1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26 */
27/*
28 * Copyright (c) 2010, Intel Corporation.
29 * All rights reserved.
30 */
31/*
32 * Portions Copyright 2009 Advanced Micro Devices, Inc.
33 */
34/*
35 * Copyright 2019 Joyent, Inc.
36 */
37
38/*
39 * CPU Identification logic
40 *
41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42 * with the identification of CPUs, their features, and their topologies. More
43 * specifically, this file helps drive the following:
44 *
45 * 1. Enumeration of features of the processor which are used by the kernel to
46 *    determine what features to enable or disable. These may be instruction set
47 *    enhancements or features that we use.
48 *
49 * 2. Enumeration of instruction set architecture (ISA) additions that userland
50 *    will be told about through the auxiliary vector.
51 *
52 * 3. Understanding the physical topology of the CPU such as the number of
53 *    caches, how many cores it has, whether or not it supports symmetric
54 *    multi-processing (SMT), etc.
55 *
56 * ------------------------
57 * CPUID History and Basics
58 * ------------------------
59 *
60 * The cpuid instruction was added by Intel roughly around the time that the
61 * original Pentium was introduced. The purpose of cpuid was to tell in a
62 * programmatic fashion information about the CPU that previously was guessed
63 * at. For example, an important part of cpuid is that we can know what
64 * extensions to the ISA exist. If you use an invalid opcode you would get a
65 * #UD, so this method allows a program (whether a user program or the kernel)
66 * to determine what exists without crashing or getting a SIGILL. Of course,
67 * this was also during the era of the clones and the AMD Am5x86. The vendor
68 * name shows up first in cpuid for a reason.
69 *
70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72 * its own meaning. The different leaves are broken down into different regions:
73 *
74 *	[ 0, 7fffffff ]			This region is called the 'basic'
75 *					region. This region is generally defined
76 *					by Intel, though some of the original
77 *					portions have different meanings based
78 *					on the manufacturer. These days, Intel
79 *					adds most new features to this region.
80 *					AMD adds non-Intel compatible
81 *					information in the third, extended
82 *					region. Intel uses this for everything
83 *					including ISA extensions, CPU
84 *					features, cache information, topology,
85 *					and more.
86 *
87 *					There is a hole carved out of this
88 *					region which is reserved for
89 *					hypervisors.
90 *
91 *	[ 40000000, 4fffffff ]		This region, which is found in the
92 *					middle of the previous region, is
93 *					explicitly promised to never be used by
94 *					CPUs. Instead, it is used by hypervisors
95 *					to communicate information about
96 *					themselves to the operating system. The
97 *					values and details are unique for each
98 *					hypervisor.
99 *
100 *	[ 80000000, ffffffff ]		This region is called the 'extended'
101 *					region. Some of the low leaves mirror
102 *					parts of the basic leaves. This region
103 *					has generally been used by AMD for
104 *					various extensions. For example, AMD-
105 *					specific information about caches,
106 *					features, and topology are found in this
107 *					region.
108 *
109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111 * the ranges, one of the primary things returned is the maximum valid leaf in
112 * that range. This allows for discovery of what range of CPUID is valid.
113 *
114 * The CPUs have potentially surprising behavior when using an invalid leaf or
115 * unimplemented leaf. If the requested leaf is within the valid basic or
116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117 * set to zero. However, if you specify a leaf that is outside of a valid range,
118 * then instead it will be filled with the last valid _basic_ leaf. For example,
119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120 * an invalid extended leaf will return the information for leaf 3.
121 *
122 * Some leaves are broken down into sub-leaves. This means that the value
123 * depends on both the leaf asked for in %eax and a secondary register. For
124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125 * additional information. Or when getting topology information in leaf 0xb, the
126 * initial value in %ecx changes which level of the topology that you are
127 * getting information about.
128 *
129 * cpuid values are always kept to 32 bits regardless of whether or not the
130 * program is in 64-bit mode. When executing in 64-bit mode, the upper
131 * 32 bits of the register are always set to zero so that way the values are the
132 * same regardless of execution mode.
133 *
134 * ----------------------
135 * Identifying Processors
136 * ----------------------
137 *
138 * We can identify a processor in two steps. The first step looks at cpuid leaf
139 * 0. Leaf 0 contains the processor's vendor information. This is done by
140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142 *
143 * From there, a processor is identified by a combination of three different
144 * values:
145 *
146 *  1. Family
147 *  2. Model
148 *  3. Stepping
149 *
150 * Each vendor uses the family and model to uniquely identify a processor. The
151 * way that family and model are changed depends on the vendor. For example,
152 * Intel has been using family 0x6 for almost all of their processor since the
153 * Pentium Pro/Pentium II era, often called the P6. The model is used to
154 * identify the exact processor. Different models are often used for the client
155 * (consumer) and server parts. Even though each processor often has major
156 * architectural differences, they still are considered the same family by
157 * Intel.
158 *
159 * On the other hand, each major AMD architecture generally has its own family.
160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161 * the model number is used to help identify specific processors.
162 *
163 * The stepping is used to refer to a revision of a specific microprocessor. The
164 * term comes from equipment used to produce masks that are used to create
165 * integrated circuits.
166 *
167 * The information is present in leaf 1, %eax. In technical documentation you
168 * will see the terms extended model and extended family. The original family,
169 * model, and stepping fields were each 4 bits wide. If the values in either
170 * are 0xf, then one is to consult the extended model and extended family, which
171 * take previously reserved bits and allow for a larger number of models and add
172 * 0xf to them.
173 *
174 * When we process this information, we store the full family, model, and
175 * stepping in the struct cpuid_info members cpi_family, cpi_model, and
176 * cpi_step, respectively. Whenever you are performing comparisons with the
177 * family, model, and stepping, you should use these members and not the raw
178 * values from cpuid. If you must use the raw values from cpuid directly, you
179 * must make sure that you add the extended model and family to the base model
180 * and family.
181 *
182 * In general, we do not use information about the family, model, and stepping
183 * to determine whether or not a feature is present; that is generally driven by
184 * specific leaves. However, when something we care about on the processor is
185 * not considered 'architectural' meaning that it is specific to a set of
186 * processors and not promised in the architecture model to be consistent from
187 * generation to generation, then we will fall back on this information. The
188 * most common cases where this comes up is when we have to workaround errata in
189 * the processor, are dealing with processor-specific features such as CPU
190 * performance counters, or we want to provide additional information for things
191 * such as fault management.
192 *
193 * While processors also do have a brand string, which is the name that people
194 * are familiar with when buying the processor, they are not meant for
195 * programmatic consumption. That is what the family, model, and stepping are
196 * for.
197 *
198 * ------------
199 * CPUID Passes
200 * ------------
201 *
202 * As part of performing feature detection, we break this into several different
203 * passes. The passes are as follows:
204 *
205 *	Pass 0		This is a primordial pass done in locore.s to deal with
206 *			Cyrix CPUs that don't support cpuid. The reality is that
207 *			we likely don't run on them any more, but there is still
208 *			logic for handling them.
209 *
210 *	Pass 1		This is the primary pass and is responsible for doing a
211 *			large number of different things:
212 *
213 *			1. Determine which vendor manufactured the CPU and
214 *			determining the family, model, and stepping information.
215 *
216 *			2. Gathering a large number of feature flags to
217 *			determine which features the CPU support and which
218 *			indicate things that we need to do other work in the OS
219 *			to enable. Features detected this way are added to the
220 *			x86_featureset which can be queried to
221 *			determine what we should do. This includes processing
222 *			all of the basic and extended CPU features that we care
223 *			about.
224 *
225 *			3. Determining the CPU's topology. This includes
226 *			information about how many cores and threads are present
227 *			in the package. It also is responsible for figuring out
228 *			which logical CPUs are potentially part of the same core
229 *			and what other resources they might share. For more
230 *			information see the 'Topology' section.
231 *
232 *			4. Determining the set of CPU security-specific features
233 *			that we need to worry about and determine the
234 *			appropriate set of workarounds.
235 *
236 *			Pass 1 on the boot CPU occurs before KMDB is started.
237 *
238 *	Pass 2		The second pass is done after startup(). Here, we check
239 *			other miscellaneous features. Most of this is gathering
240 *			additional basic and extended features that we'll use in
241 *			later passes or for debugging support.
242 *
243 *	Pass 3		The third pass occurs after the kernel memory allocator
244 *			has been fully initialized. This gathers information
245 *			where we might need dynamic memory available for our
246 *			uses. This includes several varying width leaves that
247 *			have cache information and the processor's brand string.
248 *
249 *	Pass 4		The fourth and final normal pass is performed after the
250 *			kernel has brought most everything online. This is
251 *			invoked from post_startup(). In this pass, we go through
252 *			the set of features that we have enabled and turn that
253 *			into the hardware auxiliary vector features that
254 *			userland receives. This is used by userland, primarily
255 *			by the run-time link-editor (RTLD), though userland
256 *			software could also refer to it directly.
257 *
258 *	Microcode	After a microcode update, we do a selective rescan of
259 *			the cpuid leaves to determine what features have
260 *			changed. Microcode updates can provide more details
261 *			about security related features to deal with issues like
262 *			Spectre and L1TF. On occasion, vendors have violated
263 *			their contract and removed bits. However, we don't try
264 *			to detect that because that puts us in a situation that
265 *			we really can't deal with. As such, the only thing we
266 *			rescan are security related features today. See
267 *			cpuid_pass_ucode().
268 *
269 * All of the passes (except pass 0) are run on all CPUs. However, for the most
270 * part we only care about what the boot CPU says about this information and use
271 * the other CPUs as a rough guide to sanity check that we have the same feature
272 * set.
273 *
274 * We do not support running multiple logical CPUs with disjoint, let alone
275 * different, feature sets.
276 *
277 * ------------------
278 * Processor Topology
279 * ------------------
280 *
281 * One of the important things that we need to do is to understand the topology
282 * of the underlying processor. When we say topology in this case, we're trying
283 * to understand the relationship between the logical CPUs that the operating
284 * system sees and the underlying physical layout. Different logical CPUs may
285 * share different resources which can have important consequences for the
286 * performance of the system. For example, they may share caches, execution
287 * units, and more.
288 *
289 * The topology of the processor changes from generation to generation and
290 * vendor to vendor.  Along with that, different vendors use different
291 * terminology, and the operating system itself uses occasionally overlapping
292 * terminology. It's important to understand what this topology looks like so
293 * one can understand the different things that we try to calculate and
294 * determine.
295 *
296 * To get started, let's talk about a little bit of terminology that we've used
297 * so far, is used throughout this file, and is fairly generic across multiple
298 * vendors:
299 *
300 * CPU
301 *	A central processing unit (CPU) refers to a logical and/or virtual
302 *	entity that the operating system can execute instructions on. The
303 *	underlying resources for this CPU may be shared between multiple
304 *	entities; however, to the operating system it is a discrete unit.
305 *
306 * PROCESSOR and PACKAGE
307 *
308 *	Generally, when we use the term 'processor' on its own, we are referring
309 *	to the physical entity that one buys and plugs into a board. However,
310 *	because processor has been overloaded and one might see it used to mean
311 *	multiple different levels, we will instead use the term 'package' for
312 *	the rest of this file. The term package comes from the electrical
313 *	engineering side and refers to the physical entity that encloses the
314 *	electronics inside. Strictly speaking the package can contain more than
315 *	just the CPU, for example, on many processors it may also have what's
316 *	called an 'integrated graphical processing unit (GPU)'. Because the
317 *	package can encapsulate multiple units, it is the largest physical unit
318 *	that we refer to.
319 *
320 * SOCKET
321 *
322 *	A socket refers to unit on a system board (generally the motherboard)
323 *	that can receive a package. A single package, or processor, is plugged
324 *	into a single socket. A system may have multiple sockets. Often times,
325 *	the term socket is used interchangeably with package and refers to the
326 *	electrical component that has plugged in, and not the receptacle itself.
327 *
328 * CORE
329 *
330 *	A core refers to the physical instantiation of a CPU, generally, with a
331 *	full set of hardware resources available to it. A package may contain
332 *	multiple cores inside of it or it may just have a single one. A
333 *	processor with more than one core is often referred to as 'multi-core'.
334 *	In illumos, we will use the feature X86FSET_CMP to refer to a system
335 *	that has 'multi-core' processors.
336 *
337 *	A core may expose a single logical CPU to the operating system, or it
338 *	may expose multiple CPUs, which we call threads, defined below.
339 *
340 *	Some resources may still be shared by cores in the same package. For
341 *	example, many processors will share the level 3 cache between cores.
342 *	Some AMD generations share hardware resources between cores. For more
343 *	information on that see the section 'AMD Topology'.
344 *
345 * THREAD and STRAND
346 *
347 *	In this file, generally a thread refers to a hardware resources and not
348 *	the operating system's logical abstraction. A thread is always exposed
349 *	as an independent logical CPU to the operating system. A thread belongs
350 *	to a specific core. A core may have more than one thread. When that is
351 *	the case, the threads that are part of the same core are often referred
352 *	to as 'siblings'.
353 *
354 *	When multiple threads exist, this is generally referred to as
355 *	simultaneous multi-threading (SMT). When Intel introduced this in their
356 *	processors they called it hyper-threading (HT). When multiple threads
357 *	are active in a core, they split the resources of the core. For example,
358 *	two threads may share the same set of hardware execution units.
359 *
360 *	The operating system often uses the term 'strand' to refer to a thread.
361 *	This helps disambiguate it from the software concept.
362 *
363 * CHIP
364 *
365 *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
366 *	base meaning, it is used to refer to a single integrated circuit, which
367 *	may or may not be the only thing in the package. In illumos, when you
368 *	see the term 'chip' it is almost always referring to the same thing as
369 *	the 'package'. However, many vendors may use chip to refer to one of
370 *	many integrated circuits that have been placed in the package. As an
371 *	example, see the subsequent definition.
372 *
373 *	To try and keep things consistent, we will only use chip when referring
374 *	to the entire integrated circuit package, with the exception of the
375 *	definition of multi-chip module (because it is in the name) and use the
376 *	term 'die' when we want the more general, potential sub-component
377 *	definition.
378 *
379 * DIE
380 *
381 *	A die refers to an integrated circuit. Inside of the package there may
382 *	be a single die or multiple dies. This is sometimes called a 'chip' in
383 *	vendor's parlance, but in this file, we use the term die to refer to a
384 *	subcomponent.
385 *
386 * MULTI-CHIP MODULE
387 *
388 *	A multi-chip module (MCM) refers to putting multiple distinct chips that
389 *	are connected together in the same package. When a multi-chip design is
390 *	used, generally each chip is manufactured independently and then joined
391 *	together in the package. For example, on AMD's Zen microarchitecture
392 *	(family 0x17), the package contains several dies (the second meaning of
393 *	chip from above) that are connected together.
394 *
395 * CACHE
396 *
397 *	A cache is a part of the processor that maintains copies of recently
398 *	accessed memory. Caches are split into levels and then into types.
399 *	Commonly there are one to three levels, called level one, two, and
400 *	three. The lower the level, the smaller it is, the closer it is to the
401 *	execution units of the CPU, and the faster it is to access. The layout
402 *	and design of the cache come in many different flavors, consult other
403 *	resources for a discussion of those.
404 *
405 *	Caches are generally split into two types, the instruction and data
406 *	cache. The caches contain what their names suggest, the instruction
407 *	cache has executable program text, while the data cache has all other
408 *	memory that the processor accesses. As of this writing, data is kept
409 *	coherent between all of the caches on x86, so if one modifies program
410 *	text before it is executed, that will be in the data cache, and the
411 *	instruction cache will be synchronized with that change when the
412 *	processor actually executes those instructions. This coherency also
413 *	covers the fact that data could show up in multiple caches.
414 *
415 *	Generally, the lowest level caches are specific to a core. However, the
416 *	last layer cache is shared between some number of cores. The number of
417 *	CPUs sharing this last level cache is important. This has implications
418 *	for the choices that the scheduler makes, as accessing memory that might
419 *	be in a remote cache after thread migration can be quite expensive.
420 *
421 *	Sometimes, the word cache is abbreviated with a '$', because in US
422 *	English the word cache is pronounced the same as cash. So L1D$ refers to
423 *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
424 *	in the rest of this theory statement for clarity.
425 *
426 * MEMORY CONTROLLER
427 *
428 *	The memory controller is a component that provides access to DRAM. Each
429 *	memory controller can access a set number of DRAM channels. Each channel
430 *	can have a number of DIMMs (sticks of memory) associated with it. A
431 *	given package may have more than one memory controller. The association
432 *	of the memory controller to a group of cores is important as it is
433 *	cheaper to access memory on the controller that you are associated with.
434 *
435 * NUMA
436 *
437 *	NUMA or non-uniform memory access, describes a way that systems are
438 *	built. On x86, any processor core can address all of the memory in the
439 *	system. However, When using multiple sockets or possibly within a
440 *	multi-chip module, some of that memory is physically closer and some of
441 *	it is further. Memory that is further away is more expensive to access.
442 *	Consider the following image of multiple sockets with memory:
443 *
444 *	+--------+                                                +--------+
445 *	| DIMM A |         +----------+      +----------+         | DIMM D |
446 *	+--------+-+       |          |      |          |       +-+------+-+
447 *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
448 *	  +--------+-+     |          |      |          |     +-+------+-+
449 *	    | DIMM C |     +----------+      +----------+     | DIMM F |
450 *	    +--------+                                        +--------+
451 *
452 *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
453 *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
454 *	access DIMMs A-C and more expensive to access D-F as it has to go
455 *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
456 *	D-F are cheaper than A-C. While the socket form is the most common, when
457 *	using multi-chip modules, this can also sometimes occur. For another
458 *	example of this that's more involved, see the AMD topology section.
459 *
460 *
461 * Intel Topology
462 * --------------
463 *
464 * Most Intel processors since Nehalem, (as of this writing the current gen
465 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
466 * the package is a single monolithic die. MCMs currently aren't used. Most
467 * parts have three levels of caches, with the L3 cache being shared between
468 * all of the cores on the package. The L1/L2 cache is generally specific to
469 * an individual core. The following image shows at a simplified level what
470 * this looks like. The memory controller is commonly part of something called
471 * the 'Uncore', that used to be separate physical chips that were not a part of
472 * the package, but are now part of the same chip.
473 *
474 *  +-----------------------------------------------------------------------+
475 *  | Package                                                               |
476 *  |  +-------------------+  +-------------------+  +-------------------+  |
477 *  |  | Core              |  | Core              |  | Core              |  |
478 *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
479 *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
480 *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
481 *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
482 *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
483 *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
484 *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
485 *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
486 *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
487 *  |  +-------------------+  +-------------------+  +-------------------+  |
488 *  | +-------------------------------------------------------------------+ |
489 *  | |                         Shared L3 Cache                           | |
490 *  | +-------------------------------------------------------------------+ |
491 *  | +-------------------------------------------------------------------+ |
492 *  | |                        Memory Controller                          | |
493 *  | +-------------------------------------------------------------------+ |
494 *  +-----------------------------------------------------------------------+
495 *
496 * A side effect of this current architecture is that what we care about from a
497 * scheduling and topology perspective, is simplified. In general we care about
498 * understanding which logical CPUs are part of the same core and socket.
499 *
500 * To determine the relationship between threads and cores, Intel initially used
501 * the identifier in the advanced programmable interrupt controller (APIC). They
502 * also added cpuid leaf 4 to give additional information about the number of
503 * threads and CPUs in the processor. With the addition of x2apic (which
504 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
505 * additional cpuid topology leaf 0xB was added.
506 *
507 * AMD Topology
508 * ------------
509 *
510 * When discussing AMD topology, we want to break this into three distinct
511 * generations of topology. There's the basic topology that has been used in
512 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
513 * with family 0x15 (Bulldozer), and there's the topology that was introduced
514 * with family 0x17 (Zen). AMD also has some additional terminology that's worth
515 * talking about.
516 *
517 * Until the introduction of family 0x17 (Zen), AMD did not implement something
518 * that they considered SMT. Whether or not the AMD processors have SMT
519 * influences many things including scheduling and reliability, availability,
520 * and serviceability (RAS) features.
521 *
522 * NODE
523 *
524 *	AMD uses the term node to refer to a die that contains a number of cores
525 *	and I/O resources. Depending on the processor family and model, more
526 *	than one node can be present in the package. When there is more than one
527 *	node this indicates a multi-chip module. Usually each node has its own
528 *	access to memory and I/O devices. This is important and generally
529 *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
530 *	result, we track this relationship in the operating system.
531 *
532 *	In processors with an L3 cache, the L3 cache is generally shared across
533 *	the entire node, though the way this is carved up varies from generation
534 *	to generation.
535 *
536 * BULLDOZER
537 *
538 *	Starting with the Bulldozer family (0x15) and continuing until the
539 *	introduction of the Zen microarchitecture, AMD introduced the idea of a
540 *	compute unit. In a compute unit, two traditional cores share a number of
541 *	hardware resources. Critically, they share the FPU, L1 instruction
542 *	cache, and the L2 cache. Several compute units were then combined inside
543 *	of a single node.  Because the integer execution units, L1 data cache,
544 *	and some other resources were not shared between the cores, AMD never
545 *	considered this to be SMT.
546 *
547 * ZEN
548 *
549 *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
550 *	is called Zeppelin. These modules are similar to the idea of nodes used
551 *	previously. Each of these nodes has two DRAM channels which all of the
552 *	cores in the node can access uniformly. These nodes are linked together
553 *	in the package, creating a NUMA environment.
554 *
555 *	The Zeppelin die itself contains two different 'core complexes'. Each
556 *	core complex consists of four cores which each have two threads, for a
557 *	total of 8 logical CPUs per complex. Unlike other generations,
558 *	where all the logical CPUs in a given node share the L3 cache, here each
559 *	core complex has its own shared L3 cache.
560 *
561 *	A further thing that we need to consider is that in some configurations,
562 *	particularly with the Threadripper line of processors, not every die
563 *	actually has its memory controllers wired up to actual memory channels.
564 *	This means that some cores have memory attached to them and others
565 *	don't.
566 *
567 *	To put Zen in perspective, consider the following images:
568 *
569 *      +--------------------------------------------------------+
570 *      | Core Complex                                           |
571 *      | +-------------------+    +-------------------+  +---+  |
572 *      | | Core       +----+ |    | Core       +----+ |  |   |  |
573 *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
574 *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
575 *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
576 *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
577 *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
578 *      | +-------------------+    +-------------------+  | C |  |
579 *      | +-------------------+    +-------------------+  | a |  |
580 *      | | Core       +----+ |    | Core       +----+ |  | c |  |
581 *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
582 *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
583 *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
584 *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
585 *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
586 *      | +-------------------+    +-------------------+  +---+  |
587 *      |                                                        |
588 *	+--------------------------------------------------------+
589 *
590 *  This first image represents a single Zen core complex that consists of four
591 *  cores.
592 *
593 *
594 *	+--------------------------------------------------------+
595 *	| Zeppelin Die                                           |
596 *	|  +--------------------------------------------------+  |
597 *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
598 *	|  +--------------------------------------------------+  |
599 *      |                           HH                           |
600 *	|          +-----------+    HH    +-----------+          |
601 *	|          |           |    HH    |           |          |
602 *	|          |    Core   |==========|    Core   |          |
603 *	|          |  Complex  |==========|  Complex  |          |
604 *	|          |           |    HH    |           |          |
605 *	|          +-----------+    HH    +-----------+          |
606 *      |                           HH                           |
607 *	|  +--------------------------------------------------+  |
608 *	|  |                Memory Controller                 |  |
609 *	|  +--------------------------------------------------+  |
610 *      |                                                        |
611 *	+--------------------------------------------------------+
612 *
613 *  This image represents a single Zeppelin Die. Note how both cores are
614 *  connected to the same memory controller and I/O units. While each core
615 *  complex has its own L3 cache as seen in the first image, they both have
616 *  uniform access to memory.
617 *
618 *
619 *                      PP                     PP
620 *                      PP                     PP
621 *           +----------PP---------------------PP---------+
622 *           |          PP                     PP         |
623 *           |    +-----------+          +-----------+    |
624 *           |    |           |          |           |    |
625 *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
626 *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
627 *           |    |           |          |           |    |
628 *           |    +-----------+ooo    ...+-----------+    |
629 *           |          HH      ooo  ...       HH         |
630 *           |          HH        oo..         HH         |
631 *           |          HH        ..oo         HH         |
632 *           |          HH      ...  ooo       HH         |
633 *           |    +-----------+...    ooo+-----------+    |
634 *           |    |           |          |           |    |
635 *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
636 *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
637 *           |    |           |          |           |    |
638 *           |    +-----------+          +-----------+    |
639 *           |          PP                     PP         |
640 *           +----------PP---------------------PP---------+
641 *                      PP                     PP
642 *                      PP                     PP
643 *
644 *  This image represents a single Zen package. In this example, it has four
645 *  Zeppelin dies, though some configurations only have a single one. In this
646 *  example, each die is directly connected to the next. Also, each die is
647 *  represented as being connected to memory by the 'M' character and connected
648 *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
649 *  die is made up of two core complexes, we have multiple different NUMA
650 *  domains that we care about for these systems.
651 *
652 * CPUID LEAVES
653 *
654 * There are a few different CPUID leaves that we can use to try and understand
655 * the actual state of the world. As part of the introduction of family 0xf, AMD
656 * added CPUID leaf 0x80000008. This leaf tells us the number of logical
657 * processors that are in the system. Because families before Zen didn't have
658 * SMT, this was always the number of cores that were in the system. However, it
659 * should always be thought of as the number of logical threads to be consistent
660 * between generations. In addition we also get the size of the APIC ID that is
661 * used to represent the number of logical processors. This is important for
662 * deriving topology information.
663 *
664 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
665 * bit between Bulldozer and later families, but it is quite useful in
666 * determining the topology information. Because this information has changed
667 * across family generations, it's worth calling out what these mean
668 * explicitly. The registers have the following meanings:
669 *
670 *	%eax	The APIC ID. The entire register is defined to have a 32-bit
671 *		APIC ID, even though on systems without x2apic support, it will
672 *		be limited to 8 bits.
673 *
674 *	%ebx	On Bulldozer-era systems this contains information about the
675 *		number of cores that are in a compute unit (cores that share
676 *		resources). It also contains a per-package compute unit ID that
677 *		identifies which compute unit the logical CPU is a part of.
678 *
679 *		On Zen-era systems this instead contains the number of threads
680 *		per core and the ID of the core that the logical CPU is a part
681 *		of. Note, this ID is unique only to the package, it is not
682 *		globally unique across the entire system.
683 *
684 *	%ecx	This contains the number of nodes that exist in the package. It
685 *		also contains an ID that identifies which node the logical CPU
686 *		is a part of.
687 *
688 * Finally, we also use cpuid leaf 0x8000001D to determine information about the
689 * cache layout to determine which logical CPUs are sharing which caches.
690 *
691 * illumos Topology
692 * ----------------
693 *
694 * Based on the above we synthesize the information into several different
695 * variables that we store in the 'struct cpuid_info'. We'll go into the details
696 * of what each member is supposed to represent and their uniqueness. In
697 * general, there are two levels of uniqueness that we care about. We care about
698 * an ID that is globally unique. That means that it will be unique across all
699 * entities in the system. For example, the default logical CPU ID is globally
700 * unique. On the other hand, there is some information that we only care about
701 * being unique within the context of a single package / socket. Here are the
702 * variables that we keep track of and their meaning.
703 *
704 * Several of the values that are asking for an identifier, with the exception
705 * of cpi_apicid, are allowed to be synthetic.
706 *
707 *
708 * cpi_apicid
709 *
710 *	This is the value of the CPU's APIC id. This should be the full 32-bit
711 *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
712 *	APIC ID. This value is globally unique between all logical CPUs across
713 *	all packages. This is usually required by the APIC.
714 *
715 * cpi_chipid
716 *
717 *	This value indicates the ID of the package that the logical CPU is a
718 *	part of. This value is allowed to be synthetic. It is usually derived by
719 *	taking the CPU's APIC ID and determining how many bits are used to
720 *	represent CPU cores in the package. All logical CPUs that are part of
721 *	the same package must have the same value.
722 *
723 * cpi_coreid
724 *
725 *	This represents the ID of a CPU core. Two logical CPUs should only have
726 *	the same cpi_coreid value if they are part of the same core. These
727 *	values may be synthetic. On systems that support SMT, this value is
728 *	usually derived from the APIC ID, otherwise it is often synthetic and
729 *	just set to the value of the cpu_id in the cpu_t.
730 *
731 * cpi_pkgcoreid
732 *
733 *	This is similar to the cpi_coreid in that logical CPUs that are part of
734 *	the same core should have the same ID. The main difference is that these
735 *	values are only required to be unique to a given socket.
736 *
737 * cpi_clogid
738 *
739 *	This represents the logical ID of a logical CPU. This value should be
740 *	unique within a given socket for each logical CPU. This is allowed to be
741 *	synthetic, though it is usually based off of the CPU's apic ID. The
742 *	broader system expects that logical CPUs that have are part of the same
743 *	core have contiguous numbers. For example, if there were two threads per
744 *	core, then the core IDs divided by two should be the same and the first
745 *	modulus two should be zero and the second one. For example, IDs 4 and 5
746 *	indicate two logical CPUs that are part of the same core. But IDs 5 and
747 *	6 represent two logical CPUs that are part of different cores.
748 *
749 *	While it is common for the cpi_coreid and the cpi_clogid to be derived
750 *	from the same source, strictly speaking, they don't have to be and the
751 *	two values should be considered logically independent. One should not
752 *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
753 *	some kind of relationship. While this is tempting, we've seen cases on
754 *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
755 *
756 * cpi_ncpu_per_chip
757 *
758 *	This value indicates the total number of logical CPUs that exist in the
759 *	physical package. Critically, this is not the number of logical CPUs
760 *	that exist for just the single core.
761 *
762 *	This value should be the same for all logical CPUs in the same package.
763 *
764 * cpi_ncore_per_chip
765 *
766 *	This value indicates the total number of physical CPU cores that exist
767 *	in the package. The system compares this value with cpi_ncpu_per_chip to
768 *	determine if simultaneous multi-threading (SMT) is enabled. When
769 *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
770 *	the X86FSET_HTT feature is not set. If this value is greater than one,
771 *	than we consider the processor to have the feature X86FSET_CMP, to
772 *	indicate that there is support for more than one core.
773 *
774 *	This value should be the same for all logical CPUs in the same package.
775 *
776 * cpi_procnodes_per_pkg
777 *
778 *	This value indicates the number of 'nodes' that exist in the package.
779 *	When processors are actually a multi-chip module, this represents the
780 *	number of such modules that exist in the package. Currently, on Intel
781 *	based systems this member is always set to 1.
782 *
783 *	This value should be the same for all logical CPUs in the same package.
784 *
785 * cpi_procnodeid
786 *
787 *	This value indicates the ID of the node that the logical CPU is a part
788 *	of. All logical CPUs that are in the same node must have the same value
789 *	here. This value must be unique across all of the packages in the
790 *	system.  On Intel based systems, this is currently set to the value in
791 *	cpi_chipid because there is only one node.
792 *
793 * cpi_cores_per_compunit
794 *
795 *	This value indicates the number of cores that are part of a compute
796 *	unit. See the AMD topology section for this. This member only has real
797 *	meaning currently for AMD Bulldozer family processors. For all other
798 *	processors, this should currently be set to 1.
799 *
800 * cpi_compunitid
801 *
802 *	This indicates the compute unit that the logical CPU belongs to. For
803 *	processors without AMD Bulldozer-style compute units this should be set
804 *	to the value of cpi_coreid.
805 *
806 * cpi_ncpu_shr_last_cache
807 *
808 *	This indicates the number of logical CPUs that are sharing the same last
809 *	level cache. This value should be the same for all CPUs that are sharing
810 *	that cache. The last cache refers to the cache that is closest to memory
811 *	and furthest away from the CPU.
812 *
813 * cpi_last_lvl_cacheid
814 *
815 *	This indicates the ID of the last cache that the logical CPU uses. This
816 *	cache is often shared between multiple logical CPUs and is the cache
817 *	that is closest to memory and furthest away from the CPU. This value
818 *	should be the same for a group of logical CPUs only if they actually
819 *	share the same last level cache. IDs should not overlap between
820 *	packages.
821 *
822 * cpi_ncore_bits
823 *
824 *	This indicates the number of bits that are required to represent all of
825 *	the cores in the system. As cores are derived based on their APIC IDs,
826 *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
827 *	this value to be larger than the actual number of IDs that are present
828 *	in the system. This is used to size tables by the CMI framework. It is
829 *	only filled in for Intel and AMD CPUs.
830 *
831 * cpi_nthread_bits
832 *
833 *	This indicates the number of bits required to represent all of the IDs
834 *	that cover the logical CPUs that exist on a given core. It's OK for this
835 *	value to be larger than the actual number of IDs that are present in the
836 *	system.  This is used to size tables by the CMI framework. It is
837 *	only filled in for Intel and AMD CPUs.
838 *
839 * -----------
840 * Hypervisors
841 * -----------
842 *
843 * If trying to manage the differences between vendors wasn't bad enough, it can
844 * get worse thanks to our friend hardware virtualization. Hypervisors are given
845 * the ability to interpose on all cpuid instructions and change them to suit
846 * their purposes. In general, this is necessary as the hypervisor wants to be
847 * able to present a more uniform set of features or not necessarily give the
848 * guest operating system kernel knowledge of all features so it can be
849 * more easily migrated between systems.
850 *
851 * When it comes to trying to determine topology information, this can be a
852 * double edged sword. When a hypervisor doesn't actually implement a cpuid
853 * leaf, it'll often return all zeros. Because of that, you'll often see various
854 * checks scattered about fields being non-zero before we assume we can use
855 * them.
856 *
857 * When it comes to topology information, the hypervisor is often incentivized
858 * to lie to you about topology. This is because it doesn't always actually
859 * guarantee that topology at all. The topology path we take in the system
860 * depends on how the CPU advertises itself. If it advertises itself as an Intel
861 * or AMD CPU, then we basically do our normal path. However, when they don't
862 * use an actual vendor, then that usually turns into multiple one-core CPUs
863 * that we enumerate that are often on different sockets. The actual behavior
864 * depends greatly on what the hypervisor actually exposes to us.
865 *
866 * --------------------
867 * Exposing Information
868 * --------------------
869 *
870 * We expose CPUID information in three different forms in the system.
871 *
872 * The first is through the x86_featureset variable. This is used in conjunction
873 * with the is_x86_feature() function. This is queried by x86-specific functions
874 * to determine which features are or aren't present in the system and to make
875 * decisions based upon them. For example, users of this include everything from
876 * parts of the system dedicated to reliability, availability, and
877 * serviceability (RAS), to making decisions about how to handle security
878 * mitigations, to various x86-specific drivers. General purpose or
879 * architecture independent drivers should never be calling this function.
880 *
881 * The second means is through the auxiliary vector. The auxiliary vector is a
882 * series of tagged data that the kernel passes down to a user program when it
883 * begins executing. This information is used to indicate to programs what
884 * instruction set extensions are present. For example, information about the
885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
886 * since user programs cannot make use of it. However, things like the AVX
887 * instruction sets are. Programs use this information to make run-time
888 * decisions about what features they should use. As an example, the run-time
889 * link-editor (rtld) can relocate different functions depending on the hardware
890 * support available.
891 *
892 * The final form is through a series of accessor functions that all have the
893 * form cpuid_get*. This is used by a number of different subsystems in the
894 * kernel to determine more detailed information about what we're running on,
895 * topology information, etc. Some of these subsystems include processor groups
896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
897 * microcode, and performance monitoring. These functions all ASSERT that the
898 * CPU they're being called on has reached a certain cpuid pass. If the passes
899 * are rearranged, then this needs to be adjusted.
900 *
901 * -----------------------------------------------
902 * Speculative Execution CPU Side Channel Security
903 * -----------------------------------------------
904 *
905 * With the advent of the Spectre and Meltdown attacks which exploit speculative
906 * execution in the CPU to create side channels there have been a number of
907 * different attacks and corresponding issues that the operating system needs to
908 * mitigate against. The following list is some of the common, but not
909 * exhaustive, set of issues that we know about and have done some or need to do
910 * more work in the system to mitigate against:
911 *
912 *   - Spectre v1
913 *   - swapgs (Spectre v1 variant)
914 *   - Spectre v2
915 *   - Meltdown (Spectre v3)
916 *   - Rogue Register Read (Spectre v3a)
917 *   - Speculative Store Bypass (Spectre v4)
918 *   - ret2spec, SpectreRSB
919 *   - L1 Terminal Fault (L1TF)
920 *   - Microarchitectural Data Sampling (MDS)
921 *
922 * Each of these requires different sets of mitigations and has different attack
923 * surfaces. For the most part, this discussion is about protecting the kernel
924 * from non-kernel executing environments such as user processes and hardware
925 * virtual machines. Unfortunately, there are a number of user vs. user
926 * scenarios that exist with these. The rest of this section will describe the
927 * overall approach that the system has taken to address these as well as their
928 * shortcomings. Unfortunately, not all of the above have been handled today.
929 *
930 * SPECTRE v2, ret2spec, SpectreRSB
931 *
932 * The second variant of the spectre attack focuses on performing branch target
933 * injection. This generally impacts indirect call instructions in the system.
934 * There are three different ways to mitigate this issue that are commonly
935 * described today:
936 *
937 *  1. Using Indirect Branch Restricted Speculation (IBRS).
938 *  2. Using Retpolines and RSB Stuffing
939 *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
940 *
941 * IBRS uses a feature added to microcode to restrict speculation, among other
942 * things. This form of mitigation has not been used as it has been generally
943 * seen as too expensive and requires reactivation upon various transitions in
944 * the system.
945 *
946 * As a less impactful alternative to IBRS, retpolines were developed by
947 * Google. These basically require one to replace indirect calls with a specific
948 * trampoline that will cause speculation to fail and break the attack.
949 * Retpolines require compiler support. We always build with retpolines in the
950 * external thunk mode. This means that a traditional indirect call is replaced
951 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
952 * of this is that all indirect function calls are performed through a register.
953 *
954 * We have to use a common external location of the thunk and not inline it into
955 * the callsite so that way we can have a single place to patch these functions.
956 * As it turns out, we actually have three different forms of retpolines that
957 * exist in the system:
958 *
959 *  1. A full retpoline
960 *  2. An AMD-specific optimized retpoline
961 *  3. A no-op version
962 *
963 * The first one is used in the general case. The second one is used if we can
964 * determine that we're on an AMD system and we can successfully toggle the
965 * lfence serializing MSR that exists on the platform. Basically with this
966 * present, an lfence is sufficient and we don't need to do anywhere near as
967 * complicated a dance to successfully use retpolines.
968 *
969 * The third form described above is the most curious. It turns out that the way
970 * that retpolines are implemented is that they rely on how speculation is
971 * performed on a 'ret' instruction. Intel has continued to optimize this
972 * process (which is partly why we need to have return stack buffer stuffing,
973 * but more on that in a bit) and in processors starting with Cascade Lake
974 * on the server side, it's dangerous to rely on retpolines. Instead, a new
975 * mechanism has been introduced called Enhanced IBRS (EIBRS).
976 *
977 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
978 * physical core. However, if this is the case, we don't want to use retpolines
979 * any more. Therefore if EIBRS is present, we end up turning each retpoline
980 * function (called a thunk) into a jmp instruction. This means that we're still
981 * paying the cost of an extra jump to the external thunk, but it gives us
982 * flexibility and the ability to have a single kernel image that works across a
983 * wide variety of systems and hardware features.
984 *
985 * Unfortunately, this alone is insufficient. First, Skylake systems have
986 * additional speculation for the Return Stack Buffer (RSB) which is used to
987 * return from call instructions which retpolines take advantage of. However,
988 * this problem is not just limited to Skylake and is actually more pernicious.
989 * The SpectreRSB paper introduces several more problems that can arise with
990 * dealing with this. The RSB can be poisoned just like the indirect branch
991 * predictor. This means that one needs to clear the RSB when transitioning
992 * between two different privilege domains. Some examples include:
993 *
994 *  - Switching between two different user processes
995 *  - Going between user land and the kernel
996 *  - Returning to the kernel from a hardware virtual machine
997 *
998 * Mitigating this involves combining a couple of different things. The first is
999 * SMEP (supervisor mode execution protection) which was introduced in Ivy
1000 * Bridge. When an RSB entry refers to a user address and we're executing in the
1001 * kernel, speculation through it will be stopped when SMEP is enabled. This
1002 * protects against a number of the different cases that we would normally be
1003 * worried about such as when we enter the kernel from user land.
1004 *
1005 * To prevent against additional manipulation of the RSB from other contexts
1006 * such as a non-root VMX context attacking the kernel we first look to enhanced
1007 * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1008 * need to do to protect the kernel at this time.
1009 *
1010 * On CPUs without EIBRS we need to manually overwrite the contents of the
1011 * return stack buffer. We do this through the x86_rsb_stuff() function.
1012 * Currently this is employed on context switch. The x86_rsb_stuff() function is
1013 * disabled when enhanced IBRS is present because Intel claims on such systems
1014 * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1015 * to user attacks via the RSB.
1016 *
1017 * If SMEP is not present, then we would have to stuff the RSB every time we
1018 * transitioned from user mode to the kernel, which isn't very practical right
1019 * now.
1020 *
1021 * To fully protect user to user and vmx to vmx attacks from these classes of
1022 * issues, we would also need to allow them to opt into performing an Indirect
1023 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1024 *
1025 * By default, the system will enable RSB stuffing and the required variant of
1026 * retpolines and store that information in the x86_spectrev2_mitigation value.
1027 * This will be evaluated after a microcode update as well, though it is
1028 * expected that microcode updates will not take away features. This may mean
1029 * that a late loaded microcode may not end up in the optimal configuration
1030 * (though this should be rare).
1031 *
1032 * Currently we do not build kmdb with retpolines or perform any additional side
1033 * channel security mitigations for it. One complication with kmdb is that it
1034 * requires its own retpoline thunks and it would need to adjust itself based on
1035 * what the kernel does. The threat model of kmdb is more limited and therefore
1036 * it may make more sense to investigate using prediction barriers as the whole
1037 * system is only executing a single instruction at a time while in kmdb.
1038 *
1039 * SPECTRE v1, v4
1040 *
1041 * The v1 and v4 variants of spectre are not currently mitigated in the
1042 * system and require other classes of changes to occur in the code.
1043 *
1044 * SPECTRE v1 (SWAPGS VARIANT)
1045 *
1046 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1047 * can generally affect any branch-dependent code. The swapgs issue is one
1048 * variant of this. If we are coming in from userspace, we can have code like
1049 * this:
1050 *
1051 *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1052 *	je	1f
1053 *	movq	$0, REGOFF_SAVFP(%rsp)
1054 *	swapgs
1055 *	1:
1056 *	movq	%gs:CPU_THREAD, %rax
1057 *
1058 * If an attacker can cause a mis-speculation of the branch here, we could skip
1059 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1060 * load. If subsequent code can act as the usual Spectre cache gadget, this
1061 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1062 * any use of the %gs override.
1063 *
1064 * The other case is also an issue: if we're coming into a trap from kernel
1065 * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1066 * using it. AMD systems are not vulnerable to this version, as a swapgs is
1067 * serializing with respect to subsequent uses. But as AMD /does/ need the other
1068 * case, and the fix is the same in both cases (an lfence at the branch target
1069 * 1: in this example), we'll just do it unconditionally.
1070 *
1071 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1072 * harder for user-space to actually set a useful %gsbase value: although it's
1073 * not clear, it might still be feasible via lwp_setprivate(), though, so we
1074 * mitigate anyway.
1075 *
1076 * MELTDOWN
1077 *
1078 * Meltdown, or spectre v3, allowed a user process to read any data in their
1079 * address space regardless of whether or not the page tables in question
1080 * allowed the user to have the ability to read them. The solution to meltdown
1081 * is kernel page table isolation. In this world, there are two page tables that
1082 * are used for a process, one in user land and one in the kernel. To implement
1083 * this we use per-CPU page tables and switch between the user and kernel
1084 * variants when entering and exiting the kernel.  For more information about
1085 * this process and how the trampolines work, please see the big theory
1086 * statements and additional comments in:
1087 *
1088 *  - uts/i86pc/ml/kpti_trampolines.s
1089 *  - uts/i86pc/vm/hat_i86.c
1090 *
1091 * While Meltdown only impacted Intel systems and there are also Intel systems
1092 * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1093 * kernel page table isolation enabled. While this may at first seem weird, an
1094 * important thing to remember is that you can't speculatively read an address
1095 * if it's never in your page table at all. Having user processes without kernel
1096 * pages present provides us with an important layer of defense in the kernel
1097 * against any other side channel attacks that exist and have yet to be
1098 * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1099 * default, no matter the x86 system.
1100 *
1101 * L1 TERMINAL FAULT
1102 *
1103 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1104 * execution uses page table entries. Effectively, it is two different problems.
1105 * The first is that it ignores the not present bit in the page table entries
1106 * when performing speculative execution. This means that something can
1107 * speculatively read the listed physical address if it's present in the L1
1108 * cache under certain conditions (see Intel's documentation for the full set of
1109 * conditions). Secondly, this can be used to bypass hardware virtualization
1110 * extended page tables (EPT) that are part of Intel's hardware virtual machine
1111 * instructions.
1112 *
1113 * For the non-hardware virtualized case, this is relatively easy to deal with.
1114 * We must make sure that all unmapped pages have an address of zero. This means
1115 * that they could read the first 4k of physical memory; however, we never use
1116 * that first page in the operating system and always skip putting it in our
1117 * memory map, even if firmware tells us we can use it in our memory map. While
1118 * other systems try to put extra metadata in the address and reserved bits,
1119 * which led to this being problematic in those cases, we do not.
1120 *
1121 * For hardware virtual machines things are more complicated. Because they can
1122 * construct their own page tables, it isn't hard for them to perform this
1123 * attack against any physical address. The one wrinkle is that this physical
1124 * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1125 * to flush the L1 data cache. We wrap this up in the function
1126 * spec_uarch_flush(). This function is also used in the mitigation of
1127 * microarchitectural data sampling (MDS) discussed later on. Kernel based
1128 * hypervisors such as KVM or bhyve are responsible for performing this before
1129 * entering the guest.
1130 *
1131 * Because this attack takes place in the L1 cache, there's another wrinkle
1132 * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1133 * designs. This means that when a thread enters a hardware virtualized context
1134 * and flushes the L1 data cache, the other thread on the processor may then go
1135 * ahead and put new data in it that can be potentially attacked. While one
1136 * solution is to disable SMT on the system, another option that is available is
1137 * to use a feature for hardware virtualization called 'SMT exclusion'. This
1138 * goes through and makes sure that if a HVM is being scheduled on one thread,
1139 * then the thing on the other thread is from the same hardware virtual machine.
1140 * If an interrupt comes in or the guest exits to the broader system, then the
1141 * other SMT thread will be kicked out.
1142 *
1143 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1144 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1145 * perform L1TF related mitigations.
1146 *
1147 * MICROARCHITECTURAL DATA SAMPLING
1148 *
1149 * Microarchitectural data sampling (MDS) is a combination of four discrete
1150 * vulnerabilities that are similar issues affecting various parts of the CPU's
1151 * microarchitectural implementation around load, store, and fill buffers.
1152 * Specifically it is made up of the following subcomponents:
1153 *
1154 *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1155 *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1156 *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1157 *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1158 *
1159 * To begin addressing these, Intel has introduced another feature in microcode
1160 * called MD_CLEAR. This changes the verw instruction to operate in a different
1161 * way. This allows us to execute the verw instruction in a particular way to
1162 * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1163 * updated when this microcode is present to flush this state.
1164 *
1165 * Primarily we need to flush this state whenever we transition from the kernel
1166 * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1167 * little bit different. Here the structures are statically sized when a logical
1168 * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1169 * flush the microarchitectural state before the CPU goes idles by calling hlt,
1170 * mwait, or another ACPI method. To perform these flushes, we call
1171 * x86_md_clear() at all of these transition points.
1172 *
1173 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1174 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1175 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1176 * a no-op.
1177 *
1178 * Unfortunately, with this issue hyperthreading rears its ugly head. In
1179 * particular, everything we've discussed above is only valid for a single
1180 * thread executing on a core. In the case where you have hyper-threading
1181 * present, this attack can be performed between threads. The theoretical fix
1182 * for this is to ensure that both threads are always in the same security
1183 * domain. This means that they are executing in the same ring and mutually
1184 * trust each other. Practically speaking, this would mean that a system call
1185 * would have to issue an inter-processor interrupt (IPI) to the other thread.
1186 * Rather than implement this, we recommend that one disables hyper-threading
1187 * through the use of psradm -aS.
1188 *
1189 * SUMMARY
1190 *
1191 * The following table attempts to summarize the mitigations for various issues
1192 * and what's done in various places:
1193 *
1194 *  - Spectre v1: Not currently mitigated
1195 *  - swapgs: lfences after swapgs paths
1196 *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1197 *  - Meltdown: Kernel Page Table Isolation
1198 *  - Spectre v3a: Updated CPU microcode
1199 *  - Spectre v4: Not currently mitigated
1200 *  - SpectreRSB: SMEP and RSB Stuffing
1201 *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1202 *  - MDS: x86_md_clear, requires microcode, disabling hyper threading
1203 *
1204 * The following table indicates the x86 feature set bits that indicate that a
1205 * given problem has been solved or a notable feature is present:
1206 *
1207 *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1208 *  - MDS_NO: All forms of MDS
1209 */
1210
1211#include <sys/types.h>
1212#include <sys/archsystm.h>
1213#include <sys/x86_archext.h>
1214#include <sys/kmem.h>
1215#include <sys/systm.h>
1216#include <sys/cmn_err.h>
1217#include <sys/sunddi.h>
1218#include <sys/sunndi.h>
1219#include <sys/cpuvar.h>
1220#include <sys/processor.h>
1221#include <sys/sysmacros.h>
1222#include <sys/pg.h>
1223#include <sys/fp.h>
1224#include <sys/controlregs.h>
1225#include <sys/bitmap.h>
1226#include <sys/auxv_386.h>
1227#include <sys/memnode.h>
1228#include <sys/pci_cfgspace.h>
1229#include <sys/comm_page.h>
1230#include <sys/mach_mmu.h>
1231#include <sys/ucode.h>
1232#include <sys/tsc.h>
1233#include <sys/kobj.h>
1234#include <sys/asm_misc.h>
1235
1236#ifdef __xpv
1237#include <sys/hypervisor.h>
1238#else
1239#include <sys/ontrap.h>
1240#endif
1241
1242uint_t x86_vendor = X86_VENDOR_IntelClone;
1243uint_t x86_type = X86_TYPE_OTHER;
1244uint_t x86_clflush_size = 0;
1245
1246#if defined(__xpv)
1247int x86_use_pcid = 0;
1248int x86_use_invpcid = 0;
1249#else
1250int x86_use_pcid = -1;
1251int x86_use_invpcid = -1;
1252#endif
1253
1254typedef enum {
1255	X86_SPECTREV2_RETPOLINE,
1256	X86_SPECTREV2_RETPOLINE_AMD,
1257	X86_SPECTREV2_ENHANCED_IBRS,
1258	X86_SPECTREV2_DISABLED
1259} x86_spectrev2_mitigation_t;
1260
1261uint_t x86_disable_spectrev2 = 0;
1262static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1263    X86_SPECTREV2_RETPOLINE;
1264
1265uint_t pentiumpro_bug4046376;
1266
1267uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1268
1269static char *x86_feature_names[NUM_X86_FEATURES] = {
1270	"lgpg",
1271	"tsc",
1272	"msr",
1273	"mtrr",
1274	"pge",
1275	"de",
1276	"cmov",
1277	"mmx",
1278	"mca",
1279	"pae",
1280	"cv8",
1281	"pat",
1282	"sep",
1283	"sse",
1284	"sse2",
1285	"htt",
1286	"asysc",
1287	"nx",
1288	"sse3",
1289	"cx16",
1290	"cmp",
1291	"tscp",
1292	"mwait",
1293	"sse4a",
1294	"cpuid",
1295	"ssse3",
1296	"sse4_1",
1297	"sse4_2",
1298	"1gpg",
1299	"clfsh",
1300	"64",
1301	"aes",
1302	"pclmulqdq",
1303	"xsave",
1304	"avx",
1305	"vmx",
1306	"svm",
1307	"topoext",
1308	"f16c",
1309	"rdrand",
1310	"x2apic",
1311	"avx2",
1312	"bmi1",
1313	"bmi2",
1314	"fma",
1315	"smep",
1316	"smap",
1317	"adx",
1318	"rdseed",
1319	"mpx",
1320	"avx512f",
1321	"avx512dq",
1322	"avx512pf",
1323	"avx512er",
1324	"avx512cd",
1325	"avx512bw",
1326	"avx512vl",
1327	"avx512fma",
1328	"avx512vbmi",
1329	"avx512_vpopcntdq",
1330	"avx512_4vnniw",
1331	"avx512_4fmaps",
1332	"xsaveopt",
1333	"xsavec",
1334	"xsaves",
1335	"sha",
1336	"umip",
1337	"pku",
1338	"ospke",
1339	"pcid",
1340	"invpcid",
1341	"ibrs",
1342	"ibpb",
1343	"stibp",
1344	"ssbd",
1345	"ssbd_virt",
1346	"rdcl_no",
1347	"ibrs_all",
1348	"rsba",
1349	"ssb_no",
1350	"stibp_all",
1351	"flush_cmd",
1352	"l1d_vmentry_no",
1353	"fsgsbase",
1354	"clflushopt",
1355	"clwb",
1356	"monitorx",
1357	"clzero",
1358	"xop",
1359	"fma4",
1360	"tbm",
1361	"avx512_vnni",
1362	"amd_pcec",
1363	"mb_clear",
1364	"mds_no",
1365	"core_thermal",
1366	"pkg_thermal"
1367};
1368
1369boolean_t
1370is_x86_feature(void *featureset, uint_t feature)
1371{
1372	ASSERT(feature < NUM_X86_FEATURES);
1373	return (BT_TEST((ulong_t *)featureset, feature));
1374}
1375
1376void
1377add_x86_feature(void *featureset, uint_t feature)
1378{
1379	ASSERT(feature < NUM_X86_FEATURES);
1380	BT_SET((ulong_t *)featureset, feature);
1381}
1382
1383void
1384remove_x86_feature(void *featureset, uint_t feature)
1385{
1386	ASSERT(feature < NUM_X86_FEATURES);
1387	BT_CLEAR((ulong_t *)featureset, feature);
1388}
1389
1390boolean_t
1391compare_x86_featureset(void *setA, void *setB)
1392{
1393	/*
1394	 * We assume that the unused bits of the bitmap are always zero.
1395	 */
1396	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1397		return (B_TRUE);
1398	} else {
1399		return (B_FALSE);
1400	}
1401}
1402
1403void
1404print_x86_featureset(void *featureset)
1405{
1406	uint_t i;
1407
1408	for (i = 0; i < NUM_X86_FEATURES; i++) {
1409		if (is_x86_feature(featureset, i)) {
1410			cmn_err(CE_CONT, "?x86_feature: %s\n",
1411			    x86_feature_names[i]);
1412		}
1413	}
1414}
1415
1416/* Note: This is the maximum size for the CPU, not the size of the structure. */
1417static size_t xsave_state_size = 0;
1418uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1419boolean_t xsave_force_disable = B_FALSE;
1420extern int disable_smap;
1421
1422/*
1423 * This is set to platform type we are running on.
1424 */
1425static int platform_type = -1;
1426
1427#if !defined(__xpv)
1428/*
1429 * Variable to patch if hypervisor platform detection needs to be
1430 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1431 */
1432int enable_platform_detection = 1;
1433#endif
1434
1435/*
1436 * monitor/mwait info.
1437 *
1438 * size_actual and buf_actual are the real address and size allocated to get
1439 * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1440 * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1441 * processor cache-line alignment, but this is not guarantied in the furture.
1442 */
1443struct mwait_info {
1444	size_t		mon_min;	/* min size to avoid missed wakeups */
1445	size_t		mon_max;	/* size to avoid false wakeups */
1446	size_t		size_actual;	/* size actually allocated */
1447	void		*buf_actual;	/* memory actually allocated */
1448	uint32_t	support;	/* processor support of monitor/mwait */
1449};
1450
1451/*
1452 * xsave/xrestor info.
1453 *
1454 * This structure contains HW feature bits and the size of the xsave save area.
1455 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1456 * (xsave_state) to describe the xsave layout. However, at runtime the
1457 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1458 * xsave_state structure simply represents the legacy layout of the beginning
1459 * of the xsave area.
1460 */
1461struct xsave_info {
1462	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1463	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1464	size_t		xsav_max_size;  /* max size save area for HW features */
1465	size_t		ymm_size;	/* AVX: size of ymm save area */
1466	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1467	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1468	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1469	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1470	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1471	size_t		opmask_size;	/* AVX512: size of opmask save */
1472	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1473	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1474	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1475	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1476	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1477};
1478
1479
1480/*
1481 * These constants determine how many of the elements of the
1482 * cpuid we cache in the cpuid_info data structure; the
1483 * remaining elements are accessible via the cpuid instruction.
1484 */
1485
1486#define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1487#define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1488
1489/*
1490 * See the big theory statement for a more detailed explanation of what some of
1491 * these members mean.
1492 */
1493struct cpuid_info {
1494	uint_t cpi_pass;		/* last pass completed */
1495	/*
1496	 * standard function information
1497	 */
1498	uint_t cpi_maxeax;		/* fn 0: %eax */
1499	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1500	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1501
1502	uint_t cpi_family;		/* fn 1: extended family */
1503	uint_t cpi_model;		/* fn 1: extended model */
1504	uint_t cpi_step;		/* fn 1: stepping */
1505	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1506					/*		AMD: package/socket # */
1507	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1508	int cpi_clogid;			/* fn 1: %ebx: thread # */
1509	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1510	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1511	uint_t cpi_ncache;		/* fn 2: number of elements */
1512	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1513	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1514	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1515					/* Intel fn: 4, AMD fn: 8000001d */
1516	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1517	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1518	/*
1519	 * extended function information
1520	 */
1521	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1522	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1523	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1524	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1525	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1526	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1527
1528	id_t cpi_coreid;		/* same coreid => strands share core */
1529	int cpi_pkgcoreid;		/* core number within single package */
1530	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1531					/* Intel: fn 4: %eax[31-26] */
1532
1533	/*
1534	 * These values represent the number of bits that are required to store
1535	 * information about the number of cores and threads.
1536	 */
1537	uint_t cpi_ncore_bits;
1538	uint_t cpi_nthread_bits;
1539	/*
1540	 * supported feature information
1541	 */
1542	uint32_t cpi_support[6];
1543#define	STD_EDX_FEATURES	0
1544#define	AMD_EDX_FEATURES	1
1545#define	TM_EDX_FEATURES		2
1546#define	STD_ECX_FEATURES	3
1547#define	AMD_ECX_FEATURES	4
1548#define	STD_EBX_FEATURES	5
1549	/*
1550	 * Synthesized information, where known.
1551	 */
1552	uint32_t cpi_chiprev;		/* See X86_CHIPREV_* in x86_archext.h */
1553	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1554	uint32_t cpi_socket;		/* Chip package/socket type */
1555
1556	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1557	uint32_t cpi_apicid;
1558	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1559	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1560					/* Intel: 1 */
1561	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1562	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1563
1564	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1565};
1566
1567
1568static struct cpuid_info cpuid_info0;
1569
1570/*
1571 * These bit fields are defined by the Intel Application Note AP-485
1572 * "Intel Processor Identification and the CPUID Instruction"
1573 */
1574#define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1575#define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1576#define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1577#define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1578#define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1579#define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1580
1581#define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1582#define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1583#define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1584#define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1585#define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1586#define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1587#define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1588
1589#define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1590#define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1591#define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1592#define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1593
1594#define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1595#define	CPI_XMAXEAX_MAX		0x80000100
1596#define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1597#define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1598
1599/*
1600 * Function 4 (Deterministic Cache Parameters) macros
1601 * Defined by Intel Application Note AP-485
1602 */
1603#define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1604#define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1605#define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1606#define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1607#define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1608#define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1609#define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1610
1611#define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1612#define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1613#define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1614
1615#define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1616
1617#define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1618
1619
1620/*
1621 * A couple of shorthand macros to identify "later" P6-family chips
1622 * like the Pentium M and Core.  First, the "older" P6-based stuff
1623 * (loosely defined as "pre-Pentium-4"):
1624 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1625 */
1626#define	IS_LEGACY_P6(cpi) (			\
1627	cpi->cpi_family == 6 &&			\
1628		(cpi->cpi_model == 1 ||		\
1629		cpi->cpi_model == 3 ||		\
1630		cpi->cpi_model == 5 ||		\
1631		cpi->cpi_model == 6 ||		\
1632		cpi->cpi_model == 7 ||		\
1633		cpi->cpi_model == 8 ||		\
1634		cpi->cpi_model == 0xA ||	\
1635		cpi->cpi_model == 0xB)		\
1636)
1637
1638/* A "new F6" is everything with family 6 that's not the above */
1639#define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1640
1641/* Extended family/model support */
1642#define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1643	cpi->cpi_family >= 0xf)
1644
1645/*
1646 * Info for monitor/mwait idle loop.
1647 *
1648 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1649 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1650 * 2006.
1651 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1652 * Documentation Updates" #33633, Rev 2.05, December 2006.
1653 */
1654#define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1655#define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1656#define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1657#define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1658#define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1659#define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1660#define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1661#define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1662/*
1663 * Number of sub-cstates for a given c-state.
1664 */
1665#define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1666	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1667
1668/*
1669 * XSAVE leaf 0xD enumeration
1670 */
1671#define	CPUID_LEAFD_2_YMM_OFFSET	576
1672#define	CPUID_LEAFD_2_YMM_SIZE		256
1673
1674/*
1675 * Common extended leaf names to cut down on typos.
1676 */
1677#define	CPUID_LEAF_EXT_0		0x80000000
1678#define	CPUID_LEAF_EXT_8		0x80000008
1679#define	CPUID_LEAF_EXT_1d		0x8000001d
1680#define	CPUID_LEAF_EXT_1e		0x8000001e
1681
1682/*
1683 * Functions we consune from cpuid_subr.c;  don't publish these in a header
1684 * file to try and keep people using the expected cpuid_* interfaces.
1685 */
1686extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1687extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1688extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1689extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1690extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1691
1692/*
1693 * Apply up various platform-dependent restrictions where the
1694 * underlying platform restrictions mean the CPU can be marked
1695 * as less capable than its cpuid instruction would imply.
1696 */
1697#if defined(__xpv)
1698static void
1699platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1700{
1701	switch (eax) {
1702	case 1: {
1703		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1704		    0 : CPUID_INTC_EDX_MCA;
1705		cp->cp_edx &=
1706		    ~(mcamask |
1707		    CPUID_INTC_EDX_PSE |
1708		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1709		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1710		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1711		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1712		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1713		break;
1714	}
1715
1716	case 0x80000001:
1717		cp->cp_edx &=
1718		    ~(CPUID_AMD_EDX_PSE |
1719		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1720		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1721		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1722		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1723		    CPUID_AMD_EDX_TSCP);
1724		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1725		break;
1726	default:
1727		break;
1728	}
1729
1730	switch (vendor) {
1731	case X86_VENDOR_Intel:
1732		switch (eax) {
1733		case 4:
1734			/*
1735			 * Zero out the (ncores-per-chip - 1) field
1736			 */
1737			cp->cp_eax &= 0x03fffffff;
1738			break;
1739		default:
1740			break;
1741		}
1742		break;
1743	case X86_VENDOR_AMD:
1744		switch (eax) {
1745
1746		case 0x80000001:
1747			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1748			break;
1749
1750		case CPUID_LEAF_EXT_8:
1751			/*
1752			 * Zero out the (ncores-per-chip - 1) field
1753			 */
1754			cp->cp_ecx &= 0xffffff00;
1755			break;
1756		default:
1757			break;
1758		}
1759		break;
1760	default:
1761		break;
1762	}
1763}
1764#else
1765#define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
1766#endif
1767
1768/*
1769 *  Some undocumented ways of patching the results of the cpuid
1770 *  instruction to permit running Solaris 10 on future cpus that
1771 *  we don't currently support.  Could be set to non-zero values
1772 *  via settings in eeprom.
1773 */
1774
1775uint32_t cpuid_feature_ecx_include;
1776uint32_t cpuid_feature_ecx_exclude;
1777uint32_t cpuid_feature_edx_include;
1778uint32_t cpuid_feature_edx_exclude;
1779
1780/*
1781 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1782 */
1783void
1784cpuid_alloc_space(cpu_t *cpu)
1785{
1786	/*
1787	 * By convention, cpu0 is the boot cpu, which is set up
1788	 * before memory allocation is available.  All other cpus get
1789	 * their cpuid_info struct allocated here.
1790	 */
1791	ASSERT(cpu->cpu_id != 0);
1792	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1793	cpu->cpu_m.mcpu_cpi =
1794	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1795}
1796
1797void
1798cpuid_free_space(cpu_t *cpu)
1799{
1800	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1801	int i;
1802
1803	ASSERT(cpi != NULL);
1804	ASSERT(cpi != &cpuid_info0);
1805
1806	/*
1807	 * Free up any cache leaf related dynamic storage. The first entry was
1808	 * cached from the standard cpuid storage, so we should not free it.
1809	 */
1810	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1811		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1812	if (cpi->cpi_cache_leaf_size > 0)
1813		kmem_free(cpi->cpi_cache_leaves,
1814		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1815
1816	kmem_free(cpi, sizeof (*cpi));
1817	cpu->cpu_m.mcpu_cpi = NULL;
1818}
1819
1820#if !defined(__xpv)
1821/*
1822 * Determine the type of the underlying platform. This is used to customize
1823 * initialization of various subsystems (e.g. TSC). determine_platform() must
1824 * only ever be called once to prevent two processors from seeing different
1825 * values of platform_type. Must be called before cpuid_pass1(), the earliest
1826 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1827 */
1828void
1829determine_platform(void)
1830{
1831	struct cpuid_regs cp;
1832	uint32_t base;
1833	uint32_t regs[4];
1834	char *hvstr = (char *)regs;
1835
1836	ASSERT(platform_type == -1);
1837
1838	platform_type = HW_NATIVE;
1839
1840	if (!enable_platform_detection)
1841		return;
1842
1843	/*
1844	 * If Hypervisor CPUID bit is set, try to determine hypervisor
1845	 * vendor signature, and set platform type accordingly.
1846	 *
1847	 * References:
1848	 * http://lkml.org/lkml/2008/10/1/246
1849	 * http://kb.vmware.com/kb/1009458
1850	 */
1851	cp.cp_eax = 0x1;
1852	(void) __cpuid_insn(&cp);
1853	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1854		cp.cp_eax = 0x40000000;
1855		(void) __cpuid_insn(&cp);
1856		regs[0] = cp.cp_ebx;
1857		regs[1] = cp.cp_ecx;
1858		regs[2] = cp.cp_edx;
1859		regs[3] = 0;
1860		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1861			platform_type = HW_XEN_HVM;
1862			return;
1863		}
1864		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1865			platform_type = HW_VMWARE;
1866			return;
1867		}
1868		if (strcmp(hvstr, HVSIG_KVM) == 0) {
1869			platform_type = HW_KVM;
1870			return;
1871		}
1872		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1873			platform_type = HW_BHYVE;
1874			return;
1875		}
1876		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1877			platform_type = HW_MICROSOFT;
1878	} else {
1879		/*
1880		 * Check older VMware hardware versions. VMware hypervisor is
1881		 * detected by performing an IN operation to VMware hypervisor
1882		 * port and checking that value returned in %ebx is VMware
1883		 * hypervisor magic value.
1884		 *
1885		 * References: http://kb.vmware.com/kb/1009458
1886		 */
1887		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1888		if (regs[1] == VMWARE_HVMAGIC) {
1889			platform_type = HW_VMWARE;
1890			return;
1891		}
1892	}
1893
1894	/*
1895	 * Check Xen hypervisor. In a fully virtualized domain,
1896	 * Xen's pseudo-cpuid function returns a string representing the
1897	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1898	 * supported cpuid function. We need at least a (base + 2) leaf value
1899	 * to do what we want to do. Try different base values, since the
1900	 * hypervisor might use a different one depending on whether Hyper-V
1901	 * emulation is switched on by default or not.
1902	 */
1903	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1904		cp.cp_eax = base;
1905		(void) __cpuid_insn(&cp);
1906		regs[0] = cp.cp_ebx;
1907		regs[1] = cp.cp_ecx;
1908		regs[2] = cp.cp_edx;
1909		regs[3] = 0;
1910		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1911		    cp.cp_eax >= (base + 2)) {
1912			platform_type &= ~HW_NATIVE;
1913			platform_type |= HW_XEN_HVM;
1914			return;
1915		}
1916	}
1917}
1918
1919int
1920get_hwenv(void)
1921{
1922	ASSERT(platform_type != -1);
1923	return (platform_type);
1924}
1925
1926int
1927is_controldom(void)
1928{
1929	return (0);
1930}
1931
1932#else
1933
1934int
1935get_hwenv(void)
1936{
1937	return (HW_XEN_PV);
1938}
1939
1940int
1941is_controldom(void)
1942{
1943	return (DOMAIN_IS_INITDOMAIN(xen_info));
1944}
1945
1946#endif	/* __xpv */
1947
1948/*
1949 * Make sure that we have gathered all of the CPUID leaves that we might need to
1950 * determine topology. We assume that the standard leaf 1 has already been done
1951 * and that xmaxeax has already been calculated.
1952 */
1953static void
1954cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1955{
1956	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1957
1958	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1959		struct cpuid_regs *cp;
1960
1961		cp = &cpi->cpi_extd[8];
1962		cp->cp_eax = CPUID_LEAF_EXT_8;
1963		(void) __cpuid_insn(cp);
1964		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1965	}
1966
1967	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1968	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1969		struct cpuid_regs *cp;
1970
1971		cp = &cpi->cpi_extd[0x1e];
1972		cp->cp_eax = CPUID_LEAF_EXT_1e;
1973		(void) __cpuid_insn(cp);
1974	}
1975}
1976
1977/*
1978 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1979 * it to everything else. If not, and we're on an AMD system where 8000001e is
1980 * valid, then we use that. Othewrise, we fall back to the default value for the
1981 * APIC ID in leaf 1.
1982 */
1983static uint32_t
1984cpuid_gather_apicid(struct cpuid_info *cpi)
1985{
1986	/*
1987	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
1988	 * it, we need to gather it again.
1989	 */
1990	if (cpi->cpi_maxeax >= 0xB) {
1991		struct cpuid_regs regs;
1992		struct cpuid_regs *cp;
1993
1994		cp = &regs;
1995		cp->cp_eax = 0xB;
1996		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1997		(void) __cpuid_insn(cp);
1998
1999		if (cp->cp_ebx != 0) {
2000			return (cp->cp_edx);
2001		}
2002	}
2003
2004	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2005	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2006	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2007		return (cpi->cpi_extd[0x1e].cp_eax);
2008	}
2009
2010	return (CPI_APIC_ID(cpi));
2011}
2012
2013/*
2014 * For AMD processors, attempt to calculate the number of chips and cores that
2015 * exist. The way that we do this varies based on the generation, because the
2016 * generations themselves have changed dramatically.
2017 *
2018 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2019 * However, with the advent of family 17h (Zen) it actually tells us the number
2020 * of threads, so we need to look at leaf 0x8000001e if available to determine
2021 * its value. Otherwise, for all prior families, the number of enabled cores is
2022 * the same as threads.
2023 *
2024 * If we do not have leaf 0x80000008, then we assume that this processor does
2025 * not have anything. AMD's older CPUID specification says there's no reason to
2026 * fall back to leaf 1.
2027 *
2028 * In some virtualization cases we will not have leaf 8000001e or it will be
2029 * zero. When that happens we assume the number of threads is one.
2030 */
2031static void
2032cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2033{
2034	uint_t nthreads, nthread_per_core;
2035
2036	nthreads = nthread_per_core = 1;
2037
2038	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2039		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2040	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2041		nthreads = CPI_CPU_COUNT(cpi);
2042	}
2043
2044	/*
2045	 * For us to have threads, and know about it, we have to be at least at
2046	 * family 17h and have the cpuid bit that says we have extended
2047	 * topology.
2048	 */
2049	if (cpi->cpi_family >= 0x17 &&
2050	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2051	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2052		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2053	}
2054
2055	*ncpus = nthreads;
2056	*ncores = nthreads / nthread_per_core;
2057}
2058
2059/*
2060 * Seed the initial values for the cores and threads for an Intel based
2061 * processor. These values will be overwritten if we detect that the processor
2062 * supports CPUID leaf 0xb.
2063 */
2064static void
2065cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2066{
2067	/*
2068	 * Only seed the number of physical cores from the first level leaf 4
2069	 * information. The number of threads there indicate how many share the
2070	 * L1 cache, which may or may not have anything to do with the number of
2071	 * logical CPUs per core.
2072	 */
2073	if (cpi->cpi_maxeax >= 4) {
2074		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2075	} else {
2076		*ncores = 1;
2077	}
2078
2079	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2080		*ncpus = CPI_CPU_COUNT(cpi);
2081	} else {
2082		*ncpus = *ncores;
2083	}
2084}
2085
2086static boolean_t
2087cpuid_leafB_getids(cpu_t *cpu)
2088{
2089	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2090	struct cpuid_regs regs;
2091	struct cpuid_regs *cp;
2092
2093	if (cpi->cpi_maxeax < 0xB)
2094		return (B_FALSE);
2095
2096	cp = &regs;
2097	cp->cp_eax = 0xB;
2098	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2099
2100	(void) __cpuid_insn(cp);
2101
2102	/*
2103	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2104	 * indicates that the extended topology enumeration leaf is
2105	 * available.
2106	 */
2107	if (cp->cp_ebx != 0) {
2108		uint32_t x2apic_id = 0;
2109		uint_t coreid_shift = 0;
2110		uint_t ncpu_per_core = 1;
2111		uint_t chipid_shift = 0;
2112		uint_t ncpu_per_chip = 1;
2113		uint_t i;
2114		uint_t level;
2115
2116		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2117			cp->cp_eax = 0xB;
2118			cp->cp_ecx = i;
2119
2120			(void) __cpuid_insn(cp);
2121			level = CPI_CPU_LEVEL_TYPE(cp);
2122
2123			if (level == 1) {
2124				x2apic_id = cp->cp_edx;
2125				coreid_shift = BITX(cp->cp_eax, 4, 0);
2126				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2127			} else if (level == 2) {
2128				x2apic_id = cp->cp_edx;
2129				chipid_shift = BITX(cp->cp_eax, 4, 0);
2130				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2131			}
2132		}
2133
2134		/*
2135		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2136		 */
2137		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2138		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2139		    ncpu_per_core;
2140		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2141		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2142		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2143		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2144		cpi->cpi_procnodeid = cpi->cpi_chipid;
2145		cpi->cpi_compunitid = cpi->cpi_coreid;
2146
2147		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2148			cpi->cpi_nthread_bits = coreid_shift;
2149			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2150		}
2151
2152		return (B_TRUE);
2153	} else {
2154		return (B_FALSE);
2155	}
2156}
2157
2158static void
2159cpuid_intel_getids(cpu_t *cpu, void *feature)
2160{
2161	uint_t i;
2162	uint_t chipid_shift = 0;
2163	uint_t coreid_shift = 0;
2164	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2165
2166	/*
2167	 * There are no compute units or processor nodes currently on Intel.
2168	 * Always set these to one.
2169	 */
2170	cpi->cpi_procnodes_per_pkg = 1;
2171	cpi->cpi_cores_per_compunit = 1;
2172
2173	/*
2174	 * If cpuid Leaf B is present, use that to try and get this information.
2175	 * It will be the most accurate for Intel CPUs.
2176	 */
2177	if (cpuid_leafB_getids(cpu))
2178		return;
2179
2180	/*
2181	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2182	 * and ncore_per_chip. These represent the largest power of two values
2183	 * that we need to cover all of the IDs in the system. Therefore, we use
2184	 * those values to seed the number of bits needed to cover information
2185	 * in the case when leaf B is not available. These values will probably
2186	 * be larger than required, but that's OK.
2187	 */
2188	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2189	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2190
2191	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2192		chipid_shift++;
2193
2194	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2195	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2196
2197	if (is_x86_feature(feature, X86FSET_CMP)) {
2198		/*
2199		 * Multi-core (and possibly multi-threaded)
2200		 * processors.
2201		 */
2202		uint_t ncpu_per_core;
2203		if (cpi->cpi_ncore_per_chip == 1)
2204			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2205		else if (cpi->cpi_ncore_per_chip > 1)
2206			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2207			    cpi->cpi_ncore_per_chip;
2208		/*
2209		 * 8bit APIC IDs on dual core Pentiums
2210		 * look like this:
2211		 *
2212		 * +-----------------------+------+------+
2213		 * | Physical Package ID   |  MC  |  HT  |
2214		 * +-----------------------+------+------+
2215		 * <------- chipid -------->
2216		 * <------- coreid --------------->
2217		 *			   <--- clogid -->
2218		 *			   <------>
2219		 *			   pkgcoreid
2220		 *
2221		 * Where the number of bits necessary to
2222		 * represent MC and HT fields together equals
2223		 * to the minimum number of bits necessary to
2224		 * store the value of cpi->cpi_ncpu_per_chip.
2225		 * Of those bits, the MC part uses the number
2226		 * of bits necessary to store the value of
2227		 * cpi->cpi_ncore_per_chip.
2228		 */
2229		for (i = 1; i < ncpu_per_core; i <<= 1)
2230			coreid_shift++;
2231		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2232		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2233	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2234		/*
2235		 * Single-core multi-threaded processors.
2236		 */
2237		cpi->cpi_coreid = cpi->cpi_chipid;
2238		cpi->cpi_pkgcoreid = 0;
2239	} else {
2240		/*
2241		 * Single-core single-thread processors.
2242		 */
2243		cpi->cpi_coreid = cpu->cpu_id;
2244		cpi->cpi_pkgcoreid = 0;
2245	}
2246	cpi->cpi_procnodeid = cpi->cpi_chipid;
2247	cpi->cpi_compunitid = cpi->cpi_coreid;
2248}
2249
2250/*
2251 * Historically, AMD has had CMP chips with only a single thread per core.
2252 * However, starting in family 17h (Zen), this has changed and they now have
2253 * multiple threads. Our internal core id needs to be a unique value.
2254 *
2255 * To determine the core id of an AMD system, if we're from a family before 17h,
2256 * then we just use the cpu id, as that gives us a good value that will be
2257 * unique for each core. If instead, we're on family 17h or later, then we need
2258 * to do something more complicated. CPUID leaf 0x8000001e can tell us
2259 * how many threads are in the system. Based on that, we'll shift the APIC ID.
2260 * We can't use the normal core id in that leaf as it's only unique within the
2261 * socket, which is perfect for cpi_pkgcoreid, but not us.
2262 */
2263static id_t
2264cpuid_amd_get_coreid(cpu_t *cpu)
2265{
2266	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2267
2268	if (cpi->cpi_family >= 0x17 &&
2269	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2270	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2271		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2272		if (nthreads > 1) {
2273			VERIFY3U(nthreads, ==, 2);
2274			return (cpi->cpi_apicid >> 1);
2275		}
2276	}
2277
2278	return (cpu->cpu_id);
2279}
2280
2281/*
2282 * IDs on AMD is a more challenging task. This is notable because of the
2283 * following two facts:
2284 *
2285 *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2286 *     also no way to get an actual unique core id from the system. As such, we
2287 *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2288 *     however, guarantee that sibling cores of a chip will have sequential
2289 *     coreids starting at a multiple of the number of cores per chip - that is
2290 *     usually the case, but if the ACPI MADT table is presented in a different
2291 *     order then we need to perform a few more gymnastics for the pkgcoreid.
2292 *
2293 *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2294 *     called compute units. These compute units share the L1I cache, L2 cache,
2295 *     and the FPU. To deal with this, a new topology leaf was added in
2296 *     0x8000001e. However, parts of this leaf have different meanings
2297 *     once we get to family 0x17.
2298 */
2299
2300static void
2301cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2302{
2303	int i, first_half, coreidsz;
2304	uint32_t nb_caps_reg;
2305	uint_t node2_1;
2306	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2307	struct cpuid_regs *cp;
2308
2309	/*
2310	 * Calculate the core id (this comes from hardware in family 0x17 if it
2311	 * hasn't been stripped by virtualization). We always set the compute
2312	 * unit id to the same value. Also, initialize the default number of
2313	 * cores per compute unit and nodes per package. This will be
2314	 * overwritten when we know information about a particular family.
2315	 */
2316	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2317	cpi->cpi_compunitid = cpi->cpi_coreid;
2318	cpi->cpi_cores_per_compunit = 1;
2319	cpi->cpi_procnodes_per_pkg = 1;
2320
2321	/*
2322	 * To construct the logical ID, we need to determine how many APIC IDs
2323	 * are dedicated to the cores and threads. This is provided for us in
2324	 * 0x80000008. However, if it's not present (say due to virtualization),
2325	 * then we assume it's one. This should be present on all 64-bit AMD
2326	 * processors.  It was added in family 0xf (Hammer).
2327	 */
2328	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2329		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2330
2331		/*
2332		 * In AMD parlance chip is really a node while illumos
2333		 * uses chip as equivalent to socket/package.
2334		 */
2335		if (coreidsz == 0) {
2336			/* Use legacy method */
2337			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2338				coreidsz++;
2339			if (coreidsz == 0)
2340				coreidsz = 1;
2341		}
2342	} else {
2343		/* Assume single-core part */
2344		coreidsz = 1;
2345	}
2346	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2347
2348	/*
2349	 * The package core ID varies depending on the family. While it may be
2350	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2351	 * this value is the core id in the given node. For non-virtualized
2352	 * family 17h, we need to take the logical core id and shift off the
2353	 * threads like we do when getting the core id.  Otherwise, we can use
2354	 * the clogid as is. When family 17h is virtualized, the clogid should
2355	 * be sufficient as if we don't have valid data in the leaf, then we
2356	 * won't think we have SMT, in which case the cpi_clogid should be
2357	 * sufficient.
2358	 */
2359	if (cpi->cpi_family >= 0x17 &&
2360	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2361	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2362	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2363		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2364		if (nthreads > 1) {
2365			VERIFY3U(nthreads, ==, 2);
2366			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2367		} else {
2368			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2369		}
2370	} else {
2371		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2372	}
2373
2374	/*
2375	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2376	 * (bulldozer) or newer, then we can derive all of this from leaf
2377	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2378	 */
2379	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2380	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2381		cp = &cpi->cpi_extd[0x1e];
2382
2383		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2384		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2385
2386		/*
2387		 * For Bulldozer-era CPUs, recalculate the compute unit
2388		 * information.
2389		 */
2390		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2391			cpi->cpi_cores_per_compunit =
2392			    BITX(cp->cp_ebx, 15, 8) + 1;
2393			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2394			    (cpi->cpi_ncore_per_chip /
2395			    cpi->cpi_cores_per_compunit) *
2396			    (cpi->cpi_procnodeid /
2397			    cpi->cpi_procnodes_per_pkg);
2398		}
2399	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2400		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2401	} else if (cpi->cpi_family == 0x10) {
2402		/*
2403		 * See if we are a multi-node processor.
2404		 * All processors in the system have the same number of nodes
2405		 */
2406		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2407		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2408			/* Single-node */
2409			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2410			    coreidsz);
2411		} else {
2412
2413			/*
2414			 * Multi-node revision D (2 nodes per package
2415			 * are supported)
2416			 */
2417			cpi->cpi_procnodes_per_pkg = 2;
2418
2419			first_half = (cpi->cpi_pkgcoreid <=
2420			    (cpi->cpi_ncore_per_chip/2 - 1));
2421
2422			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2423				/* We are BSP */
2424				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2425			} else {
2426
2427				/* We are AP */
2428				/* NodeId[2:1] bits to use for reading F3xe8 */
2429				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2430
2431				nb_caps_reg =
2432				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2433
2434				/*
2435				 * Check IntNodeNum bit (31:30, but bit 31 is
2436				 * always 0 on dual-node processors)
2437				 */
2438				if (BITX(nb_caps_reg, 30, 30) == 0)
2439					cpi->cpi_procnodeid = node2_1 +
2440					    !first_half;
2441				else
2442					cpi->cpi_procnodeid = node2_1 +
2443					    first_half;
2444			}
2445		}
2446	} else {
2447		cpi->cpi_procnodeid = 0;
2448	}
2449
2450	cpi->cpi_chipid =
2451	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2452
2453	cpi->cpi_ncore_bits = coreidsz;
2454	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2455	    cpi->cpi_ncore_per_chip);
2456}
2457
2458static void
2459spec_uarch_flush_noop(void)
2460{
2461}
2462
2463/*
2464 * When microcode is present that mitigates MDS, this wrmsr will also flush the
2465 * MDS-related micro-architectural state that would normally happen by calling
2466 * x86_md_clear().
2467 */
2468static void
2469spec_uarch_flush_msr(void)
2470{
2471	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2472}
2473
2474/*
2475 * This function points to a function that will flush certain
2476 * micro-architectural state on the processor. This flush is used to mitigate
2477 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2478 * function can point to one of three functions:
2479 *
2480 * - A noop which is done because we either are vulnerable, but do not have
2481 *   microcode available to help deal with a fix, or because we aren't
2482 *   vulnerable.
2483 *
2484 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2485 *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2486 *   however, it only flushes the MDS related micro-architectural state on the
2487 *   current hyperthread, it does not do anything for the twin.
2488 *
2489 * - x86_md_clear which will flush the MDS related state. This is done when we
2490 *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2491 *   (RDCL_NO is set).
2492 */
2493void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2494
2495static void
2496cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2497{
2498	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2499
2500	/*
2501	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2502	 * has been fixed in hardware, it doesn't cover everything related to
2503	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2504	 * need to mitigate this.
2505	 */
2506	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2507	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2508		return;
2509	}
2510
2511	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2512		const uint8_t nop = NOP_INSTR;
2513		uint8_t *md = (uint8_t *)x86_md_clear;
2514
2515		*md = nop;
2516	}
2517
2518	membar_producer();
2519}
2520
2521static void
2522cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2523{
2524	boolean_t need_l1d, need_mds;
2525	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2526
2527	/*
2528	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2529	 * hardware, then there's nothing left for us to do for enabling the
2530	 * flush. We can also go ahead and say that SMT exclusion is
2531	 * unnecessary.
2532	 */
2533	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2534	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2535	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2536		extern int smt_exclusion;
2537		smt_exclusion = 0;
2538		spec_uarch_flush = spec_uarch_flush_noop;
2539		membar_producer();
2540		return;
2541	}
2542
2543	/*
2544	 * The locations where we need to perform an L1D flush are required both
2545	 * for mitigating L1TF and MDS. When verw support is present in
2546	 * microcode, then the L1D flush will take care of doing that as well.
2547	 * However, if we have a system where RDCL_NO is present, but we don't
2548	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2549	 * L1D flush.
2550	 */
2551	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2552	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2553	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2554		need_l1d = B_TRUE;
2555	} else {
2556		need_l1d = B_FALSE;
2557	}
2558
2559	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2560	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2561		need_mds = B_TRUE;
2562	} else {
2563		need_mds = B_FALSE;
2564	}
2565
2566	if (need_l1d) {
2567		spec_uarch_flush = spec_uarch_flush_msr;
2568	} else if (need_mds) {
2569		spec_uarch_flush = x86_md_clear;
2570	} else {
2571		/*
2572		 * We have no hardware mitigations available to us.
2573		 */
2574		spec_uarch_flush = spec_uarch_flush_noop;
2575	}
2576	membar_producer();
2577}
2578
2579/*
2580 * We default to enabling RSB mitigations.
2581 */
2582static void
2583cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2584{
2585	const uint8_t ret = RET_INSTR;
2586	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2587
2588	switch (mit) {
2589	case X86_SPECTREV2_ENHANCED_IBRS:
2590	case X86_SPECTREV2_DISABLED:
2591		*stuff = ret;
2592		break;
2593	default:
2594		break;
2595	}
2596}
2597
2598static void
2599cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2600{
2601	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2602	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2603	    "_r14", "_r15" };
2604	const uint_t nthunks = ARRAY_SIZE(thunks);
2605	const char *type;
2606	uint_t i;
2607
2608	if (mit == x86_spectrev2_mitigation)
2609		return;
2610
2611	switch (mit) {
2612	case X86_SPECTREV2_RETPOLINE:
2613		type = "gen";
2614		break;
2615	case X86_SPECTREV2_RETPOLINE_AMD:
2616		type = "amd";
2617		break;
2618	case X86_SPECTREV2_ENHANCED_IBRS:
2619	case X86_SPECTREV2_DISABLED:
2620		type = "jmp";
2621		break;
2622	default:
2623		panic("asked to updated retpoline state with unknown state!");
2624	}
2625
2626	for (i = 0; i < nthunks; i++) {
2627		uintptr_t source, dest;
2628		int ssize, dsize;
2629		char sourcebuf[64], destbuf[64];
2630		size_t len;
2631
2632		(void) snprintf(destbuf, sizeof (destbuf),
2633		    "__x86_indirect_thunk%s", thunks[i]);
2634		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2635		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2636
2637		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2638		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2639		VERIFY3U(source, !=, 0);
2640		VERIFY3U(dest, !=, 0);
2641		VERIFY3S(dsize, >=, ssize);
2642		bcopy((void *)source, (void *)dest, ssize);
2643	}
2644}
2645
2646static void
2647cpuid_enable_enhanced_ibrs(void)
2648{
2649	uint64_t val;
2650
2651	val = rdmsr(MSR_IA32_SPEC_CTRL);
2652	val |= IA32_SPEC_CTRL_IBRS;
2653	wrmsr(MSR_IA32_SPEC_CTRL, val);
2654}
2655
2656#ifndef __xpv
2657/*
2658 * Determine whether or not we can use the AMD optimized retpoline
2659 * functionality. We use this when we know we're on an AMD system and we can
2660 * successfully verify that lfence is dispatch serializing.
2661 */
2662static boolean_t
2663cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2664{
2665	uint64_t val;
2666	on_trap_data_t otd;
2667
2668	if (cpi->cpi_vendor != X86_VENDOR_AMD)
2669		return (B_FALSE);
2670
2671	/*
2672	 * We need to determine whether or not lfence is serializing. It always
2673	 * is on families 0xf and 0x11. On others, it's controlled by
2674	 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2675	 * crazy old family, don't try and do anything.
2676	 */
2677	if (cpi->cpi_family < 0xf)
2678		return (B_FALSE);
2679	if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2680		return (B_TRUE);
2681
2682	/*
2683	 * While it may be tempting to use get_hwenv(), there are no promises
2684	 * that a hypervisor will actually declare themselves to be so in a
2685	 * friendly way. As such, try to read and set the MSR. If we can then
2686	 * read back the value we set (it wasn't just set to zero), then we go
2687	 * for it.
2688	 */
2689	if (!on_trap(&otd, OT_DATA_ACCESS)) {
2690		val = rdmsr(MSR_AMD_DECODE_CONFIG);
2691		val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2692		wrmsr(MSR_AMD_DECODE_CONFIG, val);
2693		val = rdmsr(MSR_AMD_DECODE_CONFIG);
2694	} else {
2695		val = 0;
2696	}
2697	no_trap();
2698
2699	if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2700		return (B_TRUE);
2701	return (B_FALSE);
2702}
2703#endif	/* !__xpv */
2704
2705static void
2706cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2707{
2708	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2709	x86_spectrev2_mitigation_t v2mit;
2710
2711	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2712	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2713		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2714			add_x86_feature(featureset, X86FSET_IBPB);
2715		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2716			add_x86_feature(featureset, X86FSET_IBRS);
2717		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2718			add_x86_feature(featureset, X86FSET_STIBP);
2719		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2720			add_x86_feature(featureset, X86FSET_STIBP_ALL);
2721		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2722			add_x86_feature(featureset, X86FSET_SSBD);
2723		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2724			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2725		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2726			add_x86_feature(featureset, X86FSET_SSB_NO);
2727		/*
2728		 * Don't enable enhanced IBRS unless we're told that we should
2729		 * prefer it and it has the same semantics as Intel. This is
2730		 * split into two bits rather than a single one.
2731		 */
2732		if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2733		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2734			add_x86_feature(featureset, X86FSET_IBRS_ALL);
2735		}
2736
2737	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2738	    cpi->cpi_maxeax >= 7) {
2739		struct cpuid_regs *ecp;
2740		ecp = &cpi->cpi_std[7];
2741
2742		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2743			add_x86_feature(featureset, X86FSET_MD_CLEAR);
2744		}
2745
2746		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2747			add_x86_feature(featureset, X86FSET_IBRS);
2748			add_x86_feature(featureset, X86FSET_IBPB);
2749		}
2750
2751		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2752			add_x86_feature(featureset, X86FSET_STIBP);
2753		}
2754
2755		/*
2756		 * Don't read the arch caps MSR on xpv where we lack the
2757		 * on_trap().
2758		 */
2759#ifndef __xpv
2760		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2761			on_trap_data_t otd;
2762
2763			/*
2764			 * Be paranoid and assume we'll get a #GP.
2765			 */
2766			if (!on_trap(&otd, OT_DATA_ACCESS)) {
2767				uint64_t reg;
2768
2769				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2770				if (reg & IA32_ARCH_CAP_RDCL_NO) {
2771					add_x86_feature(featureset,
2772					    X86FSET_RDCL_NO);
2773				}
2774				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2775					add_x86_feature(featureset,
2776					    X86FSET_IBRS_ALL);
2777				}
2778				if (reg & IA32_ARCH_CAP_RSBA) {
2779					add_x86_feature(featureset,
2780					    X86FSET_RSBA);
2781				}
2782				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2783					add_x86_feature(featureset,
2784					    X86FSET_L1D_VM_NO);
2785				}
2786				if (reg & IA32_ARCH_CAP_SSB_NO) {
2787					add_x86_feature(featureset,
2788					    X86FSET_SSB_NO);
2789				}
2790				if (reg & IA32_ARCH_CAP_MDS_NO) {
2791					add_x86_feature(featureset,
2792					    X86FSET_MDS_NO);
2793				}
2794			}
2795			no_trap();
2796		}
2797#endif	/* !__xpv */
2798
2799		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2800			add_x86_feature(featureset, X86FSET_SSBD);
2801
2802		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2803			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2804	}
2805
2806	if (cpu->cpu_id != 0) {
2807		if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2808			cpuid_enable_enhanced_ibrs();
2809		}
2810		return;
2811	}
2812
2813	/*
2814	 * Go through and initialize various security mechanisms that we should
2815	 * only do on a single CPU. This includes Spectre V2, L1TF, and MDS.
2816	 */
2817
2818	/*
2819	 * By default we've come in with retpolines enabled. Check whether we
2820	 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
2821	 * by default, but disabled if we are using enhanced IBRS.
2822	 */
2823	if (x86_disable_spectrev2 != 0) {
2824		v2mit = X86_SPECTREV2_DISABLED;
2825	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
2826		cpuid_enable_enhanced_ibrs();
2827		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
2828#ifndef __xpv
2829	} else if (cpuid_use_amd_retpoline(cpi)) {
2830		v2mit = X86_SPECTREV2_RETPOLINE_AMD;
2831#endif	/* !__xpv */
2832	} else {
2833		v2mit = X86_SPECTREV2_RETPOLINE;
2834	}
2835
2836	cpuid_patch_retpolines(v2mit);
2837	cpuid_patch_rsb(v2mit);
2838	x86_spectrev2_mitigation = v2mit;
2839	membar_producer();
2840
2841	/*
2842	 * We need to determine what changes are required for mitigating L1TF
2843	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
2844	 * is required.
2845	 *
2846	 * If any of these are present, then we need to flush u-arch state at
2847	 * various points. For MDS, we need to do so whenever we change to a
2848	 * lesser privilege level or we are halting the CPU. For L1TF we need to
2849	 * flush the L1D cache at VM entry. When we have microcode that handles
2850	 * MDS, the L1D flush also clears the other u-arch state that the
2851	 * md_clear does.
2852	 */
2853
2854	/*
2855	 * Update whether or not we need to be taking explicit action against
2856	 * MDS.
2857	 */
2858	cpuid_update_md_clear(cpu, featureset);
2859
2860	/*
2861	 * Determine whether SMT exclusion is required and whether or not we
2862	 * need to perform an l1d flush.
2863	 */
2864	cpuid_update_l1d_flush(cpu, featureset);
2865}
2866
2867/*
2868 * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2869 */
2870void
2871setup_xfem(void)
2872{
2873	uint64_t flags = XFEATURE_LEGACY_FP;
2874
2875	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2876
2877	if (is_x86_feature(x86_featureset, X86FSET_SSE))
2878		flags |= XFEATURE_SSE;
2879
2880	if (is_x86_feature(x86_featureset, X86FSET_AVX))
2881		flags |= XFEATURE_AVX;
2882
2883	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2884		flags |= XFEATURE_AVX512;
2885
2886	set_xcr(XFEATURE_ENABLED_MASK, flags);
2887
2888	xsave_bv_all = flags;
2889}
2890
2891static void
2892cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2893{
2894	struct cpuid_info *cpi;
2895
2896	cpi = cpu->cpu_m.mcpu_cpi;
2897
2898	if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2899		cpuid_gather_amd_topology_leaves(cpu);
2900	}
2901
2902	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2903
2904	/*
2905	 * Before we can calculate the IDs that we should assign to this
2906	 * processor, we need to understand how many cores and threads it has.
2907	 */
2908	switch (cpi->cpi_vendor) {
2909	case X86_VENDOR_Intel:
2910		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2911		    &cpi->cpi_ncore_per_chip);
2912		break;
2913	case X86_VENDOR_AMD:
2914		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2915		    &cpi->cpi_ncore_per_chip);
2916		break;
2917	default:
2918		/*
2919		 * If we have some other x86 compatible chip, it's not clear how
2920		 * they would behave. The most common case is virtualization
2921		 * today, though there are also 64-bit VIA chips. Assume that
2922		 * all we can get is the basic Leaf 1 HTT information.
2923		 */
2924		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2925			cpi->cpi_ncore_per_chip = 1;
2926			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2927		}
2928		break;
2929	}
2930
2931	/*
2932	 * Based on the calculated number of threads and cores, potentially
2933	 * assign the HTT and CMT features.
2934	 */
2935	if (cpi->cpi_ncore_per_chip > 1) {
2936		add_x86_feature(featureset, X86FSET_CMP);
2937	}
2938
2939	if (cpi->cpi_ncpu_per_chip > 1 &&
2940	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2941		add_x86_feature(featureset, X86FSET_HTT);
2942	}
2943
2944	/*
2945	 * Now that has been set up, we need to go through and calculate all of
2946	 * the rest of the parameters that exist. If we think the CPU doesn't
2947	 * have either SMT (HTT) or CMP, then we basically go through and fake
2948	 * up information in some way. The most likely case for this is
2949	 * virtualization where we have a lot of partial topology information.
2950	 */
2951	if (!is_x86_feature(featureset, X86FSET_HTT) &&
2952	    !is_x86_feature(featureset, X86FSET_CMP)) {
2953		/*
2954		 * This is a single core, single-threaded processor.
2955		 */
2956		cpi->cpi_procnodes_per_pkg = 1;
2957		cpi->cpi_cores_per_compunit = 1;
2958		cpi->cpi_compunitid = 0;
2959		cpi->cpi_chipid = -1;
2960		cpi->cpi_clogid = 0;
2961		cpi->cpi_coreid = cpu->cpu_id;
2962		cpi->cpi_pkgcoreid = 0;
2963		if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2964			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2965		} else {
2966			cpi->cpi_procnodeid = cpi->cpi_chipid;
2967		}
2968	} else {
2969		switch (cpi->cpi_vendor) {
2970		case X86_VENDOR_Intel:
2971			cpuid_intel_getids(cpu, featureset);
2972			break;
2973		case X86_VENDOR_AMD:
2974			cpuid_amd_getids(cpu, featureset);
2975			break;
2976		default:
2977			/*
2978			 * In this case, it's hard to say what we should do.
2979			 * We're going to model them to the OS as single core
2980			 * threads. We don't have a good identifier for them, so
2981			 * we're just going to use the cpu id all on a single
2982			 * chip.
2983			 *
2984			 * This case has historically been different from the
2985			 * case above where we don't have HTT or CMP. While they
2986			 * could be combined, we've opted to keep it separate to
2987			 * minimize the risk of topology changes in weird cases.
2988			 */
2989			cpi->cpi_procnodes_per_pkg = 1;
2990			cpi->cpi_cores_per_compunit = 1;
2991			cpi->cpi_chipid = 0;
2992			cpi->cpi_coreid = cpu->cpu_id;
2993			cpi->cpi_clogid = cpu->cpu_id;
2994			cpi->cpi_pkgcoreid = cpu->cpu_id;
2995			cpi->cpi_procnodeid = cpi->cpi_chipid;
2996			cpi->cpi_compunitid = cpi->cpi_coreid;
2997			break;
2998		}
2999	}
3000}
3001
3002/*
3003 * Gather relevant CPU features from leaf 6 which covers thermal information. We
3004 * always gather leaf 6 if it's supported; however, we only look for features on
3005 * Intel systems as AMD does not currently define any of the features we look
3006 * for below.
3007 */
3008static void
3009cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3010{
3011	struct cpuid_regs *cp;
3012	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3013
3014	if (cpi->cpi_maxeax < 6) {
3015		return;
3016	}
3017
3018	cp = &cpi->cpi_std[6];
3019	cp->cp_eax = 6;
3020	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3021	(void) __cpuid_insn(cp);
3022	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3023
3024	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3025		return;
3026	}
3027
3028	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3029		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3030	}
3031
3032	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3033		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3034	}
3035}
3036
3037void
3038cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3039{
3040	uint32_t mask_ecx, mask_edx;
3041	struct cpuid_info *cpi;
3042	struct cpuid_regs *cp;
3043	int xcpuid;
3044#if !defined(__xpv)
3045	extern int idle_cpu_prefer_mwait;
3046#endif
3047
3048	/*
3049	 * Space statically allocated for BSP, ensure pointer is set
3050	 */
3051	if (cpu->cpu_id == 0) {
3052		if (cpu->cpu_m.mcpu_cpi == NULL)
3053			cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3054	}
3055
3056	add_x86_feature(featureset, X86FSET_CPUID);
3057
3058	cpi = cpu->cpu_m.mcpu_cpi;
3059	ASSERT(cpi != NULL);
3060	cp = &cpi->cpi_std[0];
3061	cp->cp_eax = 0;
3062	cpi->cpi_maxeax = __cpuid_insn(cp);
3063	{
3064		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3065		*iptr++ = cp->cp_ebx;
3066		*iptr++ = cp->cp_edx;
3067		*iptr++ = cp->cp_ecx;
3068		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3069	}
3070
3071	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3072	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3073
3074	/*
3075	 * Limit the range in case of weird hardware
3076	 */
3077	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3078		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3079	if (cpi->cpi_maxeax < 1)
3080		goto pass1_done;
3081
3082	cp = &cpi->cpi_std[1];
3083	cp->cp_eax = 1;
3084	(void) __cpuid_insn(cp);
3085
3086	/*
3087	 * Extract identifying constants for easy access.
3088	 */
3089	cpi->cpi_model = CPI_MODEL(cpi);
3090	cpi->cpi_family = CPI_FAMILY(cpi);
3091
3092	if (cpi->cpi_family == 0xf)
3093		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3094
3095	/*
3096	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3097	 * Intel, and presumably everyone else, uses model == 0xf, as
3098	 * one would expect (max value means possible overflow).  Sigh.
3099	 */
3100
3101	switch (cpi->cpi_vendor) {
3102	case X86_VENDOR_Intel:
3103		if (IS_EXTENDED_MODEL_INTEL(cpi))
3104			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3105		break;
3106	case X86_VENDOR_AMD:
3107		if (CPI_FAMILY(cpi) == 0xf)
3108			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3109		break;
3110	default:
3111		if (cpi->cpi_model == 0xf)
3112			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3113		break;
3114	}
3115
3116	cpi->cpi_step = CPI_STEP(cpi);
3117	cpi->cpi_brandid = CPI_BRANDID(cpi);
3118
3119	/*
3120	 * *default* assumptions:
3121	 * - believe %edx feature word
3122	 * - ignore %ecx feature word
3123	 * - 32-bit virtual and physical addressing
3124	 */
3125	mask_edx = 0xffffffff;
3126	mask_ecx = 0;
3127
3128	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3129
3130	switch (cpi->cpi_vendor) {
3131	case X86_VENDOR_Intel:
3132		if (cpi->cpi_family == 5)
3133			x86_type = X86_TYPE_P5;
3134		else if (IS_LEGACY_P6(cpi)) {
3135			x86_type = X86_TYPE_P6;
3136			pentiumpro_bug4046376 = 1;
3137			/*
3138			 * Clear the SEP bit when it was set erroneously
3139			 */
3140			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3141				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3142		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3143			x86_type = X86_TYPE_P4;
3144			/*
3145			 * We don't currently depend on any of the %ecx
3146			 * features until Prescott, so we'll only check
3147			 * this from P4 onwards.  We might want to revisit
3148			 * that idea later.
3149			 */
3150			mask_ecx = 0xffffffff;
3151		} else if (cpi->cpi_family > 0xf)
3152			mask_ecx = 0xffffffff;
3153		/*
3154		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3155		 * to obtain the monitor linesize.
3156		 */
3157		if (cpi->cpi_maxeax < 5)
3158			mask_ecx &= ~CPUID_INTC_ECX_MON;
3159		break;
3160	case X86_VENDOR_IntelClone:
3161	default:
3162		break;
3163	case X86_VENDOR_AMD:
3164#if defined(OPTERON_ERRATUM_108)
3165		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3166			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3167			cpi->cpi_model = 0xc;
3168		} else
3169#endif
3170		if (cpi->cpi_family == 5) {
3171			/*
3172			 * AMD K5 and K6
3173			 *
3174			 * These CPUs have an incomplete implementation
3175			 * of MCA/MCE which we mask away.
3176			 */
3177			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3178
3179			/*
3180			 * Model 0 uses the wrong (APIC) bit
3181			 * to indicate PGE.  Fix it here.
3182			 */
3183			if (cpi->cpi_model == 0) {
3184				if (cp->cp_edx & 0x200) {
3185					cp->cp_edx &= ~0x200;
3186					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3187				}
3188			}
3189
3190			/*
3191			 * Early models had problems w/ MMX; disable.
3192			 */
3193			if (cpi->cpi_model < 6)
3194				mask_edx &= ~CPUID_INTC_EDX_MMX;
3195		}
3196
3197		/*
3198		 * For newer families, SSE3 and CX16, at least, are valid;
3199		 * enable all
3200		 */
3201		if (cpi->cpi_family >= 0xf)
3202			mask_ecx = 0xffffffff;
3203		/*
3204		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3205		 * to obtain the monitor linesize.
3206		 */
3207		if (cpi->cpi_maxeax < 5)
3208			mask_ecx &= ~CPUID_INTC_ECX_MON;
3209
3210#if !defined(__xpv)
3211		/*
3212		 * AMD has not historically used MWAIT in the CPU's idle loop.
3213		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3214		 * know for certain that in at least family 17h, per AMD, mwait
3215		 * is preferred. Families in-between are less certain.
3216		 */
3217		if (cpi->cpi_family < 0x17) {
3218			idle_cpu_prefer_mwait = 0;
3219		}
3220#endif
3221
3222		break;
3223	case X86_VENDOR_TM:
3224		/*
3225		 * workaround the NT workaround in CMS 4.1
3226		 */
3227		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3228		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3229			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3230		break;
3231	case X86_VENDOR_Centaur:
3232		/*
3233		 * workaround the NT workarounds again
3234		 */
3235		if (cpi->cpi_family == 6)
3236			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3237		break;
3238	case X86_VENDOR_Cyrix:
3239		/*
3240		 * We rely heavily on the probing in locore
3241		 * to actually figure out what parts, if any,
3242		 * of the Cyrix cpuid instruction to believe.
3243		 */
3244		switch (x86_type) {
3245		case X86_TYPE_CYRIX_486:
3246			mask_edx = 0;
3247			break;
3248		case X86_TYPE_CYRIX_6x86:
3249			mask_edx = 0;
3250			break;
3251		case X86_TYPE_CYRIX_6x86L:
3252			mask_edx =
3253			    CPUID_INTC_EDX_DE |
3254			    CPUID_INTC_EDX_CX8;
3255			break;
3256		case X86_TYPE_CYRIX_6x86MX:
3257			mask_edx =
3258			    CPUID_INTC_EDX_DE |
3259			    CPUID_INTC_EDX_MSR |
3260			    CPUID_INTC_EDX_CX8 |
3261			    CPUID_INTC_EDX_PGE |
3262			    CPUID_INTC_EDX_CMOV |
3263			    CPUID_INTC_EDX_MMX;
3264			break;
3265		case X86_TYPE_CYRIX_GXm:
3266			mask_edx =
3267			    CPUID_INTC_EDX_MSR |
3268			    CPUID_INTC_EDX_CX8 |
3269			    CPUID_INTC_EDX_CMOV |
3270			    CPUID_INTC_EDX_MMX;
3271			break;
3272		case X86_TYPE_CYRIX_MediaGX:
3273			break;
3274		case X86_TYPE_CYRIX_MII:
3275		case X86_TYPE_VIA_CYRIX_III:
3276			mask_edx =
3277			    CPUID_INTC_EDX_DE |
3278			    CPUID_INTC_EDX_TSC |
3279			    CPUID_INTC_EDX_MSR |
3280			    CPUID_INTC_EDX_CX8 |
3281			    CPUID_INTC_EDX_PGE |
3282			    CPUID_INTC_EDX_CMOV |
3283			    CPUID_INTC_EDX_MMX;
3284			break;
3285		default:
3286			break;
3287		}
3288		break;
3289	}
3290
3291#if defined(__xpv)
3292	/*
3293	 * Do not support MONITOR/MWAIT under a hypervisor
3294	 */
3295	mask_ecx &= ~CPUID_INTC_ECX_MON;
3296	/*
3297	 * Do not support XSAVE under a hypervisor for now
3298	 */
3299	xsave_force_disable = B_TRUE;
3300
3301#endif	/* __xpv */
3302
3303	if (xsave_force_disable) {
3304		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3305		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3306		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3307		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3308	}
3309
3310	/*
3311	 * Now we've figured out the masks that determine
3312	 * which bits we choose to believe, apply the masks
3313	 * to the feature words, then map the kernel's view
3314	 * of these feature words into its feature word.
3315	 */
3316	cp->cp_edx &= mask_edx;
3317	cp->cp_ecx &= mask_ecx;
3318
3319	/*
3320	 * apply any platform restrictions (we don't call this
3321	 * immediately after __cpuid_insn here, because we need the
3322	 * workarounds applied above first)
3323	 */
3324	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3325
3326	/*
3327	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3328	 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3329	 */
3330	if (cpi->cpi_maxeax >= 7) {
3331		struct cpuid_regs *ecp;
3332		ecp = &cpi->cpi_std[7];
3333		ecp->cp_eax = 7;
3334		ecp->cp_ecx = 0;
3335		(void) __cpuid_insn(ecp);
3336
3337		/*
3338		 * If XSAVE has been disabled, just ignore all of the
3339		 * extended-save-area dependent flags here.
3340		 */
3341		if (xsave_force_disable) {
3342			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3343			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3344			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3345			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3346			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3347			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3348			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3349		}
3350
3351		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3352			add_x86_feature(featureset, X86FSET_SMEP);
3353
3354		/*
3355		 * We check disable_smap here in addition to in startup_smap()
3356		 * to ensure CPUs that aren't the boot CPU don't accidentally
3357		 * include it in the feature set and thus generate a mismatched
3358		 * x86 feature set across CPUs.
3359		 */
3360		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3361		    disable_smap == 0)
3362			add_x86_feature(featureset, X86FSET_SMAP);
3363
3364		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3365			add_x86_feature(featureset, X86FSET_RDSEED);
3366
3367		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3368			add_x86_feature(featureset, X86FSET_ADX);
3369
3370		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3371			add_x86_feature(featureset, X86FSET_FSGSBASE);
3372
3373		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3374			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3375
3376		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3377			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3378				add_x86_feature(featureset, X86FSET_INVPCID);
3379
3380			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3381				add_x86_feature(featureset, X86FSET_MPX);
3382
3383			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3384				add_x86_feature(featureset, X86FSET_CLWB);
3385		}
3386	}
3387
3388	/*
3389	 * fold in overrides from the "eeprom" mechanism
3390	 */
3391	cp->cp_edx |= cpuid_feature_edx_include;
3392	cp->cp_edx &= ~cpuid_feature_edx_exclude;
3393
3394	cp->cp_ecx |= cpuid_feature_ecx_include;
3395	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3396
3397	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3398		add_x86_feature(featureset, X86FSET_LARGEPAGE);
3399	}
3400	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3401		add_x86_feature(featureset, X86FSET_TSC);
3402	}
3403	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3404		add_x86_feature(featureset, X86FSET_MSR);
3405	}
3406	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3407		add_x86_feature(featureset, X86FSET_MTRR);
3408	}
3409	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3410		add_x86_feature(featureset, X86FSET_PGE);
3411	}
3412	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3413		add_x86_feature(featureset, X86FSET_CMOV);
3414	}
3415	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3416		add_x86_feature(featureset, X86FSET_MMX);
3417	}
3418	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3419	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3420		add_x86_feature(featureset, X86FSET_MCA);
3421	}
3422	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3423		add_x86_feature(featureset, X86FSET_PAE);
3424	}
3425	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3426		add_x86_feature(featureset, X86FSET_CX8);
3427	}
3428	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3429		add_x86_feature(featureset, X86FSET_CX16);
3430	}
3431	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3432		add_x86_feature(featureset, X86FSET_PAT);
3433	}
3434	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3435		add_x86_feature(featureset, X86FSET_SEP);
3436	}
3437	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3438		/*
3439		 * In our implementation, fxsave/fxrstor
3440		 * are prerequisites before we'll even
3441		 * try and do SSE things.
3442		 */
3443		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3444			add_x86_feature(featureset, X86FSET_SSE);
3445		}
3446		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3447			add_x86_feature(featureset, X86FSET_SSE2);
3448		}
3449		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3450			add_x86_feature(featureset, X86FSET_SSE3);
3451		}
3452		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3453			add_x86_feature(featureset, X86FSET_SSSE3);
3454		}
3455		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3456			add_x86_feature(featureset, X86FSET_SSE4_1);
3457		}
3458		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3459			add_x86_feature(featureset, X86FSET_SSE4_2);
3460		}
3461		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3462			add_x86_feature(featureset, X86FSET_AES);
3463		}
3464		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3465			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3466		}
3467
3468		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3469			add_x86_feature(featureset, X86FSET_SHA);
3470
3471		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3472			add_x86_feature(featureset, X86FSET_UMIP);
3473		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3474			add_x86_feature(featureset, X86FSET_PKU);
3475		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3476			add_x86_feature(featureset, X86FSET_OSPKE);
3477
3478		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3479			add_x86_feature(featureset, X86FSET_XSAVE);
3480
3481			/* We only test AVX & AVX512 when there is XSAVE */
3482
3483			if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3484				add_x86_feature(featureset,
3485				    X86FSET_AVX);
3486
3487				/*
3488				 * Intel says we can't check these without also
3489				 * checking AVX.
3490				 */
3491				if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3492					add_x86_feature(featureset,
3493					    X86FSET_F16C);
3494
3495				if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3496					add_x86_feature(featureset,
3497					    X86FSET_FMA);
3498
3499				if (cpi->cpi_std[7].cp_ebx &
3500				    CPUID_INTC_EBX_7_0_BMI1)
3501					add_x86_feature(featureset,
3502					    X86FSET_BMI1);
3503
3504				if (cpi->cpi_std[7].cp_ebx &
3505				    CPUID_INTC_EBX_7_0_BMI2)
3506					add_x86_feature(featureset,
3507					    X86FSET_BMI2);
3508
3509				if (cpi->cpi_std[7].cp_ebx &
3510				    CPUID_INTC_EBX_7_0_AVX2)
3511					add_x86_feature(featureset,
3512					    X86FSET_AVX2);
3513			}
3514
3515			if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3516			    (cpi->cpi_std[7].cp_ebx &
3517			    CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3518				add_x86_feature(featureset, X86FSET_AVX512F);
3519
3520				if (cpi->cpi_std[7].cp_ebx &
3521				    CPUID_INTC_EBX_7_0_AVX512DQ)
3522					add_x86_feature(featureset,
3523					    X86FSET_AVX512DQ);
3524				if (cpi->cpi_std[7].cp_ebx &
3525				    CPUID_INTC_EBX_7_0_AVX512IFMA)
3526					add_x86_feature(featureset,
3527					    X86FSET_AVX512FMA);
3528				if (cpi->cpi_std[7].cp_ebx &
3529				    CPUID_INTC_EBX_7_0_AVX512PF)
3530					add_x86_feature(featureset,
3531					    X86FSET_AVX512PF);
3532				if (cpi->cpi_std[7].cp_ebx &
3533				    CPUID_INTC_EBX_7_0_AVX512ER)
3534					add_x86_feature(featureset,
3535					    X86FSET_AVX512ER);
3536				if (cpi->cpi_std[7].cp_ebx &
3537				    CPUID_INTC_EBX_7_0_AVX512CD)
3538					add_x86_feature(featureset,
3539					    X86FSET_AVX512CD);
3540				if (cpi->cpi_std[7].cp_ebx &
3541				    CPUID_INTC_EBX_7_0_AVX512BW)
3542					add_x86_feature(featureset,
3543					    X86FSET_AVX512BW);
3544				if (cpi->cpi_std[7].cp_ebx &
3545				    CPUID_INTC_EBX_7_0_AVX512VL)
3546					add_x86_feature(featureset,
3547					    X86FSET_AVX512VL);
3548
3549				if (cpi->cpi_std[7].cp_ecx &
3550				    CPUID_INTC_ECX_7_0_AVX512VBMI)
3551					add_x86_feature(featureset,
3552					    X86FSET_AVX512VBMI);
3553				if (cpi->cpi_std[7].cp_ecx &
3554				    CPUID_INTC_ECX_7_0_AVX512VNNI)
3555					add_x86_feature(featureset,
3556					    X86FSET_AVX512VNNI);
3557				if (cpi->cpi_std[7].cp_ecx &
3558				    CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3559					add_x86_feature(featureset,
3560					    X86FSET_AVX512VPOPCDQ);
3561
3562				if (cpi->cpi_std[7].cp_edx &
3563				    CPUID_INTC_EDX_7_0_AVX5124NNIW)
3564					add_x86_feature(featureset,
3565					    X86FSET_AVX512NNIW);
3566				if (cpi->cpi_std[7].cp_edx &
3567				    CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3568					add_x86_feature(featureset,
3569					    X86FSET_AVX512FMAPS);
3570			}
3571		}
3572	}
3573
3574	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3575		if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3576			add_x86_feature(featureset, X86FSET_PCID);
3577		}
3578	}
3579
3580	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3581		add_x86_feature(featureset, X86FSET_X2APIC);
3582	}
3583	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3584		add_x86_feature(featureset, X86FSET_DE);
3585	}
3586#if !defined(__xpv)
3587	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3588
3589		/*
3590		 * We require the CLFLUSH instruction for erratum workaround
3591		 * to use MONITOR/MWAIT.
3592		 */
3593		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3594			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3595			add_x86_feature(featureset, X86FSET_MWAIT);
3596		} else {
3597			extern int idle_cpu_assert_cflush_monitor;
3598
3599			/*
3600			 * All processors we are aware of which have
3601			 * MONITOR/MWAIT also have CLFLUSH.
3602			 */
3603			if (idle_cpu_assert_cflush_monitor) {
3604				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3605				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3606			}
3607		}
3608	}
3609#endif	/* __xpv */
3610
3611	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3612		add_x86_feature(featureset, X86FSET_VMX);
3613	}
3614
3615	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3616		add_x86_feature(featureset, X86FSET_RDRAND);
3617
3618	/*
3619	 * Only need it first time, rest of the cpus would follow suit.
3620	 * we only capture this for the bootcpu.
3621	 */
3622	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3623		add_x86_feature(featureset, X86FSET_CLFSH);
3624		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3625	}
3626	if (is_x86_feature(featureset, X86FSET_PAE))
3627		cpi->cpi_pabits = 36;
3628
3629	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3630		struct cpuid_regs r, *ecp;
3631
3632		ecp = &r;
3633		ecp->cp_eax = 0xD;
3634		ecp->cp_ecx = 1;
3635		ecp->cp_edx = ecp->cp_ebx = 0;
3636		(void) __cpuid_insn(ecp);
3637
3638		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3639			add_x86_feature(featureset, X86FSET_XSAVEOPT);
3640		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3641			add_x86_feature(featureset, X86FSET_XSAVEC);
3642		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3643			add_x86_feature(featureset, X86FSET_XSAVES);
3644	}
3645
3646	/*
3647	 * Work on the "extended" feature information, doing
3648	 * some basic initialization for cpuid_pass2()
3649	 */
3650	xcpuid = 0;
3651	switch (cpi->cpi_vendor) {
3652	case X86_VENDOR_Intel:
3653		/*
3654		 * On KVM we know we will have proper support for extended
3655		 * cpuid.
3656		 */
3657		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3658		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3659		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3660			xcpuid++;
3661		break;
3662	case X86_VENDOR_AMD:
3663		if (cpi->cpi_family > 5 ||
3664		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3665			xcpuid++;
3666		break;
3667	case X86_VENDOR_Cyrix:
3668		/*
3669		 * Only these Cyrix CPUs are -known- to support
3670		 * extended cpuid operations.
3671		 */
3672		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3673		    x86_type == X86_TYPE_CYRIX_GXm)
3674			xcpuid++;
3675		break;
3676	case X86_VENDOR_Centaur:
3677	case X86_VENDOR_TM:
3678	default:
3679		xcpuid++;
3680		break;
3681	}
3682
3683	if (xcpuid) {
3684		cp = &cpi->cpi_extd[0];
3685		cp->cp_eax = CPUID_LEAF_EXT_0;
3686		cpi->cpi_xmaxeax = __cpuid_insn(cp);
3687	}
3688
3689	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3690
3691		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3692			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3693
3694		switch (cpi->cpi_vendor) {
3695		case X86_VENDOR_Intel:
3696		case X86_VENDOR_AMD:
3697			if (cpi->cpi_xmaxeax < 0x80000001)
3698				break;
3699			cp = &cpi->cpi_extd[1];
3700			cp->cp_eax = 0x80000001;
3701			(void) __cpuid_insn(cp);
3702
3703			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3704			    cpi->cpi_family == 5 &&
3705			    cpi->cpi_model == 6 &&
3706			    cpi->cpi_step == 6) {
3707				/*
3708				 * K6 model 6 uses bit 10 to indicate SYSC
3709				 * Later models use bit 11. Fix it here.
3710				 */
3711				if (cp->cp_edx & 0x400) {
3712					cp->cp_edx &= ~0x400;
3713					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3714				}
3715			}
3716
3717			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3718
3719			/*
3720			 * Compute the additions to the kernel's feature word.
3721			 */
3722			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3723				add_x86_feature(featureset, X86FSET_NX);
3724			}
3725
3726			/*
3727			 * Regardless whether or not we boot 64-bit,
3728			 * we should have a way to identify whether
3729			 * the CPU is capable of running 64-bit.
3730			 */
3731			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3732				add_x86_feature(featureset, X86FSET_64);
3733			}
3734
3735			/* 1 GB large page - enable only for 64 bit kernel */
3736			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3737				add_x86_feature(featureset, X86FSET_1GPG);
3738			}
3739
3740			if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3741			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3742			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3743				add_x86_feature(featureset, X86FSET_SSE4A);
3744			}
3745
3746			/*
3747			 * It's really tricky to support syscall/sysret in
3748			 * the i386 kernel; we rely on sysenter/sysexit
3749			 * instead.  In the amd64 kernel, things are -way-
3750			 * better.
3751			 */
3752			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3753				add_x86_feature(featureset, X86FSET_ASYSC);
3754			}
3755
3756			/*
3757			 * While we're thinking about system calls, note
3758			 * that AMD processors don't support sysenter
3759			 * in long mode at all, so don't try to program them.
3760			 */
3761			if (x86_vendor == X86_VENDOR_AMD) {
3762				remove_x86_feature(featureset, X86FSET_SEP);
3763			}
3764
3765			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3766				add_x86_feature(featureset, X86FSET_TSCP);
3767			}
3768
3769			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3770				add_x86_feature(featureset, X86FSET_SVM);
3771			}
3772
3773			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3774				add_x86_feature(featureset, X86FSET_TOPOEXT);
3775			}
3776
3777			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3778				add_x86_feature(featureset, X86FSET_AMD_PCEC);
3779			}
3780
3781			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3782				add_x86_feature(featureset, X86FSET_XOP);
3783			}
3784
3785			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3786				add_x86_feature(featureset, X86FSET_FMA4);
3787			}
3788
3789			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3790				add_x86_feature(featureset, X86FSET_TBM);
3791			}
3792
3793			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3794				add_x86_feature(featureset, X86FSET_MONITORX);
3795			}
3796			break;
3797		default:
3798			break;
3799		}
3800
3801		/*
3802		 * Get CPUID data about processor cores and hyperthreads.
3803		 */
3804		switch (cpi->cpi_vendor) {
3805		case X86_VENDOR_Intel:
3806			if (cpi->cpi_maxeax >= 4) {
3807				cp = &cpi->cpi_std[4];
3808				cp->cp_eax = 4;
3809				cp->cp_ecx = 0;
3810				(void) __cpuid_insn(cp);
3811				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3812			}
3813			/*FALLTHROUGH*/
3814		case X86_VENDOR_AMD:
3815			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3816				break;
3817			cp = &cpi->cpi_extd[8];
3818			cp->cp_eax = CPUID_LEAF_EXT_8;
3819			(void) __cpuid_insn(cp);
3820			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3821			    cp);
3822
3823			/*
3824			 * AMD uses ebx for some extended functions.
3825			 */
3826			if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3827				/*
3828				 * While we're here, check for the AMD "Error
3829				 * Pointer Zero/Restore" feature. This can be
3830				 * used to setup the FP save handlers
3831				 * appropriately.
3832				 */
3833				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3834					cpi->cpi_fp_amd_save = 0;
3835				} else {
3836					cpi->cpi_fp_amd_save = 1;
3837				}
3838
3839				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3840					add_x86_feature(featureset,
3841					    X86FSET_CLZERO);
3842				}
3843			}
3844
3845			/*
3846			 * Virtual and physical address limits from
3847			 * cpuid override previously guessed values.
3848			 */
3849			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3850			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3851			break;
3852		default:
3853			break;
3854		}
3855
3856		/*
3857		 * Get CPUID data about TSC Invariance in Deep C-State.
3858		 */
3859		switch (cpi->cpi_vendor) {
3860		case X86_VENDOR_Intel:
3861		case X86_VENDOR_AMD:
3862			if (cpi->cpi_maxeax >= 7) {
3863				cp = &cpi->cpi_extd[7];
3864				cp->cp_eax = 0x80000007;
3865				cp->cp_ecx = 0;
3866				(void) __cpuid_insn(cp);
3867			}
3868			break;
3869		default:
3870			break;
3871		}
3872	}
3873
3874	cpuid_pass1_topology(cpu, featureset);
3875	cpuid_pass1_thermal(cpu, featureset);
3876
3877	/*
3878	 * Synthesize chip "revision" and socket type
3879	 */
3880	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3881	    cpi->cpi_model, cpi->cpi_step);
3882	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3883	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3884	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3885	    cpi->cpi_model, cpi->cpi_step);
3886
3887	if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3888		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3889		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3890			/* Special handling for AMD FP not necessary. */
3891			cpi->cpi_fp_amd_save = 0;
3892		} else {
3893			cpi->cpi_fp_amd_save = 1;
3894		}
3895	}
3896
3897	/*
3898	 * Check the processor leaves that are used for security features.
3899	 */
3900	cpuid_scan_security(cpu, featureset);
3901
3902pass1_done:
3903	cpi->cpi_pass = 1;
3904}
3905
3906/*
3907 * Make copies of the cpuid table entries we depend on, in
3908 * part for ease of parsing now, in part so that we have only
3909 * one place to correct any of it, in part for ease of
3910 * later export to userland, and in part so we can look at
3911 * this stuff in a crash dump.
3912 */
3913
3914/*ARGSUSED*/
3915void
3916cpuid_pass2(cpu_t *cpu)
3917{
3918	uint_t n, nmax;
3919	int i;
3920	struct cpuid_regs *cp;
3921	uint8_t *dp;
3922	uint32_t *iptr;
3923	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3924
3925	ASSERT(cpi->cpi_pass == 1);
3926
3927	if (cpi->cpi_maxeax < 1)
3928		goto pass2_done;
3929
3930	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3931		nmax = NMAX_CPI_STD;
3932	/*
3933	 * (We already handled n == 0 and n == 1 in pass 1)
3934	 */
3935	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3936		/*
3937		 * leaves 6 and 7 were handled in pass 1
3938		 */
3939		if (n == 6 || n == 7)
3940			continue;
3941
3942		cp->cp_eax = n;
3943
3944		/*
3945		 * CPUID function 4 expects %ecx to be initialized
3946		 * with an index which indicates which cache to return
3947		 * information about. The OS is expected to call function 4
3948		 * with %ecx set to 0, 1, 2, ... until it returns with
3949		 * EAX[4:0] set to 0, which indicates there are no more
3950		 * caches.
3951		 *
3952		 * Here, populate cpi_std[4] with the information returned by
3953		 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3954		 * when dynamic memory allocation becomes available.
3955		 *
3956		 * Note: we need to explicitly initialize %ecx here, since
3957		 * function 4 may have been previously invoked.
3958		 */
3959		if (n == 4)
3960			cp->cp_ecx = 0;
3961
3962		(void) __cpuid_insn(cp);
3963		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3964		switch (n) {
3965		case 2:
3966			/*
3967			 * "the lower 8 bits of the %eax register
3968			 * contain a value that identifies the number
3969			 * of times the cpuid [instruction] has to be
3970			 * executed to obtain a complete image of the
3971			 * processor's caching systems."
3972			 *
3973			 * How *do* they make this stuff up?
3974			 */
3975			cpi->cpi_ncache = sizeof (*cp) *
3976			    BITX(cp->cp_eax, 7, 0);
3977			if (cpi->cpi_ncache == 0)
3978				break;
3979			cpi->cpi_ncache--;	/* skip count byte */
3980
3981			/*
3982			 * Well, for now, rather than attempt to implement
3983			 * this slightly dubious algorithm, we just look
3984			 * at the first 15 ..
3985			 */
3986			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3987				cpi->cpi_ncache = sizeof (*cp) - 1;
3988
3989			dp = cpi->cpi_cacheinfo;
3990			if (BITX(cp->cp_eax, 31, 31) == 0) {
3991				uint8_t *p = (void *)&cp->cp_eax;
3992				for (i = 1; i < 4; i++)
3993					if (p[i] != 0)
3994						*dp++ = p[i];
3995			}
3996			if (BITX(cp->cp_ebx, 31, 31) == 0) {
3997				uint8_t *p = (void *)&cp->cp_ebx;
3998				for (i = 0; i < 4; i++)
3999					if (p[i] != 0)
4000						*dp++ = p[i];
4001			}
4002			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4003				uint8_t *p = (void *)&cp->cp_ecx;
4004				for (i = 0; i < 4; i++)
4005					if (p[i] != 0)
4006						*dp++ = p[i];
4007			}
4008			if (BITX(cp->cp_edx, 31, 31) == 0) {
4009				uint8_t *p = (void *)&cp->cp_edx;
4010				for (i = 0; i < 4; i++)
4011					if (p[i] != 0)
4012						*dp++ = p[i];
4013			}
4014			break;
4015
4016		case 3:	/* Processor serial number, if PSN supported */
4017			break;
4018
4019		case 4:	/* Deterministic cache parameters */
4020			break;
4021
4022		case 5:	/* Monitor/Mwait parameters */
4023		{
4024			size_t mwait_size;
4025
4026			/*
4027			 * check cpi_mwait.support which was set in cpuid_pass1
4028			 */
4029			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4030				break;
4031
4032			/*
4033			 * Protect ourself from insane mwait line size.
4034			 * Workaround for incomplete hardware emulator(s).
4035			 */
4036			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4037			if (mwait_size < sizeof (uint32_t) ||
4038			    !ISP2(mwait_size)) {
4039#if DEBUG
4040				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4041				    "size %ld", cpu->cpu_id, (long)mwait_size);
4042#endif
4043				break;
4044			}
4045
4046			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4047			cpi->cpi_mwait.mon_max = mwait_size;
4048			if (MWAIT_EXTENSION(cpi)) {
4049				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4050				if (MWAIT_INT_ENABLE(cpi))
4051					cpi->cpi_mwait.support |=
4052					    MWAIT_ECX_INT_ENABLE;
4053			}
4054			break;
4055		}
4056		default:
4057			break;
4058		}
4059	}
4060
4061	/*
4062	 * XSAVE enumeration
4063	 */
4064	if (cpi->cpi_maxeax >= 0xD) {
4065		struct cpuid_regs regs;
4066		boolean_t cpuid_d_valid = B_TRUE;
4067
4068		cp = &regs;
4069		cp->cp_eax = 0xD;
4070		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4071
4072		(void) __cpuid_insn(cp);
4073
4074		/*
4075		 * Sanity checks for debug
4076		 */
4077		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4078		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4079			cpuid_d_valid = B_FALSE;
4080		}
4081
4082		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4083		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4084		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4085
4086		/*
4087		 * If the hw supports AVX, get the size and offset in the save
4088		 * area for the ymm state.
4089		 */
4090		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4091			cp->cp_eax = 0xD;
4092			cp->cp_ecx = 2;
4093			cp->cp_edx = cp->cp_ebx = 0;
4094
4095			(void) __cpuid_insn(cp);
4096
4097			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4098			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4099				cpuid_d_valid = B_FALSE;
4100			}
4101
4102			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4103			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4104		}
4105
4106		/*
4107		 * If the hw supports MPX, get the size and offset in the
4108		 * save area for BNDREGS and BNDCSR.
4109		 */
4110		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4111			cp->cp_eax = 0xD;
4112			cp->cp_ecx = 3;
4113			cp->cp_edx = cp->cp_ebx = 0;
4114
4115			(void) __cpuid_insn(cp);
4116
4117			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4118			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4119
4120			cp->cp_eax = 0xD;
4121			cp->cp_ecx = 4;
4122			cp->cp_edx = cp->cp_ebx = 0;
4123
4124			(void) __cpuid_insn(cp);
4125
4126			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4127			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4128		}
4129
4130		/*
4131		 * If the hw supports AVX512, get the size and offset in the
4132		 * save area for the opmask registers and zmm state.
4133		 */
4134		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4135			cp->cp_eax = 0xD;
4136			cp->cp_ecx = 5;
4137			cp->cp_edx = cp->cp_ebx = 0;
4138
4139			(void) __cpuid_insn(cp);
4140
4141			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4142			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4143
4144			cp->cp_eax = 0xD;
4145			cp->cp_ecx = 6;
4146			cp->cp_edx = cp->cp_ebx = 0;
4147
4148			(void) __cpuid_insn(cp);
4149
4150			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4151			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4152
4153			cp->cp_eax = 0xD;
4154			cp->cp_ecx = 7;
4155			cp->cp_edx = cp->cp_ebx = 0;
4156
4157			(void) __cpuid_insn(cp);
4158
4159			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4160			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4161		}
4162
4163		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4164			xsave_state_size = 0;
4165		} else if (cpuid_d_valid) {
4166			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4167		} else {
4168			/* Broken CPUID 0xD, probably in HVM */
4169			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4170			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4171			    ", ymm_size = %d, ymm_offset = %d\n",
4172			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4173			    cpi->cpi_xsave.xsav_hw_features_high,
4174			    (int)cpi->cpi_xsave.xsav_max_size,
4175			    (int)cpi->cpi_xsave.ymm_size,
4176			    (int)cpi->cpi_xsave.ymm_offset);
4177
4178			if (xsave_state_size != 0) {
4179				/*
4180				 * This must be a non-boot CPU. We cannot
4181				 * continue, because boot cpu has already
4182				 * enabled XSAVE.
4183				 */
4184				ASSERT(cpu->cpu_id != 0);
4185				cmn_err(CE_PANIC, "cpu%d: we have already "
4186				    "enabled XSAVE on boot cpu, cannot "
4187				    "continue.", cpu->cpu_id);
4188			} else {
4189				/*
4190				 * If we reached here on the boot CPU, it's also
4191				 * almost certain that we'll reach here on the
4192				 * non-boot CPUs. When we're here on a boot CPU
4193				 * we should disable the feature, on a non-boot
4194				 * CPU we need to confirm that we have.
4195				 */
4196				if (cpu->cpu_id == 0) {
4197					remove_x86_feature(x86_featureset,
4198					    X86FSET_XSAVE);
4199					remove_x86_feature(x86_featureset,
4200					    X86FSET_AVX);
4201					remove_x86_feature(x86_featureset,
4202					    X86FSET_F16C);
4203					remove_x86_feature(x86_featureset,
4204					    X86FSET_BMI1);
4205					remove_x86_feature(x86_featureset,
4206					    X86FSET_BMI2);
4207					remove_x86_feature(x86_featureset,
4208					    X86FSET_FMA);
4209					remove_x86_feature(x86_featureset,
4210					    X86FSET_AVX2);
4211					remove_x86_feature(x86_featureset,
4212					    X86FSET_MPX);
4213					remove_x86_feature(x86_featureset,
4214					    X86FSET_AVX512F);
4215					remove_x86_feature(x86_featureset,
4216					    X86FSET_AVX512DQ);
4217					remove_x86_feature(x86_featureset,
4218					    X86FSET_AVX512PF);
4219					remove_x86_feature(x86_featureset,
4220					    X86FSET_AVX512ER);
4221					remove_x86_feature(x86_featureset,
4222					    X86FSET_AVX512CD);
4223					remove_x86_feature(x86_featureset,
4224					    X86FSET_AVX512BW);
4225					remove_x86_feature(x86_featureset,
4226					    X86FSET_AVX512VL);
4227					remove_x86_feature(x86_featureset,
4228					    X86FSET_AVX512FMA);
4229					remove_x86_feature(x86_featureset,
4230					    X86FSET_AVX512VBMI);
4231					remove_x86_feature(x86_featureset,
4232					    X86FSET_AVX512VNNI);
4233					remove_x86_feature(x86_featureset,
4234					    X86FSET_AVX512VPOPCDQ);
4235					remove_x86_feature(x86_featureset,
4236					    X86FSET_AVX512NNIW);
4237					remove_x86_feature(x86_featureset,
4238					    X86FSET_AVX512FMAPS);
4239
4240					CPI_FEATURES_ECX(cpi) &=
4241					    ~CPUID_INTC_ECX_XSAVE;
4242					CPI_FEATURES_ECX(cpi) &=
4243					    ~CPUID_INTC_ECX_AVX;
4244					CPI_FEATURES_ECX(cpi) &=
4245					    ~CPUID_INTC_ECX_F16C;
4246					CPI_FEATURES_ECX(cpi) &=
4247					    ~CPUID_INTC_ECX_FMA;
4248					CPI_FEATURES_7_0_EBX(cpi) &=
4249					    ~CPUID_INTC_EBX_7_0_BMI1;
4250					CPI_FEATURES_7_0_EBX(cpi) &=
4251					    ~CPUID_INTC_EBX_7_0_BMI2;
4252					CPI_FEATURES_7_0_EBX(cpi) &=
4253					    ~CPUID_INTC_EBX_7_0_AVX2;
4254					CPI_FEATURES_7_0_EBX(cpi) &=
4255					    ~CPUID_INTC_EBX_7_0_MPX;
4256					CPI_FEATURES_7_0_EBX(cpi) &=
4257					    ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4258
4259					CPI_FEATURES_7_0_ECX(cpi) &=
4260					    ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4261
4262					CPI_FEATURES_7_0_EDX(cpi) &=
4263					    ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4264
4265					xsave_force_disable = B_TRUE;
4266				} else {
4267					VERIFY(is_x86_feature(x86_featureset,
4268					    X86FSET_XSAVE) == B_FALSE);
4269				}
4270			}
4271		}
4272	}
4273
4274
4275	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4276		goto pass2_done;
4277
4278	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4279		nmax = NMAX_CPI_EXTD;
4280	/*
4281	 * Copy the extended properties, fixing them as we go.
4282	 * (We already handled n == 0 and n == 1 in pass 1)
4283	 */
4284	iptr = (void *)cpi->cpi_brandstr;
4285	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4286		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4287		(void) __cpuid_insn(cp);
4288		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4289		    cp);
4290		switch (n) {
4291		case 2:
4292		case 3:
4293		case 4:
4294			/*
4295			 * Extract the brand string
4296			 */
4297			*iptr++ = cp->cp_eax;
4298			*iptr++ = cp->cp_ebx;
4299			*iptr++ = cp->cp_ecx;
4300			*iptr++ = cp->cp_edx;
4301			break;
4302		case 5:
4303			switch (cpi->cpi_vendor) {
4304			case X86_VENDOR_AMD:
4305				/*
4306				 * The Athlon and Duron were the first
4307				 * parts to report the sizes of the
4308				 * TLB for large pages. Before then,
4309				 * we don't trust the data.
4310				 */
4311				if (cpi->cpi_family < 6 ||
4312				    (cpi->cpi_family == 6 &&
4313				    cpi->cpi_model < 1))
4314					cp->cp_eax = 0;
4315				break;
4316			default:
4317				break;
4318			}
4319			break;
4320		case 6:
4321			switch (cpi->cpi_vendor) {
4322			case X86_VENDOR_AMD:
4323				/*
4324				 * The Athlon and Duron were the first
4325				 * AMD parts with L2 TLB's.
4326				 * Before then, don't trust the data.
4327				 */
4328				if (cpi->cpi_family < 6 ||
4329				    cpi->cpi_family == 6 &&
4330				    cpi->cpi_model < 1)
4331					cp->cp_eax = cp->cp_ebx = 0;
4332				/*
4333				 * AMD Duron rev A0 reports L2
4334				 * cache size incorrectly as 1K
4335				 * when it is really 64K
4336				 */
4337				if (cpi->cpi_family == 6 &&
4338				    cpi->cpi_model == 3 &&
4339				    cpi->cpi_step == 0) {
4340					cp->cp_ecx &= 0xffff;
4341					cp->cp_ecx |= 0x400000;
4342				}
4343				break;
4344			case X86_VENDOR_Cyrix:	/* VIA C3 */
4345				/*
4346				 * VIA C3 processors are a bit messed
4347				 * up w.r.t. encoding cache sizes in %ecx
4348				 */
4349				if (cpi->cpi_family != 6)
4350					break;
4351				/*
4352				 * model 7 and 8 were incorrectly encoded
4353				 *
4354				 * xxx is model 8 really broken?
4355				 */
4356				if (cpi->cpi_model == 7 ||
4357				    cpi->cpi_model == 8)
4358					cp->cp_ecx =
4359					    BITX(cp->cp_ecx, 31, 24) << 16 |
4360					    BITX(cp->cp_ecx, 23, 16) << 12 |
4361					    BITX(cp->cp_ecx, 15, 8) << 8 |
4362					    BITX(cp->cp_ecx, 7, 0);
4363				/*
4364				 * model 9 stepping 1 has wrong associativity
4365				 */
4366				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4367					cp->cp_ecx |= 8 << 12;
4368				break;
4369			case X86_VENDOR_Intel:
4370				/*
4371				 * Extended L2 Cache features function.
4372				 * First appeared on Prescott.
4373				 */
4374			default:
4375				break;
4376			}
4377			break;
4378		default:
4379			break;
4380		}
4381	}
4382
4383pass2_done:
4384	cpi->cpi_pass = 2;
4385}
4386
4387static const char *
4388intel_cpubrand(const struct cpuid_info *cpi)
4389{
4390	int i;
4391
4392	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4393	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4394		return ("i486");
4395
4396	switch (cpi->cpi_family) {
4397	case 5:
4398		return ("Intel Pentium(r)");
4399	case 6:
4400		switch (cpi->cpi_model) {
4401			uint_t celeron, xeon;
4402			const struct cpuid_regs *cp;
4403		case 0:
4404		case 1:
4405		case 2:
4406			return ("Intel Pentium(r) Pro");
4407		case 3:
4408		case 4:
4409			return ("Intel Pentium(r) II");
4410		case 6:
4411			return ("Intel Celeron(r)");
4412		case 5:
4413		case 7:
4414			celeron = xeon = 0;
4415			cp = &cpi->cpi_std[2];	/* cache info */
4416
4417			for (i = 1; i < 4; i++) {
4418				uint_t tmp;
4419
4420				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4421				if (tmp == 0x40)
4422					celeron++;
4423				if (tmp >= 0x44 && tmp <= 0x45)
4424					xeon++;
4425			}
4426
4427			for (i = 0; i < 2; i++) {
4428				uint_t tmp;
4429
4430				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4431				if (tmp == 0x40)
4432					celeron++;
4433				else if (tmp >= 0x44 && tmp <= 0x45)
4434					xeon++;
4435			}
4436
4437			for (i = 0; i < 4; i++) {
4438				uint_t tmp;
4439
4440				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4441				if (tmp == 0x40)
4442					celeron++;
4443				else if (tmp >= 0x44 && tmp <= 0x45)
4444					xeon++;
4445			}
4446
4447			for (i = 0; i < 4; i++) {
4448				uint_t tmp;
4449
4450				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4451				if (tmp == 0x40)
4452					celeron++;
4453				else if (tmp >= 0x44 && tmp <= 0x45)
4454					xeon++;
4455			}
4456
4457			if (celeron)
4458				return ("Intel Celeron(r)");
4459			if (xeon)
4460				return (cpi->cpi_model == 5 ?
4461				    "Intel Pentium(r) II Xeon(tm)" :
4462				    "Intel Pentium(r) III Xeon(tm)");
4463			return (cpi->cpi_model == 5 ?
4464			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4465			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4466		default:
4467			break;
4468		}
4469	default:
4470		break;
4471	}
4472
4473	/* BrandID is present if the field is nonzero */
4474	if (cpi->cpi_brandid != 0) {
4475		static const struct {
4476			uint_t bt_bid;
4477			const char *bt_str;
4478		} brand_tbl[] = {
4479			{ 0x1,	"Intel(r) Celeron(r)" },
4480			{ 0x2,	"Intel(r) Pentium(r) III" },
4481			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
4482			{ 0x4,	"Intel(r) Pentium(r) III" },
4483			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
4484			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
4485			{ 0x8,	"Intel(r) Pentium(r) 4" },
4486			{ 0x9,	"Intel(r) Pentium(r) 4" },
4487			{ 0xa,	"Intel(r) Celeron(r)" },
4488			{ 0xb,	"Intel(r) Xeon(tm)" },
4489			{ 0xc,	"Intel(r) Xeon(tm) MP" },
4490			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
4491			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
4492			{ 0x11, "Mobile Genuine Intel(r)" },
4493			{ 0x12, "Intel(r) Celeron(r) M" },
4494			{ 0x13, "Mobile Intel(r) Celeron(r)" },
4495			{ 0x14, "Intel(r) Celeron(r)" },
4496			{ 0x15, "Mobile Genuine Intel(r)" },
4497			{ 0x16,	"Intel(r) Pentium(r) M" },
4498			{ 0x17, "Mobile Intel(r) Celeron(r)" }
4499		};
4500		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4501		uint_t sgn;
4502
4503		sgn = (cpi->cpi_family << 8) |
4504		    (cpi->cpi_model << 4) | cpi->cpi_step;
4505
4506		for (i = 0; i < btblmax; i++)
4507			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4508				break;
4509		if (i < btblmax) {
4510			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4511				return ("Intel(r) Celeron(r)");
4512			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4513				return ("Intel(r) Xeon(tm) MP");
4514			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4515				return ("Intel(r) Xeon(tm)");
4516			return (brand_tbl[i].bt_str);
4517		}
4518	}
4519
4520	return (NULL);
4521}
4522
4523static const char *
4524amd_cpubrand(const struct cpuid_info *cpi)
4525{
4526	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4527	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4528		return ("i486 compatible");
4529
4530	switch (cpi->cpi_family) {
4531	case 5:
4532		switch (cpi->cpi_model) {
4533		case 0:
4534		case 1:
4535		case 2:
4536		case 3:
4537		case 4:
4538		case 5:
4539			return ("AMD-K5(r)");
4540		case 6:
4541		case 7:
4542			return ("AMD-K6(r)");
4543		case 8:
4544			return ("AMD-K6(r)-2");
4545		case 9:
4546			return ("AMD-K6(r)-III");
4547		default:
4548			return ("AMD (family 5)");
4549		}
4550	case 6:
4551		switch (cpi->cpi_model) {
4552		case 1:
4553			return ("AMD-K7(tm)");
4554		case 0:
4555		case 2:
4556		case 4:
4557			return ("AMD Athlon(tm)");
4558		case 3:
4559		case 7:
4560			return ("AMD Duron(tm)");
4561		case 6:
4562		case 8:
4563		case 10:
4564			/*
4565			 * Use the L2 cache size to distinguish
4566			 */
4567			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4568			    "AMD Athlon(tm)" : "AMD Duron(tm)");
4569		default:
4570			return ("AMD (family 6)");
4571		}
4572	default:
4573		break;
4574	}
4575
4576	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4577	    cpi->cpi_brandid != 0) {
4578		switch (BITX(cpi->cpi_brandid, 7, 5)) {
4579		case 3:
4580			return ("AMD Opteron(tm) UP 1xx");
4581		case 4:
4582			return ("AMD Opteron(tm) DP 2xx");
4583		case 5:
4584			return ("AMD Opteron(tm) MP 8xx");
4585		default:
4586			return ("AMD Opteron(tm)");
4587		}
4588	}
4589
4590	return (NULL);
4591}
4592
4593static const char *
4594cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4595{
4596	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4597	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4598	    type == X86_TYPE_CYRIX_486)
4599		return ("i486 compatible");
4600
4601	switch (type) {
4602	case X86_TYPE_CYRIX_6x86:
4603		return ("Cyrix 6x86");
4604	case X86_TYPE_CYRIX_6x86L:
4605		return ("Cyrix 6x86L");
4606	case X86_TYPE_CYRIX_6x86MX:
4607		return ("Cyrix 6x86MX");
4608	case X86_TYPE_CYRIX_GXm:
4609		return ("Cyrix GXm");
4610	case X86_TYPE_CYRIX_MediaGX:
4611		return ("Cyrix MediaGX");
4612	case X86_TYPE_CYRIX_MII:
4613		return ("Cyrix M2");
4614	case X86_TYPE_VIA_CYRIX_III:
4615		return ("VIA Cyrix M3");
4616	default:
4617		/*
4618		 * Have another wild guess ..
4619		 */
4620		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4621			return ("Cyrix 5x86");
4622		else if (cpi->cpi_family == 5) {
4623			switch (cpi->cpi_model) {
4624			case 2:
4625				return ("Cyrix 6x86");	/* Cyrix M1 */
4626			case 4:
4627				return ("Cyrix MediaGX");
4628			default:
4629				break;
4630			}
4631		} else if (cpi->cpi_family == 6) {
4632			switch (cpi->cpi_model) {
4633			case 0:
4634				return ("Cyrix 6x86MX"); /* Cyrix M2? */
4635			case 5:
4636			case 6:
4637			case 7:
4638			case 8:
4639			case 9:
4640				return ("VIA C3");
4641			default:
4642				break;
4643			}
4644		}
4645		break;
4646	}
4647	return (NULL);
4648}
4649
4650/*
4651 * This only gets called in the case that the CPU extended
4652 * feature brand string (0x80000002, 0x80000003, 0x80000004)
4653 * aren't available, or contain null bytes for some reason.
4654 */
4655static void
4656fabricate_brandstr(struct cpuid_info *cpi)
4657{
4658	const char *brand = NULL;
4659
4660	switch (cpi->cpi_vendor) {
4661	case X86_VENDOR_Intel:
4662		brand = intel_cpubrand(cpi);
4663		break;
4664	case X86_VENDOR_AMD:
4665		brand = amd_cpubrand(cpi);
4666		break;
4667	case X86_VENDOR_Cyrix:
4668		brand = cyrix_cpubrand(cpi, x86_type);
4669		break;
4670	case X86_VENDOR_NexGen:
4671		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4672			brand = "NexGen Nx586";
4673		break;
4674	case X86_VENDOR_Centaur:
4675		if (cpi->cpi_family == 5)
4676			switch (cpi->cpi_model) {
4677			case 4:
4678				brand = "Centaur C6";
4679				break;
4680			case 8:
4681				brand = "Centaur C2";
4682				break;
4683			case 9:
4684				brand = "Centaur C3";
4685				break;
4686			default:
4687				break;
4688			}
4689		break;
4690	case X86_VENDOR_Rise:
4691		if (cpi->cpi_family == 5 &&
4692		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4693			brand = "Rise mP6";
4694		break;
4695	case X86_VENDOR_SiS:
4696		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4697			brand = "SiS 55x";
4698		break;
4699	case X86_VENDOR_TM:
4700		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4701			brand = "Transmeta Crusoe TM3x00 or TM5x00";
4702		break;
4703	case X86_VENDOR_NSC:
4704	case X86_VENDOR_UMC:
4705	default:
4706		break;
4707	}
4708	if (brand) {
4709		(void) strcpy((char *)cpi->cpi_brandstr, brand);
4710		return;
4711	}
4712
4713	/*
4714	 * If all else fails ...
4715	 */
4716	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4717	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4718	    cpi->cpi_model, cpi->cpi_step);
4719}
4720
4721/*
4722 * This routine is called just after kernel memory allocation
4723 * becomes available on cpu0, and as part of mp_startup() on
4724 * the other cpus.
4725 *
4726 * Fixup the brand string, and collect any information from cpuid
4727 * that requires dynamically allocated storage to represent.
4728 */
4729/*ARGSUSED*/
4730void
4731cpuid_pass3(cpu_t *cpu)
4732{
4733	int	i, max, shft, level, size;
4734	struct cpuid_regs regs;
4735	struct cpuid_regs *cp;
4736	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4737
4738	ASSERT(cpi->cpi_pass == 2);
4739
4740	/*
4741	 * Deterministic cache parameters
4742	 *
4743	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4744	 * values that are present are currently defined to be the same. This
4745	 * means we can use the same logic to parse it as long as we use the
4746	 * appropriate leaf to get the data. If you're updating this, make sure
4747	 * you're careful about which vendor supports which aspect.
4748	 *
4749	 * Take this opportunity to detect the number of threads sharing the
4750	 * last level cache, and construct a corresponding cache id. The
4751	 * respective cpuid_info members are initialized to the default case of
4752	 * "no last level cache sharing".
4753	 */
4754	cpi->cpi_ncpu_shr_last_cache = 1;
4755	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4756
4757	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4758	    (cpi->cpi_vendor == X86_VENDOR_AMD &&
4759	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4760	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4761		uint32_t leaf;
4762
4763		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4764			leaf = 4;
4765		} else {
4766			leaf = CPUID_LEAF_EXT_1d;
4767		}
4768
4769		/*
4770		 * Find the # of elements (size) returned by the leaf and along
4771		 * the way detect last level cache sharing details.
4772		 */
4773		bzero(&regs, sizeof (regs));
4774		cp = &regs;
4775		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4776			cp->cp_eax = leaf;
4777			cp->cp_ecx = i;
4778
4779			(void) __cpuid_insn(cp);
4780
4781			if (CPI_CACHE_TYPE(cp) == 0)
4782				break;
4783			level = CPI_CACHE_LVL(cp);
4784			if (level > max) {
4785				max = level;
4786				cpi->cpi_ncpu_shr_last_cache =
4787				    CPI_NTHR_SHR_CACHE(cp) + 1;
4788			}
4789		}
4790		cpi->cpi_cache_leaf_size = size = i;
4791
4792		/*
4793		 * Allocate the cpi_cache_leaves array. The first element
4794		 * references the regs for the corresponding leaf with %ecx set
4795		 * to 0. This was gathered in cpuid_pass2().
4796		 */
4797		if (size > 0) {
4798			cpi->cpi_cache_leaves =
4799			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
4800			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4801				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4802			} else {
4803				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4804			}
4805
4806			/*
4807			 * Allocate storage to hold the additional regs
4808			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4809			 *
4810			 * The regs for the leaf, %ecx == 0 has already
4811			 * been allocated as indicated above.
4812			 */
4813			for (i = 1; i < size; i++) {
4814				cp = cpi->cpi_cache_leaves[i] =
4815				    kmem_zalloc(sizeof (regs), KM_SLEEP);
4816				cp->cp_eax = leaf;
4817				cp->cp_ecx = i;
4818
4819				(void) __cpuid_insn(cp);
4820			}
4821		}
4822		/*
4823		 * Determine the number of bits needed to represent
4824		 * the number of CPUs sharing the last level cache.
4825		 *
4826		 * Shift off that number of bits from the APIC id to
4827		 * derive the cache id.
4828		 */
4829		shft = 0;
4830		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4831			shft++;
4832		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4833	}
4834
4835	/*
4836	 * Now fixup the brand string
4837	 */
4838	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4839		fabricate_brandstr(cpi);
4840	} else {
4841
4842		/*
4843		 * If we successfully extracted a brand string from the cpuid
4844		 * instruction, clean it up by removing leading spaces and
4845		 * similar junk.
4846		 */
4847		if (cpi->cpi_brandstr[0]) {
4848			size_t maxlen = sizeof (cpi->cpi_brandstr);
4849			char *src, *dst;
4850
4851			dst = src = (char *)cpi->cpi_brandstr;
4852			src[maxlen - 1] = '\0';
4853			/*
4854			 * strip leading spaces
4855			 */
4856			while (*src == ' ')
4857				src++;
4858			/*
4859			 * Remove any 'Genuine' or "Authentic" prefixes
4860			 */
4861			if (strncmp(src, "Genuine ", 8) == 0)
4862				src += 8;
4863			if (strncmp(src, "Authentic ", 10) == 0)
4864				src += 10;
4865
4866			/*
4867			 * Now do an in-place copy.
4868			 * Map (R) to (r) and (TM) to (tm).
4869			 * The era of teletypes is long gone, and there's
4870			 * -really- no need to shout.
4871			 */
4872			while (*src != '\0') {
4873				if (src[0] == '(') {
4874					if (strncmp(src + 1, "R)", 2) == 0) {
4875						(void) strncpy(dst, "(r)", 3);
4876						src += 3;
4877						dst += 3;
4878						continue;
4879					}
4880					if (strncmp(src + 1, "TM)", 3) == 0) {
4881						(void) strncpy(dst, "(tm)", 4);
4882						src += 4;
4883						dst += 4;
4884						continue;
4885					}
4886				}
4887				*dst++ = *src++;
4888			}
4889			*dst = '\0';
4890
4891			/*
4892			 * Finally, remove any trailing spaces
4893			 */
4894			while (--dst > cpi->cpi_brandstr)
4895				if (*dst == ' ')
4896					*dst = '\0';
4897				else
4898					break;
4899		} else
4900			fabricate_brandstr(cpi);
4901	}
4902	cpi->cpi_pass = 3;
4903}
4904
4905/*
4906 * This routine is called out of bind_hwcap() much later in the life
4907 * of the kernel (post_startup()).  The job of this routine is to resolve
4908 * the hardware feature support and kernel support for those features into
4909 * what we're actually going to tell applications via the aux vector.
4910 */
4911void
4912cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4913{
4914	struct cpuid_info *cpi;
4915	uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4916
4917	if (cpu == NULL)
4918		cpu = CPU;
4919	cpi = cpu->cpu_m.mcpu_cpi;
4920
4921	ASSERT(cpi->cpi_pass == 3);
4922
4923	if (cpi->cpi_maxeax >= 1) {
4924		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4925		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4926		uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4927
4928		*edx = CPI_FEATURES_EDX(cpi);
4929		*ecx = CPI_FEATURES_ECX(cpi);
4930		*ebx = CPI_FEATURES_7_0_EBX(cpi);
4931
4932		/*
4933		 * [these require explicit kernel support]
4934		 */
4935		if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4936			*edx &= ~CPUID_INTC_EDX_SEP;
4937
4938		if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4939			*edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4940		if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4941			*edx &= ~CPUID_INTC_EDX_SSE2;
4942
4943		if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4944			*edx &= ~CPUID_INTC_EDX_HTT;
4945
4946		if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4947			*ecx &= ~CPUID_INTC_ECX_SSE3;
4948
4949		if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4950			*ecx &= ~CPUID_INTC_ECX_SSSE3;
4951		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4952			*ecx &= ~CPUID_INTC_ECX_SSE4_1;
4953		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4954			*ecx &= ~CPUID_INTC_ECX_SSE4_2;
4955		if (!is_x86_feature(x86_featureset, X86FSET_AES))
4956			*ecx &= ~CPUID_INTC_ECX_AES;
4957		if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4958			*ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4959		if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4960			*ecx &= ~(CPUID_INTC_ECX_XSAVE |
4961			    CPUID_INTC_ECX_OSXSAVE);
4962		if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4963			*ecx &= ~CPUID_INTC_ECX_AVX;
4964		if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4965			*ecx &= ~CPUID_INTC_ECX_F16C;
4966		if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4967			*ecx &= ~CPUID_INTC_ECX_FMA;
4968		if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4969			*ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4970		if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4971			*ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4972		if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4973			*ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4974		if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4975			*ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4976		if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4977			*ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4978
4979		/*
4980		 * [no explicit support required beyond x87 fp context]
4981		 */
4982		if (!fpu_exists)
4983			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4984
4985		/*
4986		 * Now map the supported feature vector to things that we
4987		 * think userland will care about.
4988		 */
4989		if (*edx & CPUID_INTC_EDX_SEP)
4990			hwcap_flags |= AV_386_SEP;
4991		if (*edx & CPUID_INTC_EDX_SSE)
4992			hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4993		if (*edx & CPUID_INTC_EDX_SSE2)
4994			hwcap_flags |= AV_386_SSE2;
4995		if (*ecx & CPUID_INTC_ECX_SSE3)
4996			hwcap_flags |= AV_386_SSE3;
4997		if (*ecx & CPUID_INTC_ECX_SSSE3)
4998			hwcap_flags |= AV_386_SSSE3;
4999		if (*ecx & CPUID_INTC_ECX_SSE4_1)
5000			hwcap_flags |= AV_386_SSE4_1;
5001		if (*ecx & CPUID_INTC_ECX_SSE4_2)
5002			hwcap_flags |= AV_386_SSE4_2;
5003		if (*ecx & CPUID_INTC_ECX_MOVBE)
5004			hwcap_flags |= AV_386_MOVBE;
5005		if (*ecx & CPUID_INTC_ECX_AES)
5006			hwcap_flags |= AV_386_AES;
5007		if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5008			hwcap_flags |= AV_386_PCLMULQDQ;
5009		if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5010		    (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5011			hwcap_flags |= AV_386_XSAVE;
5012
5013			if (*ecx & CPUID_INTC_ECX_AVX) {
5014				uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5015				uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5016
5017				hwcap_flags |= AV_386_AVX;
5018				if (*ecx & CPUID_INTC_ECX_F16C)
5019					hwcap_flags_2 |= AV_386_2_F16C;
5020				if (*ecx & CPUID_INTC_ECX_FMA)
5021					hwcap_flags_2 |= AV_386_2_FMA;
5022
5023				if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5024					hwcap_flags_2 |= AV_386_2_BMI1;
5025				if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5026					hwcap_flags_2 |= AV_386_2_BMI2;
5027				if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5028					hwcap_flags_2 |= AV_386_2_AVX2;
5029				if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5030					hwcap_flags_2 |= AV_386_2_AVX512F;
5031				if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5032					hwcap_flags_2 |= AV_386_2_AVX512DQ;
5033				if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5034					hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5035				if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5036					hwcap_flags_2 |= AV_386_2_AVX512PF;
5037				if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5038					hwcap_flags_2 |= AV_386_2_AVX512ER;
5039				if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5040					hwcap_flags_2 |= AV_386_2_AVX512CD;
5041				if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5042					hwcap_flags_2 |= AV_386_2_AVX512BW;
5043				if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5044					hwcap_flags_2 |= AV_386_2_AVX512VL;
5045
5046				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5047					hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5048				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5049					hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5050				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5051					hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5052
5053				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5054					hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5055				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5056					hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5057			}
5058		}
5059		if (*ecx & CPUID_INTC_ECX_VMX)
5060			hwcap_flags |= AV_386_VMX;
5061		if (*ecx & CPUID_INTC_ECX_POPCNT)
5062			hwcap_flags |= AV_386_POPCNT;
5063		if (*edx & CPUID_INTC_EDX_FPU)
5064			hwcap_flags |= AV_386_FPU;
5065		if (*edx & CPUID_INTC_EDX_MMX)
5066			hwcap_flags |= AV_386_MMX;
5067
5068		if (*edx & CPUID_INTC_EDX_TSC)
5069			hwcap_flags |= AV_386_TSC;
5070		if (*edx & CPUID_INTC_EDX_CX8)
5071			hwcap_flags |= AV_386_CX8;
5072		if (*edx & CPUID_INTC_EDX_CMOV)
5073			hwcap_flags |= AV_386_CMOV;
5074		if (*ecx & CPUID_INTC_ECX_CX16)
5075			hwcap_flags |= AV_386_CX16;
5076
5077		if (*ecx & CPUID_INTC_ECX_RDRAND)
5078			hwcap_flags_2 |= AV_386_2_RDRAND;
5079		if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5080			hwcap_flags_2 |= AV_386_2_ADX;
5081		if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5082			hwcap_flags_2 |= AV_386_2_RDSEED;
5083		if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5084			hwcap_flags_2 |= AV_386_2_SHA;
5085		if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5086			hwcap_flags_2 |= AV_386_2_FSGSBASE;
5087		if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5088			hwcap_flags_2 |= AV_386_2_CLWB;
5089		if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5090			hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5091
5092	}
5093	/*
5094	 * Check a few miscilaneous features.
5095	 */
5096	if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5097		hwcap_flags_2 |= AV_386_2_CLZERO;
5098
5099	if (cpi->cpi_xmaxeax < 0x80000001)
5100		goto pass4_done;
5101
5102	switch (cpi->cpi_vendor) {
5103		struct cpuid_regs cp;
5104		uint32_t *edx, *ecx;
5105
5106	case X86_VENDOR_Intel:
5107		/*
5108		 * Seems like Intel duplicated what we necessary
5109		 * here to make the initial crop of 64-bit OS's work.
5110		 * Hopefully, those are the only "extended" bits
5111		 * they'll add.
5112		 */
5113		/*FALLTHROUGH*/
5114
5115	case X86_VENDOR_AMD:
5116		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5117		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5118
5119		*edx = CPI_FEATURES_XTD_EDX(cpi);
5120		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5121
5122		/*
5123		 * [these features require explicit kernel support]
5124		 */
5125		switch (cpi->cpi_vendor) {
5126		case X86_VENDOR_Intel:
5127			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5128				*edx &= ~CPUID_AMD_EDX_TSCP;
5129			break;
5130
5131		case X86_VENDOR_AMD:
5132			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5133				*edx &= ~CPUID_AMD_EDX_TSCP;
5134			if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5135				*ecx &= ~CPUID_AMD_ECX_SSE4A;
5136			break;
5137
5138		default:
5139			break;
5140		}
5141
5142		/*
5143		 * [no explicit support required beyond
5144		 * x87 fp context and exception handlers]
5145		 */
5146		if (!fpu_exists)
5147			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5148			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5149
5150		if (!is_x86_feature(x86_featureset, X86FSET_NX))
5151			*edx &= ~CPUID_AMD_EDX_NX;
5152#if !defined(__amd64)
5153		*edx &= ~CPUID_AMD_EDX_LM;
5154#endif
5155		/*
5156		 * Now map the supported feature vector to
5157		 * things that we think userland will care about.
5158		 */
5159#if defined(__amd64)
5160		if (*edx & CPUID_AMD_EDX_SYSC)
5161			hwcap_flags |= AV_386_AMD_SYSC;
5162#endif
5163		if (*edx & CPUID_AMD_EDX_MMXamd)
5164			hwcap_flags |= AV_386_AMD_MMX;
5165		if (*edx & CPUID_AMD_EDX_3DNow)
5166			hwcap_flags |= AV_386_AMD_3DNow;
5167		if (*edx & CPUID_AMD_EDX_3DNowx)
5168			hwcap_flags |= AV_386_AMD_3DNowx;
5169		if (*ecx & CPUID_AMD_ECX_SVM)
5170			hwcap_flags |= AV_386_AMD_SVM;
5171
5172		switch (cpi->cpi_vendor) {
5173		case X86_VENDOR_AMD:
5174			if (*edx & CPUID_AMD_EDX_TSCP)
5175				hwcap_flags |= AV_386_TSCP;
5176			if (*ecx & CPUID_AMD_ECX_AHF64)
5177				hwcap_flags |= AV_386_AHF;
5178			if (*ecx & CPUID_AMD_ECX_SSE4A)
5179				hwcap_flags |= AV_386_AMD_SSE4A;
5180			if (*ecx & CPUID_AMD_ECX_LZCNT)
5181				hwcap_flags |= AV_386_AMD_LZCNT;
5182			if (*ecx & CPUID_AMD_ECX_MONITORX)
5183				hwcap_flags_2 |= AV_386_2_MONITORX;
5184			break;
5185
5186		case X86_VENDOR_Intel:
5187			if (*edx & CPUID_AMD_EDX_TSCP)
5188				hwcap_flags |= AV_386_TSCP;
5189			if (*ecx & CPUID_AMD_ECX_LZCNT)
5190				hwcap_flags |= AV_386_AMD_LZCNT;
5191			/*
5192			 * Aarrgh.
5193			 * Intel uses a different bit in the same word.
5194			 */
5195			if (*ecx & CPUID_INTC_ECX_AHF64)
5196				hwcap_flags |= AV_386_AHF;
5197			break;
5198
5199		default:
5200			break;
5201		}
5202		break;
5203
5204	case X86_VENDOR_TM:
5205		cp.cp_eax = 0x80860001;
5206		(void) __cpuid_insn(&cp);
5207		cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5208		break;
5209
5210	default:
5211		break;
5212	}
5213
5214pass4_done:
5215	cpi->cpi_pass = 4;
5216	if (hwcap_out != NULL) {
5217		hwcap_out[0] = hwcap_flags;
5218		hwcap_out[1] = hwcap_flags_2;
5219	}
5220}
5221
5222
5223/*
5224 * Simulate the cpuid instruction using the data we previously
5225 * captured about this CPU.  We try our best to return the truth
5226 * about the hardware, independently of kernel support.
5227 */
5228uint32_t
5229cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5230{
5231	struct cpuid_info *cpi;
5232	struct cpuid_regs *xcp;
5233
5234	if (cpu == NULL)
5235		cpu = CPU;
5236	cpi = cpu->cpu_m.mcpu_cpi;
5237
5238	ASSERT(cpuid_checkpass(cpu, 3));
5239
5240	/*
5241	 * CPUID data is cached in two separate places: cpi_std for standard
5242	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5243	 */
5244	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5245		xcp = &cpi->cpi_std[cp->cp_eax];
5246	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5247	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5248	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5249		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5250	} else {
5251		/*
5252		 * The caller is asking for data from an input parameter which
5253		 * the kernel has not cached.  In this case we go fetch from
5254		 * the hardware and return the data directly to the user.
5255		 */
5256		return (__cpuid_insn(cp));
5257	}
5258
5259	cp->cp_eax = xcp->cp_eax;
5260	cp->cp_ebx = xcp->cp_ebx;
5261	cp->cp_ecx = xcp->cp_ecx;
5262	cp->cp_edx = xcp->cp_edx;
5263	return (cp->cp_eax);
5264}
5265
5266int
5267cpuid_checkpass(cpu_t *cpu, int pass)
5268{
5269	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5270	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5271}
5272
5273int
5274cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5275{
5276	ASSERT(cpuid_checkpass(cpu, 3));
5277
5278	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5279}
5280
5281int
5282cpuid_is_cmt(cpu_t *cpu)
5283{
5284	if (cpu == NULL)
5285		cpu = CPU;
5286
5287	ASSERT(cpuid_checkpass(cpu, 1));
5288
5289	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5290}
5291
5292/*
5293 * AMD and Intel both implement the 64-bit variant of the syscall
5294 * instruction (syscallq), so if there's -any- support for syscall,
5295 * cpuid currently says "yes, we support this".
5296 *
5297 * However, Intel decided to -not- implement the 32-bit variant of the
5298 * syscall instruction, so we provide a predicate to allow our caller
5299 * to test that subtlety here.
5300 *
5301 * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5302 *	even in the case where the hardware would in fact support it.
5303 */
5304/*ARGSUSED*/
5305int
5306cpuid_syscall32_insn(cpu_t *cpu)
5307{
5308	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5309
5310#if !defined(__xpv)
5311	if (cpu == NULL)
5312		cpu = CPU;
5313
5314	/*CSTYLED*/
5315	{
5316		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5317
5318		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5319		    cpi->cpi_xmaxeax >= 0x80000001 &&
5320		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5321			return (1);
5322	}
5323#endif
5324	return (0);
5325}
5326
5327int
5328cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5329{
5330	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5331
5332	static const char fmt[] =
5333	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5334	static const char fmt_ht[] =
5335	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5336
5337	ASSERT(cpuid_checkpass(cpu, 1));
5338
5339	if (cpuid_is_cmt(cpu))
5340		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5341		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5342		    cpi->cpi_family, cpi->cpi_model,
5343		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5344	return (snprintf(s, n, fmt,
5345	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5346	    cpi->cpi_family, cpi->cpi_model,
5347	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5348}
5349
5350const char *
5351cpuid_getvendorstr(cpu_t *cpu)
5352{
5353	ASSERT(cpuid_checkpass(cpu, 1));
5354	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5355}
5356
5357uint_t
5358cpuid_getvendor(cpu_t *cpu)
5359{
5360	ASSERT(cpuid_checkpass(cpu, 1));
5361	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5362}
5363
5364uint_t
5365cpuid_getfamily(cpu_t *cpu)
5366{
5367	ASSERT(cpuid_checkpass(cpu, 1));
5368	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5369}
5370
5371uint_t
5372cpuid_getmodel(cpu_t *cpu)
5373{
5374	ASSERT(cpuid_checkpass(cpu, 1));
5375	return (cpu->cpu_m.mcpu_cpi->cpi_model);
5376}
5377
5378uint_t
5379cpuid_get_ncpu_per_chip(cpu_t *cpu)
5380{
5381	ASSERT(cpuid_checkpass(cpu, 1));
5382	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5383}
5384
5385uint_t
5386cpuid_get_ncore_per_chip(cpu_t *cpu)
5387{
5388	ASSERT(cpuid_checkpass(cpu, 1));
5389	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5390}
5391
5392uint_t
5393cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5394{
5395	ASSERT(cpuid_checkpass(cpu, 2));
5396	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5397}
5398
5399id_t
5400cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5401{
5402	ASSERT(cpuid_checkpass(cpu, 2));
5403	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5404}
5405
5406uint_t
5407cpuid_getstep(cpu_t *cpu)
5408{
5409	ASSERT(cpuid_checkpass(cpu, 1));
5410	return (cpu->cpu_m.mcpu_cpi->cpi_step);
5411}
5412
5413uint_t
5414cpuid_getsig(struct cpu *cpu)
5415{
5416	ASSERT(cpuid_checkpass(cpu, 1));
5417	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5418}
5419
5420uint32_t
5421cpuid_getchiprev(struct cpu *cpu)
5422{
5423	ASSERT(cpuid_checkpass(cpu, 1));
5424	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5425}
5426
5427const char *
5428cpuid_getchiprevstr(struct cpu *cpu)
5429{
5430	ASSERT(cpuid_checkpass(cpu, 1));
5431	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5432}
5433
5434uint32_t
5435cpuid_getsockettype(struct cpu *cpu)
5436{
5437	ASSERT(cpuid_checkpass(cpu, 1));
5438	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5439}
5440
5441const char *
5442cpuid_getsocketstr(cpu_t *cpu)
5443{
5444	static const char *socketstr = NULL;
5445	struct cpuid_info *cpi;
5446
5447	ASSERT(cpuid_checkpass(cpu, 1));
5448	cpi = cpu->cpu_m.mcpu_cpi;
5449
5450	/* Assume that socket types are the same across the system */
5451	if (socketstr == NULL)
5452		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5453		    cpi->cpi_model, cpi->cpi_step);
5454
5455
5456	return (socketstr);
5457}
5458
5459int
5460cpuid_get_chipid(cpu_t *cpu)
5461{
5462	ASSERT(cpuid_checkpass(cpu, 1));
5463
5464	if (cpuid_is_cmt(cpu))
5465		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5466	return (cpu->cpu_id);
5467}
5468
5469id_t
5470cpuid_get_coreid(cpu_t *cpu)
5471{
5472	ASSERT(cpuid_checkpass(cpu, 1));
5473	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5474}
5475
5476int
5477cpuid_get_pkgcoreid(cpu_t *cpu)
5478{
5479	ASSERT(cpuid_checkpass(cpu, 1));
5480	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5481}
5482
5483int
5484cpuid_get_clogid(cpu_t *cpu)
5485{
5486	ASSERT(cpuid_checkpass(cpu, 1));
5487	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5488}
5489
5490int
5491cpuid_get_cacheid(cpu_t *cpu)
5492{
5493	ASSERT(cpuid_checkpass(cpu, 1));
5494	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5495}
5496
5497uint_t
5498cpuid_get_procnodeid(cpu_t *cpu)
5499{
5500	ASSERT(cpuid_checkpass(cpu, 1));
5501	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5502}
5503
5504uint_t
5505cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5506{
5507	ASSERT(cpuid_checkpass(cpu, 1));
5508	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5509}
5510
5511uint_t
5512cpuid_get_compunitid(cpu_t *cpu)
5513{
5514	ASSERT(cpuid_checkpass(cpu, 1));
5515	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5516}
5517
5518uint_t
5519cpuid_get_cores_per_compunit(cpu_t *cpu)
5520{
5521	ASSERT(cpuid_checkpass(cpu, 1));
5522	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5523}
5524
5525/*ARGSUSED*/
5526int
5527cpuid_have_cr8access(cpu_t *cpu)
5528{
5529#if defined(__amd64)
5530	return (1);
5531#else
5532	struct cpuid_info *cpi;
5533
5534	ASSERT(cpu != NULL);
5535	cpi = cpu->cpu_m.mcpu_cpi;
5536	if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5537	    (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5538		return (1);
5539	return (0);
5540#endif
5541}
5542
5543uint32_t
5544cpuid_get_apicid(cpu_t *cpu)
5545{
5546	ASSERT(cpuid_checkpass(cpu, 1));
5547	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5548		return (UINT32_MAX);
5549	} else {
5550		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5551	}
5552}
5553
5554void
5555cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5556{
5557	struct cpuid_info *cpi;
5558
5559	if (cpu == NULL)
5560		cpu = CPU;
5561	cpi = cpu->cpu_m.mcpu_cpi;
5562
5563	ASSERT(cpuid_checkpass(cpu, 1));
5564
5565	if (pabits)
5566		*pabits = cpi->cpi_pabits;
5567	if (vabits)
5568		*vabits = cpi->cpi_vabits;
5569}
5570
5571size_t
5572cpuid_get_xsave_size()
5573{
5574	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5575	    sizeof (struct xsave_state)));
5576}
5577
5578/*
5579 * Return true if the CPUs on this system require 'pointer clearing' for the
5580 * floating point error pointer exception handling. In the past, this has been
5581 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5582 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5583 * feature bit and is reflected in the cpi_fp_amd_save member.
5584 */
5585boolean_t
5586cpuid_need_fp_excp_handling()
5587{
5588	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5589	    cpuid_info0.cpi_fp_amd_save != 0);
5590}
5591
5592/*
5593 * Returns the number of data TLB entries for a corresponding
5594 * pagesize.  If it can't be computed, or isn't known, the
5595 * routine returns zero.  If you ask about an architecturally
5596 * impossible pagesize, the routine will panic (so that the
5597 * hat implementor knows that things are inconsistent.)
5598 */
5599uint_t
5600cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5601{
5602	struct cpuid_info *cpi;
5603	uint_t dtlb_nent = 0;
5604
5605	if (cpu == NULL)
5606		cpu = CPU;
5607	cpi = cpu->cpu_m.mcpu_cpi;
5608
5609	ASSERT(cpuid_checkpass(cpu, 1));
5610
5611	/*
5612	 * Check the L2 TLB info
5613	 */
5614	if (cpi->cpi_xmaxeax >= 0x80000006) {
5615		struct cpuid_regs *cp = &cpi->cpi_extd[6];
5616
5617		switch (pagesize) {
5618
5619		case 4 * 1024:
5620			/*
5621			 * All zero in the top 16 bits of the register
5622			 * indicates a unified TLB. Size is in low 16 bits.
5623			 */
5624			if ((cp->cp_ebx & 0xffff0000) == 0)
5625				dtlb_nent = cp->cp_ebx & 0x0000ffff;
5626			else
5627				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5628			break;
5629
5630		case 2 * 1024 * 1024:
5631			if ((cp->cp_eax & 0xffff0000) == 0)
5632				dtlb_nent = cp->cp_eax & 0x0000ffff;
5633			else
5634				dtlb_nent = BITX(cp->cp_eax, 27, 16);
5635			break;
5636
5637		default:
5638			panic("unknown L2 pagesize");
5639			/*NOTREACHED*/
5640		}
5641	}
5642
5643	if (dtlb_nent != 0)
5644		return (dtlb_nent);
5645
5646	/*
5647	 * No L2 TLB support for this size, try L1.
5648	 */
5649	if (cpi->cpi_xmaxeax >= 0x80000005) {
5650		struct cpuid_regs *cp = &cpi->cpi_extd[5];
5651
5652		switch (pagesize) {
5653		case 4 * 1024:
5654			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5655			break;
5656		case 2 * 1024 * 1024:
5657			dtlb_nent = BITX(cp->cp_eax, 23, 16);
5658			break;
5659		default:
5660			panic("unknown L1 d-TLB pagesize");
5661			/*NOTREACHED*/
5662		}
5663	}
5664
5665	return (dtlb_nent);
5666}
5667
5668/*
5669 * Return 0 if the erratum is not present or not applicable, positive
5670 * if it is, and negative if the status of the erratum is unknown.
5671 *
5672 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5673 * Processors" #25759, Rev 3.57, August 2005
5674 */
5675int
5676cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5677{
5678	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5679	uint_t eax;
5680
5681	/*
5682	 * Bail out if this CPU isn't an AMD CPU, or if it's
5683	 * a legacy (32-bit) AMD CPU.
5684	 */
5685	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5686	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5687	    cpi->cpi_family == 6) {
5688		return (0);
5689	}
5690
5691	eax = cpi->cpi_std[1].cp_eax;
5692
5693#define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
5694#define	SH_B3(eax)	(eax == 0xf51)
5695#define	B(eax)		(SH_B0(eax) || SH_B3(eax))
5696
5697#define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
5698
5699#define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5700#define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5701#define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
5702#define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5703
5704#define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5705#define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
5706#define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
5707#define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5708
5709#define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5710#define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
5711#define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
5712#define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
5713#define	BH_E4(eax)	(eax == 0x20fb1)
5714#define	SH_E5(eax)	(eax == 0x20f42)
5715#define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
5716#define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
5717#define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5718			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5719			    DH_E6(eax) || JH_E6(eax))
5720
5721#define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5722#define	DR_B0(eax)	(eax == 0x100f20)
5723#define	DR_B1(eax)	(eax == 0x100f21)
5724#define	DR_BA(eax)	(eax == 0x100f2a)
5725#define	DR_B2(eax)	(eax == 0x100f22)
5726#define	DR_B3(eax)	(eax == 0x100f23)
5727#define	RB_C0(eax)	(eax == 0x100f40)
5728
5729	switch (erratum) {
5730	case 1:
5731		return (cpi->cpi_family < 0x10);
5732	case 51:	/* what does the asterisk mean? */
5733		return (B(eax) || SH_C0(eax) || CG(eax));
5734	case 52:
5735		return (B(eax));
5736	case 57:
5737		return (cpi->cpi_family <= 0x11);
5738	case 58:
5739		return (B(eax));
5740	case 60:
5741		return (cpi->cpi_family <= 0x11);
5742	case 61:
5743	case 62:
5744	case 63:
5745	case 64:
5746	case 65:
5747	case 66:
5748	case 68:
5749	case 69:
5750	case 70:
5751	case 71:
5752		return (B(eax));
5753	case 72:
5754		return (SH_B0(eax));
5755	case 74:
5756		return (B(eax));
5757	case 75:
5758		return (cpi->cpi_family < 0x10);
5759	case 76:
5760		return (B(eax));
5761	case 77:
5762		return (cpi->cpi_family <= 0x11);
5763	case 78:
5764		return (B(eax) || SH_C0(eax));
5765	case 79:
5766		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5767	case 80:
5768	case 81:
5769	case 82:
5770		return (B(eax));
5771	case 83:
5772		return (B(eax) || SH_C0(eax) || CG(eax));
5773	case 85:
5774		return (cpi->cpi_family < 0x10);
5775	case 86:
5776		return (SH_C0(eax) || CG(eax));
5777	case 88:
5778#if !defined(__amd64)
5779		return (0);
5780#else
5781		return (B(eax) || SH_C0(eax));
5782#endif
5783	case 89:
5784		return (cpi->cpi_family < 0x10);
5785	case 90:
5786		return (B(eax) || SH_C0(eax) || CG(eax));
5787	case 91:
5788	case 92:
5789		return (B(eax) || SH_C0(eax));
5790	case 93:
5791		return (SH_C0(eax));
5792	case 94:
5793		return (B(eax) || SH_C0(eax) || CG(eax));
5794	case 95:
5795#if !defined(__amd64)
5796		return (0);
5797#else
5798		return (B(eax) || SH_C0(eax));
5799#endif
5800	case 96:
5801		return (B(eax) || SH_C0(eax) || CG(eax));
5802	case 97:
5803	case 98:
5804		return (SH_C0(eax) || CG(eax));
5805	case 99:
5806		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5807	case 100:
5808		return (B(eax) || SH_C0(eax));
5809	case 101:
5810	case 103:
5811		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5812	case 104:
5813		return (SH_C0(eax) || CG(eax) || D0(eax));
5814	case 105:
5815	case 106:
5816	case 107:
5817		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5818	case 108:
5819		return (DH_CG(eax));
5820	case 109:
5821		return (SH_C0(eax) || CG(eax) || D0(eax));
5822	case 110:
5823		return (D0(eax) || EX(eax));
5824	case 111:
5825		return (CG(eax));
5826	case 112:
5827		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5828	case 113:
5829		return (eax == 0x20fc0);
5830	case 114:
5831		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5832	case 115:
5833		return (SH_E0(eax) || JH_E1(eax));
5834	case 116:
5835		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5836	case 117:
5837		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5838	case 118:
5839		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5840		    JH_E6(eax));
5841	case 121:
5842		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5843	case 122:
5844		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5845	case 123:
5846		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5847	case 131:
5848		return (cpi->cpi_family < 0x10);
5849	case 6336786:
5850
5851		/*
5852		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
5853		 * if this is a K8 family or newer processor. We're testing for
5854		 * this 'erratum' to determine whether or not we have a constant
5855		 * TSC.
5856		 *
5857		 * Our current fix for this is to disable the C1-Clock ramping.
5858		 * However, this doesn't work on newer processor families nor
5859		 * does it work when virtualized as those devices don't exist.
5860		 */
5861		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5862			return (0);
5863		}
5864
5865		if (CPI_FAMILY(cpi) == 0xf) {
5866			struct cpuid_regs regs;
5867			regs.cp_eax = 0x80000007;
5868			(void) __cpuid_insn(&regs);
5869			return (!(regs.cp_edx & 0x100));
5870		}
5871		return (0);
5872	case 6323525:
5873		/*
5874		 * This erratum (K8 #147) is not present on family 10 and newer.
5875		 */
5876		if (cpi->cpi_family >= 0x10) {
5877			return (0);
5878		}
5879		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5880		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5881
5882	case 6671130:
5883		/*
5884		 * check for processors (pre-Shanghai) that do not provide
5885		 * optimal management of 1gb ptes in its tlb.
5886		 */
5887		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5888
5889	case 298:
5890		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5891		    DR_B2(eax) || RB_C0(eax));
5892
5893	case 721:
5894#if defined(__amd64)
5895		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5896#else
5897		return (0);
5898#endif
5899
5900	default:
5901		return (-1);
5902
5903	}
5904}
5905
5906/*
5907 * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5908 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5909 */
5910int
5911osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5912{
5913	struct cpuid_info	*cpi;
5914	uint_t			osvwid;
5915	static int		osvwfeature = -1;
5916	uint64_t		osvwlength;
5917
5918
5919	cpi = cpu->cpu_m.mcpu_cpi;
5920
5921	/* confirm OSVW supported */
5922	if (osvwfeature == -1) {
5923		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5924	} else {
5925		/* assert that osvw feature setting is consistent on all cpus */
5926		ASSERT(osvwfeature ==
5927		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5928	}
5929	if (!osvwfeature)
5930		return (-1);
5931
5932	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5933
5934	switch (erratum) {
5935	case 298:	/* osvwid is 0 */
5936		osvwid = 0;
5937		if (osvwlength <= (uint64_t)osvwid) {
5938			/* osvwid 0 is unknown */
5939			return (-1);
5940		}
5941
5942		/*
5943		 * Check the OSVW STATUS MSR to determine the state
5944		 * of the erratum where:
5945		 *   0 - fixed by HW
5946		 *   1 - BIOS has applied the workaround when BIOS
5947		 *   workaround is available. (Or for other errata,
5948		 *   OS workaround is required.)
5949		 * For a value of 1, caller will confirm that the
5950		 * erratum 298 workaround has indeed been applied by BIOS.
5951		 *
5952		 * A 1 may be set in cpus that have a HW fix
5953		 * in a mixed cpu system. Regarding erratum 298:
5954		 *   In a multiprocessor platform, the workaround above
5955		 *   should be applied to all processors regardless of
5956		 *   silicon revision when an affected processor is
5957		 *   present.
5958		 */
5959
5960		return (rdmsr(MSR_AMD_OSVW_STATUS +
5961		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
5962		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5963
5964	default:
5965		return (-1);
5966	}
5967}
5968
5969static const char assoc_str[] = "associativity";
5970static const char line_str[] = "line-size";
5971static const char size_str[] = "size";
5972
5973static void
5974add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5975    uint32_t val)
5976{
5977	char buf[128];
5978
5979	/*
5980	 * ndi_prop_update_int() is used because it is desirable for
5981	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5982	 */
5983	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5984		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5985}
5986
5987/*
5988 * Intel-style cache/tlb description
5989 *
5990 * Standard cpuid level 2 gives a randomly ordered
5991 * selection of tags that index into a table that describes
5992 * cache and tlb properties.
5993 */
5994
5995static const char l1_icache_str[] = "l1-icache";
5996static const char l1_dcache_str[] = "l1-dcache";
5997static const char l2_cache_str[] = "l2-cache";
5998static const char l3_cache_str[] = "l3-cache";
5999static const char itlb4k_str[] = "itlb-4K";
6000static const char dtlb4k_str[] = "dtlb-4K";
6001static const char itlb2M_str[] = "itlb-2M";
6002static const char itlb4M_str[] = "itlb-4M";
6003static const char dtlb4M_str[] = "dtlb-4M";
6004static const char dtlb24_str[] = "dtlb0-2M-4M";
6005static const char itlb424_str[] = "itlb-4K-2M-4M";
6006static const char itlb24_str[] = "itlb-2M-4M";
6007static const char dtlb44_str[] = "dtlb-4K-4M";
6008static const char sl1_dcache_str[] = "sectored-l1-dcache";
6009static const char sl2_cache_str[] = "sectored-l2-cache";
6010static const char itrace_str[] = "itrace-cache";
6011static const char sl3_cache_str[] = "sectored-l3-cache";
6012static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6013
6014static const struct cachetab {
6015	uint8_t		ct_code;
6016	uint8_t		ct_assoc;
6017	uint16_t	ct_line_size;
6018	size_t		ct_size;
6019	const char	*ct_label;
6020} intel_ctab[] = {
6021	/*
6022	 * maintain descending order!
6023	 *
6024	 * Codes ignored - Reason
6025	 * ----------------------
6026	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6027	 * f0H/f1H - Currently we do not interpret prefetch size by design
6028	 */
6029	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6030	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6031	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6032	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6033	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6034	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6035	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6036	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6037	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6038	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6039	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6040	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6041	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6042	{ 0xc0, 4, 0, 8, dtlb44_str },
6043	{ 0xba, 4, 0, 64, dtlb4k_str },
6044	{ 0xb4, 4, 0, 256, dtlb4k_str },
6045	{ 0xb3, 4, 0, 128, dtlb4k_str },
6046	{ 0xb2, 4, 0, 64, itlb4k_str },
6047	{ 0xb0, 4, 0, 128, itlb4k_str },
6048	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6049	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6050	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6051	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6052	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6053	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6054	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6055	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6056	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6057	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6058	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6059	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6060	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6061	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6062	{ 0x73, 8, 0, 64*1024, itrace_str},
6063	{ 0x72, 8, 0, 32*1024, itrace_str},
6064	{ 0x71, 8, 0, 16*1024, itrace_str},
6065	{ 0x70, 8, 0, 12*1024, itrace_str},
6066	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6067	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6068	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6069	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6070	{ 0x5d, 0, 0, 256, dtlb44_str},
6071	{ 0x5c, 0, 0, 128, dtlb44_str},
6072	{ 0x5b, 0, 0, 64, dtlb44_str},
6073	{ 0x5a, 4, 0, 32, dtlb24_str},
6074	{ 0x59, 0, 0, 16, dtlb4k_str},
6075	{ 0x57, 4, 0, 16, dtlb4k_str},
6076	{ 0x56, 4, 0, 16, dtlb4M_str},
6077	{ 0x55, 0, 0, 7, itlb24_str},
6078	{ 0x52, 0, 0, 256, itlb424_str},
6079	{ 0x51, 0, 0, 128, itlb424_str},
6080	{ 0x50, 0, 0, 64, itlb424_str},
6081	{ 0x4f, 0, 0, 32, itlb4k_str},
6082	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6083	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6084	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6085	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6086	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6087	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6088	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6089	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6090	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6091	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6092	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6093	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6094	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6095	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6096	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6097	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6098	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6099	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6100	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6101	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6102	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6103	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6104	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6105	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6106	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6107	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6108	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6109	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6110	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6111	{ 0x0b, 4, 0, 4, itlb4M_str},
6112	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6113	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6114	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6115	{ 0x05, 4, 0, 32, dtlb4M_str},
6116	{ 0x04, 4, 0, 8, dtlb4M_str},
6117	{ 0x03, 4, 0, 64, dtlb4k_str},
6118	{ 0x02, 4, 0, 2, itlb4M_str},
6119	{ 0x01, 4, 0, 32, itlb4k_str},
6120	{ 0 }
6121};
6122
6123static const struct cachetab cyrix_ctab[] = {
6124	{ 0x70, 4, 0, 32, "tlb-4K" },
6125	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6126	{ 0 }
6127};
6128
6129/*
6130 * Search a cache table for a matching entry
6131 */
6132static const struct cachetab *
6133find_cacheent(const struct cachetab *ct, uint_t code)
6134{
6135	if (code != 0) {
6136		for (; ct->ct_code != 0; ct++)
6137			if (ct->ct_code <= code)
6138				break;
6139		if (ct->ct_code == code)
6140			return (ct);
6141	}
6142	return (NULL);
6143}
6144
6145/*
6146 * Populate cachetab entry with L2 or L3 cache-information using
6147 * cpuid function 4. This function is called from intel_walk_cacheinfo()
6148 * when descriptor 0x49 is encountered. It returns 0 if no such cache
6149 * information is found.
6150 */
6151static int
6152intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6153{
6154	uint32_t level, i;
6155	int ret = 0;
6156
6157	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6158		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6159
6160		if (level == 2 || level == 3) {
6161			ct->ct_assoc =
6162			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6163			ct->ct_line_size =
6164			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6165			ct->ct_size = ct->ct_assoc *
6166			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6167			    ct->ct_line_size *
6168			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6169
6170			if (level == 2) {
6171				ct->ct_label = l2_cache_str;
6172			} else if (level == 3) {
6173				ct->ct_label = l3_cache_str;
6174			}
6175			ret = 1;
6176		}
6177	}
6178
6179	return (ret);
6180}
6181
6182/*
6183 * Walk the cacheinfo descriptor, applying 'func' to every valid element
6184 * The walk is terminated if the walker returns non-zero.
6185 */
6186static void
6187intel_walk_cacheinfo(struct cpuid_info *cpi,
6188    void *arg, int (*func)(void *, const struct cachetab *))
6189{
6190	const struct cachetab *ct;
6191	struct cachetab des_49_ct, des_b1_ct;
6192	uint8_t *dp;
6193	int i;
6194
6195	if ((dp = cpi->cpi_cacheinfo) == NULL)
6196		return;
6197	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6198		/*
6199		 * For overloaded descriptor 0x49 we use cpuid function 4
6200		 * if supported by the current processor, to create
6201		 * cache information.
6202		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6203		 * to disambiguate the cache information.
6204		 */
6205		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6206		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6207				ct = &des_49_ct;
6208		} else if (*dp == 0xb1) {
6209			des_b1_ct.ct_code = 0xb1;
6210			des_b1_ct.ct_assoc = 4;
6211			des_b1_ct.ct_line_size = 0;
6212			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6213				des_b1_ct.ct_size = 8;
6214				des_b1_ct.ct_label = itlb2M_str;
6215			} else {
6216				des_b1_ct.ct_size = 4;
6217				des_b1_ct.ct_label = itlb4M_str;
6218			}
6219			ct = &des_b1_ct;
6220		} else {
6221			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6222				continue;
6223			}
6224		}
6225
6226		if (func(arg, ct) != 0) {
6227			break;
6228		}
6229	}
6230}
6231
6232/*
6233 * (Like the Intel one, except for Cyrix CPUs)
6234 */
6235static void
6236cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6237    void *arg, int (*func)(void *, const struct cachetab *))
6238{
6239	const struct cachetab *ct;
6240	uint8_t *dp;
6241	int i;
6242
6243	if ((dp = cpi->cpi_cacheinfo) == NULL)
6244		return;
6245	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6246		/*
6247		 * Search Cyrix-specific descriptor table first ..
6248		 */
6249		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6250			if (func(arg, ct) != 0)
6251				break;
6252			continue;
6253		}
6254		/*
6255		 * .. else fall back to the Intel one
6256		 */
6257		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6258			if (func(arg, ct) != 0)
6259				break;
6260			continue;
6261		}
6262	}
6263}
6264
6265/*
6266 * A cacheinfo walker that adds associativity, line-size, and size properties
6267 * to the devinfo node it is passed as an argument.
6268 */
6269static int
6270add_cacheent_props(void *arg, const struct cachetab *ct)
6271{
6272	dev_info_t *devi = arg;
6273
6274	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6275	if (ct->ct_line_size != 0)
6276		add_cache_prop(devi, ct->ct_label, line_str,
6277		    ct->ct_line_size);
6278	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6279	return (0);
6280}
6281
6282
6283static const char fully_assoc[] = "fully-associative?";
6284
6285/*
6286 * AMD style cache/tlb description
6287 *
6288 * Extended functions 5 and 6 directly describe properties of
6289 * tlbs and various cache levels.
6290 */
6291static void
6292add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6293{
6294	switch (assoc) {
6295	case 0:	/* reserved; ignore */
6296		break;
6297	default:
6298		add_cache_prop(devi, label, assoc_str, assoc);
6299		break;
6300	case 0xff:
6301		add_cache_prop(devi, label, fully_assoc, 1);
6302		break;
6303	}
6304}
6305
6306static void
6307add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6308{
6309	if (size == 0)
6310		return;
6311	add_cache_prop(devi, label, size_str, size);
6312	add_amd_assoc(devi, label, assoc);
6313}
6314
6315static void
6316add_amd_cache(dev_info_t *devi, const char *label,
6317    uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6318{
6319	if (size == 0 || line_size == 0)
6320		return;
6321	add_amd_assoc(devi, label, assoc);
6322	/*
6323	 * Most AMD parts have a sectored cache. Multiple cache lines are
6324	 * associated with each tag. A sector consists of all cache lines
6325	 * associated with a tag. For example, the AMD K6-III has a sector
6326	 * size of 2 cache lines per tag.
6327	 */
6328	if (lines_per_tag != 0)
6329		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6330	add_cache_prop(devi, label, line_str, line_size);
6331	add_cache_prop(devi, label, size_str, size * 1024);
6332}
6333
6334static void
6335add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6336{
6337	switch (assoc) {
6338	case 0:	/* off */
6339		break;
6340	case 1:
6341	case 2:
6342	case 4:
6343		add_cache_prop(devi, label, assoc_str, assoc);
6344		break;
6345	case 6:
6346		add_cache_prop(devi, label, assoc_str, 8);
6347		break;
6348	case 8:
6349		add_cache_prop(devi, label, assoc_str, 16);
6350		break;
6351	case 0xf:
6352		add_cache_prop(devi, label, fully_assoc, 1);
6353		break;
6354	default: /* reserved; ignore */
6355		break;
6356	}
6357}
6358
6359static void
6360add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6361{
6362	if (size == 0 || assoc == 0)
6363		return;
6364	add_amd_l2_assoc(devi, label, assoc);
6365	add_cache_prop(devi, label, size_str, size);
6366}
6367
6368static void
6369add_amd_l2_cache(dev_info_t *devi, const char *label,
6370    uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6371{
6372	if (size == 0 || assoc == 0 || line_size == 0)
6373		return;
6374	add_amd_l2_assoc(devi, label, assoc);
6375	if (lines_per_tag != 0)
6376		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6377	add_cache_prop(devi, label, line_str, line_size);
6378	add_cache_prop(devi, label, size_str, size * 1024);
6379}
6380
6381static void
6382amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6383{
6384	struct cpuid_regs *cp;
6385
6386	if (cpi->cpi_xmaxeax < 0x80000005)
6387		return;
6388	cp = &cpi->cpi_extd[5];
6389
6390	/*
6391	 * 4M/2M L1 TLB configuration
6392	 *
6393	 * We report the size for 2M pages because AMD uses two
6394	 * TLB entries for one 4M page.
6395	 */
6396	add_amd_tlb(devi, "dtlb-2M",
6397	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6398	add_amd_tlb(devi, "itlb-2M",
6399	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6400
6401	/*
6402	 * 4K L1 TLB configuration
6403	 */
6404
6405	switch (cpi->cpi_vendor) {
6406		uint_t nentries;
6407	case X86_VENDOR_TM:
6408		if (cpi->cpi_family >= 5) {
6409			/*
6410			 * Crusoe processors have 256 TLB entries, but
6411			 * cpuid data format constrains them to only
6412			 * reporting 255 of them.
6413			 */
6414			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6415				nentries = 256;
6416			/*
6417			 * Crusoe processors also have a unified TLB
6418			 */
6419			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6420			    nentries);
6421			break;
6422		}
6423		/*FALLTHROUGH*/
6424	default:
6425		add_amd_tlb(devi, itlb4k_str,
6426		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6427		add_amd_tlb(devi, dtlb4k_str,
6428		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6429		break;
6430	}
6431
6432	/*
6433	 * data L1 cache configuration
6434	 */
6435
6436	add_amd_cache(devi, l1_dcache_str,
6437	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6438	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6439
6440	/*
6441	 * code L1 cache configuration
6442	 */
6443
6444	add_amd_cache(devi, l1_icache_str,
6445	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6446	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6447
6448	if (cpi->cpi_xmaxeax < 0x80000006)
6449		return;
6450	cp = &cpi->cpi_extd[6];
6451
6452	/* Check for a unified L2 TLB for large pages */
6453
6454	if (BITX(cp->cp_eax, 31, 16) == 0)
6455		add_amd_l2_tlb(devi, "l2-tlb-2M",
6456		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6457	else {
6458		add_amd_l2_tlb(devi, "l2-dtlb-2M",
6459		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6460		add_amd_l2_tlb(devi, "l2-itlb-2M",
6461		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6462	}
6463
6464	/* Check for a unified L2 TLB for 4K pages */
6465
6466	if (BITX(cp->cp_ebx, 31, 16) == 0) {
6467		add_amd_l2_tlb(devi, "l2-tlb-4K",
6468		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6469	} else {
6470		add_amd_l2_tlb(devi, "l2-dtlb-4K",
6471		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6472		add_amd_l2_tlb(devi, "l2-itlb-4K",
6473		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6474	}
6475
6476	add_amd_l2_cache(devi, l2_cache_str,
6477	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6478	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6479}
6480
6481/*
6482 * There are two basic ways that the x86 world describes it cache
6483 * and tlb architecture - Intel's way and AMD's way.
6484 *
6485 * Return which flavor of cache architecture we should use
6486 */
6487static int
6488x86_which_cacheinfo(struct cpuid_info *cpi)
6489{
6490	switch (cpi->cpi_vendor) {
6491	case X86_VENDOR_Intel:
6492		if (cpi->cpi_maxeax >= 2)
6493			return (X86_VENDOR_Intel);
6494		break;
6495	case X86_VENDOR_AMD:
6496		/*
6497		 * The K5 model 1 was the first part from AMD that reported
6498		 * cache sizes via extended cpuid functions.
6499		 */
6500		if (cpi->cpi_family > 5 ||
6501		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6502			return (X86_VENDOR_AMD);
6503		break;
6504	case X86_VENDOR_TM:
6505		if (cpi->cpi_family >= 5)
6506			return (X86_VENDOR_AMD);
6507		/*FALLTHROUGH*/
6508	default:
6509		/*
6510		 * If they have extended CPU data for 0x80000005
6511		 * then we assume they have AMD-format cache
6512		 * information.
6513		 *
6514		 * If not, and the vendor happens to be Cyrix,
6515		 * then try our-Cyrix specific handler.
6516		 *
6517		 * If we're not Cyrix, then assume we're using Intel's
6518		 * table-driven format instead.
6519		 */
6520		if (cpi->cpi_xmaxeax >= 0x80000005)
6521			return (X86_VENDOR_AMD);
6522		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6523			return (X86_VENDOR_Cyrix);
6524		else if (cpi->cpi_maxeax >= 2)
6525			return (X86_VENDOR_Intel);
6526		break;
6527	}
6528	return (-1);
6529}
6530
6531void
6532cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6533    struct cpuid_info *cpi)
6534{
6535	dev_info_t *cpu_devi;
6536	int create;
6537
6538	cpu_devi = (dev_info_t *)dip;
6539
6540	/* device_type */
6541	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6542	    "device_type", "cpu");
6543
6544	/* reg */
6545	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6546	    "reg", cpu_id);
6547
6548	/* cpu-mhz, and clock-frequency */
6549	if (cpu_freq > 0) {
6550		long long mul;
6551
6552		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6553		    "cpu-mhz", cpu_freq);
6554		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6555			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6556			    "clock-frequency", (int)mul);
6557	}
6558
6559	if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6560		return;
6561	}
6562
6563	/* vendor-id */
6564	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6565	    "vendor-id", cpi->cpi_vendorstr);
6566
6567	if (cpi->cpi_maxeax == 0) {
6568		return;
6569	}
6570
6571	/*
6572	 * family, model, and step
6573	 */
6574	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6575	    "family", CPI_FAMILY(cpi));
6576	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6577	    "cpu-model", CPI_MODEL(cpi));
6578	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6579	    "stepping-id", CPI_STEP(cpi));
6580
6581	/* type */
6582	switch (cpi->cpi_vendor) {
6583	case X86_VENDOR_Intel:
6584		create = 1;
6585		break;
6586	default:
6587		create = 0;
6588		break;
6589	}
6590	if (create)
6591		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6592		    "type", CPI_TYPE(cpi));
6593
6594	/* ext-family */
6595	switch (cpi->cpi_vendor) {
6596	case X86_VENDOR_Intel:
6597	case X86_VENDOR_AMD:
6598		create = cpi->cpi_family >= 0xf;
6599		break;
6600	default:
6601		create = 0;
6602		break;
6603	}
6604	if (create)
6605		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6606		    "ext-family", CPI_FAMILY_XTD(cpi));
6607
6608	/* ext-model */
6609	switch (cpi->cpi_vendor) {
6610	case X86_VENDOR_Intel:
6611		create = IS_EXTENDED_MODEL_INTEL(cpi);
6612		break;
6613	case X86_VENDOR_AMD:
6614		create = CPI_FAMILY(cpi) == 0xf;
6615		break;
6616	default:
6617		create = 0;
6618		break;
6619	}
6620	if (create)
6621		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6622		    "ext-model", CPI_MODEL_XTD(cpi));
6623
6624	/* generation */
6625	switch (cpi->cpi_vendor) {
6626	case X86_VENDOR_AMD:
6627		/*
6628		 * AMD K5 model 1 was the first part to support this
6629		 */
6630		create = cpi->cpi_xmaxeax >= 0x80000001;
6631		break;
6632	default:
6633		create = 0;
6634		break;
6635	}
6636	if (create)
6637		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6638		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6639
6640	/* brand-id */
6641	switch (cpi->cpi_vendor) {
6642	case X86_VENDOR_Intel:
6643		/*
6644		 * brand id first appeared on Pentium III Xeon model 8,
6645		 * and Celeron model 8 processors and Opteron
6646		 */
6647		create = cpi->cpi_family > 6 ||
6648		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6649		break;
6650	case X86_VENDOR_AMD:
6651		create = cpi->cpi_family >= 0xf;
6652		break;
6653	default:
6654		create = 0;
6655		break;
6656	}
6657	if (create && cpi->cpi_brandid != 0) {
6658		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6659		    "brand-id", cpi->cpi_brandid);
6660	}
6661
6662	/* chunks, and apic-id */
6663	switch (cpi->cpi_vendor) {
6664		/*
6665		 * first available on Pentium IV and Opteron (K8)
6666		 */
6667	case X86_VENDOR_Intel:
6668		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6669		break;
6670	case X86_VENDOR_AMD:
6671		create = cpi->cpi_family >= 0xf;
6672		break;
6673	default:
6674		create = 0;
6675		break;
6676	}
6677	if (create) {
6678		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6679		    "chunks", CPI_CHUNKS(cpi));
6680		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6681		    "apic-id", cpi->cpi_apicid);
6682		if (cpi->cpi_chipid >= 0) {
6683			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6684			    "chip#", cpi->cpi_chipid);
6685			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6686			    "clog#", cpi->cpi_clogid);
6687		}
6688	}
6689
6690	/* cpuid-features */
6691	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6692	    "cpuid-features", CPI_FEATURES_EDX(cpi));
6693
6694
6695	/* cpuid-features-ecx */
6696	switch (cpi->cpi_vendor) {
6697	case X86_VENDOR_Intel:
6698		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6699		break;
6700	case X86_VENDOR_AMD:
6701		create = cpi->cpi_family >= 0xf;
6702		break;
6703	default:
6704		create = 0;
6705		break;
6706	}
6707	if (create)
6708		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6709		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6710
6711	/* ext-cpuid-features */
6712	switch (cpi->cpi_vendor) {
6713	case X86_VENDOR_Intel:
6714	case X86_VENDOR_AMD:
6715	case X86_VENDOR_Cyrix:
6716	case X86_VENDOR_TM:
6717	case X86_VENDOR_Centaur:
6718		create = cpi->cpi_xmaxeax >= 0x80000001;
6719		break;
6720	default:
6721		create = 0;
6722		break;
6723	}
6724	if (create) {
6725		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6726		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6727		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6728		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6729	}
6730
6731	/*
6732	 * Brand String first appeared in Intel Pentium IV, AMD K5
6733	 * model 1, and Cyrix GXm.  On earlier models we try and
6734	 * simulate something similar .. so this string should always
6735	 * same -something- about the processor, however lame.
6736	 */
6737	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6738	    "brand-string", cpi->cpi_brandstr);
6739
6740	/*
6741	 * Finally, cache and tlb information
6742	 */
6743	switch (x86_which_cacheinfo(cpi)) {
6744	case X86_VENDOR_Intel:
6745		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6746		break;
6747	case X86_VENDOR_Cyrix:
6748		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6749		break;
6750	case X86_VENDOR_AMD:
6751		amd_cache_info(cpi, cpu_devi);
6752		break;
6753	default:
6754		break;
6755	}
6756}
6757
6758struct l2info {
6759	int *l2i_csz;
6760	int *l2i_lsz;
6761	int *l2i_assoc;
6762	int l2i_ret;
6763};
6764
6765/*
6766 * A cacheinfo walker that fetches the size, line-size and associativity
6767 * of the L2 cache
6768 */
6769static int
6770intel_l2cinfo(void *arg, const struct cachetab *ct)
6771{
6772	struct l2info *l2i = arg;
6773	int *ip;
6774
6775	if (ct->ct_label != l2_cache_str &&
6776	    ct->ct_label != sl2_cache_str)
6777		return (0);	/* not an L2 -- keep walking */
6778
6779	if ((ip = l2i->l2i_csz) != NULL)
6780		*ip = ct->ct_size;
6781	if ((ip = l2i->l2i_lsz) != NULL)
6782		*ip = ct->ct_line_size;
6783	if ((ip = l2i->l2i_assoc) != NULL)
6784		*ip = ct->ct_assoc;
6785	l2i->l2i_ret = ct->ct_size;
6786	return (1);		/* was an L2 -- terminate walk */
6787}
6788
6789/*
6790 * AMD L2/L3 Cache and TLB Associativity Field Definition:
6791 *
6792 *	Unlike the associativity for the L1 cache and tlb where the 8 bit
6793 *	value is the associativity, the associativity for the L2 cache and
6794 *	tlb is encoded in the following table. The 4 bit L2 value serves as
6795 *	an index into the amd_afd[] array to determine the associativity.
6796 *	-1 is undefined. 0 is fully associative.
6797 */
6798
6799static int amd_afd[] =
6800	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6801
6802static void
6803amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6804{
6805	struct cpuid_regs *cp;
6806	uint_t size, assoc;
6807	int i;
6808	int *ip;
6809
6810	if (cpi->cpi_xmaxeax < 0x80000006)
6811		return;
6812	cp = &cpi->cpi_extd[6];
6813
6814	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6815	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6816		uint_t cachesz = size * 1024;
6817		assoc = amd_afd[i];
6818
6819		ASSERT(assoc != -1);
6820
6821		if ((ip = l2i->l2i_csz) != NULL)
6822			*ip = cachesz;
6823		if ((ip = l2i->l2i_lsz) != NULL)
6824			*ip = BITX(cp->cp_ecx, 7, 0);
6825		if ((ip = l2i->l2i_assoc) != NULL)
6826			*ip = assoc;
6827		l2i->l2i_ret = cachesz;
6828	}
6829}
6830
6831int
6832getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6833{
6834	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6835	struct l2info __l2info, *l2i = &__l2info;
6836
6837	l2i->l2i_csz = csz;
6838	l2i->l2i_lsz = lsz;
6839	l2i->l2i_assoc = assoc;
6840	l2i->l2i_ret = -1;
6841
6842	switch (x86_which_cacheinfo(cpi)) {
6843	case X86_VENDOR_Intel:
6844		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6845		break;
6846	case X86_VENDOR_Cyrix:
6847		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6848		break;
6849	case X86_VENDOR_AMD:
6850		amd_l2cacheinfo(cpi, l2i);
6851		break;
6852	default:
6853		break;
6854	}
6855	return (l2i->l2i_ret);
6856}
6857
6858#if !defined(__xpv)
6859
6860uint32_t *
6861cpuid_mwait_alloc(cpu_t *cpu)
6862{
6863	uint32_t	*ret;
6864	size_t		mwait_size;
6865
6866	ASSERT(cpuid_checkpass(CPU, 2));
6867
6868	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6869	if (mwait_size == 0)
6870		return (NULL);
6871
6872	/*
6873	 * kmem_alloc() returns cache line size aligned data for mwait_size
6874	 * allocations.  mwait_size is currently cache line sized.  Neither
6875	 * of these implementation details are guarantied to be true in the
6876	 * future.
6877	 *
6878	 * First try allocating mwait_size as kmem_alloc() currently returns
6879	 * correctly aligned memory.  If kmem_alloc() does not return
6880	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
6881	 *
6882	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6883	 * decide to free this memory.
6884	 */
6885	ret = kmem_zalloc(mwait_size, KM_SLEEP);
6886	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6887		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6888		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6889		*ret = MWAIT_RUNNING;
6890		return (ret);
6891	} else {
6892		kmem_free(ret, mwait_size);
6893		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6894		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6895		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6896		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6897		*ret = MWAIT_RUNNING;
6898		return (ret);
6899	}
6900}
6901
6902void
6903cpuid_mwait_free(cpu_t *cpu)
6904{
6905	if (cpu->cpu_m.mcpu_cpi == NULL) {
6906		return;
6907	}
6908
6909	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6910	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6911		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6912		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6913	}
6914
6915	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6916	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6917}
6918
6919void
6920patch_tsc_read(int flag)
6921{
6922	size_t cnt;
6923
6924	switch (flag) {
6925	case TSC_NONE:
6926		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6927		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6928		break;
6929	case TSC_RDTSC_MFENCE:
6930		cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6931		(void) memcpy((void *)tsc_read,
6932		    (void *)&_tsc_mfence_start, cnt);
6933		break;
6934	case TSC_RDTSC_LFENCE:
6935		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6936		(void) memcpy((void *)tsc_read,
6937		    (void *)&_tsc_lfence_start, cnt);
6938		break;
6939	case TSC_TSCP:
6940		cnt = &_tscp_end - &_tscp_start;
6941		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6942		break;
6943	default:
6944		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6945		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6946		break;
6947	}
6948	tsc_type = flag;
6949}
6950
6951int
6952cpuid_deep_cstates_supported(void)
6953{
6954	struct cpuid_info *cpi;
6955	struct cpuid_regs regs;
6956
6957	ASSERT(cpuid_checkpass(CPU, 1));
6958
6959	cpi = CPU->cpu_m.mcpu_cpi;
6960
6961	if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6962		return (0);
6963
6964	switch (cpi->cpi_vendor) {
6965	case X86_VENDOR_Intel:
6966		if (cpi->cpi_xmaxeax < 0x80000007)
6967			return (0);
6968
6969		/*
6970		 * TSC run at a constant rate in all ACPI C-states?
6971		 */
6972		regs.cp_eax = 0x80000007;
6973		(void) __cpuid_insn(&regs);
6974		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6975
6976	default:
6977		return (0);
6978	}
6979}
6980
6981#endif	/* !__xpv */
6982
6983void
6984post_startup_cpu_fixups(void)
6985{
6986#ifndef __xpv
6987	/*
6988	 * Some AMD processors support C1E state. Entering this state will
6989	 * cause the local APIC timer to stop, which we can't deal with at
6990	 * this time.
6991	 */
6992	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6993		on_trap_data_t otd;
6994		uint64_t reg;
6995
6996		if (!on_trap(&otd, OT_DATA_ACCESS)) {
6997			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6998			/* Disable C1E state if it is enabled by BIOS */
6999			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
7000			    AMD_ACTONCMPHALT_MASK) {
7001				reg &= ~(AMD_ACTONCMPHALT_MASK <<
7002				    AMD_ACTONCMPHALT_SHIFT);
7003				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
7004			}
7005		}
7006		no_trap();
7007	}
7008#endif	/* !__xpv */
7009}
7010
7011void
7012enable_pcid(void)
7013{
7014	if (x86_use_pcid == -1)
7015		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
7016
7017	if (x86_use_invpcid == -1) {
7018		x86_use_invpcid = is_x86_feature(x86_featureset,
7019		    X86FSET_INVPCID);
7020	}
7021
7022	if (!x86_use_pcid)
7023		return;
7024
7025	/*
7026	 * Intel say that on setting PCIDE, it immediately starts using the PCID
7027	 * bits; better make sure there's nothing there.
7028	 */
7029	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
7030
7031	setcr4(getcr4() | CR4_PCIDE);
7032}
7033
7034/*
7035 * Setup necessary registers to enable XSAVE feature on this processor.
7036 * This function needs to be called early enough, so that no xsave/xrstor
7037 * ops will execute on the processor before the MSRs are properly set up.
7038 *
7039 * Current implementation has the following assumption:
7040 * - cpuid_pass1() is done, so that X86 features are known.
7041 * - fpu_probe() is done, so that fp_save_mech is chosen.
7042 */
7043void
7044xsave_setup_msr(cpu_t *cpu)
7045{
7046	ASSERT(fp_save_mech == FP_XSAVE);
7047	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
7048
7049	/* Enable OSXSAVE in CR4. */
7050	setcr4(getcr4() | CR4_OSXSAVE);
7051	/*
7052	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
7053	 * correct value.
7054	 */
7055	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
7056	setup_xfem();
7057}
7058
7059/*
7060 * Starting with the Westmere processor the local
7061 * APIC timer will continue running in all C-states,
7062 * including the deepest C-states.
7063 */
7064int
7065cpuid_arat_supported(void)
7066{
7067	struct cpuid_info *cpi;
7068	struct cpuid_regs regs;
7069
7070	ASSERT(cpuid_checkpass(CPU, 1));
7071	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7072
7073	cpi = CPU->cpu_m.mcpu_cpi;
7074
7075	switch (cpi->cpi_vendor) {
7076	case X86_VENDOR_Intel:
7077		/*
7078		 * Always-running Local APIC Timer is
7079		 * indicated by CPUID.6.EAX[2].
7080		 */
7081		if (cpi->cpi_maxeax >= 6) {
7082			regs.cp_eax = 6;
7083			(void) cpuid_insn(NULL, &regs);
7084			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
7085		} else {
7086			return (0);
7087		}
7088	default:
7089		return (0);
7090	}
7091}
7092
7093/*
7094 * Check support for Intel ENERGY_PERF_BIAS feature
7095 */
7096int
7097cpuid_iepb_supported(struct cpu *cp)
7098{
7099	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
7100	struct cpuid_regs regs;
7101
7102	ASSERT(cpuid_checkpass(cp, 1));
7103
7104	if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
7105	    !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
7106		return (0);
7107	}
7108
7109	/*
7110	 * Intel ENERGY_PERF_BIAS MSR is indicated by
7111	 * capability bit CPUID.6.ECX.3
7112	 */
7113	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
7114		return (0);
7115
7116	regs.cp_eax = 0x6;
7117	(void) cpuid_insn(NULL, &regs);
7118	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
7119}
7120
7121/*
7122 * Check support for TSC deadline timer
7123 *
7124 * TSC deadline timer provides a superior software programming
7125 * model over local APIC timer that eliminates "time drifts".
7126 * Instead of specifying a relative time, software specifies an
7127 * absolute time as the target at which the processor should
7128 * generate a timer event.
7129 */
7130int
7131cpuid_deadline_tsc_supported(void)
7132{
7133	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
7134	struct cpuid_regs regs;
7135
7136	ASSERT(cpuid_checkpass(CPU, 1));
7137	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
7138
7139	switch (cpi->cpi_vendor) {
7140	case X86_VENDOR_Intel:
7141		if (cpi->cpi_maxeax >= 1) {
7142			regs.cp_eax = 1;
7143			(void) cpuid_insn(NULL, &regs);
7144			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
7145		} else {
7146			return (0);
7147		}
7148	default:
7149		return (0);
7150	}
7151}
7152
7153#if defined(__amd64) && !defined(__xpv)
7154/*
7155 * Patch in versions of bcopy for high performance Intel Nhm processors
7156 * and later...
7157 */
7158void
7159patch_memops(uint_t vendor)
7160{
7161	size_t cnt, i;
7162	caddr_t to, from;
7163
7164	if ((vendor == X86_VENDOR_Intel) &&
7165	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
7166		cnt = &bcopy_patch_end - &bcopy_patch_start;
7167		to = &bcopy_ck_size;
7168		from = &bcopy_patch_start;
7169		for (i = 0; i < cnt; i++) {
7170			*to++ = *from++;
7171		}
7172	}
7173}
7174#endif  /* __amd64 && !__xpv */
7175
7176/*
7177 * We're being asked to tell the system how many bits are required to represent
7178 * the various thread and strand IDs. While it's tempting to derive this based
7179 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
7180 * correct. Instead, this needs to be based on the number of bits that the APIC
7181 * allows for these different configurations. We only update these to a larger
7182 * value if we find one.
7183 */
7184void
7185cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
7186{
7187	struct cpuid_info *cpi;
7188
7189	VERIFY(cpuid_checkpass(CPU, 1));
7190	cpi = cpu->cpu_m.mcpu_cpi;
7191
7192	if (cpi->cpi_ncore_bits > *core_nbits) {
7193		*core_nbits = cpi->cpi_ncore_bits;
7194	}
7195
7196	if (cpi->cpi_nthread_bits > *strand_nbits) {
7197		*strand_nbits = cpi->cpi_nthread_bits;
7198	}
7199}
7200
7201void
7202cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
7203{
7204	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7205	struct cpuid_regs cp;
7206
7207	/*
7208	 * Reread the CPUID portions that we need for various security
7209	 * information.
7210	 */
7211	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
7212		/*
7213		 * Check if we now have leaf 7 available to us.
7214		 */
7215		if (cpi->cpi_maxeax < 7) {
7216			bzero(&cp, sizeof (cp));