1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26 * Copyright 2020 Joyent, Inc.
27 */
28/*
29 * Copyright (c) 2010, Intel Corporation.
30 * All rights reserved.
31 */
32/*
33 * Portions Copyright 2009 Advanced Micro Devices, Inc.
34 */
35
36/*
37 * CPU Identification logic
38 *
39 * The purpose of this file and its companion, cpuid_subr.c, is to help deal
40 * with the identification of CPUs, their features, and their topologies. More
41 * specifically, this file helps drive the following:
42 *
43 * 1. Enumeration of features of the processor which are used by the kernel to
44 *    determine what features to enable or disable. These may be instruction set
45 *    enhancements or features that we use.
46 *
47 * 2. Enumeration of instruction set architecture (ISA) additions that userland
48 *    will be told about through the auxiliary vector.
49 *
50 * 3. Understanding the physical topology of the CPU such as the number of
51 *    caches, how many cores it has, whether or not it supports symmetric
52 *    multi-processing (SMT), etc.
53 *
54 * ------------------------
55 * CPUID History and Basics
56 * ------------------------
57 *
58 * The cpuid instruction was added by Intel roughly around the time that the
59 * original Pentium was introduced. The purpose of cpuid was to tell in a
60 * programmatic fashion information about the CPU that previously was guessed
61 * at. For example, an important part of cpuid is that we can know what
62 * extensions to the ISA exist. If you use an invalid opcode you would get a
63 * #UD, so this method allows a program (whether a user program or the kernel)
64 * to determine what exists without crashing or getting a SIGILL. Of course,
65 * this was also during the era of the clones and the AMD Am5x86. The vendor
66 * name shows up first in cpuid for a reason.
67 *
68 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
69 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
70 * its own meaning. The different leaves are broken down into different regions:
71 *
72 *	[ 0, 7fffffff ]			This region is called the 'basic'
73 *					region. This region is generally defined
74 *					by Intel, though some of the original
75 *					portions have different meanings based
76 *					on the manufacturer. These days, Intel
77 *					adds most new features to this region.
78 *					AMD adds non-Intel compatible
79 *					information in the third, extended
80 *					region. Intel uses this for everything
81 *					including ISA extensions, CPU
82 *					features, cache information, topology,
83 *					and more.
84 *
85 *					There is a hole carved out of this
86 *					region which is reserved for
87 *					hypervisors.
88 *
89 *	[ 40000000, 4fffffff ]		This region, which is found in the
90 *					middle of the previous region, is
91 *					explicitly promised to never be used by
92 *					CPUs. Instead, it is used by hypervisors
93 *					to communicate information about
94 *					themselves to the operating system. The
95 *					values and details are unique for each
96 *					hypervisor.
97 *
98 *	[ 80000000, ffffffff ]		This region is called the 'extended'
99 *					region. Some of the low leaves mirror
100 *					parts of the basic leaves. This region
101 *					has generally been used by AMD for
102 *					various extensions. For example, AMD-
103 *					specific information about caches,
104 *					features, and topology are found in this
105 *					region.
106 *
107 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
108 * and %edx, and then issue the cpuid instruction. At the first leaf in each of
109 * the ranges, one of the primary things returned is the maximum valid leaf in
110 * that range. This allows for discovery of what range of CPUID is valid.
111 *
112 * The CPUs have potentially surprising behavior when using an invalid leaf or
113 * unimplemented leaf. If the requested leaf is within the valid basic or
114 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
115 * set to zero. However, if you specify a leaf that is outside of a valid range,
116 * then instead it will be filled with the last valid _basic_ leaf. For example,
117 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
118 * an invalid extended leaf will return the information for leaf 3.
119 *
120 * Some leaves are broken down into sub-leaves. This means that the value
121 * depends on both the leaf asked for in %eax and a secondary register. For
122 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
123 * additional information. Or when getting topology information in leaf 0xb, the
124 * initial value in %ecx changes which level of the topology that you are
125 * getting information about.
126 *
127 * cpuid values are always kept to 32 bits regardless of whether or not the
128 * program is in 64-bit mode. When executing in 64-bit mode, the upper
129 * 32 bits of the register are always set to zero so that way the values are the
130 * same regardless of execution mode.
131 *
132 * ----------------------
133 * Identifying Processors
134 * ----------------------
135 *
136 * We can identify a processor in two steps. The first step looks at cpuid leaf
137 * 0. Leaf 0 contains the processor's vendor information. This is done by
138 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
139 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
140 *
141 * From there, a processor is identified by a combination of three different
142 * values:
143 *
144 *  1. Family
145 *  2. Model
146 *  3. Stepping
147 *
148 * Each vendor uses the family and model to uniquely identify a processor. The
149 * way that family and model are changed depends on the vendor. For example,
150 * Intel has been using family 0x6 for almost all of their processor since the
151 * Pentium Pro/Pentium II era, often called the P6. The model is used to
152 * identify the exact processor. Different models are often used for the client
153 * (consumer) and server parts. Even though each processor often has major
154 * architectural differences, they still are considered the same family by
155 * Intel.
156 *
157 * On the other hand, each major AMD architecture generally has its own family.
158 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
159 * the model number is used to help identify specific processors.
160 *
161 * The stepping is used to refer to a revision of a specific microprocessor. The
162 * term comes from equipment used to produce masks that are used to create
163 * integrated circuits.
164 *
165 * The information is present in leaf 1, %eax. In technical documentation you
166 * will see the terms extended model and extended family. The original family,
167 * model, and stepping fields were each 4 bits wide. If the values in either
168 * are 0xf, then one is to consult the extended model and extended family, which
169 * take previously reserved bits and allow for a larger number of models and add
170 * 0xf to them.
171 *
172 * When we process this information, we store the full family, model, and
173 * stepping in the struct cpuid_info members cpi_family, cpi_model, and
174 * cpi_step, respectively. Whenever you are performing comparisons with the
175 * family, model, and stepping, you should use these members and not the raw
176 * values from cpuid. If you must use the raw values from cpuid directly, you
177 * must make sure that you add the extended model and family to the base model
178 * and family.
179 *
180 * In general, we do not use information about the family, model, and stepping
181 * to determine whether or not a feature is present; that is generally driven by
182 * specific leaves. However, when something we care about on the processor is
183 * not considered 'architectural' meaning that it is specific to a set of
184 * processors and not promised in the architecture model to be consistent from
185 * generation to generation, then we will fall back on this information. The
186 * most common cases where this comes up is when we have to workaround errata in
187 * the processor, are dealing with processor-specific features such as CPU
188 * performance counters, or we want to provide additional information for things
189 * such as fault management.
190 *
191 * While processors also do have a brand string, which is the name that people
192 * are familiar with when buying the processor, they are not meant for
193 * programmatic consumption. That is what the family, model, and stepping are
194 * for.
195 *
196 * ------------
197 * CPUID Passes
198 * ------------
199 *
200 * As part of performing feature detection, we break this into several different
201 * passes. The passes are as follows:
202 *
203 *	Pass 0		This is a primordial pass done in locore.s to deal with
204 *			Cyrix CPUs that don't support cpuid. The reality is that
205 *			we likely don't run on them any more, but there is still
206 *			logic for handling them.
207 *
208 *	Pass 1		This is the primary pass and is responsible for doing a
209 *			large number of different things:
210 *
211 *			1. Determine which vendor manufactured the CPU and
212 *			determining the family, model, and stepping information.
213 *
214 *			2. Gathering a large number of feature flags to
215 *			determine which features the CPU support and which
216 *			indicate things that we need to do other work in the OS
217 *			to enable. Features detected this way are added to the
218 *			x86_featureset which can be queried to
219 *			determine what we should do. This includes processing
220 *			all of the basic and extended CPU features that we care
221 *			about.
222 *
223 *			3. Determining the CPU's topology. This includes
224 *			information about how many cores and threads are present
225 *			in the package. It also is responsible for figuring out
226 *			which logical CPUs are potentially part of the same core
227 *			and what other resources they might share. For more
228 *			information see the 'Topology' section.
229 *
230 *			4. Determining the set of CPU security-specific features
231 *			that we need to worry about and determine the
232 *			appropriate set of workarounds.
233 *
234 *			Pass 1 on the boot CPU occurs before KMDB is started.
235 *
236 *	Pass 2		The second pass is done after startup(). Here, we check
237 *			other miscellaneous features. Most of this is gathering
238 *			additional basic and extended features that we'll use in
239 *			later passes or for debugging support.
240 *
241 *	Pass 3		The third pass occurs after the kernel memory allocator
242 *			has been fully initialized. This gathers information
243 *			where we might need dynamic memory available for our
244 *			uses. This includes several varying width leaves that
245 *			have cache information and the processor's brand string.
246 *
247 *	Pass 4		The fourth and final normal pass is performed after the
248 *			kernel has brought most everything online. This is
249 *			invoked from post_startup(). In this pass, we go through
250 *			the set of features that we have enabled and turn that
251 *			into the hardware auxiliary vector features that
252 *			userland receives. This is used by userland, primarily
253 *			by the run-time link-editor (RTLD), though userland
254 *			software could also refer to it directly.
255 *
256 *	Microcode	After a microcode update, we do a selective rescan of
257 *			the cpuid leaves to determine what features have
258 *			changed. Microcode updates can provide more details
259 *			about security related features to deal with issues like
260 *			Spectre and L1TF. On occasion, vendors have violated
261 *			their contract and removed bits. However, we don't try
262 *			to detect that because that puts us in a situation that
263 *			we really can't deal with. As such, the only thing we
264 *			rescan are security related features today. See
265 *			cpuid_pass_ucode().
266 *
267 * All of the passes (except pass 0) are run on all CPUs. However, for the most
268 * part we only care about what the boot CPU says about this information and use
269 * the other CPUs as a rough guide to sanity check that we have the same feature
270 * set.
271 *
272 * We do not support running multiple logical CPUs with disjoint, let alone
273 * different, feature sets.
274 *
275 * ------------------
276 * Processor Topology
277 * ------------------
278 *
279 * One of the important things that we need to do is to understand the topology
280 * of the underlying processor. When we say topology in this case, we're trying
281 * to understand the relationship between the logical CPUs that the operating
282 * system sees and the underlying physical layout. Different logical CPUs may
283 * share different resources which can have important consequences for the
284 * performance of the system. For example, they may share caches, execution
285 * units, and more.
286 *
287 * The topology of the processor changes from generation to generation and
288 * vendor to vendor.  Along with that, different vendors use different
289 * terminology, and the operating system itself uses occasionally overlapping
290 * terminology. It's important to understand what this topology looks like so
291 * one can understand the different things that we try to calculate and
292 * determine.
293 *
294 * To get started, let's talk about a little bit of terminology that we've used
295 * so far, is used throughout this file, and is fairly generic across multiple
296 * vendors:
297 *
298 * CPU
299 *	A central processing unit (CPU) refers to a logical and/or virtual
300 *	entity that the operating system can execute instructions on. The
301 *	underlying resources for this CPU may be shared between multiple
302 *	entities; however, to the operating system it is a discrete unit.
303 *
304 * PROCESSOR and PACKAGE
305 *
306 *	Generally, when we use the term 'processor' on its own, we are referring
307 *	to the physical entity that one buys and plugs into a board. However,
308 *	because processor has been overloaded and one might see it used to mean
309 *	multiple different levels, we will instead use the term 'package' for
310 *	the rest of this file. The term package comes from the electrical
311 *	engineering side and refers to the physical entity that encloses the
312 *	electronics inside. Strictly speaking the package can contain more than
313 *	just the CPU, for example, on many processors it may also have what's
314 *	called an 'integrated graphical processing unit (GPU)'. Because the
315 *	package can encapsulate multiple units, it is the largest physical unit
316 *	that we refer to.
317 *
318 * SOCKET
319 *
320 *	A socket refers to unit on a system board (generally the motherboard)
321 *	that can receive a package. A single package, or processor, is plugged
322 *	into a single socket. A system may have multiple sockets. Often times,
323 *	the term socket is used interchangeably with package and refers to the
324 *	electrical component that has plugged in, and not the receptacle itself.
325 *
326 * CORE
327 *
328 *	A core refers to the physical instantiation of a CPU, generally, with a
329 *	full set of hardware resources available to it. A package may contain
330 *	multiple cores inside of it or it may just have a single one. A
331 *	processor with more than one core is often referred to as 'multi-core'.
332 *	In illumos, we will use the feature X86FSET_CMP to refer to a system
333 *	that has 'multi-core' processors.
334 *
335 *	A core may expose a single logical CPU to the operating system, or it
336 *	may expose multiple CPUs, which we call threads, defined below.
337 *
338 *	Some resources may still be shared by cores in the same package. For
339 *	example, many processors will share the level 3 cache between cores.
340 *	Some AMD generations share hardware resources between cores. For more
341 *	information on that see the section 'AMD Topology'.
342 *
343 * THREAD and STRAND
344 *
345 *	In this file, generally a thread refers to a hardware resources and not
346 *	the operating system's logical abstraction. A thread is always exposed
347 *	as an independent logical CPU to the operating system. A thread belongs
348 *	to a specific core. A core may have more than one thread. When that is
349 *	the case, the threads that are part of the same core are often referred
350 *	to as 'siblings'.
351 *
352 *	When multiple threads exist, this is generally referred to as
353 *	simultaneous multi-threading (SMT). When Intel introduced this in their
354 *	processors they called it hyper-threading (HT). When multiple threads
355 *	are active in a core, they split the resources of the core. For example,
356 *	two threads may share the same set of hardware execution units.
357 *
358 *	The operating system often uses the term 'strand' to refer to a thread.
359 *	This helps disambiguate it from the software concept.
360 *
361 * CHIP
362 *
363 *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
364 *	base meaning, it is used to refer to a single integrated circuit, which
365 *	may or may not be the only thing in the package. In illumos, when you
366 *	see the term 'chip' it is almost always referring to the same thing as
367 *	the 'package'. However, many vendors may use chip to refer to one of
368 *	many integrated circuits that have been placed in the package. As an
369 *	example, see the subsequent definition.
370 *
371 *	To try and keep things consistent, we will only use chip when referring
372 *	to the entire integrated circuit package, with the exception of the
373 *	definition of multi-chip module (because it is in the name) and use the
374 *	term 'die' when we want the more general, potential sub-component
375 *	definition.
376 *
377 * DIE
378 *
379 *	A die refers to an integrated circuit. Inside of the package there may
380 *	be a single die or multiple dies. This is sometimes called a 'chip' in
381 *	vendor's parlance, but in this file, we use the term die to refer to a
382 *	subcomponent.
383 *
384 * MULTI-CHIP MODULE
385 *
386 *	A multi-chip module (MCM) refers to putting multiple distinct chips that
387 *	are connected together in the same package. When a multi-chip design is
388 *	used, generally each chip is manufactured independently and then joined
389 *	together in the package. For example, on AMD's Zen microarchitecture
390 *	(family 0x17), the package contains several dies (the second meaning of
391 *	chip from above) that are connected together.
392 *
393 * CACHE
394 *
395 *	A cache is a part of the processor that maintains copies of recently
396 *	accessed memory. Caches are split into levels and then into types.
397 *	Commonly there are one to three levels, called level one, two, and
398 *	three. The lower the level, the smaller it is, the closer it is to the
399 *	execution units of the CPU, and the faster it is to access. The layout
400 *	and design of the cache come in many different flavors, consult other
401 *	resources for a discussion of those.
402 *
403 *	Caches are generally split into two types, the instruction and data
404 *	cache. The caches contain what their names suggest, the instruction
405 *	cache has executable program text, while the data cache has all other
406 *	memory that the processor accesses. As of this writing, data is kept
407 *	coherent between all of the caches on x86, so if one modifies program
408 *	text before it is executed, that will be in the data cache, and the
409 *	instruction cache will be synchronized with that change when the
410 *	processor actually executes those instructions. This coherency also
411 *	covers the fact that data could show up in multiple caches.
412 *
413 *	Generally, the lowest level caches are specific to a core. However, the
414 *	last layer cache is shared between some number of cores. The number of
415 *	CPUs sharing this last level cache is important. This has implications
416 *	for the choices that the scheduler makes, as accessing memory that might
417 *	be in a remote cache after thread migration can be quite expensive.
418 *
419 *	Sometimes, the word cache is abbreviated with a '$', because in US
420 *	English the word cache is pronounced the same as cash. So L1D$ refers to
421 *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
422 *	in the rest of this theory statement for clarity.
423 *
424 * MEMORY CONTROLLER
425 *
426 *	The memory controller is a component that provides access to DRAM. Each
427 *	memory controller can access a set number of DRAM channels. Each channel
428 *	can have a number of DIMMs (sticks of memory) associated with it. A
429 *	given package may have more than one memory controller. The association
430 *	of the memory controller to a group of cores is important as it is
431 *	cheaper to access memory on the controller that you are associated with.
432 *
433 * NUMA
434 *
435 *	NUMA or non-uniform memory access, describes a way that systems are
436 *	built. On x86, any processor core can address all of the memory in the
437 *	system. However, When using multiple sockets or possibly within a
438 *	multi-chip module, some of that memory is physically closer and some of
439 *	it is further. Memory that is further away is more expensive to access.
440 *	Consider the following image of multiple sockets with memory:
441 *
442 *	+--------+                                                +--------+
443 *	| DIMM A |         +----------+      +----------+         | DIMM D |
444 *	+--------+-+       |          |      |          |       +-+------+-+
445 *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
446 *	  +--------+-+     |          |      |          |     +-+------+-+
447 *	    | DIMM C |     +----------+      +----------+     | DIMM F |
448 *	    +--------+                                        +--------+
449 *
450 *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
451 *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
452 *	access DIMMs A-C and more expensive to access D-F as it has to go
453 *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
454 *	D-F are cheaper than A-C. While the socket form is the most common, when
455 *	using multi-chip modules, this can also sometimes occur. For another
456 *	example of this that's more involved, see the AMD topology section.
457 *
458 *
459 * Intel Topology
460 * --------------
461 *
462 * Most Intel processors since Nehalem, (as of this writing the current gen
463 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
464 * the package is a single monolithic die. MCMs currently aren't used. Most
465 * parts have three levels of caches, with the L3 cache being shared between
466 * all of the cores on the package. The L1/L2 cache is generally specific to
467 * an individual core. The following image shows at a simplified level what
468 * this looks like. The memory controller is commonly part of something called
469 * the 'Uncore', that used to be separate physical chips that were not a part of
470 * the package, but are now part of the same chip.
471 *
472 *  +-----------------------------------------------------------------------+
473 *  | Package                                                               |
474 *  |  +-------------------+  +-------------------+  +-------------------+  |
475 *  |  | Core              |  | Core              |  | Core              |  |
476 *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
477 *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
478 *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
479 *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
480 *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
481 *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
482 *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
483 *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
484 *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
485 *  |  +-------------------+  +-------------------+  +-------------------+  |
486 *  | +-------------------------------------------------------------------+ |
487 *  | |                         Shared L3 Cache                           | |
488 *  | +-------------------------------------------------------------------+ |
489 *  | +-------------------------------------------------------------------+ |
490 *  | |                        Memory Controller                          | |
491 *  | +-------------------------------------------------------------------+ |
492 *  +-----------------------------------------------------------------------+
493 *
494 * A side effect of this current architecture is that what we care about from a
495 * scheduling and topology perspective, is simplified. In general we care about
496 * understanding which logical CPUs are part of the same core and socket.
497 *
498 * To determine the relationship between threads and cores, Intel initially used
499 * the identifier in the advanced programmable interrupt controller (APIC). They
500 * also added cpuid leaf 4 to give additional information about the number of
501 * threads and CPUs in the processor. With the addition of x2apic (which
502 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
503 * additional cpuid topology leaf 0xB was added.
504 *
505 * AMD Topology
506 * ------------
507 *
508 * When discussing AMD topology, we want to break this into three distinct
509 * generations of topology. There's the basic topology that has been used in
510 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
511 * with family 0x15 (Bulldozer), and there's the topology that was introduced
512 * with family 0x17 (Zen). AMD also has some additional terminology that's worth
513 * talking about.
514 *
515 * Until the introduction of family 0x17 (Zen), AMD did not implement something
516 * that they considered SMT. Whether or not the AMD processors have SMT
517 * influences many things including scheduling and reliability, availability,
518 * and serviceability (RAS) features.
519 *
520 * NODE
521 *
522 *	AMD uses the term node to refer to a die that contains a number of cores
523 *	and I/O resources. Depending on the processor family and model, more
524 *	than one node can be present in the package. When there is more than one
525 *	node this indicates a multi-chip module. Usually each node has its own
526 *	access to memory and I/O devices. This is important and generally
527 *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
528 *	result, we track this relationship in the operating system.
529 *
530 *	In processors with an L3 cache, the L3 cache is generally shared across
531 *	the entire node, though the way this is carved up varies from generation
532 *	to generation.
533 *
534 * BULLDOZER
535 *
536 *	Starting with the Bulldozer family (0x15) and continuing until the
537 *	introduction of the Zen microarchitecture, AMD introduced the idea of a
538 *	compute unit. In a compute unit, two traditional cores share a number of
539 *	hardware resources. Critically, they share the FPU, L1 instruction
540 *	cache, and the L2 cache. Several compute units were then combined inside
541 *	of a single node.  Because the integer execution units, L1 data cache,
542 *	and some other resources were not shared between the cores, AMD never
543 *	considered this to be SMT.
544 *
545 * ZEN
546 *
547 *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
548 *	is called Zeppelin. These modules are similar to the idea of nodes used
549 *	previously. Each of these nodes has two DRAM channels which all of the
550 *	cores in the node can access uniformly. These nodes are linked together
551 *	in the package, creating a NUMA environment.
552 *
553 *	The Zeppelin die itself contains two different 'core complexes'. Each
554 *	core complex consists of four cores which each have two threads, for a
555 *	total of 8 logical CPUs per complex. Unlike other generations,
556 *	where all the logical CPUs in a given node share the L3 cache, here each
557 *	core complex has its own shared L3 cache.
558 *
559 *	A further thing that we need to consider is that in some configurations,
560 *	particularly with the Threadripper line of processors, not every die
561 *	actually has its memory controllers wired up to actual memory channels.
562 *	This means that some cores have memory attached to them and others
563 *	don't.
564 *
565 *	To put Zen in perspective, consider the following images:
566 *
567 *      +--------------------------------------------------------+
568 *      | Core Complex                                           |
569 *      | +-------------------+    +-------------------+  +---+  |
570 *      | | Core       +----+ |    | Core       +----+ |  |   |  |
571 *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
572 *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
573 *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
574 *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
575 *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
576 *      | +-------------------+    +-------------------+  | C |  |
577 *      | +-------------------+    +-------------------+  | a |  |
578 *      | | Core       +----+ |    | Core       +----+ |  | c |  |
579 *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
580 *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
581 *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
582 *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
583 *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
584 *      | +-------------------+    +-------------------+  +---+  |
585 *      |                                                        |
586 *	+--------------------------------------------------------+
587 *
588 *  This first image represents a single Zen core complex that consists of four
589 *  cores.
590 *
591 *
592 *	+--------------------------------------------------------+
593 *	| Zeppelin Die                                           |
594 *	|  +--------------------------------------------------+  |
595 *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
596 *	|  +--------------------------------------------------+  |
597 *      |                           HH                           |
598 *	|          +-----------+    HH    +-----------+          |
599 *	|          |           |    HH    |           |          |
600 *	|          |    Core   |==========|    Core   |          |
601 *	|          |  Complex  |==========|  Complex  |          |
602 *	|          |           |    HH    |           |          |
603 *	|          +-----------+    HH    +-----------+          |
604 *      |                           HH                           |
605 *	|  +--------------------------------------------------+  |
606 *	|  |                Memory Controller                 |  |
607 *	|  +--------------------------------------------------+  |
608 *      |                                                        |
609 *	+--------------------------------------------------------+
610 *
611 *  This image represents a single Zeppelin Die. Note how both cores are
612 *  connected to the same memory controller and I/O units. While each core
613 *  complex has its own L3 cache as seen in the first image, they both have
614 *  uniform access to memory.
615 *
616 *
617 *                      PP                     PP
618 *                      PP                     PP
619 *           +----------PP---------------------PP---------+
620 *           |          PP                     PP         |
621 *           |    +-----------+          +-----------+    |
622 *           |    |           |          |           |    |
623 *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
624 *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
625 *           |    |           |          |           |    |
626 *           |    +-----------+ooo    ...+-----------+    |
627 *           |          HH      ooo  ...       HH         |
628 *           |          HH        oo..         HH         |
629 *           |          HH        ..oo         HH         |
630 *           |          HH      ...  ooo       HH         |
631 *           |    +-----------+...    ooo+-----------+    |
632 *           |    |           |          |           |    |
633 *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
634 *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
635 *           |    |           |          |           |    |
636 *           |    +-----------+          +-----------+    |
637 *           |          PP                     PP         |
638 *           +----------PP---------------------PP---------+
639 *                      PP                     PP
640 *                      PP                     PP
641 *
642 *  This image represents a single Zen package. In this example, it has four
643 *  Zeppelin dies, though some configurations only have a single one. In this
644 *  example, each die is directly connected to the next. Also, each die is
645 *  represented as being connected to memory by the 'M' character and connected
646 *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
647 *  die is made up of two core complexes, we have multiple different NUMA
648 *  domains that we care about for these systems.
649 *
650 * CPUID LEAVES
651 *
652 * There are a few different CPUID leaves that we can use to try and understand
653 * the actual state of the world. As part of the introduction of family 0xf, AMD
654 * added CPUID leaf 0x80000008. This leaf tells us the number of logical
655 * processors that are in the system. Because families before Zen didn't have
656 * SMT, this was always the number of cores that were in the system. However, it
657 * should always be thought of as the number of logical threads to be consistent
658 * between generations. In addition we also get the size of the APIC ID that is
659 * used to represent the number of logical processors. This is important for
660 * deriving topology information.
661 *
662 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
663 * bit between Bulldozer and later families, but it is quite useful in
664 * determining the topology information. Because this information has changed
665 * across family generations, it's worth calling out what these mean
666 * explicitly. The registers have the following meanings:
667 *
668 *	%eax	The APIC ID. The entire register is defined to have a 32-bit
669 *		APIC ID, even though on systems without x2apic support, it will
670 *		be limited to 8 bits.
671 *
672 *	%ebx	On Bulldozer-era systems this contains information about the
673 *		number of cores that are in a compute unit (cores that share
674 *		resources). It also contains a per-package compute unit ID that
675 *		identifies which compute unit the logical CPU is a part of.
676 *
677 *		On Zen-era systems this instead contains the number of threads
678 *		per core and the ID of the core that the logical CPU is a part
679 *		of. Note, this ID is unique only to the package, it is not
680 *		globally unique across the entire system.
681 *
682 *	%ecx	This contains the number of nodes that exist in the package. It
683 *		also contains an ID that identifies which node the logical CPU
684 *		is a part of.
685 *
686 * Finally, we also use cpuid leaf 0x8000001D to determine information about the
687 * cache layout to determine which logical CPUs are sharing which caches.
688 *
689 * illumos Topology
690 * ----------------
691 *
692 * Based on the above we synthesize the information into several different
693 * variables that we store in the 'struct cpuid_info'. We'll go into the details
694 * of what each member is supposed to represent and their uniqueness. In
695 * general, there are two levels of uniqueness that we care about. We care about
696 * an ID that is globally unique. That means that it will be unique across all
697 * entities in the system. For example, the default logical CPU ID is globally
698 * unique. On the other hand, there is some information that we only care about
699 * being unique within the context of a single package / socket. Here are the
700 * variables that we keep track of and their meaning.
701 *
702 * Several of the values that are asking for an identifier, with the exception
703 * of cpi_apicid, are allowed to be synthetic.
704 *
705 *
706 * cpi_apicid
707 *
708 *	This is the value of the CPU's APIC id. This should be the full 32-bit
709 *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
710 *	APIC ID. This value is globally unique between all logical CPUs across
711 *	all packages. This is usually required by the APIC.
712 *
713 * cpi_chipid
714 *
715 *	This value indicates the ID of the package that the logical CPU is a
716 *	part of. This value is allowed to be synthetic. It is usually derived by
717 *	taking the CPU's APIC ID and determining how many bits are used to
718 *	represent CPU cores in the package. All logical CPUs that are part of
719 *	the same package must have the same value.
720 *
721 * cpi_coreid
722 *
723 *	This represents the ID of a CPU core. Two logical CPUs should only have
724 *	the same cpi_coreid value if they are part of the same core. These
725 *	values may be synthetic. On systems that support SMT, this value is
726 *	usually derived from the APIC ID, otherwise it is often synthetic and
727 *	just set to the value of the cpu_id in the cpu_t.
728 *
729 * cpi_pkgcoreid
730 *
731 *	This is similar to the cpi_coreid in that logical CPUs that are part of
732 *	the same core should have the same ID. The main difference is that these
733 *	values are only required to be unique to a given socket.
734 *
735 * cpi_clogid
736 *
737 *	This represents the logical ID of a logical CPU. This value should be
738 *	unique within a given socket for each logical CPU. This is allowed to be
739 *	synthetic, though it is usually based off of the CPU's apic ID. The
740 *	broader system expects that logical CPUs that have are part of the same
741 *	core have contiguous numbers. For example, if there were two threads per
742 *	core, then the core IDs divided by two should be the same and the first
743 *	modulus two should be zero and the second one. For example, IDs 4 and 5
744 *	indicate two logical CPUs that are part of the same core. But IDs 5 and
745 *	6 represent two logical CPUs that are part of different cores.
746 *
747 *	While it is common for the cpi_coreid and the cpi_clogid to be derived
748 *	from the same source, strictly speaking, they don't have to be and the
749 *	two values should be considered logically independent. One should not
750 *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
751 *	some kind of relationship. While this is tempting, we've seen cases on
752 *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
753 *
754 * cpi_ncpu_per_chip
755 *
756 *	This value indicates the total number of logical CPUs that exist in the
757 *	physical package. Critically, this is not the number of logical CPUs
758 *	that exist for just the single core.
759 *
760 *	This value should be the same for all logical CPUs in the same package.
761 *
762 * cpi_ncore_per_chip
763 *
764 *	This value indicates the total number of physical CPU cores that exist
765 *	in the package. The system compares this value with cpi_ncpu_per_chip to
766 *	determine if simultaneous multi-threading (SMT) is enabled. When
767 *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
768 *	the X86FSET_HTT feature is not set. If this value is greater than one,
769 *	than we consider the processor to have the feature X86FSET_CMP, to
770 *	indicate that there is support for more than one core.
771 *
772 *	This value should be the same for all logical CPUs in the same package.
773 *
774 * cpi_procnodes_per_pkg
775 *
776 *	This value indicates the number of 'nodes' that exist in the package.
777 *	When processors are actually a multi-chip module, this represents the
778 *	number of such modules that exist in the package. Currently, on Intel
779 *	based systems this member is always set to 1.
780 *
781 *	This value should be the same for all logical CPUs in the same package.
782 *
783 * cpi_procnodeid
784 *
785 *	This value indicates the ID of the node that the logical CPU is a part
786 *	of. All logical CPUs that are in the same node must have the same value
787 *	here. This value must be unique across all of the packages in the
788 *	system.  On Intel based systems, this is currently set to the value in
789 *	cpi_chipid because there is only one node.
790 *
791 * cpi_cores_per_compunit
792 *
793 *	This value indicates the number of cores that are part of a compute
794 *	unit. See the AMD topology section for this. This member only has real
795 *	meaning currently for AMD Bulldozer family processors. For all other
796 *	processors, this should currently be set to 1.
797 *
798 * cpi_compunitid
799 *
800 *	This indicates the compute unit that the logical CPU belongs to. For
801 *	processors without AMD Bulldozer-style compute units this should be set
802 *	to the value of cpi_coreid.
803 *
804 * cpi_ncpu_shr_last_cache
805 *
806 *	This indicates the number of logical CPUs that are sharing the same last
807 *	level cache. This value should be the same for all CPUs that are sharing
808 *	that cache. The last cache refers to the cache that is closest to memory
809 *	and furthest away from the CPU.
810 *
811 * cpi_last_lvl_cacheid
812 *
813 *	This indicates the ID of the last cache that the logical CPU uses. This
814 *	cache is often shared between multiple logical CPUs and is the cache
815 *	that is closest to memory and furthest away from the CPU. This value
816 *	should be the same for a group of logical CPUs only if they actually
817 *	share the same last level cache. IDs should not overlap between
818 *	packages.
819 *
820 * cpi_ncore_bits
821 *
822 *	This indicates the number of bits that are required to represent all of
823 *	the cores in the system. As cores are derived based on their APIC IDs,
824 *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
825 *	this value to be larger than the actual number of IDs that are present
826 *	in the system. This is used to size tables by the CMI framework. It is
827 *	only filled in for Intel and AMD CPUs.
828 *
829 * cpi_nthread_bits
830 *
831 *	This indicates the number of bits required to represent all of the IDs
832 *	that cover the logical CPUs that exist on a given core. It's OK for this
833 *	value to be larger than the actual number of IDs that are present in the
834 *	system.  This is used to size tables by the CMI framework. It is
835 *	only filled in for Intel and AMD CPUs.
836 *
837 * -----------
838 * Hypervisors
839 * -----------
840 *
841 * If trying to manage the differences between vendors wasn't bad enough, it can
842 * get worse thanks to our friend hardware virtualization. Hypervisors are given
843 * the ability to interpose on all cpuid instructions and change them to suit
844 * their purposes. In general, this is necessary as the hypervisor wants to be
845 * able to present a more uniform set of features or not necessarily give the
846 * guest operating system kernel knowledge of all features so it can be
847 * more easily migrated between systems.
848 *
849 * When it comes to trying to determine topology information, this can be a
850 * double edged sword. When a hypervisor doesn't actually implement a cpuid
851 * leaf, it'll often return all zeros. Because of that, you'll often see various
852 * checks scattered about fields being non-zero before we assume we can use
853 * them.
854 *
855 * When it comes to topology information, the hypervisor is often incentivized
856 * to lie to you about topology. This is because it doesn't always actually
857 * guarantee that topology at all. The topology path we take in the system
858 * depends on how the CPU advertises itself. If it advertises itself as an Intel
859 * or AMD CPU, then we basically do our normal path. However, when they don't
860 * use an actual vendor, then that usually turns into multiple one-core CPUs
861 * that we enumerate that are often on different sockets. The actual behavior
862 * depends greatly on what the hypervisor actually exposes to us.
863 *
864 * --------------------
865 * Exposing Information
866 * --------------------
867 *
868 * We expose CPUID information in three different forms in the system.
869 *
870 * The first is through the x86_featureset variable. This is used in conjunction
871 * with the is_x86_feature() function. This is queried by x86-specific functions
872 * to determine which features are or aren't present in the system and to make
873 * decisions based upon them. For example, users of this include everything from
874 * parts of the system dedicated to reliability, availability, and
875 * serviceability (RAS), to making decisions about how to handle security
876 * mitigations, to various x86-specific drivers. General purpose or
877 * architecture independent drivers should never be calling this function.
878 *
879 * The second means is through the auxiliary vector. The auxiliary vector is a
880 * series of tagged data that the kernel passes down to a user program when it
881 * begins executing. This information is used to indicate to programs what
882 * instruction set extensions are present. For example, information about the
883 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
884 * since user programs cannot make use of it. However, things like the AVX
885 * instruction sets are. Programs use this information to make run-time
886 * decisions about what features they should use. As an example, the run-time
887 * link-editor (rtld) can relocate different functions depending on the hardware
888 * support available.
889 *
890 * The final form is through a series of accessor functions that all have the
891 * form cpuid_get*. This is used by a number of different subsystems in the
892 * kernel to determine more detailed information about what we're running on,
893 * topology information, etc. Some of these subsystems include processor groups
894 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
895 * microcode, and performance monitoring. These functions all ASSERT that the
896 * CPU they're being called on has reached a certain cpuid pass. If the passes
897 * are rearranged, then this needs to be adjusted.
898 *
899 * -----------------------------------------------
900 * Speculative Execution CPU Side Channel Security
901 * -----------------------------------------------
902 *
903 * With the advent of the Spectre and Meltdown attacks which exploit speculative
904 * execution in the CPU to create side channels there have been a number of
905 * different attacks and corresponding issues that the operating system needs to
906 * mitigate against. The following list is some of the common, but not
907 * exhaustive, set of issues that we know about and have done some or need to do
908 * more work in the system to mitigate against:
909 *
910 *   - Spectre v1
911 *   - swapgs (Spectre v1 variant)
912 *   - Spectre v2
913 *   - Meltdown (Spectre v3)
914 *   - Rogue Register Read (Spectre v3a)
915 *   - Speculative Store Bypass (Spectre v4)
916 *   - ret2spec, SpectreRSB
917 *   - L1 Terminal Fault (L1TF)
918 *   - Microarchitectural Data Sampling (MDS)
919 *
920 * Each of these requires different sets of mitigations and has different attack
921 * surfaces. For the most part, this discussion is about protecting the kernel
922 * from non-kernel executing environments such as user processes and hardware
923 * virtual machines. Unfortunately, there are a number of user vs. user
924 * scenarios that exist with these. The rest of this section will describe the
925 * overall approach that the system has taken to address these as well as their
926 * shortcomings. Unfortunately, not all of the above have been handled today.
927 *
928 * SPECTRE v2, ret2spec, SpectreRSB
929 *
930 * The second variant of the spectre attack focuses on performing branch target
931 * injection. This generally impacts indirect call instructions in the system.
932 * There are three different ways to mitigate this issue that are commonly
933 * described today:
934 *
935 *  1. Using Indirect Branch Restricted Speculation (IBRS).
936 *  2. Using Retpolines and RSB Stuffing
937 *  3. Using Enhanced Indirect Branch Restricted Speculation (EIBRS)
938 *
939 * IBRS uses a feature added to microcode to restrict speculation, among other
940 * things. This form of mitigation has not been used as it has been generally
941 * seen as too expensive and requires reactivation upon various transitions in
942 * the system.
943 *
944 * As a less impactful alternative to IBRS, retpolines were developed by
945 * Google. These basically require one to replace indirect calls with a specific
946 * trampoline that will cause speculation to fail and break the attack.
947 * Retpolines require compiler support. We always build with retpolines in the
948 * external thunk mode. This means that a traditional indirect call is replaced
949 * with a call to one of the __x86_indirect_thunk_<reg> functions. A side effect
950 * of this is that all indirect function calls are performed through a register.
951 *
952 * We have to use a common external location of the thunk and not inline it into
953 * the callsite so that way we can have a single place to patch these functions.
954 * As it turns out, we actually have three different forms of retpolines that
955 * exist in the system:
956 *
957 *  1. A full retpoline
958 *  2. An AMD-specific optimized retpoline
959 *  3. A no-op version
960 *
961 * The first one is used in the general case. The second one is used if we can
962 * determine that we're on an AMD system and we can successfully toggle the
963 * lfence serializing MSR that exists on the platform. Basically with this
964 * present, an lfence is sufficient and we don't need to do anywhere near as
965 * complicated a dance to successfully use retpolines.
966 *
967 * The third form described above is the most curious. It turns out that the way
968 * that retpolines are implemented is that they rely on how speculation is
969 * performed on a 'ret' instruction. Intel has continued to optimize this
970 * process (which is partly why we need to have return stack buffer stuffing,
971 * but more on that in a bit) and in processors starting with Cascade Lake
972 * on the server side, it's dangerous to rely on retpolines. Instead, a new
973 * mechanism has been introduced called Enhanced IBRS (EIBRS).
974 *
975 * Unlike IBRS, EIBRS is designed to be enabled once at boot and left on each
976 * physical core. However, if this is the case, we don't want to use retpolines
977 * any more. Therefore if EIBRS is present, we end up turning each retpoline
978 * function (called a thunk) into a jmp instruction. This means that we're still
979 * paying the cost of an extra jump to the external thunk, but it gives us
980 * flexibility and the ability to have a single kernel image that works across a
981 * wide variety of systems and hardware features.
982 *
983 * Unfortunately, this alone is insufficient. First, Skylake systems have
984 * additional speculation for the Return Stack Buffer (RSB) which is used to
985 * return from call instructions which retpolines take advantage of. However,
986 * this problem is not just limited to Skylake and is actually more pernicious.
987 * The SpectreRSB paper introduces several more problems that can arise with
988 * dealing with this. The RSB can be poisoned just like the indirect branch
989 * predictor. This means that one needs to clear the RSB when transitioning
990 * between two different privilege domains. Some examples include:
991 *
992 *  - Switching between two different user processes
993 *  - Going between user land and the kernel
994 *  - Returning to the kernel from a hardware virtual machine
995 *
996 * Mitigating this involves combining a couple of different things. The first is
997 * SMEP (supervisor mode execution protection) which was introduced in Ivy
998 * Bridge. When an RSB entry refers to a user address and we're executing in the
999 * kernel, speculation through it will be stopped when SMEP is enabled. This
1000 * protects against a number of the different cases that we would normally be
1001 * worried about such as when we enter the kernel from user land.
1002 *
1003 * To prevent against additional manipulation of the RSB from other contexts
1004 * such as a non-root VMX context attacking the kernel we first look to enhanced
1005 * IBRS. When EIBRS is present and enabled, then there is nothing else that we
1006 * need to do to protect the kernel at this time.
1007 *
1008 * On CPUs without EIBRS we need to manually overwrite the contents of the
1009 * return stack buffer. We do this through the x86_rsb_stuff() function.
1010 * Currently this is employed on context switch. The x86_rsb_stuff() function is
1011 * disabled when enhanced IBRS is present because Intel claims on such systems
1012 * it will be ineffective. Stuffing the RSB in context switch helps prevent user
1013 * to user attacks via the RSB.
1014 *
1015 * If SMEP is not present, then we would have to stuff the RSB every time we
1016 * transitioned from user mode to the kernel, which isn't very practical right
1017 * now.
1018 *
1019 * To fully protect user to user and vmx to vmx attacks from these classes of
1020 * issues, we would also need to allow them to opt into performing an Indirect
1021 * Branch Prediction Barrier (IBPB) on switch. This is not currently wired up.
1022 *
1023 * By default, the system will enable RSB stuffing and the required variant of
1024 * retpolines and store that information in the x86_spectrev2_mitigation value.
1025 * This will be evaluated after a microcode update as well, though it is
1026 * expected that microcode updates will not take away features. This may mean
1027 * that a late loaded microcode may not end up in the optimal configuration
1028 * (though this should be rare).
1029 *
1030 * Currently we do not build kmdb with retpolines or perform any additional side
1031 * channel security mitigations for it. One complication with kmdb is that it
1032 * requires its own retpoline thunks and it would need to adjust itself based on
1033 * what the kernel does. The threat model of kmdb is more limited and therefore
1034 * it may make more sense to investigate using prediction barriers as the whole
1035 * system is only executing a single instruction at a time while in kmdb.
1036 *
1037 * SPECTRE v1, v4
1038 *
1039 * The v1 and v4 variants of spectre are not currently mitigated in the
1040 * system and require other classes of changes to occur in the code.
1041 *
1042 * SPECTRE v1 (SWAPGS VARIANT)
1043 *
1044 * The class of Spectre v1 vulnerabilities aren't all about bounds checks, but
1045 * can generally affect any branch-dependent code. The swapgs issue is one
1046 * variant of this. If we are coming in from userspace, we can have code like
1047 * this:
1048 *
1049 *	cmpw	$KCS_SEL, REGOFF_CS(%rsp)
1050 *	je	1f
1051 *	movq	$0, REGOFF_SAVFP(%rsp)
1052 *	swapgs
1053 *	1:
1054 *	movq	%gs:CPU_THREAD, %rax
1055 *
1056 * If an attacker can cause a mis-speculation of the branch here, we could skip
1057 * the needed swapgs, and use the /user/ %gsbase as the base of the %gs-based
1058 * load. If subsequent code can act as the usual Spectre cache gadget, this
1059 * would potentially allow KPTI bypass. To fix this, we need an lfence prior to
1060 * any use of the %gs override.
1061 *
1062 * The other case is also an issue: if we're coming into a trap from kernel
1063 * space, we could mis-speculate and swapgs the user %gsbase back in prior to
1064 * using it. AMD systems are not vulnerable to this version, as a swapgs is
1065 * serializing with respect to subsequent uses. But as AMD /does/ need the other
1066 * case, and the fix is the same in both cases (an lfence at the branch target
1067 * 1: in this example), we'll just do it unconditionally.
1068 *
1069 * Note that we don't enable user-space "wrgsbase" via CR4_FSGSBASE, making it
1070 * harder for user-space to actually set a useful %gsbase value: although it's
1071 * not clear, it might still be feasible via lwp_setprivate(), though, so we
1072 * mitigate anyway.
1073 *
1074 * MELTDOWN
1075 *
1076 * Meltdown, or spectre v3, allowed a user process to read any data in their
1077 * address space regardless of whether or not the page tables in question
1078 * allowed the user to have the ability to read them. The solution to meltdown
1079 * is kernel page table isolation. In this world, there are two page tables that
1080 * are used for a process, one in user land and one in the kernel. To implement
1081 * this we use per-CPU page tables and switch between the user and kernel
1082 * variants when entering and exiting the kernel.  For more information about
1083 * this process and how the trampolines work, please see the big theory
1084 * statements and additional comments in:
1085 *
1086 *  - uts/i86pc/ml/kpti_trampolines.s
1087 *  - uts/i86pc/vm/hat_i86.c
1088 *
1089 * While Meltdown only impacted Intel systems and there are also Intel systems
1090 * that have Meltdown fixed (called Rogue Data Cache Load), we always have
1091 * kernel page table isolation enabled. While this may at first seem weird, an
1092 * important thing to remember is that you can't speculatively read an address
1093 * if it's never in your page table at all. Having user processes without kernel
1094 * pages present provides us with an important layer of defense in the kernel
1095 * against any other side channel attacks that exist and have yet to be
1096 * discovered. As such, kernel page table isolation (KPTI) is always enabled by
1097 * default, no matter the x86 system.
1098 *
1099 * L1 TERMINAL FAULT
1100 *
1101 * L1 Terminal Fault (L1TF) takes advantage of an issue in how speculative
1102 * execution uses page table entries. Effectively, it is two different problems.
1103 * The first is that it ignores the not present bit in the page table entries
1104 * when performing speculative execution. This means that something can
1105 * speculatively read the listed physical address if it's present in the L1
1106 * cache under certain conditions (see Intel's documentation for the full set of
1107 * conditions). Secondly, this can be used to bypass hardware virtualization
1108 * extended page tables (EPT) that are part of Intel's hardware virtual machine
1109 * instructions.
1110 *
1111 * For the non-hardware virtualized case, this is relatively easy to deal with.
1112 * We must make sure that all unmapped pages have an address of zero. This means
1113 * that they could read the first 4k of physical memory; however, we never use
1114 * that first page in the operating system and always skip putting it in our
1115 * memory map, even if firmware tells us we can use it in our memory map. While
1116 * other systems try to put extra metadata in the address and reserved bits,
1117 * which led to this being problematic in those cases, we do not.
1118 *
1119 * For hardware virtual machines things are more complicated. Because they can
1120 * construct their own page tables, it isn't hard for them to perform this
1121 * attack against any physical address. The one wrinkle is that this physical
1122 * address must be in the L1 data cache. Thus Intel added an MSR that we can use
1123 * to flush the L1 data cache. We wrap this up in the function
1124 * spec_uarch_flush(). This function is also used in the mitigation of
1125 * microarchitectural data sampling (MDS) discussed later on. Kernel based
1126 * hypervisors such as KVM or bhyve are responsible for performing this before
1127 * entering the guest.
1128 *
1129 * Because this attack takes place in the L1 cache, there's another wrinkle
1130 * here. The L1 cache is shared between all logical CPUs in a core in most Intel
1131 * designs. This means that when a thread enters a hardware virtualized context
1132 * and flushes the L1 data cache, the other thread on the processor may then go
1133 * ahead and put new data in it that can be potentially attacked. While one
1134 * solution is to disable SMT on the system, another option that is available is
1135 * to use a feature for hardware virtualization called 'SMT exclusion'. This
1136 * goes through and makes sure that if a HVM is being scheduled on one thread,
1137 * then the thing on the other thread is from the same hardware virtual machine.
1138 * If an interrupt comes in or the guest exits to the broader system, then the
1139 * other SMT thread will be kicked out.
1140 *
1141 * L1TF can be fully mitigated by hardware. If the RDCL_NO feature is set in the
1142 * architecture capabilities MSR (MSR_IA32_ARCH_CAPABILITIES), then we will not
1143 * perform L1TF related mitigations.
1144 *
1145 * MICROARCHITECTURAL DATA SAMPLING
1146 *
1147 * Microarchitectural data sampling (MDS) is a combination of four discrete
1148 * vulnerabilities that are similar issues affecting various parts of the CPU's
1149 * microarchitectural implementation around load, store, and fill buffers.
1150 * Specifically it is made up of the following subcomponents:
1151 *
1152 *  1. Microarchitectural Store Buffer Data Sampling (MSBDS)
1153 *  2. Microarchitectural Fill Buffer Data Sampling (MFBDS)
1154 *  3. Microarchitectural Load Port Data Sampling (MLPDS)
1155 *  4. Microarchitectural Data Sampling Uncacheable Memory (MDSUM)
1156 *
1157 * To begin addressing these, Intel has introduced another feature in microcode
1158 * called MD_CLEAR. This changes the verw instruction to operate in a different
1159 * way. This allows us to execute the verw instruction in a particular way to
1160 * flush the state of the affected parts. The L1TF L1D flush mechanism is also
1161 * updated when this microcode is present to flush this state.
1162 *
1163 * Primarily we need to flush this state whenever we transition from the kernel
1164 * to a less privileged context such as user mode or an HVM guest. MSBDS is a
1165 * little bit different. Here the structures are statically sized when a logical
1166 * CPU is in use and resized when it goes to sleep. Therefore, we also need to
1167 * flush the microarchitectural state before the CPU goes idles by calling hlt,
1168 * mwait, or another ACPI method. To perform these flushes, we call
1169 * x86_md_clear() at all of these transition points.
1170 *
1171 * If hardware enumerates RDCL_NO, indicating that it is not vulnerable to L1TF,
1172 * then we change the spec_uarch_flush() function to point to x86_md_clear(). If
1173 * MDS_NO has been set, then this is fully mitigated and x86_md_clear() becomes
1174 * a no-op.
1175 *
1176 * Unfortunately, with this issue hyperthreading rears its ugly head. In
1177 * particular, everything we've discussed above is only valid for a single
1178 * thread executing on a core. In the case where you have hyper-threading
1179 * present, this attack can be performed between threads. The theoretical fix
1180 * for this is to ensure that both threads are always in the same security
1181 * domain. This means that they are executing in the same ring and mutually
1182 * trust each other. Practically speaking, this would mean that a system call
1183 * would have to issue an inter-processor interrupt (IPI) to the other thread.
1184 * Rather than implement this, we recommend that one disables hyper-threading
1185 * through the use of psradm -aS.
1186 *
1187 * TSX ASYNCHRONOUS ABORT
1188 *
1189 * TSX Asynchronous Abort (TAA) is another side-channel vulnerability that
1190 * behaves like MDS, but leverages Intel's transactional instructions as another
1191 * vector. Effectively, when a transaction hits one of these cases (unmapped
1192 * page, various cache snoop activity, etc.) then the same data can be exposed
1193 * as in the case of MDS. This means that you can attack your twin.
1194 *
1195 * Intel has described that there are two different ways that we can mitigate
1196 * this problem on affected processors:
1197 *
1198 *   1) We can use the same techniques used to deal with MDS. Flushing the
1199 *      microarchitectural buffers and disabling hyperthreading will mitigate
1200 *      this in the same way.
1201 *
1202 *   2) Using microcode to disable TSX.
1203 *
1204 * Now, most processors that are subject to MDS (as in they don't have MDS_NO in
1205 * the IA32_ARCH_CAPABILITIES MSR) will not receive microcode to disable TSX.
1206 * That's OK as we're already doing all such mitigations. On the other hand,
1207 * processors with MDS_NO are all supposed to receive microcode updates that
1208 * enumerate support for disabling TSX. In general, we'd rather use this method
1209 * when available as it doesn't require disabling hyperthreading to be
1210 * effective. Currently we basically are relying on microcode for processors
1211 * that enumerate MDS_NO.
1212 *
1213 * The microcode features are enumerated as part of the IA32_ARCH_CAPABILITIES.
1214 * When bit 7 (IA32_ARCH_CAP_TSX_CTRL) is present, then we are given two
1215 * different powers. The first allows us to cause all transactions to
1216 * immediately abort. The second gives us a means of disabling TSX completely,
1217 * which includes removing it from cpuid. If we have support for this in
1218 * microcode during the first cpuid pass, then we'll disable TSX completely such
1219 * that user land never has a chance to observe the bit. However, if we are late
1220 * loading the microcode, then we must use the functionality to cause
1221 * transactions to automatically abort. This is necessary for user land's sake.
1222 * Once a program sees a cpuid bit, it must not be taken away.
1223 *
1224 * We track whether or not we should do this based on what cpuid pass we're in.
1225 * Whenever we hit cpuid_scan_security() on the boot CPU and we're still on pass
1226 * 1 of the cpuid logic, then we can completely turn off TSX. Notably this
1227 * should happen twice. Once in the normal cpuid_pass1() code and then a second
1228 * time after we do the initial microcode update.  As a result we need to be
1229 * careful in cpuid_apply_tsx() to only use the MSR if we've loaded a suitable
1230 * microcode on the current CPU (which happens prior to cpuid_pass_ucode()).
1231 *
1232 * If TAA has been fixed, then it will be enumerated in IA32_ARCH_CAPABILITIES
1233 * as TAA_NO. In such a case, we will still disable TSX: it's proven to be an
1234 * unfortunate feature in a number of ways, and taking the opportunity to
1235 * finally be able to turn it off is likely to be of benefit in the future.
1236 *
1237 * SUMMARY
1238 *
1239 * The following table attempts to summarize the mitigations for various issues
1240 * and what's done in various places:
1241 *
1242 *  - Spectre v1: Not currently mitigated
1243 *  - swapgs: lfences after swapgs paths
1244 *  - Spectre v2: Retpolines/RSB Stuffing or EIBRS if HW support
1245 *  - Meltdown: Kernel Page Table Isolation
1246 *  - Spectre v3a: Updated CPU microcode
1247 *  - Spectre v4: Not currently mitigated
1248 *  - SpectreRSB: SMEP and RSB Stuffing
1249 *  - L1TF: spec_uarch_flush, SMT exclusion, requires microcode
1250 *  - MDS: x86_md_clear, requires microcode, disabling SMT
1251 *  - TAA: x86_md_clear and disabling SMT OR microcode and disabling TSX
1252 *
1253 * The following table indicates the x86 feature set bits that indicate that a
1254 * given problem has been solved or a notable feature is present:
1255 *
1256 *  - RDCL_NO: Meltdown, L1TF, MSBDS subset of MDS
1257 *  - MDS_NO: All forms of MDS
1258 *  - TAA_NO: TAA
1259 */
1260
1261#include <sys/types.h>
1262#include <sys/archsystm.h>
1263#include <sys/x86_archext.h>
1264#include <sys/kmem.h>
1265#include <sys/systm.h>
1266#include <sys/cmn_err.h>
1267#include <sys/sunddi.h>
1268#include <sys/sunndi.h>
1269#include <sys/cpuvar.h>
1270#include <sys/processor.h>
1271#include <sys/sysmacros.h>
1272#include <sys/pg.h>
1273#include <sys/fp.h>
1274#include <sys/controlregs.h>
1275#include <sys/bitmap.h>
1276#include <sys/auxv_386.h>
1277#include <sys/memnode.h>
1278#include <sys/pci_cfgspace.h>
1279#include <sys/comm_page.h>
1280#include <sys/mach_mmu.h>
1281#include <sys/ucode.h>
1282#include <sys/tsc.h>
1283#include <sys/kobj.h>
1284#include <sys/asm_misc.h>
1285
1286#ifdef __xpv
1287#include <sys/hypervisor.h>
1288#else
1289#include <sys/ontrap.h>
1290#endif
1291
1292uint_t x86_vendor = X86_VENDOR_IntelClone;
1293uint_t x86_type = X86_TYPE_OTHER;
1294uint_t x86_clflush_size = 0;
1295
1296#if defined(__xpv)
1297int x86_use_pcid = 0;
1298int x86_use_invpcid = 0;
1299#else
1300int x86_use_pcid = -1;
1301int x86_use_invpcid = -1;
1302#endif
1303
1304typedef enum {
1305	X86_SPECTREV2_RETPOLINE,
1306	X86_SPECTREV2_RETPOLINE_AMD,
1307	X86_SPECTREV2_ENHANCED_IBRS,
1308	X86_SPECTREV2_DISABLED
1309} x86_spectrev2_mitigation_t;
1310
1311uint_t x86_disable_spectrev2 = 0;
1312static x86_spectrev2_mitigation_t x86_spectrev2_mitigation =
1313    X86_SPECTREV2_RETPOLINE;
1314
1315/*
1316 * The mitigation status for TAA:
1317 * X86_TAA_NOTHING -- no mitigation available for TAA side-channels
1318 * X86_TAA_DISABLED -- mitigation disabled via x86_disable_taa
1319 * X86_TAA_MD_CLEAR -- MDS mitigation also suffices for TAA
1320 * X86_TAA_TSX_FORCE_ABORT -- transactions are forced to abort
1321 * X86_TAA_TSX_DISABLE -- force abort transactions and hide from CPUID
1322 * X86_TAA_HW_MITIGATED -- TSX potentially active but H/W not TAA-vulnerable
1323 */
1324typedef enum {
1325	X86_TAA_NOTHING,
1326	X86_TAA_DISABLED,
1327	X86_TAA_MD_CLEAR,
1328	X86_TAA_TSX_FORCE_ABORT,
1329	X86_TAA_TSX_DISABLE,
1330	X86_TAA_HW_MITIGATED
1331} x86_taa_mitigation_t;
1332
1333uint_t x86_disable_taa = 0;
1334static x86_taa_mitigation_t x86_taa_mitigation = X86_TAA_NOTHING;
1335
1336uint_t pentiumpro_bug4046376;
1337
1338uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
1339
1340static char *x86_feature_names[NUM_X86_FEATURES] = {
1341	"lgpg",
1342	"tsc",
1343	"msr",
1344	"mtrr",
1345	"pge",
1346	"de",
1347	"cmov",
1348	"mmx",
1349	"mca",
1350	"pae",
1351	"cv8",
1352	"pat",
1353	"sep",
1354	"sse",
1355	"sse2",
1356	"htt",
1357	"asysc",
1358	"nx",
1359	"sse3",
1360	"cx16",
1361	"cmp",
1362	"tscp",
1363	"mwait",
1364	"sse4a",
1365	"cpuid",
1366	"ssse3",
1367	"sse4_1",
1368	"sse4_2",
1369	"1gpg",
1370	"clfsh",
1371	"64",
1372	"aes",
1373	"pclmulqdq",
1374	"xsave",
1375	"avx",
1376	"vmx",
1377	"svm",
1378	"topoext",
1379	"f16c",
1380	"rdrand",
1381	"x2apic",
1382	"avx2",
1383	"bmi1",
1384	"bmi2",
1385	"fma",
1386	"smep",
1387	"smap",
1388	"adx",
1389	"rdseed",
1390	"mpx",
1391	"avx512f",
1392	"avx512dq",
1393	"avx512pf",
1394	"avx512er",
1395	"avx512cd",
1396	"avx512bw",
1397	"avx512vl",
1398	"avx512fma",
1399	"avx512vbmi",
1400	"avx512_vpopcntdq",
1401	"avx512_4vnniw",
1402	"avx512_4fmaps",
1403	"xsaveopt",
1404	"xsavec",
1405	"xsaves",
1406	"sha",
1407	"umip",
1408	"pku",
1409	"ospke",
1410	"pcid",
1411	"invpcid",
1412	"ibrs",
1413	"ibpb",
1414	"stibp",
1415	"ssbd",
1416	"ssbd_virt",
1417	"rdcl_no",
1418	"ibrs_all",
1419	"rsba",
1420	"ssb_no",
1421	"stibp_all",
1422	"flush_cmd",
1423	"l1d_vmentry_no",
1424	"fsgsbase",
1425	"clflushopt",
1426	"clwb",
1427	"monitorx",
1428	"clzero",
1429	"xop",
1430	"fma4",
1431	"tbm",
1432	"avx512_vnni",
1433	"amd_pcec",
1434	"mb_clear",
1435	"mds_no",
1436	"core_thermal",
1437	"pkg_thermal",
1438	"tsx_ctrl",
1439	"taa_no",
1440	"ppin"
1441};
1442
1443boolean_t
1444is_x86_feature(void *featureset, uint_t feature)
1445{
1446	ASSERT(feature < NUM_X86_FEATURES);
1447	return (BT_TEST((ulong_t *)featureset, feature));
1448}
1449
1450void
1451add_x86_feature(void *featureset, uint_t feature)
1452{
1453	ASSERT(feature < NUM_X86_FEATURES);
1454	BT_SET((ulong_t *)featureset, feature);
1455}
1456
1457void
1458remove_x86_feature(void *featureset, uint_t feature)
1459{
1460	ASSERT(feature < NUM_X86_FEATURES);
1461	BT_CLEAR((ulong_t *)featureset, feature);
1462}
1463
1464boolean_t
1465compare_x86_featureset(void *setA, void *setB)
1466{
1467	/*
1468	 * We assume that the unused bits of the bitmap are always zero.
1469	 */
1470	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1471		return (B_TRUE);
1472	} else {
1473		return (B_FALSE);
1474	}
1475}
1476
1477void
1478print_x86_featureset(void *featureset)
1479{
1480	uint_t i;
1481
1482	for (i = 0; i < NUM_X86_FEATURES; i++) {
1483		if (is_x86_feature(featureset, i)) {
1484			cmn_err(CE_CONT, "?x86_feature: %s\n",
1485			    x86_feature_names[i]);
1486		}
1487	}
1488}
1489
1490/* Note: This is the maximum size for the CPU, not the size of the structure. */
1491static size_t xsave_state_size = 0;
1492uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1493boolean_t xsave_force_disable = B_FALSE;
1494extern int disable_smap;
1495
1496/*
1497 * This is set to platform type we are running on.
1498 */
1499static int platform_type = -1;
1500
1501#if !defined(__xpv)
1502/*
1503 * Variable to patch if hypervisor platform detection needs to be
1504 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1505 */
1506int enable_platform_detection = 1;
1507#endif
1508
1509/*
1510 * monitor/mwait info.
1511 *
1512 * size_actual and buf_actual are the real address and size allocated to get
1513 * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1514 * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1515 * processor cache-line alignment, but this is not guarantied in the furture.
1516 */
1517struct mwait_info {
1518	size_t		mon_min;	/* min size to avoid missed wakeups */
1519	size_t		mon_max;	/* size to avoid false wakeups */
1520	size_t		size_actual;	/* size actually allocated */
1521	void		*buf_actual;	/* memory actually allocated */
1522	uint32_t	support;	/* processor support of monitor/mwait */
1523};
1524
1525/*
1526 * xsave/xrestor info.
1527 *
1528 * This structure contains HW feature bits and the size of the xsave save area.
1529 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1530 * (xsave_state) to describe the xsave layout. However, at runtime the
1531 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1532 * xsave_state structure simply represents the legacy layout of the beginning
1533 * of the xsave area.
1534 */
1535struct xsave_info {
1536	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1537	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1538	size_t		xsav_max_size;  /* max size save area for HW features */
1539	size_t		ymm_size;	/* AVX: size of ymm save area */
1540	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1541	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1542	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1543	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1544	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1545	size_t		opmask_size;	/* AVX512: size of opmask save */
1546	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1547	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1548	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1549	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1550	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1551};
1552
1553
1554/*
1555 * These constants determine how many of the elements of the
1556 * cpuid we cache in the cpuid_info data structure; the
1557 * remaining elements are accessible via the cpuid instruction.
1558 */
1559
1560#define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1561#define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1562
1563/*
1564 * See the big theory statement for a more detailed explanation of what some of
1565 * these members mean.
1566 */
1567struct cpuid_info {
1568	uint_t cpi_pass;		/* last pass completed */
1569	/*
1570	 * standard function information
1571	 */
1572	uint_t cpi_maxeax;		/* fn 0: %eax */
1573	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1574	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1575
1576	uint_t cpi_family;		/* fn 1: extended family */
1577	uint_t cpi_model;		/* fn 1: extended model */
1578	uint_t cpi_step;		/* fn 1: stepping */
1579	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1580					/*		AMD: package/socket # */
1581	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1582	int cpi_clogid;			/* fn 1: %ebx: thread # */
1583	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1584	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1585	uint_t cpi_ncache;		/* fn 2: number of elements */
1586	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1587	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1588	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1589					/* Intel fn: 4, AMD fn: 8000001d */
1590	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1591	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1592	/*
1593	 * extended function information
1594	 */
1595	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1596	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1597	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1598	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1599	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1600	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1601
1602	id_t cpi_coreid;		/* same coreid => strands share core */
1603	int cpi_pkgcoreid;		/* core number within single package */
1604	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1605					/* Intel: fn 4: %eax[31-26] */
1606
1607	/*
1608	 * These values represent the number of bits that are required to store
1609	 * information about the number of cores and threads.
1610	 */
1611	uint_t cpi_ncore_bits;
1612	uint_t cpi_nthread_bits;
1613	/*
1614	 * supported feature information
1615	 */
1616	uint32_t cpi_support[6];
1617#define	STD_EDX_FEATURES	0
1618#define	AMD_EDX_FEATURES	1
1619#define	TM_EDX_FEATURES		2
1620#define	STD_ECX_FEATURES	3
1621#define	AMD_ECX_FEATURES	4
1622#define	STD_EBX_FEATURES	5
1623	/*
1624	 * Synthesized information, where known.
1625	 */
1626	uint32_t cpi_chiprev;		/* See X86_CHIPREV_* in x86_archext.h */
1627	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1628	uint32_t cpi_socket;		/* Chip package/socket type */
1629
1630	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1631	uint32_t cpi_apicid;
1632	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1633	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1634					/* Intel: 1 */
1635	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1636	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1637
1638	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1639};
1640
1641
1642static struct cpuid_info cpuid_info0;
1643
1644/*
1645 * These bit fields are defined by the Intel Application Note AP-485
1646 * "Intel Processor Identification and the CPUID Instruction"
1647 */
1648#define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1649#define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1650#define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1651#define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1652#define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1653#define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1654
1655#define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1656#define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1657#define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1658#define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1659#define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1660#define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1661#define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1662
1663#define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1664#define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1665#define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1666#define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1667
1668#define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1669#define	CPI_XMAXEAX_MAX		0x80000100
1670#define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1671#define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1672
1673/*
1674 * Function 4 (Deterministic Cache Parameters) macros
1675 * Defined by Intel Application Note AP-485
1676 */
1677#define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1678#define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1679#define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1680#define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1681#define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1682#define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1683#define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1684
1685#define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1686#define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1687#define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1688
1689#define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1690
1691#define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1692
1693
1694/*
1695 * A couple of shorthand macros to identify "later" P6-family chips
1696 * like the Pentium M and Core.  First, the "older" P6-based stuff
1697 * (loosely defined as "pre-Pentium-4"):
1698 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1699 */
1700#define	IS_LEGACY_P6(cpi) (			\
1701	cpi->cpi_family == 6 &&			\
1702		(cpi->cpi_model == 1 ||		\
1703		cpi->cpi_model == 3 ||		\
1704		cpi->cpi_model == 5 ||		\
1705		cpi->cpi_model == 6 ||		\
1706		cpi->cpi_model == 7 ||		\
1707		cpi->cpi_model == 8 ||		\
1708		cpi->cpi_model == 0xA ||	\
1709		cpi->cpi_model == 0xB)		\
1710)
1711
1712/* A "new F6" is everything with family 6 that's not the above */
1713#define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1714
1715/* Extended family/model support */
1716#define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1717	cpi->cpi_family >= 0xf)
1718
1719/*
1720 * Info for monitor/mwait idle loop.
1721 *
1722 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1723 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1724 * 2006.
1725 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1726 * Documentation Updates" #33633, Rev 2.05, December 2006.
1727 */
1728#define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1729#define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1730#define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1731#define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1732#define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1733#define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1734#define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1735#define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1736/*
1737 * Number of sub-cstates for a given c-state.
1738 */
1739#define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1740	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1741
1742/*
1743 * XSAVE leaf 0xD enumeration
1744 */
1745#define	CPUID_LEAFD_2_YMM_OFFSET	576
1746#define	CPUID_LEAFD_2_YMM_SIZE		256
1747
1748/*
1749 * Common extended leaf names to cut down on typos.
1750 */
1751#define	CPUID_LEAF_EXT_0		0x80000000
1752#define	CPUID_LEAF_EXT_8		0x80000008
1753#define	CPUID_LEAF_EXT_1d		0x8000001d
1754#define	CPUID_LEAF_EXT_1e		0x8000001e
1755
1756/*
1757 * Functions we consune from cpuid_subr.c;  don't publish these in a header
1758 * file to try and keep people using the expected cpuid_* interfaces.
1759 */
1760extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1761extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1762extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1763extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1764extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1765
1766/*
1767 * Apply up various platform-dependent restrictions where the
1768 * underlying platform restrictions mean the CPU can be marked
1769 * as less capable than its cpuid instruction would imply.
1770 */
1771#if defined(__xpv)
1772static void
1773platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1774{
1775	switch (eax) {
1776	case 1: {
1777		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1778		    0 : CPUID_INTC_EDX_MCA;
1779		cp->cp_edx &=
1780		    ~(mcamask |
1781		    CPUID_INTC_EDX_PSE |
1782		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1783		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1784		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1785		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1786		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1787		break;
1788	}
1789
1790	case 0x80000001:
1791		cp->cp_edx &=
1792		    ~(CPUID_AMD_EDX_PSE |
1793		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1794		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1795		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1796		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1797		    CPUID_AMD_EDX_TSCP);
1798		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1799		break;
1800	default:
1801		break;
1802	}
1803
1804	switch (vendor) {
1805	case X86_VENDOR_Intel:
1806		switch (eax) {
1807		case 4:
1808			/*
1809			 * Zero out the (ncores-per-chip - 1) field
1810			 */
1811			cp->cp_eax &= 0x03fffffff;
1812			break;
1813		default:
1814			break;
1815		}
1816		break;
1817	case X86_VENDOR_AMD:
1818		switch (eax) {
1819
1820		case 0x80000001:
1821			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1822			break;
1823
1824		case CPUID_LEAF_EXT_8:
1825			/*
1826			 * Zero out the (ncores-per-chip - 1) field
1827			 */
1828			cp->cp_ecx &= 0xffffff00;
1829			break;
1830		default:
1831			break;
1832		}
1833		break;
1834	default:
1835		break;
1836	}
1837}
1838#else
1839#define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
1840#endif
1841
1842/*
1843 *  Some undocumented ways of patching the results of the cpuid
1844 *  instruction to permit running Solaris 10 on future cpus that
1845 *  we don't currently support.  Could be set to non-zero values
1846 *  via settings in eeprom.
1847 */
1848
1849uint32_t cpuid_feature_ecx_include;
1850uint32_t cpuid_feature_ecx_exclude;
1851uint32_t cpuid_feature_edx_include;
1852uint32_t cpuid_feature_edx_exclude;
1853
1854/*
1855 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1856 */
1857void
1858cpuid_alloc_space(cpu_t *cpu)
1859{
1860	/*
1861	 * By convention, cpu0 is the boot cpu, which is set up
1862	 * before memory allocation is available.  All other cpus get
1863	 * their cpuid_info struct allocated here.
1864	 */
1865	ASSERT(cpu->cpu_id != 0);
1866	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1867	cpu->cpu_m.mcpu_cpi =
1868	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1869}
1870
1871void
1872cpuid_free_space(cpu_t *cpu)
1873{
1874	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1875	int i;
1876
1877	ASSERT(cpi != NULL);
1878	ASSERT(cpi != &cpuid_info0);
1879
1880	/*
1881	 * Free up any cache leaf related dynamic storage. The first entry was
1882	 * cached from the standard cpuid storage, so we should not free it.
1883	 */
1884	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1885		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1886	if (cpi->cpi_cache_leaf_size > 0)
1887		kmem_free(cpi->cpi_cache_leaves,
1888		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1889
1890	kmem_free(cpi, sizeof (*cpi));
1891	cpu->cpu_m.mcpu_cpi = NULL;
1892}
1893
1894#if !defined(__xpv)
1895/*
1896 * Determine the type of the underlying platform. This is used to customize
1897 * initialization of various subsystems (e.g. TSC). determine_platform() must
1898 * only ever be called once to prevent two processors from seeing different
1899 * values of platform_type. Must be called before cpuid_pass1(), the earliest
1900 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1901 */
1902void
1903determine_platform(void)
1904{
1905	struct cpuid_regs cp;
1906	uint32_t base;
1907	uint32_t regs[4];
1908	char *hvstr = (char *)regs;
1909
1910	ASSERT(platform_type == -1);
1911
1912	platform_type = HW_NATIVE;
1913
1914	if (!enable_platform_detection)
1915		return;
1916
1917	/*
1918	 * If Hypervisor CPUID bit is set, try to determine hypervisor
1919	 * vendor signature, and set platform type accordingly.
1920	 *
1921	 * References:
1922	 * http://lkml.org/lkml/2008/10/1/246
1923	 * http://kb.vmware.com/kb/1009458
1924	 */
1925	cp.cp_eax = 0x1;
1926	(void) __cpuid_insn(&cp);
1927	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1928		cp.cp_eax = 0x40000000;
1929		(void) __cpuid_insn(&cp);
1930		regs[0] = cp.cp_ebx;
1931		regs[1] = cp.cp_ecx;
1932		regs[2] = cp.cp_edx;
1933		regs[3] = 0;
1934		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1935			platform_type = HW_XEN_HVM;
1936			return;
1937		}
1938		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1939			platform_type = HW_VMWARE;
1940			return;
1941		}
1942		if (strcmp(hvstr, HVSIG_KVM) == 0) {
1943			platform_type = HW_KVM;
1944			return;
1945		}
1946		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1947			platform_type = HW_BHYVE;
1948			return;
1949		}
1950		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1951			platform_type = HW_MICROSOFT;
1952	} else {
1953		/*
1954		 * Check older VMware hardware versions. VMware hypervisor is
1955		 * detected by performing an IN operation to VMware hypervisor
1956		 * port and checking that value returned in %ebx is VMware
1957		 * hypervisor magic value.
1958		 *
1959		 * References: http://kb.vmware.com/kb/1009458
1960		 */
1961		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1962		if (regs[1] == VMWARE_HVMAGIC) {
1963			platform_type = HW_VMWARE;
1964			return;
1965		}
1966	}
1967
1968	/*
1969	 * Check Xen hypervisor. In a fully virtualized domain,
1970	 * Xen's pseudo-cpuid function returns a string representing the
1971	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1972	 * supported cpuid function. We need at least a (base + 2) leaf value
1973	 * to do what we want to do. Try different base values, since the
1974	 * hypervisor might use a different one depending on whether Hyper-V
1975	 * emulation is switched on by default or not.
1976	 */
1977	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1978		cp.cp_eax = base;
1979		(void) __cpuid_insn(&cp);
1980		regs[0] = cp.cp_ebx;
1981		regs[1] = cp.cp_ecx;
1982		regs[2] = cp.cp_edx;
1983		regs[3] = 0;
1984		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1985		    cp.cp_eax >= (base + 2)) {
1986			platform_type &= ~HW_NATIVE;
1987			platform_type |= HW_XEN_HVM;
1988			return;
1989		}
1990	}
1991}
1992
1993int
1994get_hwenv(void)
1995{
1996	ASSERT(platform_type != -1);
1997	return (platform_type);
1998}
1999
2000int
2001is_controldom(void)
2002{
2003	return (0);
2004}
2005
2006#else
2007
2008int
2009get_hwenv(void)
2010{
2011	return (HW_XEN_PV);
2012}
2013
2014int
2015is_controldom(void)
2016{
2017	return (DOMAIN_IS_INITDOMAIN(xen_info));
2018}
2019
2020#endif	/* __xpv */
2021
2022/*
2023 * Make sure that we have gathered all of the CPUID leaves that we might need to
2024 * determine topology. We assume that the standard leaf 1 has already been done
2025 * and that xmaxeax has already been calculated.
2026 */
2027static void
2028cpuid_gather_amd_topology_leaves(cpu_t *cpu)
2029{
2030	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2031
2032	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2033		struct cpuid_regs *cp;
2034
2035		cp = &cpi->cpi_extd[8];
2036		cp->cp_eax = CPUID_LEAF_EXT_8;
2037		(void) __cpuid_insn(cp);
2038		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
2039	}
2040
2041	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2042	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2043		struct cpuid_regs *cp;
2044
2045		cp = &cpi->cpi_extd[0x1e];
2046		cp->cp_eax = CPUID_LEAF_EXT_1e;
2047		(void) __cpuid_insn(cp);
2048	}
2049}
2050
2051/*
2052 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
2053 * it to everything else. If not, and we're on an AMD system where 8000001e is
2054 * valid, then we use that. Othewrise, we fall back to the default value for the
2055 * APIC ID in leaf 1.
2056 */
2057static uint32_t
2058cpuid_gather_apicid(struct cpuid_info *cpi)
2059{
2060	/*
2061	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
2062	 * it, we need to gather it again.
2063	 */
2064	if (cpi->cpi_maxeax >= 0xB) {
2065		struct cpuid_regs regs;
2066		struct cpuid_regs *cp;
2067
2068		cp = &regs;
2069		cp->cp_eax = 0xB;
2070		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2071		(void) __cpuid_insn(cp);
2072
2073		if (cp->cp_ebx != 0) {
2074			return (cp->cp_edx);
2075		}
2076	}
2077
2078	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2079	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2080	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2081		return (cpi->cpi_extd[0x1e].cp_eax);
2082	}
2083
2084	return (CPI_APIC_ID(cpi));
2085}
2086
2087/*
2088 * For AMD processors, attempt to calculate the number of chips and cores that
2089 * exist. The way that we do this varies based on the generation, because the
2090 * generations themselves have changed dramatically.
2091 *
2092 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
2093 * However, with the advent of family 17h (Zen) it actually tells us the number
2094 * of threads, so we need to look at leaf 0x8000001e if available to determine
2095 * its value. Otherwise, for all prior families, the number of enabled cores is
2096 * the same as threads.
2097 *
2098 * If we do not have leaf 0x80000008, then we assume that this processor does
2099 * not have anything. AMD's older CPUID specification says there's no reason to
2100 * fall back to leaf 1.
2101 *
2102 * In some virtualization cases we will not have leaf 8000001e or it will be
2103 * zero. When that happens we assume the number of threads is one.
2104 */
2105static void
2106cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2107{
2108	uint_t nthreads, nthread_per_core;
2109
2110	nthreads = nthread_per_core = 1;
2111
2112	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2113		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
2114	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2115		nthreads = CPI_CPU_COUNT(cpi);
2116	}
2117
2118	/*
2119	 * For us to have threads, and know about it, we have to be at least at
2120	 * family 17h and have the cpuid bit that says we have extended
2121	 * topology.
2122	 */
2123	if (cpi->cpi_family >= 0x17 &&
2124	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2125	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2126		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2127	}
2128
2129	*ncpus = nthreads;
2130	*ncores = nthreads / nthread_per_core;
2131}
2132
2133/*
2134 * Seed the initial values for the cores and threads for an Intel based
2135 * processor. These values will be overwritten if we detect that the processor
2136 * supports CPUID leaf 0xb.
2137 */
2138static void
2139cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
2140{
2141	/*
2142	 * Only seed the number of physical cores from the first level leaf 4
2143	 * information. The number of threads there indicate how many share the
2144	 * L1 cache, which may or may not have anything to do with the number of
2145	 * logical CPUs per core.
2146	 */
2147	if (cpi->cpi_maxeax >= 4) {
2148		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
2149	} else {
2150		*ncores = 1;
2151	}
2152
2153	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2154		*ncpus = CPI_CPU_COUNT(cpi);
2155	} else {
2156		*ncpus = *ncores;
2157	}
2158}
2159
2160static boolean_t
2161cpuid_leafB_getids(cpu_t *cpu)
2162{
2163	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2164	struct cpuid_regs regs;
2165	struct cpuid_regs *cp;
2166
2167	if (cpi->cpi_maxeax < 0xB)
2168		return (B_FALSE);
2169
2170	cp = &regs;
2171	cp->cp_eax = 0xB;
2172	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
2173
2174	(void) __cpuid_insn(cp);
2175
2176	/*
2177	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
2178	 * indicates that the extended topology enumeration leaf is
2179	 * available.
2180	 */
2181	if (cp->cp_ebx != 0) {
2182		uint32_t x2apic_id = 0;
2183		uint_t coreid_shift = 0;
2184		uint_t ncpu_per_core = 1;
2185		uint_t chipid_shift = 0;
2186		uint_t ncpu_per_chip = 1;
2187		uint_t i;
2188		uint_t level;
2189
2190		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
2191			cp->cp_eax = 0xB;
2192			cp->cp_ecx = i;
2193
2194			(void) __cpuid_insn(cp);
2195			level = CPI_CPU_LEVEL_TYPE(cp);
2196
2197			if (level == 1) {
2198				x2apic_id = cp->cp_edx;
2199				coreid_shift = BITX(cp->cp_eax, 4, 0);
2200				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
2201			} else if (level == 2) {
2202				x2apic_id = cp->cp_edx;
2203				chipid_shift = BITX(cp->cp_eax, 4, 0);
2204				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
2205			}
2206		}
2207
2208		/*
2209		 * cpi_apicid is taken care of in cpuid_gather_apicid.
2210		 */
2211		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
2212		cpi->cpi_ncore_per_chip = ncpu_per_chip /
2213		    ncpu_per_core;
2214		cpi->cpi_chipid = x2apic_id >> chipid_shift;
2215		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
2216		cpi->cpi_coreid = x2apic_id >> coreid_shift;
2217		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2218		cpi->cpi_procnodeid = cpi->cpi_chipid;
2219		cpi->cpi_compunitid = cpi->cpi_coreid;
2220
2221		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
2222			cpi->cpi_nthread_bits = coreid_shift;
2223			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
2224		}
2225
2226		return (B_TRUE);
2227	} else {
2228		return (B_FALSE);
2229	}
2230}
2231
2232static void
2233cpuid_intel_getids(cpu_t *cpu, void *feature)
2234{
2235	uint_t i;
2236	uint_t chipid_shift = 0;
2237	uint_t coreid_shift = 0;
2238	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2239
2240	/*
2241	 * There are no compute units or processor nodes currently on Intel.
2242	 * Always set these to one.
2243	 */
2244	cpi->cpi_procnodes_per_pkg = 1;
2245	cpi->cpi_cores_per_compunit = 1;
2246
2247	/*
2248	 * If cpuid Leaf B is present, use that to try and get this information.
2249	 * It will be the most accurate for Intel CPUs.
2250	 */
2251	if (cpuid_leafB_getids(cpu))
2252		return;
2253
2254	/*
2255	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
2256	 * and ncore_per_chip. These represent the largest power of two values
2257	 * that we need to cover all of the IDs in the system. Therefore, we use
2258	 * those values to seed the number of bits needed to cover information
2259	 * in the case when leaf B is not available. These values will probably
2260	 * be larger than required, but that's OK.
2261	 */
2262	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
2263	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
2264
2265	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
2266		chipid_shift++;
2267
2268	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
2269	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
2270
2271	if (is_x86_feature(feature, X86FSET_CMP)) {
2272		/*
2273		 * Multi-core (and possibly multi-threaded)
2274		 * processors.
2275		 */
2276		uint_t ncpu_per_core = 0;
2277
2278		if (cpi->cpi_ncore_per_chip == 1)
2279			ncpu_per_core = cpi->cpi_ncpu_per_chip;
2280		else if (cpi->cpi_ncore_per_chip > 1)
2281			ncpu_per_core = cpi->cpi_ncpu_per_chip /
2282			    cpi->cpi_ncore_per_chip;
2283		/*
2284		 * 8bit APIC IDs on dual core Pentiums
2285		 * look like this:
2286		 *
2287		 * +-----------------------+------+------+
2288		 * | Physical Package ID   |  MC  |  HT  |
2289		 * +-----------------------+------+------+
2290		 * <------- chipid -------->
2291		 * <------- coreid --------------->
2292		 *			   <--- clogid -->
2293		 *			   <------>
2294		 *			   pkgcoreid
2295		 *
2296		 * Where the number of bits necessary to
2297		 * represent MC and HT fields together equals
2298		 * to the minimum number of bits necessary to
2299		 * store the value of cpi->cpi_ncpu_per_chip.
2300		 * Of those bits, the MC part uses the number
2301		 * of bits necessary to store the value of
2302		 * cpi->cpi_ncore_per_chip.
2303		 */
2304		for (i = 1; i < ncpu_per_core; i <<= 1)
2305			coreid_shift++;
2306		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
2307		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
2308	} else if (is_x86_feature(feature, X86FSET_HTT)) {
2309		/*
2310		 * Single-core multi-threaded processors.
2311		 */
2312		cpi->cpi_coreid = cpi->cpi_chipid;
2313		cpi->cpi_pkgcoreid = 0;
2314	} else {
2315		/*
2316		 * Single-core single-thread processors.
2317		 */
2318		cpi->cpi_coreid = cpu->cpu_id;
2319		cpi->cpi_pkgcoreid = 0;
2320	}
2321	cpi->cpi_procnodeid = cpi->cpi_chipid;
2322	cpi->cpi_compunitid = cpi->cpi_coreid;
2323}
2324
2325/*
2326 * Historically, AMD has had CMP chips with only a single thread per core.
2327 * However, starting in family 17h (Zen), this has changed and they now have
2328 * multiple threads. Our internal core id needs to be a unique value.
2329 *
2330 * To determine the core id of an AMD system, if we're from a family before 17h,
2331 * then we just use the cpu id, as that gives us a good value that will be
2332 * unique for each core. If instead, we're on family 17h or later, then we need
2333 * to do something more complicated. CPUID leaf 0x8000001e can tell us
2334 * how many threads are in the system. Based on that, we'll shift the APIC ID.
2335 * We can't use the normal core id in that leaf as it's only unique within the
2336 * socket, which is perfect for cpi_pkgcoreid, but not us.
2337 */
2338static id_t
2339cpuid_amd_get_coreid(cpu_t *cpu)
2340{
2341	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2342
2343	if (cpi->cpi_family >= 0x17 &&
2344	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2345	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2346		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2347		if (nthreads > 1) {
2348			VERIFY3U(nthreads, ==, 2);
2349			return (cpi->cpi_apicid >> 1);
2350		}
2351	}
2352
2353	return (cpu->cpu_id);
2354}
2355
2356/*
2357 * IDs on AMD is a more challenging task. This is notable because of the
2358 * following two facts:
2359 *
2360 *  1. Before family 0x17 (Zen), there was no support for SMT and there was
2361 *     also no way to get an actual unique core id from the system. As such, we
2362 *     synthesize this case by using cpu->cpu_id.  This scheme does not,
2363 *     however, guarantee that sibling cores of a chip will have sequential
2364 *     coreids starting at a multiple of the number of cores per chip - that is
2365 *     usually the case, but if the ACPI MADT table is presented in a different
2366 *     order then we need to perform a few more gymnastics for the pkgcoreid.
2367 *
2368 *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
2369 *     called compute units. These compute units share the L1I cache, L2 cache,
2370 *     and the FPU. To deal with this, a new topology leaf was added in
2371 *     0x8000001e. However, parts of this leaf have different meanings
2372 *     once we get to family 0x17.
2373 */
2374
2375static void
2376cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
2377{
2378	int i, first_half, coreidsz;
2379	uint32_t nb_caps_reg;
2380	uint_t node2_1;
2381	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2382	struct cpuid_regs *cp;
2383
2384	/*
2385	 * Calculate the core id (this comes from hardware in family 0x17 if it
2386	 * hasn't been stripped by virtualization). We always set the compute
2387	 * unit id to the same value. Also, initialize the default number of
2388	 * cores per compute unit and nodes per package. This will be
2389	 * overwritten when we know information about a particular family.
2390	 */
2391	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
2392	cpi->cpi_compunitid = cpi->cpi_coreid;
2393	cpi->cpi_cores_per_compunit = 1;
2394	cpi->cpi_procnodes_per_pkg = 1;
2395
2396	/*
2397	 * To construct the logical ID, we need to determine how many APIC IDs
2398	 * are dedicated to the cores and threads. This is provided for us in
2399	 * 0x80000008. However, if it's not present (say due to virtualization),
2400	 * then we assume it's one. This should be present on all 64-bit AMD
2401	 * processors.  It was added in family 0xf (Hammer).
2402	 */
2403	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2404		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2405
2406		/*
2407		 * In AMD parlance chip is really a node while illumos
2408		 * uses chip as equivalent to socket/package.
2409		 */
2410		if (coreidsz == 0) {
2411			/* Use legacy method */
2412			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2413				coreidsz++;
2414			if (coreidsz == 0)
2415				coreidsz = 1;
2416		}
2417	} else {
2418		/* Assume single-core part */
2419		coreidsz = 1;
2420	}
2421	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2422
2423	/*
2424	 * The package core ID varies depending on the family. While it may be
2425	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2426	 * this value is the core id in the given node. For non-virtualized
2427	 * family 17h, we need to take the logical core id and shift off the
2428	 * threads like we do when getting the core id.  Otherwise, we can use
2429	 * the clogid as is. When family 17h is virtualized, the clogid should
2430	 * be sufficient as if we don't have valid data in the leaf, then we
2431	 * won't think we have SMT, in which case the cpi_clogid should be
2432	 * sufficient.
2433	 */
2434	if (cpi->cpi_family >= 0x17 &&
2435	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2436	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2437	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2438		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2439		if (nthreads > 1) {
2440			VERIFY3U(nthreads, ==, 2);
2441			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2442		} else {
2443			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2444		}
2445	} else {
2446		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2447	}
2448
2449	/*
2450	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2451	 * (bulldozer) or newer, then we can derive all of this from leaf
2452	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2453	 */
2454	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2455	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2456		cp = &cpi->cpi_extd[0x1e];
2457
2458		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2459		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2460
2461		/*
2462		 * For Bulldozer-era CPUs, recalculate the compute unit
2463		 * information.
2464		 */
2465		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2466			cpi->cpi_cores_per_compunit =
2467			    BITX(cp->cp_ebx, 15, 8) + 1;
2468			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2469			    (cpi->cpi_ncore_per_chip /
2470			    cpi->cpi_cores_per_compunit) *
2471			    (cpi->cpi_procnodeid /
2472			    cpi->cpi_procnodes_per_pkg);
2473		}
2474	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2475		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2476	} else if (cpi->cpi_family == 0x10) {
2477		/*
2478		 * See if we are a multi-node processor.
2479		 * All processors in the system have the same number of nodes
2480		 */
2481		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2482		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2483			/* Single-node */
2484			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2485			    coreidsz);
2486		} else {
2487
2488			/*
2489			 * Multi-node revision D (2 nodes per package
2490			 * are supported)
2491			 */
2492			cpi->cpi_procnodes_per_pkg = 2;
2493
2494			first_half = (cpi->cpi_pkgcoreid <=
2495			    (cpi->cpi_ncore_per_chip/2 - 1));
2496
2497			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2498				/* We are BSP */
2499				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2500			} else {
2501
2502				/* We are AP */
2503				/* NodeId[2:1] bits to use for reading F3xe8 */
2504				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2505
2506				nb_caps_reg =
2507				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2508
2509				/*
2510				 * Check IntNodeNum bit (31:30, but bit 31 is
2511				 * always 0 on dual-node processors)
2512				 */
2513				if (BITX(nb_caps_reg, 30, 30) == 0)
2514					cpi->cpi_procnodeid = node2_1 +
2515					    !first_half;
2516				else
2517					cpi->cpi_procnodeid = node2_1 +
2518					    first_half;
2519			}
2520		}
2521	} else {
2522		cpi->cpi_procnodeid = 0;
2523	}
2524
2525	cpi->cpi_chipid =
2526	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2527
2528	cpi->cpi_ncore_bits = coreidsz;
2529	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2530	    cpi->cpi_ncore_per_chip);
2531}
2532
2533static void
2534spec_uarch_flush_noop(void)
2535{
2536}
2537
2538/*
2539 * When microcode is present that mitigates MDS, this wrmsr will also flush the
2540 * MDS-related micro-architectural state that would normally happen by calling
2541 * x86_md_clear().
2542 */
2543static void
2544spec_uarch_flush_msr(void)
2545{
2546	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2547}
2548
2549/*
2550 * This function points to a function that will flush certain
2551 * micro-architectural state on the processor. This flush is used to mitigate
2552 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2553 * function can point to one of three functions:
2554 *
2555 * - A noop which is done because we either are vulnerable, but do not have
2556 *   microcode available to help deal with a fix, or because we aren't
2557 *   vulnerable.
2558 *
2559 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2560 *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2561 *   however, it only flushes the MDS related micro-architectural state on the
2562 *   current hyperthread, it does not do anything for the twin.
2563 *
2564 * - x86_md_clear which will flush the MDS related state. This is done when we
2565 *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2566 *   (RDCL_NO is set).
2567 */
2568void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2569
2570static void
2571cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2572{
2573	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2574
2575	/*
2576	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2577	 * has been fixed in hardware, it doesn't cover everything related to
2578	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2579	 * need to mitigate this.
2580	 */
2581	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2582	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2583		return;
2584	}
2585
2586	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2587		const uint8_t nop = NOP_INSTR;
2588		uint8_t *md = (uint8_t *)x86_md_clear;
2589
2590		*md = nop;
2591	}
2592
2593	membar_producer();
2594}
2595
2596static void
2597cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2598{
2599	boolean_t need_l1d, need_mds;
2600	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2601
2602	/*
2603	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2604	 * hardware, then there's nothing left for us to do for enabling the
2605	 * flush. We can also go ahead and say that SMT exclusion is
2606	 * unnecessary.
2607	 */
2608	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2609	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2610	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2611		extern int smt_exclusion;
2612		smt_exclusion = 0;
2613		spec_uarch_flush = spec_uarch_flush_noop;
2614		membar_producer();
2615		return;
2616	}
2617
2618	/*
2619	 * The locations where we need to perform an L1D flush are required both
2620	 * for mitigating L1TF and MDS. When verw support is present in
2621	 * microcode, then the L1D flush will take care of doing that as well.
2622	 * However, if we have a system where RDCL_NO is present, but we don't
2623	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2624	 * L1D flush.
2625	 */
2626	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2627	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2628	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2629		need_l1d = B_TRUE;
2630	} else {
2631		need_l1d = B_FALSE;
2632	}
2633
2634	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2635	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2636		need_mds = B_TRUE;
2637	} else {
2638		need_mds = B_FALSE;
2639	}
2640
2641	if (need_l1d) {
2642		spec_uarch_flush = spec_uarch_flush_msr;
2643	} else if (need_mds) {
2644		spec_uarch_flush = x86_md_clear;
2645	} else {
2646		/*
2647		 * We have no hardware mitigations available to us.
2648		 */
2649		spec_uarch_flush = spec_uarch_flush_noop;
2650	}
2651	membar_producer();
2652}
2653
2654/*
2655 * We default to enabling RSB mitigations.
2656 */
2657static void
2658cpuid_patch_rsb(x86_spectrev2_mitigation_t mit)
2659{
2660	const uint8_t ret = RET_INSTR;
2661	uint8_t *stuff = (uint8_t *)x86_rsb_stuff;
2662
2663	switch (mit) {
2664	case X86_SPECTREV2_ENHANCED_IBRS:
2665	case X86_SPECTREV2_DISABLED:
2666		*stuff = ret;
2667		break;
2668	default:
2669		break;
2670	}
2671}
2672
2673static void
2674cpuid_patch_retpolines(x86_spectrev2_mitigation_t mit)
2675{
2676	const char *thunks[] = { "_rax", "_rbx", "_rcx", "_rdx", "_rdi",
2677	    "_rsi", "_rbp", "_r8", "_r9", "_r10", "_r11", "_r12", "_r13",
2678	    "_r14", "_r15" };
2679	const uint_t nthunks = ARRAY_SIZE(thunks);
2680	const char *type;
2681	uint_t i;
2682
2683	if (mit == x86_spectrev2_mitigation)
2684		return;
2685
2686	switch (mit) {
2687	case X86_SPECTREV2_RETPOLINE:
2688		type = "gen";
2689		break;
2690	case X86_SPECTREV2_RETPOLINE_AMD:
2691		type = "amd";
2692		break;
2693	case X86_SPECTREV2_ENHANCED_IBRS:
2694	case X86_SPECTREV2_DISABLED:
2695		type = "jmp";
2696		break;
2697	default:
2698		panic("asked to updated retpoline state with unknown state!");
2699	}
2700
2701	for (i = 0; i < nthunks; i++) {
2702		uintptr_t source, dest;
2703		int ssize, dsize;
2704		char sourcebuf[64], destbuf[64];
2705		size_t len;
2706
2707		(void) snprintf(destbuf, sizeof (destbuf),
2708		    "__x86_indirect_thunk%s", thunks[i]);
2709		(void) snprintf(sourcebuf, sizeof (sourcebuf),
2710		    "__x86_indirect_thunk_%s%s", type, thunks[i]);
2711
2712		source = kobj_getelfsym(sourcebuf, NULL, &ssize);
2713		dest = kobj_getelfsym(destbuf, NULL, &dsize);
2714		VERIFY3U(source, !=, 0);
2715		VERIFY3U(dest, !=, 0);
2716		VERIFY3S(dsize, >=, ssize);
2717		bcopy((void *)source, (void *)dest, ssize);
2718	}
2719}
2720
2721static void
2722cpuid_enable_enhanced_ibrs(void)
2723{
2724	uint64_t val;
2725
2726	val = rdmsr(MSR_IA32_SPEC_CTRL);
2727	val |= IA32_SPEC_CTRL_IBRS;
2728	wrmsr(MSR_IA32_SPEC_CTRL, val);
2729}
2730
2731#ifndef __xpv
2732/*
2733 * Determine whether or not we can use the AMD optimized retpoline
2734 * functionality. We use this when we know we're on an AMD system and we can
2735 * successfully verify that lfence is dispatch serializing.
2736 */
2737static boolean_t
2738cpuid_use_amd_retpoline(struct cpuid_info *cpi)
2739{
2740	uint64_t val;
2741	on_trap_data_t otd;
2742
2743	if (cpi->cpi_vendor != X86_VENDOR_AMD)
2744		return (B_FALSE);
2745
2746	/*
2747	 * We need to determine whether or not lfence is serializing. It always
2748	 * is on families 0xf and 0x11. On others, it's controlled by
2749	 * MSR_AMD_DECODE_CONFIG (MSRC001_1029). If some hypervisor gives us a
2750	 * crazy old family, don't try and do anything.
2751	 */
2752	if (cpi->cpi_family < 0xf)
2753		return (B_FALSE);
2754	if (cpi->cpi_family == 0xf || cpi->cpi_family == 0x11)
2755		return (B_TRUE);
2756
2757	/*
2758	 * While it may be tempting to use get_hwenv(), there are no promises
2759	 * that a hypervisor will actually declare themselves to be so in a
2760	 * friendly way. As such, try to read and set the MSR. If we can then
2761	 * read back the value we set (it wasn't just set to zero), then we go
2762	 * for it.
2763	 */
2764	if (!on_trap(&otd, OT_DATA_ACCESS)) {
2765		val = rdmsr(MSR_AMD_DECODE_CONFIG);
2766		val |= AMD_DECODE_CONFIG_LFENCE_DISPATCH;
2767		wrmsr(MSR_AMD_DECODE_CONFIG, val);
2768		val = rdmsr(MSR_AMD_DECODE_CONFIG);
2769	} else {
2770		val = 0;
2771	}
2772	no_trap();
2773
2774	if ((val & AMD_DECODE_CONFIG_LFENCE_DISPATCH) != 0)
2775		return (B_TRUE);
2776	return (B_FALSE);
2777}
2778#endif	/* !__xpv */
2779
2780/*
2781 * Determine how we should mitigate TAA or if we need to. Regardless of TAA, if
2782 * we can disable TSX, we do so.
2783 *
2784 * This determination is done only on the boot CPU, potentially after loading
2785 * updated microcode.
2786 */
2787static void
2788cpuid_update_tsx(cpu_t *cpu, uchar_t *featureset)
2789{
2790	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2791
2792	VERIFY(cpu->cpu_id == 0);
2793
2794	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2795		x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2796		return;
2797	}
2798
2799	if (x86_disable_taa) {
2800		x86_taa_mitigation = X86_TAA_DISABLED;
2801		return;
2802	}
2803
2804	/*
2805	 * If we do not have the ability to disable TSX, then our only
2806	 * mitigation options are in hardware (TAA_NO), or by using our existing
2807	 * MDS mitigation as described above.  The latter relies upon us having
2808	 * configured MDS mitigations correctly! This includes disabling SMT if
2809	 * we want to cross-CPU-thread protection.
2810	 */
2811	if (!is_x86_feature(featureset, X86FSET_TSX_CTRL)) {
2812		/*
2813		 * It's not clear whether any parts will enumerate TAA_NO
2814		 * *without* TSX_CTRL, but let's mark it as such if we see this.
2815		 */
2816		if (is_x86_feature(featureset, X86FSET_TAA_NO)) {
2817			x86_taa_mitigation = X86_TAA_HW_MITIGATED;
2818			return;
2819		}
2820
2821		if (is_x86_feature(featureset, X86FSET_MD_CLEAR) &&
2822		    !is_x86_feature(featureset, X86FSET_MDS_NO)) {
2823			x86_taa_mitigation = X86_TAA_MD_CLEAR;
2824		} else {
2825			x86_taa_mitigation = X86_TAA_NOTHING;
2826		}
2827		return;
2828	}
2829
2830	/*
2831	 * We have TSX_CTRL, but we can only fully disable TSX if we're early
2832	 * enough in boot.
2833	 *
2834	 * Otherwise, we'll fall back to causing transactions to abort as our
2835	 * mitigation. TSX-using code will always take the fallback path.
2836	 */
2837	if (cpi->cpi_pass < 4) {
2838		x86_taa_mitigation = X86_TAA_TSX_DISABLE;
2839	} else {
2840		x86_taa_mitigation = X86_TAA_TSX_FORCE_ABORT;
2841	}
2842}
2843
2844/*
2845 * As mentioned, we should only touch the MSR when we've got a suitable
2846 * microcode loaded on this CPU.
2847 */
2848static void
2849cpuid_apply_tsx(x86_taa_mitigation_t taa, uchar_t *featureset)
2850{
2851	uint64_t val;
2852
2853	switch (taa) {
2854	case X86_TAA_TSX_DISABLE:
2855		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2856			return;
2857		val = rdmsr(MSR_IA32_TSX_CTRL);
2858		val |= IA32_TSX_CTRL_CPUID_CLEAR | IA32_TSX_CTRL_RTM_DISABLE;
2859		wrmsr(MSR_IA32_TSX_CTRL, val);
2860		break;
2861	case X86_TAA_TSX_FORCE_ABORT:
2862		if (!is_x86_feature(featureset, X86FSET_TSX_CTRL))
2863			return;
2864		val = rdmsr(MSR_IA32_TSX_CTRL);
2865		val |= IA32_TSX_CTRL_RTM_DISABLE;
2866		wrmsr(MSR_IA32_TSX_CTRL, val);
2867		break;
2868	case X86_TAA_HW_MITIGATED:
2869	case X86_TAA_MD_CLEAR:
2870	case X86_TAA_DISABLED:
2871	case X86_TAA_NOTHING:
2872		break;
2873	}
2874}
2875
2876static void
2877cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2878{
2879	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2880	x86_spectrev2_mitigation_t v2mit;
2881
2882	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2883	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2884		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2885			add_x86_feature(featureset, X86FSET_IBPB);
2886		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2887			add_x86_feature(featureset, X86FSET_IBRS);
2888		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2889			add_x86_feature(featureset, X86FSET_STIBP);
2890		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2891			add_x86_feature(featureset, X86FSET_STIBP_ALL);
2892		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2893			add_x86_feature(featureset, X86FSET_SSBD);
2894		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2895			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2896		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2897			add_x86_feature(featureset, X86FSET_SSB_NO);
2898		/*
2899		 * Don't enable enhanced IBRS unless we're told that we should
2900		 * prefer it and it has the same semantics as Intel. This is
2901		 * split into two bits rather than a single one.
2902		 */
2903		if ((cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS) &&
2904		    (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)) {
2905			add_x86_feature(featureset, X86FSET_IBRS_ALL);
2906		}
2907
2908	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2909	    cpi->cpi_maxeax >= 7) {
2910		struct cpuid_regs *ecp;
2911		ecp = &cpi->cpi_std[7];
2912
2913		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2914			add_x86_feature(featureset, X86FSET_MD_CLEAR);
2915		}
2916
2917		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2918			add_x86_feature(featureset, X86FSET_IBRS);
2919			add_x86_feature(featureset, X86FSET_IBPB);
2920		}
2921
2922		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2923			add_x86_feature(featureset, X86FSET_STIBP);
2924		}
2925
2926		/*
2927		 * Don't read the arch caps MSR on xpv where we lack the
2928		 * on_trap().
2929		 */
2930#ifndef __xpv
2931		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2932			on_trap_data_t otd;
2933
2934			/*
2935			 * Be paranoid and assume we'll get a #GP.
2936			 */
2937			if (!on_trap(&otd, OT_DATA_ACCESS)) {
2938				uint64_t reg;
2939
2940				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2941				if (reg & IA32_ARCH_CAP_RDCL_NO) {
2942					add_x86_feature(featureset,
2943					    X86FSET_RDCL_NO);
2944				}
2945				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2946					add_x86_feature(featureset,
2947					    X86FSET_IBRS_ALL);
2948				}
2949				if (reg & IA32_ARCH_CAP_RSBA) {
2950					add_x86_feature(featureset,
2951					    X86FSET_RSBA);
2952				}
2953				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2954					add_x86_feature(featureset,
2955					    X86FSET_L1D_VM_NO);
2956				}
2957				if (reg & IA32_ARCH_CAP_SSB_NO) {
2958					add_x86_feature(featureset,
2959					    X86FSET_SSB_NO);
2960				}
2961				if (reg & IA32_ARCH_CAP_MDS_NO) {
2962					add_x86_feature(featureset,
2963					    X86FSET_MDS_NO);
2964				}
2965				if (reg & IA32_ARCH_CAP_TSX_CTRL) {
2966					add_x86_feature(featureset,
2967					    X86FSET_TSX_CTRL);
2968				}
2969				if (reg & IA32_ARCH_CAP_TAA_NO) {
2970					add_x86_feature(featureset,
2971					    X86FSET_TAA_NO);
2972				}
2973			}
2974			no_trap();
2975		}
2976#endif	/* !__xpv */
2977
2978		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2979			add_x86_feature(featureset, X86FSET_SSBD);
2980
2981		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2982			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2983	}
2984
2985	/*
2986	 * Take care of certain mitigations on the non-boot CPU. The boot CPU
2987	 * will have already run this function and determined what we need to
2988	 * do. This gives us a hook for per-HW thread mitigations such as
2989	 * enhanced IBRS, or disabling TSX.
2990	 */
2991	if (cpu->cpu_id != 0) {
2992		if (x86_spectrev2_mitigation == X86_SPECTREV2_ENHANCED_IBRS) {
2993			cpuid_enable_enhanced_ibrs();
2994		}
2995
2996		cpuid_apply_tsx(x86_taa_mitigation, featureset);
2997		return;
2998	}
2999
3000	/*
3001	 * Go through and initialize various security mechanisms that we should
3002	 * only do on a single CPU. This includes Spectre V2, L1TF, MDS, and
3003	 * TAA.
3004	 */
3005
3006	/*
3007	 * By default we've come in with retpolines enabled. Check whether we
3008	 * should disable them or enable enhanced IBRS. RSB stuffing is enabled
3009	 * by default, but disabled if we are using enhanced IBRS.
3010	 */
3011	if (x86_disable_spectrev2 != 0) {
3012		v2mit = X86_SPECTREV2_DISABLED;
3013	} else if (is_x86_feature(featureset, X86FSET_IBRS_ALL)) {
3014		cpuid_enable_enhanced_ibrs();
3015		v2mit = X86_SPECTREV2_ENHANCED_IBRS;
3016#ifndef __xpv
3017	} else if (cpuid_use_amd_retpoline(cpi)) {
3018		v2mit = X86_SPECTREV2_RETPOLINE_AMD;
3019#endif	/* !__xpv */
3020	} else {
3021		v2mit = X86_SPECTREV2_RETPOLINE;
3022	}
3023
3024	cpuid_patch_retpolines(v2mit);
3025	cpuid_patch_rsb(v2mit);
3026	x86_spectrev2_mitigation = v2mit;
3027	membar_producer();
3028
3029	/*
3030	 * We need to determine what changes are required for mitigating L1TF
3031	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
3032	 * is required.
3033	 *
3034	 * If any of these are present, then we need to flush u-arch state at
3035	 * various points. For MDS, we need to do so whenever we change to a
3036	 * lesser privilege level or we are halting the CPU. For L1TF we need to
3037	 * flush the L1D cache at VM entry. When we have microcode that handles
3038	 * MDS, the L1D flush also clears the other u-arch state that the
3039	 * md_clear does.
3040	 */
3041
3042	/*
3043	 * Update whether or not we need to be taking explicit action against
3044	 * MDS.
3045	 */
3046	cpuid_update_md_clear(cpu, featureset);
3047
3048	/*
3049	 * Determine whether SMT exclusion is required and whether or not we
3050	 * need to perform an l1d flush.
3051	 */
3052	cpuid_update_l1d_flush(cpu, featureset);
3053
3054	/*
3055	 * Determine what our mitigation strategy should be for TAA and then
3056	 * also apply TAA mitigations.
3057	 */
3058	cpuid_update_tsx(cpu, featureset);
3059	cpuid_apply_tsx(x86_taa_mitigation, featureset);
3060}
3061
3062/*
3063 * Setup XFeature_Enabled_Mask register. Required by xsave feature.
3064 */
3065void
3066setup_xfem(void)
3067{
3068	uint64_t flags = XFEATURE_LEGACY_FP;
3069
3070	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
3071
3072	if (is_x86_feature(x86_featureset, X86FSET_SSE))
3073		flags |= XFEATURE_SSE;
3074
3075	if (is_x86_feature(x86_featureset, X86FSET_AVX))
3076		flags |= XFEATURE_AVX;
3077
3078	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
3079		flags |= XFEATURE_AVX512;
3080
3081	set_xcr(XFEATURE_ENABLED_MASK, flags);
3082
3083	xsave_bv_all = flags;
3084}
3085
3086static void
3087cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
3088{
3089	struct cpuid_info *cpi;
3090
3091	cpi = cpu->cpu_m.mcpu_cpi;
3092
3093	if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3094		cpuid_gather_amd_topology_leaves(cpu);
3095	}
3096
3097	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
3098
3099	/*
3100	 * Before we can calculate the IDs that we should assign to this
3101	 * processor, we need to understand how many cores and threads it has.
3102	 */
3103	switch (cpi->cpi_vendor) {
3104	case X86_VENDOR_Intel:
3105		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3106		    &cpi->cpi_ncore_per_chip);
3107		break;
3108	case X86_VENDOR_AMD:
3109		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
3110		    &cpi->cpi_ncore_per_chip);
3111		break;
3112	default:
3113		/*
3114		 * If we have some other x86 compatible chip, it's not clear how
3115		 * they would behave. The most common case is virtualization
3116		 * today, though there are also 64-bit VIA chips. Assume that
3117		 * all we can get is the basic Leaf 1 HTT information.
3118		 */
3119		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
3120			cpi->cpi_ncore_per_chip = 1;
3121			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
3122		}
3123		break;
3124	}
3125
3126	/*
3127	 * Based on the calculated number of threads and cores, potentially
3128	 * assign the HTT and CMT features.
3129	 */
3130	if (cpi->cpi_ncore_per_chip > 1) {
3131		add_x86_feature(featureset, X86FSET_CMP);
3132	}
3133
3134	if (cpi->cpi_ncpu_per_chip > 1 &&
3135	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
3136		add_x86_feature(featureset, X86FSET_HTT);
3137	}
3138
3139	/*
3140	 * Now that has been set up, we need to go through and calculate all of
3141	 * the rest of the parameters that exist. If we think the CPU doesn't
3142	 * have either SMT (HTT) or CMP, then we basically go through and fake
3143	 * up information in some way. The most likely case for this is
3144	 * virtualization where we have a lot of partial topology information.
3145	 */
3146	if (!is_x86_feature(featureset, X86FSET_HTT) &&
3147	    !is_x86_feature(featureset, X86FSET_CMP)) {
3148		/*
3149		 * This is a single core, single-threaded processor.
3150		 */
3151		cpi->cpi_procnodes_per_pkg = 1;
3152		cpi->cpi_cores_per_compunit = 1;
3153		cpi->cpi_compunitid = 0;
3154		cpi->cpi_chipid = -1;
3155		cpi->cpi_clogid = 0;
3156		cpi->cpi_coreid = cpu->cpu_id;
3157		cpi->cpi_pkgcoreid = 0;
3158		if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3159			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
3160		} else {
3161			cpi->cpi_procnodeid = cpi->cpi_chipid;
3162		}
3163	} else {
3164		switch (cpi->cpi_vendor) {
3165		case X86_VENDOR_Intel:
3166			cpuid_intel_getids(cpu, featureset);
3167			break;
3168		case X86_VENDOR_AMD:
3169			cpuid_amd_getids(cpu, featureset);
3170			break;
3171		default:
3172			/*
3173			 * In this case, it's hard to say what we should do.
3174			 * We're going to model them to the OS as single core
3175			 * threads. We don't have a good identifier for them, so
3176			 * we're just going to use the cpu id all on a single
3177			 * chip.
3178			 *
3179			 * This case has historically been different from the
3180			 * case above where we don't have HTT or CMP. While they
3181			 * could be combined, we've opted to keep it separate to
3182			 * minimize the risk of topology changes in weird cases.
3183			 */
3184			cpi->cpi_procnodes_per_pkg = 1;
3185			cpi->cpi_cores_per_compunit = 1;
3186			cpi->cpi_chipid = 0;
3187			cpi->cpi_coreid = cpu->cpu_id;
3188			cpi->cpi_clogid = cpu->cpu_id;
3189			cpi->cpi_pkgcoreid = cpu->cpu_id;
3190			cpi->cpi_procnodeid = cpi->cpi_chipid;
3191			cpi->cpi_compunitid = cpi->cpi_coreid;
3192			break;
3193		}
3194	}
3195}
3196
3197/*
3198 * Gather relevant CPU features from leaf 6 which covers thermal information. We
3199 * always gather leaf 6 if it's supported; however, we only look for features on
3200 * Intel systems as AMD does not currently define any of the features we look
3201 * for below.
3202 */
3203static void
3204cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
3205{
3206	struct cpuid_regs *cp;
3207	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3208
3209	if (cpi->cpi_maxeax < 6) {
3210		return;
3211	}
3212
3213	cp = &cpi->cpi_std[6];
3214	cp->cp_eax = 6;
3215	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
3216	(void) __cpuid_insn(cp);
3217	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
3218
3219	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
3220		return;
3221	}
3222
3223	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
3224		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
3225	}
3226
3227	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
3228		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
3229	}
3230}
3231
3232/*
3233 * PPIN is the protected processor inventory number. On AMD this is an actual
3234 * feature bit. However, on Intel systems we need to read the platform
3235 * information MSR if we're on a specific model.
3236 */
3237#if !defined(__xpv)
3238static void
3239cpuid_pass1_ppin(cpu_t *cpu, uchar_t *featureset)
3240{
3241	on_trap_data_t otd;
3242	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3243
3244	switch (cpi->cpi_vendor) {
3245	case X86_VENDOR_AMD:
3246		/*
3247		 * This leaf will have already been gathered in the topology
3248		 * functions.
3249		 */
3250		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
3251			if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PPIN) {
3252				add_x86_feature(featureset, X86FSET_PPIN);
3253			}
3254		}
3255		break;
3256	case X86_VENDOR_Intel:
3257		if (cpi->cpi_family != 6)
3258			break;
3259		switch (cpi->cpi_model) {
3260		case INTC_MODEL_IVYBRIDGE_XEON:
3261		case INTC_MODEL_HASWELL_XEON:
3262		case INTC_MODEL_BROADWELL_XEON:
3263		case INTC_MODEL_BROADWELL_XEON_D:
3264		case INTC_MODEL_SKYLAKE_XEON:
3265			if (!on_trap(&otd, OT_DATA_ACCESS)) {
3266				uint64_t value;
3267
3268				value = rdmsr(MSR_PLATFORM_INFO);
3269				if ((value & MSR_PLATFORM_INFO_PPIN) != 0) {
3270					add_x86_feature(featureset,
3271					    X86FSET_PPIN);
3272				}
3273			}
3274			no_trap();
3275			break;
3276		default:
3277			break;
3278		}
3279		break;
3280	default:
3281		break;
3282	}
3283}
3284#endif	/* ! __xpv */
3285
3286void
3287cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
3288{
3289	uint32_t mask_ecx, mask_edx;
3290	struct cpuid_info *cpi;
3291	struct cpuid_regs *cp;
3292	int xcpuid;
3293#if !defined(__xpv)
3294	extern int idle_cpu_prefer_mwait;
3295#endif
3296
3297	/*
3298	 * Space statically allocated for BSP, ensure pointer is set
3299	 */
3300	if (cpu->cpu_id == 0) {
3301		if (cpu->cpu_m.mcpu_cpi == NULL)
3302			cpu->cpu_m.mcpu_cpi = &cpuid_info0;
3303	}
3304
3305	add_x86_feature(featureset, X86FSET_CPUID);
3306
3307	cpi = cpu->cpu_m.mcpu_cpi;
3308	ASSERT(cpi != NULL);
3309	cp = &cpi->cpi_std[0];
3310	cp->cp_eax = 0;
3311	cpi->cpi_maxeax = __cpuid_insn(cp);
3312	{
3313		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
3314		*iptr++ = cp->cp_ebx;
3315		*iptr++ = cp->cp_edx;
3316		*iptr++ = cp->cp_ecx;
3317		*(char *)&cpi->cpi_vendorstr[12] = '\0';
3318	}
3319
3320	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
3321	x86_vendor = cpi->cpi_vendor; /* for compatibility */
3322
3323	/*
3324	 * Limit the range in case of weird hardware
3325	 */
3326	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
3327		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
3328	if (cpi->cpi_maxeax < 1)
3329		goto pass1_done;
3330
3331	cp = &cpi->cpi_std[1];
3332	cp->cp_eax = 1;
3333	(void) __cpuid_insn(cp);
3334
3335	/*
3336	 * Extract identifying constants for easy access.
3337	 */
3338	cpi->cpi_model = CPI_MODEL(cpi);
3339	cpi->cpi_family = CPI_FAMILY(cpi);
3340
3341	if (cpi->cpi_family == 0xf)
3342		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
3343
3344	/*
3345	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
3346	 * Intel, and presumably everyone else, uses model == 0xf, as
3347	 * one would expect (max value means possible overflow).  Sigh.
3348	 */
3349
3350	switch (cpi->cpi_vendor) {
3351	case X86_VENDOR_Intel:
3352		if (IS_EXTENDED_MODEL_INTEL(cpi))
3353			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3354		break;
3355	case X86_VENDOR_AMD:
3356		if (CPI_FAMILY(cpi) == 0xf)
3357			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3358		break;
3359	default:
3360		if (cpi->cpi_model == 0xf)
3361			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
3362		break;
3363	}
3364
3365	cpi->cpi_step = CPI_STEP(cpi);
3366	cpi->cpi_brandid = CPI_BRANDID(cpi);
3367
3368	/*
3369	 * *default* assumptions:
3370	 * - believe %edx feature word
3371	 * - ignore %ecx feature word
3372	 * - 32-bit virtual and physical addressing
3373	 */
3374	mask_edx = 0xffffffff;
3375	mask_ecx = 0;
3376
3377	cpi->cpi_pabits = cpi->cpi_vabits = 32;
3378
3379	switch (cpi->cpi_vendor) {
3380	case X86_VENDOR_Intel:
3381		if (cpi->cpi_family == 5)
3382			x86_type = X86_TYPE_P5;
3383		else if (IS_LEGACY_P6(cpi)) {
3384			x86_type = X86_TYPE_P6;
3385			pentiumpro_bug4046376 = 1;
3386			/*
3387			 * Clear the SEP bit when it was set erroneously
3388			 */
3389			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
3390				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
3391		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
3392			x86_type = X86_TYPE_P4;
3393			/*
3394			 * We don't currently depend on any of the %ecx
3395			 * features until Prescott, so we'll only check
3396			 * this from P4 onwards.  We might want to revisit
3397			 * that idea later.
3398			 */
3399			mask_ecx = 0xffffffff;
3400		} else if (cpi->cpi_family > 0xf)
3401			mask_ecx = 0xffffffff;
3402		/*
3403		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3404		 * to obtain the monitor linesize.
3405		 */
3406		if (cpi->cpi_maxeax < 5)
3407			mask_ecx &= ~CPUID_INTC_ECX_MON;
3408		break;
3409	case X86_VENDOR_IntelClone:
3410	default:
3411		break;
3412	case X86_VENDOR_AMD:
3413#if defined(OPTERON_ERRATUM_108)
3414		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
3415			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
3416			cpi->cpi_model = 0xc;
3417		} else
3418#endif
3419		if (cpi->cpi_family == 5) {
3420			/*
3421			 * AMD K5 and K6
3422			 *
3423			 * These CPUs have an incomplete implementation
3424			 * of MCA/MCE which we mask away.
3425			 */
3426			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
3427
3428			/*
3429			 * Model 0 uses the wrong (APIC) bit
3430			 * to indicate PGE.  Fix it here.
3431			 */
3432			if (cpi->cpi_model == 0) {
3433				if (cp->cp_edx & 0x200) {
3434					cp->cp_edx &= ~0x200;
3435					cp->cp_edx |= CPUID_INTC_EDX_PGE;
3436				}
3437			}
3438
3439			/*
3440			 * Early models had problems w/ MMX; disable.
3441			 */
3442			if (cpi->cpi_model < 6)
3443				mask_edx &= ~CPUID_INTC_EDX_MMX;
3444		}
3445
3446		/*
3447		 * For newer families, SSE3 and CX16, at least, are valid;
3448		 * enable all
3449		 */
3450		if (cpi->cpi_family >= 0xf)
3451			mask_ecx = 0xffffffff;
3452		/*
3453		 * We don't support MONITOR/MWAIT if leaf 5 is not available
3454		 * to obtain the monitor linesize.
3455		 */
3456		if (cpi->cpi_maxeax < 5)
3457			mask_ecx &= ~CPUID_INTC_ECX_MON;
3458
3459#if !defined(__xpv)
3460		/*
3461		 * AMD has not historically used MWAIT in the CPU's idle loop.
3462		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
3463		 * know for certain that in at least family 17h, per AMD, mwait
3464		 * is preferred. Families in-between are less certain.
3465		 */
3466		if (cpi->cpi_family < 0x17) {
3467			idle_cpu_prefer_mwait = 0;
3468		}
3469#endif
3470
3471		break;
3472	case X86_VENDOR_TM:
3473		/*
3474		 * workaround the NT workaround in CMS 4.1
3475		 */
3476		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
3477		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
3478			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3479		break;
3480	case X86_VENDOR_Centaur:
3481		/*
3482		 * workaround the NT workarounds again
3483		 */
3484		if (cpi->cpi_family == 6)
3485			cp->cp_edx |= CPUID_INTC_EDX_CX8;
3486		break;
3487	case X86_VENDOR_Cyrix:
3488		/*
3489		 * We rely heavily on the probing in locore
3490		 * to actually figure out what parts, if any,
3491		 * of the Cyrix cpuid instruction to believe.
3492		 */
3493		switch (x86_type) {
3494		case X86_TYPE_CYRIX_486:
3495			mask_edx = 0;
3496			break;
3497		case X86_TYPE_CYRIX_6x86:
3498			mask_edx = 0;
3499			break;
3500		case X86_TYPE_CYRIX_6x86L:
3501			mask_edx =
3502			    CPUID_INTC_EDX_DE |
3503			    CPUID_INTC_EDX_CX8;
3504			break;
3505		case X86_TYPE_CYRIX_6x86MX:
3506			mask_edx =
3507			    CPUID_INTC_EDX_DE |
3508			    CPUID_INTC_EDX_MSR |
3509			    CPUID_INTC_EDX_CX8 |
3510			    CPUID_INTC_EDX_PGE |
3511			    CPUID_INTC_EDX_CMOV |
3512			    CPUID_INTC_EDX_MMX;
3513			break;
3514		case X86_TYPE_CYRIX_GXm:
3515			mask_edx =
3516			    CPUID_INTC_EDX_MSR |
3517			    CPUID_INTC_EDX_CX8 |
3518			    CPUID_INTC_EDX_CMOV |
3519			    CPUID_INTC_EDX_MMX;
3520			break;
3521		case X86_TYPE_CYRIX_MediaGX:
3522			break;
3523		case X86_TYPE_CYRIX_MII:
3524		case X86_TYPE_VIA_CYRIX_III:
3525			mask_edx =
3526			    CPUID_INTC_EDX_DE |
3527			    CPUID_INTC_EDX_TSC |
3528			    CPUID_INTC_EDX_MSR |
3529			    CPUID_INTC_EDX_CX8 |
3530			    CPUID_INTC_EDX_PGE |
3531			    CPUID_INTC_EDX_CMOV |
3532			    CPUID_INTC_EDX_MMX;
3533			break;
3534		default:
3535			break;
3536		}
3537		break;
3538	}
3539
3540#if defined(__xpv)
3541	/*
3542	 * Do not support MONITOR/MWAIT under a hypervisor
3543	 */
3544	mask_ecx &= ~CPUID_INTC_ECX_MON;
3545	/*
3546	 * Do not support XSAVE under a hypervisor for now
3547	 */
3548	xsave_force_disable = B_TRUE;
3549
3550#endif	/* __xpv */
3551
3552	if (xsave_force_disable) {
3553		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
3554		mask_ecx &= ~CPUID_INTC_ECX_AVX;
3555		mask_ecx &= ~CPUID_INTC_ECX_F16C;
3556		mask_ecx &= ~CPUID_INTC_ECX_FMA;
3557	}
3558
3559	/*
3560	 * Now we've figured out the masks that determine
3561	 * which bits we choose to believe, apply the masks
3562	 * to the feature words, then map the kernel's view
3563	 * of these feature words into its feature word.
3564	 */
3565	cp->cp_edx &= mask_edx;
3566	cp->cp_ecx &= mask_ecx;
3567
3568	/*
3569	 * apply any platform restrictions (we don't call this
3570	 * immediately after __cpuid_insn here, because we need the
3571	 * workarounds applied above first)
3572	 */
3573	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
3574
3575	/*
3576	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
3577	 * instruction set extensions in leaf 7's ebx, ecx, and edx.
3578	 */
3579	if (cpi->cpi_maxeax >= 7) {
3580		struct cpuid_regs *ecp;
3581		ecp = &cpi->cpi_std[7];
3582		ecp->cp_eax = 7;
3583		ecp->cp_ecx = 0;
3584		(void) __cpuid_insn(ecp);
3585
3586		/*
3587		 * If XSAVE has been disabled, just ignore all of the
3588		 * extended-save-area dependent flags here.
3589		 */
3590		if (xsave_force_disable) {
3591			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
3592			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
3593			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
3594			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
3595			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3596			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3597			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3598		}
3599
3600		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
3601			add_x86_feature(featureset, X86FSET_SMEP);
3602
3603		/*
3604		 * We check disable_smap here in addition to in startup_smap()
3605		 * to ensure CPUs that aren't the boot CPU don't accidentally
3606		 * include it in the feature set and thus generate a mismatched
3607		 * x86 feature set across CPUs.
3608		 */
3609		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
3610		    disable_smap == 0)
3611			add_x86_feature(featureset, X86FSET_SMAP);
3612
3613		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
3614			add_x86_feature(featureset, X86FSET_RDSEED);
3615
3616		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
3617			add_x86_feature(featureset, X86FSET_ADX);
3618
3619		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
3620			add_x86_feature(featureset, X86FSET_FSGSBASE);
3621
3622		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
3623			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
3624
3625		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3626			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
3627				add_x86_feature(featureset, X86FSET_INVPCID);
3628
3629			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
3630				add_x86_feature(featureset, X86FSET_MPX);
3631
3632			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
3633				add_x86_feature(featureset, X86FSET_CLWB);
3634		}
3635	}
3636
3637	/*
3638	 * fold in overrides from the "eeprom" mechanism
3639	 */
3640	cp->cp_edx |= cpuid_feature_edx_include;
3641	cp->cp_edx &= ~cpuid_feature_edx_exclude;
3642
3643	cp->cp_ecx |= cpuid_feature_ecx_include;
3644	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
3645
3646	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
3647		add_x86_feature(featureset, X86FSET_LARGEPAGE);
3648	}
3649	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
3650		add_x86_feature(featureset, X86FSET_TSC);
3651	}
3652	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
3653		add_x86_feature(featureset, X86FSET_MSR);
3654	}
3655	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
3656		add_x86_feature(featureset, X86FSET_MTRR);
3657	}
3658	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
3659		add_x86_feature(featureset, X86FSET_PGE);
3660	}
3661	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
3662		add_x86_feature(featureset, X86FSET_CMOV);
3663	}
3664	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
3665		add_x86_feature(featureset, X86FSET_MMX);
3666	}
3667	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
3668	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
3669		add_x86_feature(featureset, X86FSET_MCA);
3670	}
3671	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
3672		add_x86_feature(featureset, X86FSET_PAE);
3673	}
3674	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
3675		add_x86_feature(featureset, X86FSET_CX8);
3676	}
3677	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
3678		add_x86_feature(featureset, X86FSET_CX16);
3679	}
3680	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
3681		add_x86_feature(featureset, X86FSET_PAT);
3682	}
3683	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
3684		add_x86_feature(featureset, X86FSET_SEP);
3685	}
3686	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
3687		/*
3688		 * In our implementation, fxsave/fxrstor
3689		 * are prerequisites before we'll even
3690		 * try and do SSE things.
3691		 */
3692		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
3693			add_x86_feature(featureset, X86FSET_SSE);
3694		}
3695		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
3696			add_x86_feature(featureset, X86FSET_SSE2);
3697		}
3698		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
3699			add_x86_feature(featureset, X86FSET_SSE3);
3700		}
3701		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
3702			add_x86_feature(featureset, X86FSET_SSSE3);
3703		}
3704		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
3705			add_x86_feature(featureset, X86FSET_SSE4_1);
3706		}
3707		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
3708			add_x86_feature(featureset, X86FSET_SSE4_2);
3709		}
3710		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
3711			add_x86_feature(featureset, X86FSET_AES);
3712		}
3713		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
3714			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
3715		}
3716
3717		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
3718			add_x86_feature(featureset, X86FSET_SHA);
3719
3720		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
3721			add_x86_feature(featureset, X86FSET_UMIP);
3722		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
3723			add_x86_feature(featureset, X86FSET_PKU);
3724		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
3725			add_x86_feature(featureset, X86FSET_OSPKE);
3726
3727		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
3728			add_x86_feature(featureset, X86FSET_XSAVE);
3729
3730			/* We only test AVX & AVX512 when there is XSAVE */
3731
3732			if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
3733				add_x86_feature(featureset,
3734				    X86FSET_AVX);
3735
3736				/*
3737				 * Intel says we can't check these without also
3738				 * checking AVX.
3739				 */
3740				if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3741					add_x86_feature(featureset,
3742					    X86FSET_F16C);
3743
3744				if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3745					add_x86_feature(featureset,
3746					    X86FSET_FMA);
3747
3748				if (cpi->cpi_std[7].cp_ebx &
3749				    CPUID_INTC_EBX_7_0_BMI1)
3750					add_x86_feature(featureset,
3751					    X86FSET_BMI1);
3752
3753				if (cpi->cpi_std[7].cp_ebx &
3754				    CPUID_INTC_EBX_7_0_BMI2)
3755					add_x86_feature(featureset,
3756					    X86FSET_BMI2);
3757
3758				if (cpi->cpi_std[7].cp_ebx &
3759				    CPUID_INTC_EBX_7_0_AVX2)
3760					add_x86_feature(featureset,
3761					    X86FSET_AVX2);
3762			}
3763
3764			if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3765			    (cpi->cpi_std[7].cp_ebx &
3766			    CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3767				add_x86_feature(featureset, X86FSET_AVX512F);
3768
3769				if (cpi->cpi_std[7].cp_ebx &
3770				    CPUID_INTC_EBX_7_0_AVX512DQ)
3771					add_x86_feature(featureset,
3772					    X86FSET_AVX512DQ);
3773				if (cpi->cpi_std[7].cp_ebx &
3774				    CPUID_INTC_EBX_7_0_AVX512IFMA)
3775					add_x86_feature(featureset,
3776					    X86FSET_AVX512FMA);
3777				if (cpi->cpi_std[7].cp_ebx &
3778				    CPUID_INTC_EBX_7_0_AVX512PF)
3779					add_x86_feature(featureset,
3780					    X86FSET_AVX512PF);
3781				if (cpi->cpi_std[7].cp_ebx &
3782				    CPUID_INTC_EBX_7_0_AVX512ER)
3783					add_x86_feature(featureset,
3784					    X86FSET_AVX512ER);
3785				if (cpi->cpi_std[7].cp_ebx &
3786				    CPUID_INTC_EBX_7_0_AVX512CD)
3787					add_x86_feature(featureset,
3788					    X86FSET_AVX512CD);
3789				if (cpi->cpi_std[7].cp_ebx &
3790				    CPUID_INTC_EBX_7_0_AVX512BW)
3791					add_x86_feature(featureset,
3792					    X86FSET_AVX512BW);
3793				if (cpi->cpi_std[7].cp_ebx &
3794				    CPUID_INTC_EBX_7_0_AVX512VL)
3795					add_x86_feature(featureset,
3796					    X86FSET_AVX512VL);
3797
3798				if (cpi->cpi_std[7].cp_ecx &
3799				    CPUID_INTC_ECX_7_0_AVX512VBMI)
3800					add_x86_feature(featureset,
3801					    X86FSET_AVX512VBMI);
3802				if (cpi->cpi_std[7].cp_ecx &
3803				    CPUID_INTC_ECX_7_0_AVX512VNNI)
3804					add_x86_feature(featureset,
3805					    X86FSET_AVX512VNNI);
3806				if (cpi->cpi_std[7].cp_ecx &
3807				    CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3808					add_x86_feature(featureset,
3809					    X86FSET_AVX512VPOPCDQ);
3810
3811				if (cpi->cpi_std[7].cp_edx &
3812				    CPUID_INTC_EDX_7_0_AVX5124NNIW)
3813					add_x86_feature(featureset,
3814					    X86FSET_AVX512NNIW);
3815				if (cpi->cpi_std[7].cp_edx &
3816				    CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3817					add_x86_feature(featureset,
3818					    X86FSET_AVX512FMAPS);
3819			}
3820		}
3821	}
3822
3823	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3824		if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3825			add_x86_feature(featureset, X86FSET_PCID);
3826		}
3827	}
3828
3829	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3830		add_x86_feature(featureset, X86FSET_X2APIC);
3831	}
3832	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3833		add_x86_feature(featureset, X86FSET_DE);
3834	}
3835#if !defined(__xpv)
3836	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3837
3838		/*
3839		 * We require the CLFLUSH instruction for erratum workaround
3840		 * to use MONITOR/MWAIT.
3841		 */
3842		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3843			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3844			add_x86_feature(featureset, X86FSET_MWAIT);
3845		} else {
3846			extern int idle_cpu_assert_cflush_monitor;
3847
3848			/*
3849			 * All processors we are aware of which have
3850			 * MONITOR/MWAIT also have CLFLUSH.
3851			 */
3852			if (idle_cpu_assert_cflush_monitor) {
3853				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3854				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3855			}
3856		}
3857	}
3858#endif	/* __xpv */
3859
3860	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3861		add_x86_feature(featureset, X86FSET_VMX);
3862	}
3863
3864	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3865		add_x86_feature(featureset, X86FSET_RDRAND);
3866
3867	/*
3868	 * Only need it first time, rest of the cpus would follow suit.
3869	 * we only capture this for the bootcpu.
3870	 */
3871	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3872		add_x86_feature(featureset, X86FSET_CLFSH);
3873		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3874	}
3875	if (is_x86_feature(featureset, X86FSET_PAE))
3876		cpi->cpi_pabits = 36;
3877
3878	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3879		struct cpuid_regs r, *ecp;
3880
3881		ecp = &r;
3882		ecp->cp_eax = 0xD;
3883		ecp->cp_ecx = 1;
3884		ecp->cp_edx = ecp->cp_ebx = 0;
3885		(void) __cpuid_insn(ecp);
3886
3887		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3888			add_x86_feature(featureset, X86FSET_XSAVEOPT);
3889		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3890			add_x86_feature(featureset, X86FSET_XSAVEC);
3891		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3892			add_x86_feature(featureset, X86FSET_XSAVES);
3893	}
3894
3895	/*
3896	 * Work on the "extended" feature information, doing
3897	 * some basic initialization for cpuid_pass2()
3898	 */
3899	xcpuid = 0;
3900	switch (cpi->cpi_vendor) {
3901	case X86_VENDOR_Intel:
3902		/*
3903		 * On KVM we know we will have proper support for extended
3904		 * cpuid.
3905		 */
3906		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3907		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3908		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3909			xcpuid++;
3910		break;
3911	case X86_VENDOR_AMD:
3912		if (cpi->cpi_family > 5 ||
3913		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3914			xcpuid++;
3915		break;
3916	case X86_VENDOR_Cyrix:
3917		/*
3918		 * Only these Cyrix CPUs are -known- to support
3919		 * extended cpuid operations.
3920		 */
3921		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3922		    x86_type == X86_TYPE_CYRIX_GXm)
3923			xcpuid++;
3924		break;
3925	case X86_VENDOR_Centaur:
3926	case X86_VENDOR_TM:
3927	default:
3928		xcpuid++;
3929		break;
3930	}
3931
3932	if (xcpuid) {
3933		cp = &cpi->cpi_extd[0];
3934		cp->cp_eax = CPUID_LEAF_EXT_0;
3935		cpi->cpi_xmaxeax = __cpuid_insn(cp);
3936	}
3937
3938	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3939
3940		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3941			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3942
3943		switch (cpi->cpi_vendor) {
3944		case X86_VENDOR_Intel:
3945		case X86_VENDOR_AMD:
3946			if (cpi->cpi_xmaxeax < 0x80000001)
3947				break;
3948			cp = &cpi->cpi_extd[1];
3949			cp->cp_eax = 0x80000001;
3950			(void) __cpuid_insn(cp);
3951
3952			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3953			    cpi->cpi_family == 5 &&
3954			    cpi->cpi_model == 6 &&
3955			    cpi->cpi_step == 6) {
3956				/*
3957				 * K6 model 6 uses bit 10 to indicate SYSC
3958				 * Later models use bit 11. Fix it here.
3959				 */
3960				if (cp->cp_edx & 0x400) {
3961					cp->cp_edx &= ~0x400;
3962					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3963				}
3964			}
3965
3966			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3967
3968			/*
3969			 * Compute the additions to the kernel's feature word.
3970			 */
3971			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3972				add_x86_feature(featureset, X86FSET_NX);
3973			}
3974
3975			/*
3976			 * Regardless whether or not we boot 64-bit,
3977			 * we should have a way to identify whether
3978			 * the CPU is capable of running 64-bit.
3979			 */
3980			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3981				add_x86_feature(featureset, X86FSET_64);
3982			}
3983
3984			/* 1 GB large page - enable only for 64 bit kernel */
3985			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3986				add_x86_feature(featureset, X86FSET_1GPG);
3987			}
3988
3989			if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3990			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3991			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3992				add_x86_feature(featureset, X86FSET_SSE4A);
3993			}
3994
3995			/*
3996			 * It's really tricky to support syscall/sysret in
3997			 * the i386 kernel; we rely on sysenter/sysexit
3998			 * instead.  In the amd64 kernel, things are -way-
3999			 * better.
4000			 */
4001			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
4002				add_x86_feature(featureset, X86FSET_ASYSC);
4003			}
4004
4005			/*
4006			 * While we're thinking about system calls, note
4007			 * that AMD processors don't support sysenter
4008			 * in long mode at all, so don't try to program them.
4009			 */
4010			if (x86_vendor == X86_VENDOR_AMD) {
4011				remove_x86_feature(featureset, X86FSET_SEP);
4012			}
4013
4014			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
4015				add_x86_feature(featureset, X86FSET_TSCP);
4016			}
4017
4018			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
4019				add_x86_feature(featureset, X86FSET_SVM);
4020			}
4021
4022			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
4023				add_x86_feature(featureset, X86FSET_TOPOEXT);
4024			}
4025
4026			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
4027				add_x86_feature(featureset, X86FSET_AMD_PCEC);
4028			}
4029
4030			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
4031				add_x86_feature(featureset, X86FSET_XOP);
4032			}
4033
4034			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
4035				add_x86_feature(featureset, X86FSET_FMA4);
4036			}
4037
4038			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
4039				add_x86_feature(featureset, X86FSET_TBM);
4040			}
4041
4042			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
4043				add_x86_feature(featureset, X86FSET_MONITORX);
4044			}
4045			break;
4046		default:
4047			break;
4048		}
4049
4050		/*
4051		 * Get CPUID data about processor cores and hyperthreads.
4052		 */
4053		switch (cpi->cpi_vendor) {
4054		case X86_VENDOR_Intel:
4055			if (cpi->cpi_maxeax >= 4) {
4056				cp = &cpi->cpi_std[4];
4057				cp->cp_eax = 4;
4058				cp->cp_ecx = 0;
4059				(void) __cpuid_insn(cp);
4060				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
4061			}
4062			/*FALLTHROUGH*/
4063		case X86_VENDOR_AMD:
4064			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
4065				break;
4066			cp = &cpi->cpi_extd[8];
4067			cp->cp_eax = CPUID_LEAF_EXT_8;
4068			(void) __cpuid_insn(cp);
4069			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
4070			    cp);
4071
4072			/*
4073			 * AMD uses ebx for some extended functions.
4074			 */
4075			if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4076				/*
4077				 * While we're here, check for the AMD "Error
4078				 * Pointer Zero/Restore" feature. This can be
4079				 * used to setup the FP save handlers
4080				 * appropriately.
4081				 */
4082				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4083					cpi->cpi_fp_amd_save = 0;
4084				} else {
4085					cpi->cpi_fp_amd_save = 1;
4086				}
4087
4088				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
4089					add_x86_feature(featureset,
4090					    X86FSET_CLZERO);
4091				}
4092			}
4093
4094			/*
4095			 * Virtual and physical address limits from
4096			 * cpuid override previously guessed values.
4097			 */
4098			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
4099			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
4100			break;
4101		default:
4102			break;
4103		}
4104
4105		/*
4106		 * Get CPUID data about TSC Invariance in Deep C-State.
4107		 */
4108		switch (cpi->cpi_vendor) {
4109		case X86_VENDOR_Intel:
4110		case X86_VENDOR_AMD:
4111			if (cpi->cpi_maxeax >= 7) {
4112				cp = &cpi->cpi_extd[7];
4113				cp->cp_eax = 0x80000007;
4114				cp->cp_ecx = 0;
4115				(void) __cpuid_insn(cp);
4116			}
4117			break;
4118		default:
4119			break;
4120		}
4121	}
4122
4123	/*
4124	 * cpuid_pass1_ppin assumes that cpuid_pass1_topology has already been
4125	 * run and thus gathered some of its dependent leaves.
4126	 */
4127	cpuid_pass1_topology(cpu, featureset);
4128	cpuid_pass1_thermal(cpu, featureset);
4129#if !defined(__xpv)
4130	cpuid_pass1_ppin(cpu, featureset);
4131#endif
4132
4133	/*
4134	 * Synthesize chip "revision" and socket type
4135	 */
4136	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
4137	    cpi->cpi_model, cpi->cpi_step);
4138	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
4139	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
4140	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
4141	    cpi->cpi_model, cpi->cpi_step);
4142
4143	if (cpi->cpi_vendor == X86_VENDOR_AMD) {
4144		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
4145		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
4146			/* Special handling for AMD FP not necessary. */
4147			cpi->cpi_fp_amd_save = 0;
4148		} else {
4149			cpi->cpi_fp_amd_save = 1;
4150		}
4151	}
4152
4153	/*
4154	 * Check the processor leaves that are used for security features.
4155	 */
4156	cpuid_scan_security(cpu, featureset);
4157
4158pass1_done:
4159	cpi->cpi_pass = 1;
4160}
4161
4162/*
4163 * Make copies of the cpuid table entries we depend on, in
4164 * part for ease of parsing now, in part so that we have only
4165 * one place to correct any of it, in part for ease of
4166 * later export to userland, and in part so we can look at
4167 * this stuff in a crash dump.
4168 */
4169
4170/*ARGSUSED*/
4171void
4172cpuid_pass2(cpu_t *cpu)
4173{
4174	uint_t n, nmax;
4175	int i;
4176	struct cpuid_regs *cp;
4177	uint8_t *dp;
4178	uint32_t *iptr;
4179	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4180
4181	ASSERT(cpi->cpi_pass == 1);
4182
4183	if (cpi->cpi_maxeax < 1)
4184		goto pass2_done;
4185
4186	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
4187		nmax = NMAX_CPI_STD;
4188	/*
4189	 * (We already handled n == 0 and n == 1 in pass 1)
4190	 */
4191	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
4192		/*
4193		 * leaves 6 and 7 were handled in pass 1
4194		 */
4195		if (n == 6 || n == 7)
4196			continue;
4197
4198		cp->cp_eax = n;
4199
4200		/*
4201		 * CPUID function 4 expects %ecx to be initialized
4202		 * with an index which indicates which cache to return
4203		 * information about. The OS is expected to call function 4
4204		 * with %ecx set to 0, 1, 2, ... until it returns with
4205		 * EAX[4:0] set to 0, which indicates there are no more
4206		 * caches.
4207		 *
4208		 * Here, populate cpi_std[4] with the information returned by
4209		 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
4210		 * when dynamic memory allocation becomes available.
4211		 *
4212		 * Note: we need to explicitly initialize %ecx here, since
4213		 * function 4 may have been previously invoked.
4214		 */
4215		if (n == 4)
4216			cp->cp_ecx = 0;
4217
4218		(void) __cpuid_insn(cp);
4219		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
4220		switch (n) {
4221		case 2:
4222			/*
4223			 * "the lower 8 bits of the %eax register
4224			 * contain a value that identifies the number
4225			 * of times the cpuid [instruction] has to be
4226			 * executed to obtain a complete image of the
4227			 * processor's caching systems."
4228			 *
4229			 * How *do* they make this stuff up?
4230			 */
4231			cpi->cpi_ncache = sizeof (*cp) *
4232			    BITX(cp->cp_eax, 7, 0);
4233			if (cpi->cpi_ncache == 0)
4234				break;
4235			cpi->cpi_ncache--;	/* skip count byte */
4236
4237			/*
4238			 * Well, for now, rather than attempt to implement
4239			 * this slightly dubious algorithm, we just look
4240			 * at the first 15 ..
4241			 */
4242			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
4243				cpi->cpi_ncache = sizeof (*cp) - 1;
4244
4245			dp = cpi->cpi_cacheinfo;
4246			if (BITX(cp->cp_eax, 31, 31) == 0) {
4247				uint8_t *p = (void *)&cp->cp_eax;
4248				for (i = 1; i < 4; i++)
4249					if (p[i] != 0)
4250						*dp++ = p[i];
4251			}
4252			if (BITX(cp->cp_ebx, 31, 31) == 0) {
4253				uint8_t *p = (void *)&cp->cp_ebx;
4254				for (i = 0; i < 4; i++)
4255					if (p[i] != 0)
4256						*dp++ = p[i];
4257			}
4258			if (BITX(cp->cp_ecx, 31, 31) == 0) {
4259				uint8_t *p = (void *)&cp->cp_ecx;
4260				for (i = 0; i < 4; i++)
4261					if (p[i] != 0)
4262						*dp++ = p[i];
4263			}
4264			if (BITX(cp->cp_edx, 31, 31) == 0) {
4265				uint8_t *p = (void *)&cp->cp_edx;
4266				for (i = 0; i < 4; i++)
4267					if (p[i] != 0)
4268						*dp++ = p[i];
4269			}
4270			break;
4271
4272		case 3:	/* Processor serial number, if PSN supported */
4273			break;
4274
4275		case 4:	/* Deterministic cache parameters */
4276			break;
4277
4278		case 5:	/* Monitor/Mwait parameters */
4279		{
4280			size_t mwait_size;
4281
4282			/*
4283			 * check cpi_mwait.support which was set in cpuid_pass1
4284			 */
4285			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
4286				break;
4287
4288			/*
4289			 * Protect ourself from insane mwait line size.
4290			 * Workaround for incomplete hardware emulator(s).
4291			 */
4292			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
4293			if (mwait_size < sizeof (uint32_t) ||
4294			    !ISP2(mwait_size)) {
4295#if DEBUG
4296				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
4297				    "size %ld", cpu->cpu_id, (long)mwait_size);
4298#endif
4299				break;
4300			}
4301
4302			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
4303			cpi->cpi_mwait.mon_max = mwait_size;
4304			if (MWAIT_EXTENSION(cpi)) {
4305				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
4306				if (MWAIT_INT_ENABLE(cpi))
4307					cpi->cpi_mwait.support |=
4308					    MWAIT_ECX_INT_ENABLE;
4309			}
4310			break;
4311		}
4312		default:
4313			break;
4314		}
4315	}
4316
4317	/*
4318	 * XSAVE enumeration
4319	 */
4320	if (cpi->cpi_maxeax >= 0xD) {
4321		struct cpuid_regs regs;
4322		boolean_t cpuid_d_valid = B_TRUE;
4323
4324		cp = &regs;
4325		cp->cp_eax = 0xD;
4326		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
4327
4328		(void) __cpuid_insn(cp);
4329
4330		/*
4331		 * Sanity checks for debug
4332		 */
4333		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
4334		    (cp->cp_eax & XFEATURE_SSE) == 0) {
4335			cpuid_d_valid = B_FALSE;
4336		}
4337
4338		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
4339		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
4340		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
4341
4342		/*
4343		 * If the hw supports AVX, get the size and offset in the save
4344		 * area for the ymm state.
4345		 */
4346		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
4347			cp->cp_eax = 0xD;
4348			cp->cp_ecx = 2;
4349			cp->cp_edx = cp->cp_ebx = 0;
4350
4351			(void) __cpuid_insn(cp);
4352
4353			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
4354			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
4355				cpuid_d_valid = B_FALSE;
4356			}
4357
4358			cpi->cpi_xsave.ymm_size = cp->cp_eax;
4359			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
4360		}
4361
4362		/*
4363		 * If the hw supports MPX, get the size and offset in the
4364		 * save area for BNDREGS and BNDCSR.
4365		 */
4366		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
4367			cp->cp_eax = 0xD;
4368			cp->cp_ecx = 3;
4369			cp->cp_edx = cp->cp_ebx = 0;
4370
4371			(void) __cpuid_insn(cp);
4372
4373			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
4374			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
4375
4376			cp->cp_eax = 0xD;
4377			cp->cp_ecx = 4;
4378			cp->cp_edx = cp->cp_ebx = 0;
4379
4380			(void) __cpuid_insn(cp);
4381
4382			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
4383			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
4384		}
4385
4386		/*
4387		 * If the hw supports AVX512, get the size and offset in the
4388		 * save area for the opmask registers and zmm state.
4389		 */
4390		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
4391			cp->cp_eax = 0xD;
4392			cp->cp_ecx = 5;
4393			cp->cp_edx = cp->cp_ebx = 0;
4394
4395			(void) __cpuid_insn(cp);
4396
4397			cpi->cpi_xsave.opmask_size = cp->cp_eax;
4398			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
4399
4400			cp->cp_eax = 0xD;
4401			cp->cp_ecx = 6;
4402			cp->cp_edx = cp->cp_ebx = 0;
4403
4404			(void) __cpuid_insn(cp);
4405
4406			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
4407			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
4408
4409			cp->cp_eax = 0xD;
4410			cp->cp_ecx = 7;
4411			cp->cp_edx = cp->cp_ebx = 0;
4412
4413			(void) __cpuid_insn(cp);
4414
4415			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
4416			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
4417		}
4418
4419		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
4420			xsave_state_size = 0;
4421		} else if (cpuid_d_valid) {
4422			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
4423		} else {
4424			/* Broken CPUID 0xD, probably in HVM */
4425			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
4426			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
4427			    ", ymm_size = %d, ymm_offset = %d\n",
4428			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
4429			    cpi->cpi_xsave.xsav_hw_features_high,
4430			    (int)cpi->cpi_xsave.xsav_max_size,
4431			    (int)cpi->cpi_xsave.ymm_size,
4432			    (int)cpi->cpi_xsave.ymm_offset);
4433
4434			if (xsave_state_size != 0) {
4435				/*
4436				 * This must be a non-boot CPU. We cannot
4437				 * continue, because boot cpu has already
4438				 * enabled XSAVE.
4439				 */
4440				ASSERT(cpu->cpu_id != 0);
4441				cmn_err(CE_PANIC, "cpu%d: we have already "
4442				    "enabled XSAVE on boot cpu, cannot "
4443				    "continue.", cpu->cpu_id);
4444			} else {
4445				/*
4446				 * If we reached here on the boot CPU, it's also
4447				 * almost certain that we'll reach here on the
4448				 * non-boot CPUs. When we're here on a boot CPU
4449				 * we should disable the feature, on a non-boot
4450				 * CPU we need to confirm that we have.
4451				 */
4452				if (cpu->cpu_id == 0) {
4453					remove_x86_feature(x86_featureset,
4454					    X86FSET_XSAVE);
4455					remove_x86_feature(x86_featureset,
4456					    X86FSET_AVX);
4457					remove_x86_feature(x86_featureset,
4458					    X86FSET_F16C);
4459					remove_x86_feature(x86_featureset,
4460					    X86FSET_BMI1);
4461					remove_x86_feature(x86_featureset,
4462					    X86FSET_BMI2);
4463					remove_x86_feature(x86_featureset,
4464					    X86FSET_FMA);
4465					remove_x86_feature(x86_featureset,
4466					    X86FSET_AVX2);
4467					remove_x86_feature(x86_featureset,
4468					    X86FSET_MPX);
4469					remove_x86_feature(x86_featureset,
4470					    X86FSET_AVX512F);
4471					remove_x86_feature(x86_featureset,
4472					    X86FSET_AVX512DQ);
4473					remove_x86_feature(x86_featureset,
4474					    X86FSET_AVX512PF);
4475					remove_x86_feature(x86_featureset,
4476					    X86FSET_AVX512ER);
4477					remove_x86_feature(x86_featureset,
4478					    X86FSET_AVX512CD);
4479					remove_x86_feature(x86_featureset,
4480					    X86FSET_AVX512BW);
4481					remove_x86_feature(x86_featureset,
4482					    X86FSET_AVX512VL);
4483					remove_x86_feature(x86_featureset,
4484					    X86FSET_AVX512FMA);
4485					remove_x86_feature(x86_featureset,
4486					    X86FSET_AVX512VBMI);
4487					remove_x86_feature(x86_featureset,
4488					    X86FSET_AVX512VNNI);
4489					remove_x86_feature(x86_featureset,
4490					    X86FSET_AVX512VPOPCDQ);
4491					remove_x86_feature(x86_featureset,
4492					    X86FSET_AVX512NNIW);
4493					remove_x86_feature(x86_featureset,
4494					    X86FSET_AVX512FMAPS);
4495
4496					CPI_FEATURES_ECX(cpi) &=
4497					    ~CPUID_INTC_ECX_XSAVE;
4498					CPI_FEATURES_ECX(cpi) &=
4499					    ~CPUID_INTC_ECX_AVX;
4500					CPI_FEATURES_ECX(cpi) &=
4501					    ~CPUID_INTC_ECX_F16C;
4502					CPI_FEATURES_ECX(cpi) &=
4503					    ~CPUID_INTC_ECX_FMA;
4504					CPI_FEATURES_7_0_EBX(cpi) &=
4505					    ~CPUID_INTC_EBX_7_0_BMI1;
4506					CPI_FEATURES_7_0_EBX(cpi) &=
4507					    ~CPUID_INTC_EBX_7_0_BMI2;
4508					CPI_FEATURES_7_0_EBX(cpi) &=
4509					    ~CPUID_INTC_EBX_7_0_AVX2;
4510					CPI_FEATURES_7_0_EBX(cpi) &=
4511					    ~CPUID_INTC_EBX_7_0_MPX;
4512					CPI_FEATURES_7_0_EBX(cpi) &=
4513					    ~CPUID_INTC_EBX_7_0_ALL_AVX512;
4514
4515					CPI_FEATURES_7_0_ECX(cpi) &=
4516					    ~CPUID_INTC_ECX_7_0_ALL_AVX512;
4517
4518					CPI_FEATURES_7_0_EDX(cpi) &=
4519					    ~CPUID_INTC_EDX_7_0_ALL_AVX512;
4520
4521					xsave_force_disable = B_TRUE;
4522				} else {
4523					VERIFY(is_x86_feature(x86_featureset,
4524					    X86FSET_XSAVE) == B_FALSE);
4525				}
4526			}
4527		}
4528	}
4529
4530
4531	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
4532		goto pass2_done;
4533
4534	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
4535		nmax = NMAX_CPI_EXTD;
4536	/*
4537	 * Copy the extended properties, fixing them as we go.
4538	 * (We already handled n == 0 and n == 1 in pass 1)
4539	 */
4540	iptr = (void *)cpi->cpi_brandstr;
4541	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
4542		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
4543		(void) __cpuid_insn(cp);
4544		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
4545		    cp);
4546		switch (n) {
4547		case 2:
4548		case 3:
4549		case 4:
4550			/*
4551			 * Extract the brand string
4552			 */
4553			*iptr++ = cp->cp_eax;
4554			*iptr++ = cp->cp_ebx;
4555			*iptr++ = cp->cp_ecx;
4556			*iptr++ = cp->cp_edx;
4557			break;
4558		case 5:
4559			switch (cpi->cpi_vendor) {
4560			case X86_VENDOR_AMD:
4561				/*
4562				 * The Athlon and Duron were the first
4563				 * parts to report the sizes of the
4564				 * TLB for large pages. Before then,
4565				 * we don't trust the data.
4566				 */
4567				if (cpi->cpi_family < 6 ||
4568				    (cpi->cpi_family == 6 &&
4569				    cpi->cpi_model < 1))
4570					cp->cp_eax = 0;
4571				break;
4572			default:
4573				break;
4574			}
4575			break;
4576		case 6:
4577			switch (cpi->cpi_vendor) {
4578			case X86_VENDOR_AMD:
4579				/*
4580				 * The Athlon and Duron were the first
4581				 * AMD parts with L2 TLB's.
4582				 * Before then, don't trust the data.
4583				 */
4584				if (cpi->cpi_family < 6 ||
4585				    cpi->cpi_family == 6 &&
4586				    cpi->cpi_model < 1)
4587					cp->cp_eax = cp->cp_ebx = 0;
4588				/*
4589				 * AMD Duron rev A0 reports L2
4590				 * cache size incorrectly as 1K
4591				 * when it is really 64K
4592				 */
4593				if (cpi->cpi_family == 6 &&
4594				    cpi->cpi_model == 3 &&
4595				    cpi->cpi_step == 0) {
4596					cp->cp_ecx &= 0xffff;
4597					cp->cp_ecx |= 0x400000;
4598				}
4599				break;
4600			case X86_VENDOR_Cyrix:	/* VIA C3 */
4601				/*
4602				 * VIA C3 processors are a bit messed
4603				 * up w.r.t. encoding cache sizes in %ecx
4604				 */
4605				if (cpi->cpi_family != 6)
4606					break;
4607				/*
4608				 * model 7 and 8 were incorrectly encoded
4609				 *
4610				 * xxx is model 8 really broken?
4611				 */
4612				if (cpi->cpi_model == 7 ||
4613				    cpi->cpi_model == 8)
4614					cp->cp_ecx =
4615					    BITX(cp->cp_ecx, 31, 24) << 16 |
4616					    BITX(cp->cp_ecx, 23, 16) << 12 |
4617					    BITX(cp->cp_ecx, 15, 8) << 8 |
4618					    BITX(cp->cp_ecx, 7, 0);
4619				/*
4620				 * model 9 stepping 1 has wrong associativity
4621				 */
4622				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
4623					cp->cp_ecx |= 8 << 12;
4624				break;
4625			case X86_VENDOR_Intel:
4626				/*
4627				 * Extended L2 Cache features function.
4628				 * First appeared on Prescott.
4629				 */
4630			default:
4631				break;
4632			}
4633			break;
4634		default:
4635			break;
4636		}
4637	}
4638
4639pass2_done:
4640	cpi->cpi_pass = 2;
4641}
4642
4643static const char *
4644intel_cpubrand(const struct cpuid_info *cpi)
4645{
4646	int i;
4647
4648	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4649	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4650		return ("i486");
4651
4652	switch (cpi->cpi_family) {
4653	case 5:
4654		return ("Intel Pentium(r)");
4655	case 6:
4656		switch (cpi->cpi_model) {
4657			uint_t celeron, xeon;
4658			const struct cpuid_regs *cp;
4659		case 0:
4660		case 1:
4661		case 2:
4662			return ("Intel Pentium(r) Pro");
4663		case 3:
4664		case 4:
4665			return ("Intel Pentium(r) II");
4666		case 6:
4667			return ("Intel Celeron(r)");
4668		case 5:
4669		case 7:
4670			celeron = xeon = 0;
4671			cp = &cpi->cpi_std[2];	/* cache info */
4672
4673			for (i = 1; i < 4; i++) {
4674				uint_t tmp;
4675
4676				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
4677				if (tmp == 0x40)
4678					celeron++;
4679				if (tmp >= 0x44 && tmp <= 0x45)
4680					xeon++;
4681			}
4682
4683			for (i = 0; i < 2; i++) {
4684				uint_t tmp;
4685
4686				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
4687				if (tmp == 0x40)
4688					celeron++;
4689				else if (tmp >= 0x44 && tmp <= 0x45)
4690					xeon++;
4691			}
4692
4693			for (i = 0; i < 4; i++) {
4694				uint_t tmp;
4695
4696				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
4697				if (tmp == 0x40)
4698					celeron++;
4699				else if (tmp >= 0x44 && tmp <= 0x45)
4700					xeon++;
4701			}
4702
4703			for (i = 0; i < 4; i++) {
4704				uint_t tmp;
4705
4706				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
4707				if (tmp == 0x40)
4708					celeron++;
4709				else if (tmp >= 0x44 && tmp <= 0x45)
4710					xeon++;
4711			}
4712
4713			if (celeron)
4714				return ("Intel Celeron(r)");
4715			if (xeon)
4716				return (cpi->cpi_model == 5 ?
4717				    "Intel Pentium(r) II Xeon(tm)" :
4718				    "Intel Pentium(r) III Xeon(tm)");
4719			return (cpi->cpi_model == 5 ?
4720			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
4721			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
4722		default:
4723			break;
4724		}
4725	default:
4726		break;
4727	}
4728
4729	/* BrandID is present if the field is nonzero */
4730	if (cpi->cpi_brandid != 0) {
4731		static const struct {
4732			uint_t bt_bid;
4733			const char *bt_str;
4734		} brand_tbl[] = {
4735			{ 0x1,	"Intel(r) Celeron(r)" },
4736			{ 0x2,	"Intel(r) Pentium(r) III" },
4737			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
4738			{ 0x4,	"Intel(r) Pentium(r) III" },
4739			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
4740			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
4741			{ 0x8,	"Intel(r) Pentium(r) 4" },
4742			{ 0x9,	"Intel(r) Pentium(r) 4" },
4743			{ 0xa,	"Intel(r) Celeron(r)" },
4744			{ 0xb,	"Intel(r) Xeon(tm)" },
4745			{ 0xc,	"Intel(r) Xeon(tm) MP" },
4746			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
4747			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
4748			{ 0x11, "Mobile Genuine Intel(r)" },
4749			{ 0x12, "Intel(r) Celeron(r) M" },
4750			{ 0x13, "Mobile Intel(r) Celeron(r)" },
4751			{ 0x14, "Intel(r) Celeron(r)" },
4752			{ 0x15, "Mobile Genuine Intel(r)" },
4753			{ 0x16,	"Intel(r) Pentium(r) M" },
4754			{ 0x17, "Mobile Intel(r) Celeron(r)" }
4755		};
4756		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4757		uint_t sgn;
4758
4759		sgn = (cpi->cpi_family << 8) |
4760		    (cpi->cpi_model << 4) | cpi->cpi_step;
4761
4762		for (i = 0; i < btblmax; i++)
4763			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4764				break;
4765		if (i < btblmax) {
4766			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4767				return ("Intel(r) Celeron(r)");
4768			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4769				return ("Intel(r) Xeon(tm) MP");
4770			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4771				return ("Intel(r) Xeon(tm)");
4772			return (brand_tbl[i].bt_str);
4773		}
4774	}
4775
4776	return (NULL);
4777}
4778
4779static const char *
4780amd_cpubrand(const struct cpuid_info *cpi)
4781{
4782	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4783	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4784		return ("i486 compatible");
4785
4786	switch (cpi->cpi_family) {
4787	case 5:
4788		switch (cpi->cpi_model) {
4789		case 0:
4790		case 1:
4791		case 2:
4792		case 3:
4793		case 4:
4794		case 5:
4795			return ("AMD-K5(r)");
4796		case 6:
4797		case 7:
4798			return ("AMD-K6(r)");
4799		case 8:
4800			return ("AMD-K6(r)-2");
4801		case 9:
4802			return ("AMD-K6(r)-III");
4803		default:
4804			return ("AMD (family 5)");
4805		}
4806	case 6:
4807		switch (cpi->cpi_model) {
4808		case 1:
4809			return ("AMD-K7(tm)");
4810		case 0:
4811		case 2:
4812		case 4:
4813			return ("AMD Athlon(tm)");
4814		case 3:
4815		case 7:
4816			return ("AMD Duron(tm)");
4817		case 6:
4818		case 8:
4819		case 10:
4820			/*
4821			 * Use the L2 cache size to distinguish
4822			 */
4823			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4824			    "AMD Athlon(tm)" : "AMD Duron(tm)");
4825		default:
4826			return ("AMD (family 6)");
4827		}
4828	default:
4829		break;
4830	}
4831
4832	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4833	    cpi->cpi_brandid != 0) {
4834		switch (BITX(cpi->cpi_brandid, 7, 5)) {
4835		case 3:
4836			return ("AMD Opteron(tm) UP 1xx");
4837		case 4:
4838			return ("AMD Opteron(tm) DP 2xx");
4839		case 5:
4840			return ("AMD Opteron(tm) MP 8xx");
4841		default:
4842			return ("AMD Opteron(tm)");
4843		}
4844	}
4845
4846	return (NULL);
4847}
4848
4849static const char *
4850cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4851{
4852	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4853	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4854	    type == X86_TYPE_CYRIX_486)
4855		return ("i486 compatible");
4856
4857	switch (type) {
4858	case X86_TYPE_CYRIX_6x86:
4859		return ("Cyrix 6x86");
4860	case X86_TYPE_CYRIX_6x86L:
4861		return ("Cyrix 6x86L");
4862	case X86_TYPE_CYRIX_6x86MX:
4863		return ("Cyrix 6x86MX");
4864	case X86_TYPE_CYRIX_GXm:
4865		return ("Cyrix GXm");
4866	case X86_TYPE_CYRIX_MediaGX:
4867		return ("Cyrix MediaGX");
4868	case X86_TYPE_CYRIX_MII:
4869		return ("Cyrix M2");
4870	case X86_TYPE_VIA_CYRIX_III:
4871		return ("VIA Cyrix M3");
4872	default:
4873		/*
4874		 * Have another wild guess ..
4875		 */
4876		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4877			return ("Cyrix 5x86");
4878		else if (cpi->cpi_family == 5) {
4879			switch (cpi->cpi_model) {
4880			case 2:
4881				return ("Cyrix 6x86");	/* Cyrix M1 */
4882			case 4:
4883				return ("Cyrix MediaGX");
4884			default:
4885				break;
4886			}
4887		} else if (cpi->cpi_family == 6) {
4888			switch (cpi->cpi_model) {
4889			case 0:
4890				return ("Cyrix 6x86MX"); /* Cyrix M2? */
4891			case 5:
4892			case 6:
4893			case 7:
4894			case 8:
4895			case 9:
4896				return ("VIA C3");
4897			default:
4898				break;
4899			}
4900		}
4901		break;
4902	}
4903	return (NULL);
4904}
4905
4906/*
4907 * This only gets called in the case that the CPU extended
4908 * feature brand string (0x80000002, 0x80000003, 0x80000004)
4909 * aren't available, or contain null bytes for some reason.
4910 */
4911static void
4912fabricate_brandstr(struct cpuid_info *cpi)
4913{
4914	const char *brand = NULL;
4915
4916	switch (cpi->cpi_vendor) {
4917	case X86_VENDOR_Intel:
4918		brand = intel_cpubrand(cpi);
4919		break;
4920	case X86_VENDOR_AMD:
4921		brand = amd_cpubrand(cpi);
4922		break;
4923	case X86_VENDOR_Cyrix:
4924		brand = cyrix_cpubrand(cpi, x86_type);
4925		break;
4926	case X86_VENDOR_NexGen:
4927		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4928			brand = "NexGen Nx586";
4929		break;
4930	case X86_VENDOR_Centaur:
4931		if (cpi->cpi_family == 5)
4932			switch (cpi->cpi_model) {
4933			case 4:
4934				brand = "Centaur C6";
4935				break;
4936			case 8:
4937				brand = "Centaur C2";
4938				break;
4939			case 9:
4940				brand = "Centaur C3";
4941				break;
4942			default:
4943				break;
4944			}
4945		break;
4946	case X86_VENDOR_Rise:
4947		if (cpi->cpi_family == 5 &&
4948		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4949			brand = "Rise mP6";
4950		break;
4951	case X86_VENDOR_SiS:
4952		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4953			brand = "SiS 55x";
4954		break;
4955	case X86_VENDOR_TM:
4956		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4957			brand = "Transmeta Crusoe TM3x00 or TM5x00";
4958		break;
4959	case X86_VENDOR_NSC:
4960	case X86_VENDOR_UMC:
4961	default:
4962		break;
4963	}
4964	if (brand) {
4965		(void) strcpy((char *)cpi->cpi_brandstr, brand);
4966		return;
4967	}
4968
4969	/*
4970	 * If all else fails ...
4971	 */
4972	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4973	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4974	    cpi->cpi_model, cpi->cpi_step);
4975}
4976
4977/*
4978 * This routine is called just after kernel memory allocation
4979 * becomes available on cpu0, and as part of mp_startup() on
4980 * the other cpus.
4981 *
4982 * Fixup the brand string, and collect any information from cpuid
4983 * that requires dynamically allocated storage to represent.
4984 */
4985/*ARGSUSED*/
4986void
4987cpuid_pass3(cpu_t *cpu)
4988{
4989	int	i, max, shft, level, size;
4990	struct cpuid_regs regs;
4991	struct cpuid_regs *cp;
4992	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4993
4994	ASSERT(cpi->cpi_pass == 2);
4995
4996	/*
4997	 * Deterministic cache parameters
4998	 *
4999	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
5000	 * values that are present are currently defined to be the same. This
5001	 * means we can use the same logic to parse it as long as we use the
5002	 * appropriate leaf to get the data. If you're updating this, make sure
5003	 * you're careful about which vendor supports which aspect.
5004	 *
5005	 * Take this opportunity to detect the number of threads sharing the
5006	 * last level cache, and construct a corresponding cache id. The
5007	 * respective cpuid_info members are initialized to the default case of
5008	 * "no last level cache sharing".
5009	 */
5010	cpi->cpi_ncpu_shr_last_cache = 1;
5011	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
5012
5013	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
5014	    (cpi->cpi_vendor == X86_VENDOR_AMD &&
5015	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
5016	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
5017		uint32_t leaf;
5018
5019		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5020			leaf = 4;
5021		} else {
5022			leaf = CPUID_LEAF_EXT_1d;
5023		}
5024
5025		/*
5026		 * Find the # of elements (size) returned by the leaf and along
5027		 * the way detect last level cache sharing details.
5028		 */
5029		bzero(&regs, sizeof (regs));
5030		cp = &regs;
5031		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
5032			cp->cp_eax = leaf;
5033			cp->cp_ecx = i;
5034
5035			(void) __cpuid_insn(cp);
5036
5037			if (CPI_CACHE_TYPE(cp) == 0)
5038				break;
5039			level = CPI_CACHE_LVL(cp);
5040			if (level > max) {
5041				max = level;
5042				cpi->cpi_ncpu_shr_last_cache =
5043				    CPI_NTHR_SHR_CACHE(cp) + 1;
5044			}
5045		}
5046		cpi->cpi_cache_leaf_size = size = i;
5047
5048		/*
5049		 * Allocate the cpi_cache_leaves array. The first element
5050		 * references the regs for the corresponding leaf with %ecx set
5051		 * to 0. This was gathered in cpuid_pass2().
5052		 */
5053		if (size > 0) {
5054			cpi->cpi_cache_leaves =
5055			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
5056			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
5057				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
5058			} else {
5059				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
5060			}
5061
5062			/*
5063			 * Allocate storage to hold the additional regs
5064			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
5065			 *
5066			 * The regs for the leaf, %ecx == 0 has already
5067			 * been allocated as indicated above.
5068			 */
5069			for (i = 1; i < size; i++) {
5070				cp = cpi->cpi_cache_leaves[i] =
5071				    kmem_zalloc(sizeof (regs), KM_SLEEP);
5072				cp->cp_eax = leaf;
5073				cp->cp_ecx = i;
5074
5075				(void) __cpuid_insn(cp);
5076			}
5077		}
5078		/*
5079		 * Determine the number of bits needed to represent
5080		 * the number of CPUs sharing the last level cache.
5081		 *
5082		 * Shift off that number of bits from the APIC id to
5083		 * derive the cache id.
5084		 */
5085		shft = 0;
5086		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
5087			shft++;
5088		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
5089	}
5090
5091	/*
5092	 * Now fixup the brand string
5093	 */
5094	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
5095		fabricate_brandstr(cpi);
5096	} else {
5097
5098		/*
5099		 * If we successfully extracted a brand string from the cpuid
5100		 * instruction, clean it up by removing leading spaces and
5101		 * similar junk.
5102		 */
5103		if (cpi->cpi_brandstr[0]) {
5104			size_t maxlen = sizeof (cpi->cpi_brandstr);
5105			char *src, *dst;
5106
5107			dst = src = (char *)cpi->cpi_brandstr;
5108			src[maxlen - 1] = '\0';
5109			/*
5110			 * strip leading spaces
5111			 */
5112			while (*src == ' ')
5113				src++;
5114			/*
5115			 * Remove any 'Genuine' or "Authentic" prefixes
5116			 */
5117			if (strncmp(src, "Genuine ", 8) == 0)
5118				src += 8;
5119			if (strncmp(src, "Authentic ", 10) == 0)
5120				src += 10;
5121
5122			/*
5123			 * Now do an in-place copy.
5124			 * Map (R) to (r) and (TM) to (tm).
5125			 * The era of teletypes is long gone, and there's
5126			 * -really- no need to shout.
5127			 */
5128			while (*src != '\0') {
5129				if (src[0] == '(') {
5130					if (strncmp(src + 1, "R)", 2) == 0) {
5131						(void) strncpy(dst, "(r)", 3);
5132						src += 3;
5133						dst += 3;
5134						continue;
5135					}
5136					if (strncmp(src + 1, "TM)", 3) == 0) {
5137						(void) strncpy(dst, "(tm)", 4);
5138						src += 4;
5139						dst += 4;
5140						continue;
5141					}
5142				}
5143				*dst++ = *src++;
5144			}
5145			*dst = '\0';
5146
5147			/*
5148			 * Finally, remove any trailing spaces
5149			 */
5150			while (--dst > cpi->cpi_brandstr)
5151				if (*dst == ' ')
5152					*dst = '\0';
5153				else
5154					break;
5155		} else
5156			fabricate_brandstr(cpi);
5157	}
5158	cpi->cpi_pass = 3;
5159}
5160
5161/*
5162 * This routine is called out of bind_hwcap() much later in the life
5163 * of the kernel (post_startup()).  The job of this routine is to resolve
5164 * the hardware feature support and kernel support for those features into
5165 * what we're actually going to tell applications via the aux vector.
5166 */
5167void
5168cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
5169{
5170	struct cpuid_info *cpi;
5171	uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
5172
5173	if (cpu == NULL)
5174		cpu = CPU;
5175	cpi = cpu->cpu_m.mcpu_cpi;
5176
5177	ASSERT(cpi->cpi_pass == 3);
5178
5179	if (cpi->cpi_maxeax >= 1) {
5180		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
5181		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
5182		uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
5183
5184		*edx = CPI_FEATURES_EDX(cpi);
5185		*ecx = CPI_FEATURES_ECX(cpi);
5186		*ebx = CPI_FEATURES_7_0_EBX(cpi);
5187
5188		/*
5189		 * [these require explicit kernel support]
5190		 */
5191		if (!is_x86_feature(x86_featureset, X86FSET_SEP))
5192			*edx &= ~CPUID_INTC_EDX_SEP;
5193
5194		if (!is_x86_feature(x86_featureset, X86FSET_SSE))
5195			*edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
5196		if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
5197			*edx &= ~CPUID_INTC_EDX_SSE2;
5198
5199		if (!is_x86_feature(x86_featureset, X86FSET_HTT))
5200			*edx &= ~CPUID_INTC_EDX_HTT;
5201
5202		if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
5203			*ecx &= ~CPUID_INTC_ECX_SSE3;
5204
5205		if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
5206			*ecx &= ~CPUID_INTC_ECX_SSSE3;
5207		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
5208			*ecx &= ~CPUID_INTC_ECX_SSE4_1;
5209		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
5210			*ecx &= ~CPUID_INTC_ECX_SSE4_2;
5211		if (!is_x86_feature(x86_featureset, X86FSET_AES))
5212			*ecx &= ~CPUID_INTC_ECX_AES;
5213		if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
5214			*ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
5215		if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
5216			*ecx &= ~(CPUID_INTC_ECX_XSAVE |
5217			    CPUID_INTC_ECX_OSXSAVE);
5218		if (!is_x86_feature(x86_featureset, X86FSET_AVX))
5219			*ecx &= ~CPUID_INTC_ECX_AVX;
5220		if (!is_x86_feature(x86_featureset, X86FSET_F16C))
5221			*ecx &= ~CPUID_INTC_ECX_F16C;
5222		if (!is_x86_feature(x86_featureset, X86FSET_FMA))
5223			*ecx &= ~CPUID_INTC_ECX_FMA;
5224		if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
5225			*ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
5226		if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
5227			*ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
5228		if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
5229			*ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
5230		if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
5231			*ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
5232		if (!is_x86_feature(x86_featureset, X86FSET_ADX))
5233			*ebx &= ~CPUID_INTC_EBX_7_0_ADX;
5234
5235		/*
5236		 * [no explicit support required beyond x87 fp context]
5237		 */
5238		if (!fpu_exists)
5239			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
5240
5241		/*
5242		 * Now map the supported feature vector to things that we
5243		 * think userland will care about.
5244		 */
5245		if (*edx & CPUID_INTC_EDX_SEP)
5246			hwcap_flags |= AV_386_SEP;
5247		if (*edx & CPUID_INTC_EDX_SSE)
5248			hwcap_flags |= AV_386_FXSR | AV_386_SSE;
5249		if (*edx & CPUID_INTC_EDX_SSE2)
5250			hwcap_flags |= AV_386_SSE2;
5251		if (*ecx & CPUID_INTC_ECX_SSE3)
5252			hwcap_flags |= AV_386_SSE3;
5253		if (*ecx & CPUID_INTC_ECX_SSSE3)
5254			hwcap_flags |= AV_386_SSSE3;
5255		if (*ecx & CPUID_INTC_ECX_SSE4_1)
5256			hwcap_flags |= AV_386_SSE4_1;
5257		if (*ecx & CPUID_INTC_ECX_SSE4_2)
5258			hwcap_flags |= AV_386_SSE4_2;
5259		if (*ecx & CPUID_INTC_ECX_MOVBE)
5260			hwcap_flags |= AV_386_MOVBE;
5261		if (*ecx & CPUID_INTC_ECX_AES)
5262			hwcap_flags |= AV_386_AES;
5263		if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
5264			hwcap_flags |= AV_386_PCLMULQDQ;
5265		if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
5266		    (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
5267			hwcap_flags |= AV_386_XSAVE;
5268
5269			if (*ecx & CPUID_INTC_ECX_AVX) {
5270				uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
5271				uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
5272
5273				hwcap_flags |= AV_386_AVX;
5274				if (*ecx & CPUID_INTC_ECX_F16C)
5275					hwcap_flags_2 |= AV_386_2_F16C;
5276				if (*ecx & CPUID_INTC_ECX_FMA)
5277					hwcap_flags_2 |= AV_386_2_FMA;
5278
5279				if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
5280					hwcap_flags_2 |= AV_386_2_BMI1;
5281				if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
5282					hwcap_flags_2 |= AV_386_2_BMI2;
5283				if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
5284					hwcap_flags_2 |= AV_386_2_AVX2;
5285				if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
5286					hwcap_flags_2 |= AV_386_2_AVX512F;
5287				if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
5288					hwcap_flags_2 |= AV_386_2_AVX512DQ;
5289				if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
5290					hwcap_flags_2 |= AV_386_2_AVX512IFMA;
5291				if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
5292					hwcap_flags_2 |= AV_386_2_AVX512PF;
5293				if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
5294					hwcap_flags_2 |= AV_386_2_AVX512ER;
5295				if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
5296					hwcap_flags_2 |= AV_386_2_AVX512CD;
5297				if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
5298					hwcap_flags_2 |= AV_386_2_AVX512BW;
5299				if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
5300					hwcap_flags_2 |= AV_386_2_AVX512VL;
5301
5302				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
5303					hwcap_flags_2 |= AV_386_2_AVX512VBMI;
5304				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
5305					hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
5306				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
5307					hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
5308
5309				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
5310					hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
5311				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
5312					hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
5313			}
5314		}
5315		if (*ecx & CPUID_INTC_ECX_VMX)
5316			hwcap_flags |= AV_386_VMX;
5317		if (*ecx & CPUID_INTC_ECX_POPCNT)
5318			hwcap_flags |= AV_386_POPCNT;
5319		if (*edx & CPUID_INTC_EDX_FPU)
5320			hwcap_flags |= AV_386_FPU;
5321		if (*edx & CPUID_INTC_EDX_MMX)
5322			hwcap_flags |= AV_386_MMX;
5323
5324		if (*edx & CPUID_INTC_EDX_TSC)
5325			hwcap_flags |= AV_386_TSC;
5326		if (*edx & CPUID_INTC_EDX_CX8)
5327			hwcap_flags |= AV_386_CX8;
5328		if (*edx & CPUID_INTC_EDX_CMOV)
5329			hwcap_flags |= AV_386_CMOV;
5330		if (*ecx & CPUID_INTC_ECX_CX16)
5331			hwcap_flags |= AV_386_CX16;
5332
5333		if (*ecx & CPUID_INTC_ECX_RDRAND)
5334			hwcap_flags_2 |= AV_386_2_RDRAND;
5335		if (*ebx & CPUID_INTC_EBX_7_0_ADX)
5336			hwcap_flags_2 |= AV_386_2_ADX;
5337		if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
5338			hwcap_flags_2 |= AV_386_2_RDSEED;
5339		if (*ebx & CPUID_INTC_EBX_7_0_SHA)
5340			hwcap_flags_2 |= AV_386_2_SHA;
5341		if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
5342			hwcap_flags_2 |= AV_386_2_FSGSBASE;
5343		if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
5344			hwcap_flags_2 |= AV_386_2_CLWB;
5345		if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
5346			hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
5347
5348	}
5349	/*
5350	 * Check a few miscilaneous features.
5351	 */
5352	if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
5353		hwcap_flags_2 |= AV_386_2_CLZERO;
5354
5355	if (cpi->cpi_xmaxeax < 0x80000001)
5356		goto pass4_done;
5357
5358	switch (cpi->cpi_vendor) {
5359		struct cpuid_regs cp;
5360		uint32_t *edx, *ecx;
5361
5362	case X86_VENDOR_Intel:
5363		/*
5364		 * Seems like Intel duplicated what we necessary
5365		 * here to make the initial crop of 64-bit OS's work.
5366		 * Hopefully, those are the only "extended" bits
5367		 * they'll add.
5368		 */
5369		/*FALLTHROUGH*/
5370
5371	case X86_VENDOR_AMD:
5372		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
5373		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
5374
5375		*edx = CPI_FEATURES_XTD_EDX(cpi);
5376		*ecx = CPI_FEATURES_XTD_ECX(cpi);
5377
5378		/*
5379		 * [these features require explicit kernel support]
5380		 */
5381		switch (cpi->cpi_vendor) {
5382		case X86_VENDOR_Intel:
5383			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5384				*edx &= ~CPUID_AMD_EDX_TSCP;
5385			break;
5386
5387		case X86_VENDOR_AMD:
5388			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
5389				*edx &= ~CPUID_AMD_EDX_TSCP;
5390			if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
5391				*ecx &= ~CPUID_AMD_ECX_SSE4A;
5392			break;
5393
5394		default:
5395			break;
5396		}
5397
5398		/*
5399		 * [no explicit support required beyond
5400		 * x87 fp context and exception handlers]
5401		 */
5402		if (!fpu_exists)
5403			*edx &= ~(CPUID_AMD_EDX_MMXamd |
5404			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
5405
5406		if (!is_x86_feature(x86_featureset, X86FSET_NX))
5407			*edx &= ~CPUID_AMD_EDX_NX;
5408#if !defined(__amd64)
5409		*edx &= ~CPUID_AMD_EDX_LM;
5410#endif
5411		/*
5412		 * Now map the supported feature vector to
5413		 * things that we think userland will care about.
5414		 */
5415#if defined(__amd64)
5416		if (*edx & CPUID_AMD_EDX_SYSC)
5417			hwcap_flags |= AV_386_AMD_SYSC;
5418#endif
5419		if (*edx & CPUID_AMD_EDX_MMXamd)
5420			hwcap_flags |= AV_386_AMD_MMX;
5421		if (*edx & CPUID_AMD_EDX_3DNow)
5422			hwcap_flags |= AV_386_AMD_3DNow;
5423		if (*edx & CPUID_AMD_EDX_3DNowx)
5424			hwcap_flags |= AV_386_AMD_3DNowx;
5425		if (*ecx & CPUID_AMD_ECX_SVM)
5426			hwcap_flags |= AV_386_AMD_SVM;
5427
5428		switch (cpi->cpi_vendor) {
5429		case X86_VENDOR_AMD:
5430			if (*edx & CPUID_AMD_EDX_TSCP)
5431				hwcap_flags |= AV_386_TSCP;
5432			if (*ecx & CPUID_AMD_ECX_AHF64)
5433				hwcap_flags |= AV_386_AHF;
5434			if (*ecx & CPUID_AMD_ECX_SSE4A)
5435				hwcap_flags |= AV_386_AMD_SSE4A;
5436			if (*ecx & CPUID_AMD_ECX_LZCNT)
5437				hwcap_flags |= AV_386_AMD_LZCNT;
5438			if (*ecx & CPUID_AMD_ECX_MONITORX)
5439				hwcap_flags_2 |= AV_386_2_MONITORX;
5440			break;
5441
5442		case X86_VENDOR_Intel:
5443			if (*edx & CPUID_AMD_EDX_TSCP)
5444				hwcap_flags |= AV_386_TSCP;
5445			if (*ecx & CPUID_AMD_ECX_LZCNT)
5446				hwcap_flags |= AV_386_AMD_LZCNT;
5447			/*
5448			 * Aarrgh.
5449			 * Intel uses a different bit in the same word.
5450			 */
5451			if (*ecx & CPUID_INTC_ECX_AHF64)
5452				hwcap_flags |= AV_386_AHF;
5453			break;
5454
5455		default:
5456			break;
5457		}
5458		break;
5459
5460	case X86_VENDOR_TM:
5461		cp.cp_eax = 0x80860001;
5462		(void) __cpuid_insn(&cp);
5463		cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
5464		break;
5465
5466	default:
5467		break;
5468	}
5469
5470pass4_done:
5471	cpi->cpi_pass = 4;
5472	if (hwcap_out != NULL) {
5473		hwcap_out[0] = hwcap_flags;
5474		hwcap_out[1] = hwcap_flags_2;
5475	}
5476}
5477
5478
5479/*
5480 * Simulate the cpuid instruction using the data we previously
5481 * captured about this CPU.  We try our best to return the truth
5482 * about the hardware, independently of kernel support.
5483 */
5484uint32_t
5485cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
5486{
5487	struct cpuid_info *cpi;
5488	struct cpuid_regs *xcp;
5489
5490	if (cpu == NULL)
5491		cpu = CPU;
5492	cpi = cpu->cpu_m.mcpu_cpi;
5493
5494	ASSERT(cpuid_checkpass(cpu, 3));
5495
5496	/*
5497	 * CPUID data is cached in two separate places: cpi_std for standard
5498	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
5499	 */
5500	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
5501		xcp = &cpi->cpi_std[cp->cp_eax];
5502	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
5503	    cp->cp_eax <= cpi->cpi_xmaxeax &&
5504	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
5505		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
5506	} else {
5507		/*
5508		 * The caller is asking for data from an input parameter which
5509		 * the kernel has not cached.  In this case we go fetch from
5510		 * the hardware and return the data directly to the user.
5511		 */
5512		return (__cpuid_insn(cp));
5513	}
5514
5515	cp->cp_eax = xcp->cp_eax;
5516	cp->cp_ebx = xcp->cp_ebx;
5517	cp->cp_ecx = xcp->cp_ecx;
5518	cp->cp_edx = xcp->cp_edx;
5519	return (cp->cp_eax);
5520}
5521
5522int
5523cpuid_checkpass(cpu_t *cpu, int pass)
5524{
5525	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
5526	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
5527}
5528
5529int
5530cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
5531{
5532	ASSERT(cpuid_checkpass(cpu, 3));
5533
5534	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
5535}
5536
5537int
5538cpuid_is_cmt(cpu_t *cpu)
5539{
5540	if (cpu == NULL)
5541		cpu = CPU;
5542
5543	ASSERT(cpuid_checkpass(cpu, 1));
5544
5545	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
5546}
5547
5548/*
5549 * AMD and Intel both implement the 64-bit variant of the syscall
5550 * instruction (syscallq), so if there's -any- support for syscall,
5551 * cpuid currently says "yes, we support this".
5552 *
5553 * However, Intel decided to -not- implement the 32-bit variant of the
5554 * syscall instruction, so we provide a predicate to allow our caller
5555 * to test that subtlety here.
5556 *
5557 * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
5558 *	even in the case where the hardware would in fact support it.
5559 */
5560/*ARGSUSED*/
5561int
5562cpuid_syscall32_insn(cpu_t *cpu)
5563{
5564	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
5565
5566#if !defined(__xpv)
5567	if (cpu == NULL)
5568		cpu = CPU;
5569
5570	/*CSTYLED*/
5571	{
5572		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5573
5574		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
5575		    cpi->cpi_xmaxeax >= 0x80000001 &&
5576		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
5577			return (1);
5578	}
5579#endif
5580	return (0);
5581}
5582
5583int
5584cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
5585{
5586	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5587
5588	static const char fmt[] =
5589	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
5590	static const char fmt_ht[] =
5591	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
5592
5593	ASSERT(cpuid_checkpass(cpu, 1));
5594
5595	if (cpuid_is_cmt(cpu))
5596		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
5597		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5598		    cpi->cpi_family, cpi->cpi_model,
5599		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5600	return (snprintf(s, n, fmt,
5601	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
5602	    cpi->cpi_family, cpi->cpi_model,
5603	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
5604}
5605
5606const char *
5607cpuid_getvendorstr(cpu_t *cpu)
5608{
5609	ASSERT(cpuid_checkpass(cpu, 1));
5610	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
5611}
5612
5613uint_t
5614cpuid_getvendor(cpu_t *cpu)
5615{
5616	ASSERT(cpuid_checkpass(cpu, 1));
5617	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
5618}
5619
5620uint_t
5621cpuid_getfamily(cpu_t *cpu)
5622{
5623	ASSERT(cpuid_checkpass(cpu, 1));
5624	return (cpu->cpu_m.mcpu_cpi->cpi_family);
5625}
5626
5627uint_t
5628cpuid_getmodel(cpu_t *cpu)
5629{
5630	ASSERT(cpuid_checkpass(cpu, 1));
5631	return (cpu->cpu_m.mcpu_cpi->cpi_model);
5632}
5633
5634uint_t
5635cpuid_get_ncpu_per_chip(cpu_t *cpu)
5636{
5637	ASSERT(cpuid_checkpass(cpu, 1));
5638	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
5639}
5640
5641uint_t
5642cpuid_get_ncore_per_chip(cpu_t *cpu)
5643{
5644	ASSERT(cpuid_checkpass(cpu, 1));
5645	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
5646}
5647
5648uint_t
5649cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
5650{
5651	ASSERT(cpuid_checkpass(cpu, 2));
5652	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
5653}
5654
5655id_t
5656cpuid_get_last_lvl_cacheid(cpu_t *cpu)
5657{
5658	ASSERT(cpuid_checkpass(cpu, 2));
5659	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5660}
5661
5662uint_t
5663cpuid_getstep(cpu_t *cpu)
5664{
5665	ASSERT(cpuid_checkpass(cpu, 1));
5666	return (cpu->cpu_m.mcpu_cpi->cpi_step);
5667}
5668
5669uint_t
5670cpuid_getsig(struct cpu *cpu)
5671{
5672	ASSERT(cpuid_checkpass(cpu, 1));
5673	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
5674}
5675
5676uint32_t
5677cpuid_getchiprev(struct cpu *cpu)
5678{
5679	ASSERT(cpuid_checkpass(cpu, 1));
5680	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
5681}
5682
5683const char *
5684cpuid_getchiprevstr(struct cpu *cpu)
5685{
5686	ASSERT(cpuid_checkpass(cpu, 1));
5687	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
5688}
5689
5690uint32_t
5691cpuid_getsockettype(struct cpu *cpu)
5692{
5693	ASSERT(cpuid_checkpass(cpu, 1));
5694	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
5695}
5696
5697const char *
5698cpuid_getsocketstr(cpu_t *cpu)
5699{
5700	static const char *socketstr = NULL;
5701	struct cpuid_info *cpi;
5702
5703	ASSERT(cpuid_checkpass(cpu, 1));
5704	cpi = cpu->cpu_m.mcpu_cpi;
5705
5706	/* Assume that socket types are the same across the system */
5707	if (socketstr == NULL)
5708		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
5709		    cpi->cpi_model, cpi->cpi_step);
5710
5711
5712	return (socketstr);
5713}
5714
5715int
5716cpuid_get_chipid(cpu_t *cpu)
5717{
5718	ASSERT(cpuid_checkpass(cpu, 1));
5719
5720	if (cpuid_is_cmt(cpu))
5721		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
5722	return (cpu->cpu_id);
5723}
5724
5725id_t
5726cpuid_get_coreid(cpu_t *cpu)
5727{
5728	ASSERT(cpuid_checkpass(cpu, 1));
5729	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
5730}
5731
5732int
5733cpuid_get_pkgcoreid(cpu_t *cpu)
5734{
5735	ASSERT(cpuid_checkpass(cpu, 1));
5736	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
5737}
5738
5739int
5740cpuid_get_clogid(cpu_t *cpu)
5741{
5742	ASSERT(cpuid_checkpass(cpu, 1));
5743	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5744}
5745
5746int
5747cpuid_get_cacheid(cpu_t *cpu)
5748{
5749	ASSERT(cpuid_checkpass(cpu, 1));
5750	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5751}
5752
5753uint_t
5754cpuid_get_procnodeid(cpu_t *cpu)
5755{
5756	ASSERT(cpuid_checkpass(cpu, 1));
5757	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5758}
5759
5760uint_t
5761cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5762{
5763	ASSERT(cpuid_checkpass(cpu, 1));
5764	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5765}
5766
5767uint_t
5768cpuid_get_compunitid(cpu_t *cpu)
5769{
5770	ASSERT(cpuid_checkpass(cpu, 1));
5771	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5772}
5773
5774uint_t
5775cpuid_get_cores_per_compunit(cpu_t *cpu)
5776{
5777	ASSERT(cpuid_checkpass(cpu, 1));
5778	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5779}
5780
5781/*ARGSUSED*/
5782int
5783cpuid_have_cr8access(cpu_t *cpu)
5784{
5785#if defined(__amd64)
5786	return (1);
5787#else
5788	struct cpuid_info *cpi;
5789
5790	ASSERT(cpu != NULL);
5791	cpi = cpu->cpu_m.mcpu_cpi;
5792	if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5793	    (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5794		return (1);
5795	return (0);
5796#endif
5797}
5798
5799uint32_t
5800cpuid_get_apicid(cpu_t *cpu)
5801{
5802	ASSERT(cpuid_checkpass(cpu, 1));
5803	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5804		return (UINT32_MAX);
5805	} else {
5806		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5807	}
5808}
5809
5810void
5811cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5812{
5813	struct cpuid_info *cpi;
5814
5815	if (cpu == NULL)
5816		cpu = CPU;
5817	cpi = cpu->cpu_m.mcpu_cpi;
5818
5819	ASSERT(cpuid_checkpass(cpu, 1));
5820
5821	if (pabits)
5822		*pabits = cpi->cpi_pabits;
5823	if (vabits)
5824		*vabits = cpi->cpi_vabits;
5825}
5826
5827size_t
5828cpuid_get_xsave_size()
5829{
5830	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5831	    sizeof (struct xsave_state)));
5832}
5833
5834/*
5835 * Return true if the CPUs on this system require 'pointer clearing' for the
5836 * floating point error pointer exception handling. In the past, this has been
5837 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5838 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5839 * feature bit and is reflected in the cpi_fp_amd_save member.
5840 */
5841boolean_t
5842cpuid_need_fp_excp_handling()
5843{
5844	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5845	    cpuid_info0.cpi_fp_amd_save != 0);
5846}
5847
5848/*
5849 * Returns the number of data TLB entries for a corresponding
5850 * pagesize.  If it can't be computed, or isn't known, the
5851 * routine returns zero.  If you ask about an architecturally
5852 * impossible pagesize, the routine will panic (so that the
5853 * hat implementor knows that things are inconsistent.)
5854 */
5855uint_t
5856cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5857{
5858	struct cpuid_info *cpi;
5859	uint_t dtlb_nent = 0;
5860
5861	if (cpu == NULL)
5862		cpu = CPU;
5863	cpi = cpu->cpu_m.mcpu_cpi;
5864
5865	ASSERT(cpuid_checkpass(cpu, 1));
5866
5867	/*
5868	 * Check the L2 TLB info
5869	 */
5870	if (cpi->cpi_xmaxeax >= 0x80000006) {
5871		struct cpuid_regs *cp = &cpi->cpi_extd[6];
5872
5873		switch (pagesize) {
5874
5875		case 4 * 1024:
5876			/*
5877			 * All zero in the top 16 bits of the register
5878			 * indicates a unified TLB. Size is in low 16 bits.
5879			 */
5880			if ((cp->cp_ebx & 0xffff0000) == 0)
5881				dtlb_nent = cp->cp_ebx & 0x0000ffff;
5882			else
5883				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5884			break;
5885
5886		case 2 * 1024 * 1024:
5887			if ((cp->cp_eax & 0xffff0000) == 0)
5888				dtlb_nent = cp->cp_eax & 0x0000ffff;
5889			else
5890				dtlb_nent = BITX(cp->cp_eax, 27, 16);
5891			break;
5892
5893		default:
5894			panic("unknown L2 pagesize");
5895			/*NOTREACHED*/
5896		}
5897	}
5898
5899	if (dtlb_nent != 0)
5900		return (dtlb_nent);
5901
5902	/*
5903	 * No L2 TLB support for this size, try L1.
5904	 */
5905	if (cpi->cpi_xmaxeax >= 0x80000005) {
5906		struct cpuid_regs *cp = &cpi->cpi_extd[5];
5907
5908		switch (pagesize) {
5909		case 4 * 1024:
5910			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5911			break;
5912		case 2 * 1024 * 1024:
5913			dtlb_nent = BITX(cp->cp_eax, 23, 16);
5914			break;
5915		default:
5916			panic("unknown L1 d-TLB pagesize");
5917			/*NOTREACHED*/
5918		}
5919	}
5920
5921	return (dtlb_nent);
5922}
5923
5924/*
5925 * Return 0 if the erratum is not present or not applicable, positive
5926 * if it is, and negative if the status of the erratum is unknown.
5927 *
5928 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5929 * Processors" #25759, Rev 3.57, August 2005
5930 */
5931int
5932cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5933{
5934	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5935	uint_t eax;
5936
5937	/*
5938	 * Bail out if this CPU isn't an AMD CPU, or if it's
5939	 * a legacy (32-bit) AMD CPU.
5940	 */
5941	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5942	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5943	    cpi->cpi_family == 6) {
5944		return (0);
5945	}
5946
5947	eax = cpi->cpi_std[1].cp_eax;
5948
5949#define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
5950#define	SH_B3(eax)	(eax == 0xf51)
5951#define	B(eax)		(SH_B0(eax) || SH_B3(eax))
5952
5953#define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
5954
5955#define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5956#define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5957#define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
5958#define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5959
5960#define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5961#define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
5962#define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
5963#define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5964
5965#define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5966#define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
5967#define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
5968#define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
5969#define	BH_E4(eax)	(eax == 0x20fb1)
5970#define	SH_E5(eax)	(eax == 0x20f42)
5971#define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
5972#define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
5973#define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5974			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5975			    DH_E6(eax) || JH_E6(eax))
5976
5977#define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5978#define	DR_B0(eax)	(eax == 0x100f20)
5979#define	DR_B1(eax)	(eax == 0x100f21)
5980#define	DR_BA(eax)	(eax == 0x100f2a)
5981#define	DR_B2(eax)	(eax == 0x100f22)
5982#define	DR_B3(eax)	(eax == 0x100f23)
5983#define	RB_C0(eax)	(eax == 0x100f40)
5984
5985	switch (erratum) {
5986	case 1:
5987		return (cpi->cpi_family < 0x10);
5988	case 51:	/* what does the asterisk mean? */
5989		return (B(eax) || SH_C0(eax) || CG(eax));
5990	case 52:
5991		return (B(eax));
5992	case 57:
5993		return (cpi->cpi_family <= 0x11);
5994	case 58:
5995		return (B(eax));
5996	case 60:
5997		return (cpi->cpi_family <= 0x11);
5998	case 61:
5999	case 62:
6000	case 63:
6001	case 64:
6002	case 65:
6003	case 66:
6004	case 68:
6005	case 69:
6006	case 70:
6007	case 71:
6008		return (B(eax));
6009	case 72:
6010		return (SH_B0(eax));
6011	case 74:
6012		return (B(eax));
6013	case 75:
6014		return (cpi->cpi_family < 0x10);
6015	case 76:
6016		return (B(eax));
6017	case 77:
6018		return (cpi->cpi_family <= 0x11);
6019	case 78:
6020		return (B(eax) || SH_C0(eax));
6021	case 79:
6022		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6023	case 80:
6024	case 81:
6025	case 82:
6026		return (B(eax));
6027	case 83:
6028		return (B(eax) || SH_C0(eax) || CG(eax));
6029	case 85:
6030		return (cpi->cpi_family < 0x10);
6031	case 86:
6032		return (SH_C0(eax) || CG(eax));
6033	case 88:
6034#if !defined(__amd64)
6035		return (0);
6036#else
6037		return (B(eax) || SH_C0(eax));
6038#endif
6039	case 89:
6040		return (cpi->cpi_family < 0x10);
6041	case 90:
6042		return (B(eax) || SH_C0(eax) || CG(eax));
6043	case 91:
6044	case 92:
6045		return (B(eax) || SH_C0(eax));
6046	case 93:
6047		return (SH_C0(eax));
6048	case 94:
6049		return (B(eax) || SH_C0(eax) || CG(eax));
6050	case 95:
6051#if !defined(__amd64)
6052		return (0);
6053#else
6054		return (B(eax) || SH_C0(eax));
6055#endif
6056	case 96:
6057		return (B(eax) || SH_C0(eax) || CG(eax));
6058	case 97:
6059	case 98:
6060		return (SH_C0(eax) || CG(eax));
6061	case 99:
6062		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6063	case 100:
6064		return (B(eax) || SH_C0(eax));
6065	case 101:
6066	case 103:
6067		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6068	case 104:
6069		return (SH_C0(eax) || CG(eax) || D0(eax));
6070	case 105:
6071	case 106:
6072	case 107:
6073		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6074	case 108:
6075		return (DH_CG(eax));
6076	case 109:
6077		return (SH_C0(eax) || CG(eax) || D0(eax));
6078	case 110:
6079		return (D0(eax) || EX(eax));
6080	case 111:
6081		return (CG(eax));
6082	case 112:
6083		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6084	case 113:
6085		return (eax == 0x20fc0);
6086	case 114:
6087		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6088	case 115:
6089		return (SH_E0(eax) || JH_E1(eax));
6090	case 116:
6091		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
6092	case 117:
6093		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
6094	case 118:
6095		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
6096		    JH_E6(eax));
6097	case 121:
6098		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
6099	case 122:
6100		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
6101	case 123:
6102		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
6103	case 131:
6104		return (cpi->cpi_family < 0x10);
6105	case 6336786:
6106
6107		/*
6108		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
6109		 * if this is a K8 family or newer processor. We're testing for
6110		 * this 'erratum' to determine whether or not we have a constant
6111		 * TSC.
6112		 *
6113		 * Our current fix for this is to disable the C1-Clock ramping.
6114		 * However, this doesn't work on newer processor families nor
6115		 * does it work when virtualized as those devices don't exist.
6116		 */
6117		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
6118			return (0);
6119		}
6120
6121		if (CPI_FAMILY(cpi) == 0xf) {
6122			struct cpuid_regs regs;
6123			regs.cp_eax = 0x80000007;
6124			(void) __cpuid_insn(&regs);
6125			return (!(regs.cp_edx & 0x100));
6126		}
6127		return (0);
6128	case 6323525:
6129		/*
6130		 * This erratum (K8 #147) is not present on family 10 and newer.
6131		 */
6132		if (cpi->cpi_family >= 0x10) {
6133			return (0);
6134		}
6135		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
6136		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
6137
6138	case 6671130:
6139		/*
6140		 * check for processors (pre-Shanghai) that do not provide
6141		 * optimal management of 1gb ptes in its tlb.
6142		 */
6143		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
6144
6145	case 298:
6146		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
6147		    DR_B2(eax) || RB_C0(eax));
6148
6149	case 721:
6150#if defined(__amd64)
6151		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
6152#else
6153		return (0);
6154#endif
6155
6156	default:
6157		return (-1);
6158
6159	}
6160}
6161
6162/*
6163 * Determine if specified erratum is present via OSVW (OS Visible Workaround).
6164 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
6165 */
6166int
6167osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
6168{
6169	struct cpuid_info	*cpi;
6170	uint_t			osvwid;
6171	static int		osvwfeature = -1;
6172	uint64_t		osvwlength;
6173
6174
6175	cpi = cpu->cpu_m.mcpu_cpi;
6176
6177	/* confirm OSVW supported */
6178	if (osvwfeature == -1) {
6179		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
6180	} else {
6181		/* assert that osvw feature setting is consistent on all cpus */
6182		ASSERT(osvwfeature ==
6183		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
6184	}
6185	if (!osvwfeature)
6186		return (-1);
6187
6188	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
6189
6190	switch (erratum) {
6191	case 298:	/* osvwid is 0 */
6192		osvwid = 0;
6193		if (osvwlength <= (uint64_t)osvwid) {
6194			/* osvwid 0 is unknown */
6195			return (-1);
6196		}
6197
6198		/*
6199		 * Check the OSVW STATUS MSR to determine the state
6200		 * of the erratum where:
6201		 *   0 - fixed by HW
6202		 *   1 - BIOS has applied the workaround when BIOS
6203		 *   workaround is available. (Or for other errata,
6204		 *   OS workaround is required.)
6205		 * For a value of 1, caller will confirm that the
6206		 * erratum 298 workaround has indeed been applied by BIOS.
6207		 *
6208		 * A 1 may be set in cpus that have a HW fix
6209		 * in a mixed cpu system. Regarding erratum 298:
6210		 *   In a multiprocessor platform, the workaround above
6211		 *   should be applied to all processors regardless of
6212		 *   silicon revision when an affected processor is
6213		 *   present.
6214		 */
6215
6216		return (rdmsr(MSR_AMD_OSVW_STATUS +
6217		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
6218		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
6219
6220	default:
6221		return (-1);
6222	}
6223}
6224
6225static const char assoc_str[] = "associativity";
6226static const char line_str[] = "line-size";
6227static const char size_str[] = "size";
6228
6229static void
6230add_cache_prop(dev_info_t *devi, const char *label, const char *type,
6231    uint32_t val)
6232{
6233	char buf[128];
6234
6235	/*
6236	 * ndi_prop_update_int() is used because it is desirable for
6237	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
6238	 */
6239	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
6240		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
6241}
6242
6243/*
6244 * Intel-style cache/tlb description
6245 *
6246 * Standard cpuid level 2 gives a randomly ordered
6247 * selection of tags that index into a table that describes
6248 * cache and tlb properties.
6249 */
6250
6251static const char l1_icache_str[] = "l1-icache";
6252static const char l1_dcache_str[] = "l1-dcache";
6253static const char l2_cache_str[] = "l2-cache";
6254static const char l3_cache_str[] = "l3-cache";
6255static const char itlb4k_str[] = "itlb-4K";
6256static const char dtlb4k_str[] = "dtlb-4K";
6257static const char itlb2M_str[] = "itlb-2M";
6258static const char itlb4M_str[] = "itlb-4M";
6259static const char dtlb4M_str[] = "dtlb-4M";
6260static const char dtlb24_str[] = "dtlb0-2M-4M";
6261static const char itlb424_str[] = "itlb-4K-2M-4M";
6262static const char itlb24_str[] = "itlb-2M-4M";
6263static const char dtlb44_str[] = "dtlb-4K-4M";
6264static const char sl1_dcache_str[] = "sectored-l1-dcache";
6265static const char sl2_cache_str[] = "sectored-l2-cache";
6266static const char itrace_str[] = "itrace-cache";
6267static const char sl3_cache_str[] = "sectored-l3-cache";
6268static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
6269
6270static const struct cachetab {
6271	uint8_t		ct_code;
6272	uint8_t		ct_assoc;
6273	uint16_t	ct_line_size;
6274	size_t		ct_size;
6275	const char	*ct_label;
6276} intel_ctab[] = {
6277	/*
6278	 * maintain descending order!
6279	 *
6280	 * Codes ignored - Reason
6281	 * ----------------------
6282	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
6283	 * f0H/f1H - Currently we do not interpret prefetch size by design
6284	 */
6285	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
6286	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
6287	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
6288	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
6289	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
6290	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
6291	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
6292	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
6293	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
6294	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
6295	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
6296	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
6297	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
6298	{ 0xc0, 4, 0, 8, dtlb44_str },
6299	{ 0xba, 4, 0, 64, dtlb4k_str },
6300	{ 0xb4, 4, 0, 256, dtlb4k_str },
6301	{ 0xb3, 4, 0, 128, dtlb4k_str },
6302	{ 0xb2, 4, 0, 64, itlb4k_str },
6303	{ 0xb0, 4, 0, 128, itlb4k_str },
6304	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
6305	{ 0x86, 4, 64, 512*1024, l2_cache_str},
6306	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
6307	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
6308	{ 0x83, 8, 32, 512*1024, l2_cache_str},
6309	{ 0x82, 8, 32, 256*1024, l2_cache_str},
6310	{ 0x80, 8, 64, 512*1024, l2_cache_str},
6311	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
6312	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
6313	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
6314	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
6315	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
6316	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
6317	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
6318	{ 0x73, 8, 0, 64*1024, itrace_str},
6319	{ 0x72, 8, 0, 32*1024, itrace_str},
6320	{ 0x71, 8, 0, 16*1024, itrace_str},
6321	{ 0x70, 8, 0, 12*1024, itrace_str},
6322	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
6323	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
6324	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
6325	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
6326	{ 0x5d, 0, 0, 256, dtlb44_str},
6327	{ 0x5c, 0, 0, 128, dtlb44_str},
6328	{ 0x5b, 0, 0, 64, dtlb44_str},
6329	{ 0x5a, 4, 0, 32, dtlb24_str},
6330	{ 0x59, 0, 0, 16, dtlb4k_str},
6331	{ 0x57, 4, 0, 16, dtlb4k_str},
6332	{ 0x56, 4, 0, 16, dtlb4M_str},
6333	{ 0x55, 0, 0, 7, itlb24_str},
6334	{ 0x52, 0, 0, 256, itlb424_str},
6335	{ 0x51, 0, 0, 128, itlb424_str},
6336	{ 0x50, 0, 0, 64, itlb424_str},
6337	{ 0x4f, 0, 0, 32, itlb4k_str},
6338	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
6339	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
6340	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
6341	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
6342	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
6343	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
6344	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
6345	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
6346	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
6347	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
6348	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
6349	{ 0x43, 4, 32, 512*1024, l2_cache_str},
6350	{ 0x42, 4, 32, 256*1024, l2_cache_str},
6351	{ 0x41, 4, 32, 128*1024, l2_cache_str},
6352	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
6353	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
6354	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
6355	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
6356	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
6357	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
6358	{ 0x30, 8, 64, 32*1024, l1_icache_str},
6359	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
6360	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
6361	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
6362	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
6363	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
6364	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
6365	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
6366	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
6367	{ 0x0b, 4, 0, 4, itlb4M_str},
6368	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
6369	{ 0x08, 4, 32, 16*1024, l1_icache_str},
6370	{ 0x06, 4, 32, 8*1024, l1_icache_str},
6371	{ 0x05, 4, 0, 32, dtlb4M_str},
6372	{ 0x04, 4, 0, 8, dtlb4M_str},
6373	{ 0x03, 4, 0, 64, dtlb4k_str},
6374	{ 0x02, 4, 0, 2, itlb4M_str},
6375	{ 0x01, 4, 0, 32, itlb4k_str},
6376	{ 0 }
6377};
6378
6379static const struct cachetab cyrix_ctab[] = {
6380	{ 0x70, 4, 0, 32, "tlb-4K" },
6381	{ 0x80, 4, 16, 16*1024, "l1-cache" },
6382	{ 0 }
6383};
6384
6385/*
6386 * Search a cache table for a matching entry
6387 */
6388static const struct cachetab *
6389find_cacheent(const struct cachetab *ct, uint_t code)
6390{
6391	if (code != 0) {
6392		for (; ct->ct_code != 0; ct++)
6393			if (ct->ct_code <= code)
6394				break;
6395		if (ct->ct_code == code)
6396			return (ct);
6397	}
6398	return (NULL);
6399}
6400
6401/*
6402 * Populate cachetab entry with L2 or L3 cache-information using
6403 * cpuid function 4. This function is called from intel_walk_cacheinfo()
6404 * when descriptor 0x49 is encountered. It returns 0 if no such cache
6405 * information is found.
6406 */
6407static int
6408intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
6409{
6410	uint32_t level, i;
6411	int ret = 0;
6412
6413	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
6414		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
6415
6416		if (level == 2 || level == 3) {
6417			ct->ct_assoc =
6418			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
6419			ct->ct_line_size =
6420			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
6421			ct->ct_size = ct->ct_assoc *
6422			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
6423			    ct->ct_line_size *
6424			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
6425
6426			if (level == 2) {
6427				ct->ct_label = l2_cache_str;
6428			} else if (level == 3) {
6429				ct->ct_label = l3_cache_str;
6430			}
6431			ret = 1;
6432		}
6433	}
6434
6435	return (ret);
6436}
6437
6438/*
6439 * Walk the cacheinfo descriptor, applying 'func' to every valid element
6440 * The walk is terminated if the walker returns non-zero.
6441 */
6442static void
6443intel_walk_cacheinfo(struct cpuid_info *cpi,
6444    void *arg, int (*func)(void *, const struct cachetab *))
6445{
6446	const struct cachetab *ct;
6447	struct cachetab des_49_ct, des_b1_ct;
6448	uint8_t *dp;
6449	int i;
6450
6451	if ((dp = cpi->cpi_cacheinfo) == NULL)
6452		return;
6453	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6454		/*
6455		 * For overloaded descriptor 0x49 we use cpuid function 4
6456		 * if supported by the current processor, to create
6457		 * cache information.
6458		 * For overloaded descriptor 0xb1 we use X86_PAE flag
6459		 * to disambiguate the cache information.
6460		 */
6461		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
6462		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
6463				ct = &des_49_ct;
6464		} else if (*dp == 0xb1) {
6465			des_b1_ct.ct_code = 0xb1;
6466			des_b1_ct.ct_assoc = 4;
6467			des_b1_ct.ct_line_size = 0;
6468			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
6469				des_b1_ct.ct_size = 8;
6470				des_b1_ct.ct_label = itlb2M_str;
6471			} else {
6472				des_b1_ct.ct_size = 4;
6473				des_b1_ct.ct_label = itlb4M_str;
6474			}
6475			ct = &des_b1_ct;
6476		} else {
6477			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
6478				continue;
6479			}
6480		}
6481
6482		if (func(arg, ct) != 0) {
6483			break;
6484		}
6485	}
6486}
6487
6488/*
6489 * (Like the Intel one, except for Cyrix CPUs)
6490 */
6491static void
6492cyrix_walk_cacheinfo(struct cpuid_info *cpi,
6493    void *arg, int (*func)(void *, const struct cachetab *))
6494{
6495	const struct cachetab *ct;
6496	uint8_t *dp;
6497	int i;
6498
6499	if ((dp = cpi->cpi_cacheinfo) == NULL)
6500		return;
6501	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
6502		/*
6503		 * Search Cyrix-specific descriptor table first ..
6504		 */
6505		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
6506			if (func(arg, ct) != 0)
6507				break;
6508			continue;
6509		}
6510		/*
6511		 * .. else fall back to the Intel one
6512		 */
6513		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
6514			if (func(arg, ct) != 0)
6515				break;
6516			continue;
6517		}
6518	}
6519}
6520
6521/*
6522 * A cacheinfo walker that adds associativity, line-size, and size properties
6523 * to the devinfo node it is passed as an argument.
6524 */
6525static int
6526add_cacheent_props(void *arg, const struct cachetab *ct)
6527{
6528	dev_info_t *devi = arg;
6529
6530	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
6531	if (ct->ct_line_size != 0)
6532		add_cache_prop(devi, ct->ct_label, line_str,
6533		    ct->ct_line_size);
6534	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
6535	return (0);
6536}
6537
6538
6539static const char fully_assoc[] = "fully-associative?";
6540
6541/*
6542 * AMD style cache/tlb description
6543 *
6544 * Extended functions 5 and 6 directly describe properties of
6545 * tlbs and various cache levels.
6546 */
6547static void
6548add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6549{
6550	switch (assoc) {
6551	case 0:	/* reserved; ignore */
6552		break;
6553	default:
6554		add_cache_prop(devi, label, assoc_str, assoc);
6555		break;
6556	case 0xff:
6557		add_cache_prop(devi, label, fully_assoc, 1);
6558		break;
6559	}
6560}
6561
6562static void
6563add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6564{
6565	if (size == 0)
6566		return;
6567	add_cache_prop(devi, label, size_str, size);
6568	add_amd_assoc(devi, label, assoc);
6569}
6570
6571static void
6572add_amd_cache(dev_info_t *devi, const char *label,
6573    uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6574{
6575	if (size == 0 || line_size == 0)
6576		return;
6577	add_amd_assoc(devi, label, assoc);
6578	/*
6579	 * Most AMD parts have a sectored cache. Multiple cache lines are
6580	 * associated with each tag. A sector consists of all cache lines
6581	 * associated with a tag. For example, the AMD K6-III has a sector
6582	 * size of 2 cache lines per tag.
6583	 */
6584	if (lines_per_tag != 0)
6585		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6586	add_cache_prop(devi, label, line_str, line_size);
6587	add_cache_prop(devi, label, size_str, size * 1024);
6588}
6589
6590static void
6591add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
6592{
6593	switch (assoc) {
6594	case 0:	/* off */
6595		break;
6596	case 1:
6597	case 2:
6598	case 4:
6599		add_cache_prop(devi, label, assoc_str, assoc);
6600		break;
6601	case 6:
6602		add_cache_prop(devi, label, assoc_str, 8);
6603		break;
6604	case 8:
6605		add_cache_prop(devi, label, assoc_str, 16);
6606		break;
6607	case 0xf:
6608		add_cache_prop(devi, label, fully_assoc, 1);
6609		break;
6610	default: /* reserved; ignore */
6611		break;
6612	}
6613}
6614
6615static void
6616add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
6617{
6618	if (size == 0 || assoc == 0)
6619		return;
6620	add_amd_l2_assoc(devi, label, assoc);
6621	add_cache_prop(devi, label, size_str, size);
6622}
6623
6624static void
6625add_amd_l2_cache(dev_info_t *devi, const char *label,
6626    uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
6627{
6628	if (size == 0 || assoc == 0 || line_size == 0)
6629		return;
6630	add_amd_l2_assoc(devi, label, assoc);
6631	if (lines_per_tag != 0)
6632		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
6633	add_cache_prop(devi, label, line_str, line_size);
6634	add_cache_prop(devi, label, size_str, size * 1024);
6635}
6636
6637static void
6638amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
6639{
6640	struct cpuid_regs *cp;
6641
6642	if (cpi->cpi_xmaxeax < 0x80000005)
6643		return;
6644	cp = &cpi->cpi_extd[5];
6645
6646	/*
6647	 * 4M/2M L1 TLB configuration
6648	 *
6649	 * We report the size for 2M pages because AMD uses two
6650	 * TLB entries for one 4M page.
6651	 */
6652	add_amd_tlb(devi, "dtlb-2M",
6653	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
6654	add_amd_tlb(devi, "itlb-2M",
6655	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
6656
6657	/*
6658	 * 4K L1 TLB configuration
6659	 */
6660
6661	switch (cpi->cpi_vendor) {
6662		uint_t nentries;
6663	case X86_VENDOR_TM:
6664		if (cpi->cpi_family >= 5) {
6665			/*
6666			 * Crusoe processors have 256 TLB entries, but
6667			 * cpuid data format constrains them to only
6668			 * reporting 255 of them.
6669			 */
6670			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
6671				nentries = 256;
6672			/*
6673			 * Crusoe processors also have a unified TLB
6674			 */
6675			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
6676			    nentries);
6677			break;
6678		}
6679		/*FALLTHROUGH*/
6680	default:
6681		add_amd_tlb(devi, itlb4k_str,
6682		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
6683		add_amd_tlb(devi, dtlb4k_str,
6684		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
6685		break;
6686	}
6687
6688	/*
6689	 * data L1 cache configuration
6690	 */
6691
6692	add_amd_cache(devi, l1_dcache_str,
6693	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
6694	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
6695
6696	/*
6697	 * code L1 cache configuration
6698	 */
6699
6700	add_amd_cache(devi, l1_icache_str,
6701	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
6702	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
6703
6704	if (cpi->cpi_xmaxeax < 0x80000006)
6705		return;
6706	cp = &cpi->cpi_extd[6];
6707
6708	/* Check for a unified L2 TLB for large pages */
6709
6710	if (BITX(cp->cp_eax, 31, 16) == 0)
6711		add_amd_l2_tlb(devi, "l2-tlb-2M",
6712		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6713	else {
6714		add_amd_l2_tlb(devi, "l2-dtlb-2M",
6715		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6716		add_amd_l2_tlb(devi, "l2-itlb-2M",
6717		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6718	}
6719
6720	/* Check for a unified L2 TLB for 4K pages */
6721
6722	if (BITX(cp->cp_ebx, 31, 16) == 0) {
6723		add_amd_l2_tlb(devi, "l2-tlb-4K",
6724		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6725	} else {
6726		add_amd_l2_tlb(devi, "l2-dtlb-4K",
6727		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
6728		add_amd_l2_tlb(devi, "l2-itlb-4K",
6729		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
6730	}
6731
6732	add_amd_l2_cache(devi, l2_cache_str,
6733	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
6734	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
6735}
6736
6737/*
6738 * There are two basic ways that the x86 world describes it cache
6739 * and tlb architecture - Intel's way and AMD's way.
6740 *
6741 * Return which flavor of cache architecture we should use
6742 */
6743static int
6744x86_which_cacheinfo(struct cpuid_info *cpi)
6745{
6746	switch (cpi->cpi_vendor) {
6747	case X86_VENDOR_Intel:
6748		if (cpi->cpi_maxeax >= 2)
6749			return (X86_VENDOR_Intel);
6750		break;
6751	case X86_VENDOR_AMD:
6752		/*
6753		 * The K5 model 1 was the first part from AMD that reported
6754		 * cache sizes via extended cpuid functions.
6755		 */
6756		if (cpi->cpi_family > 5 ||
6757		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6758			return (X86_VENDOR_AMD);
6759		break;
6760	case X86_VENDOR_TM:
6761		if (cpi->cpi_family >= 5)
6762			return (X86_VENDOR_AMD);
6763		/*FALLTHROUGH*/
6764	default:
6765		/*
6766		 * If they have extended CPU data for 0x80000005
6767		 * then we assume they have AMD-format cache
6768		 * information.
6769		 *
6770		 * If not, and the vendor happens to be Cyrix,
6771		 * then try our-Cyrix specific handler.
6772		 *
6773		 * If we're not Cyrix, then assume we're using Intel's
6774		 * table-driven format instead.
6775		 */
6776		if (cpi->cpi_xmaxeax >= 0x80000005)
6777			return (X86_VENDOR_AMD);
6778		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6779			return (X86_VENDOR_Cyrix);
6780		else if (cpi->cpi_maxeax >= 2)
6781			return (X86_VENDOR_Intel);
6782		break;
6783	}
6784	return (-1);
6785}
6786
6787void
6788cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6789    struct cpuid_info *cpi)
6790{
6791	dev_info_t *cpu_devi;
6792	int create;
6793
6794	cpu_devi = (dev_info_t *)dip;
6795
6796	/* device_type */
6797	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6798	    "device_type", "cpu");
6799
6800	/* reg */
6801	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6802	    "reg", cpu_id);
6803
6804	/* cpu-mhz, and clock-frequency */
6805	if (cpu_freq > 0) {
6806		long long mul;
6807
6808		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6809		    "cpu-mhz", cpu_freq);
6810		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6811			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6812			    "clock-frequency", (int)mul);
6813	}
6814
6815	if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6816		return;
6817	}
6818
6819	/* vendor-id */
6820	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6821	    "vendor-id", cpi->cpi_vendorstr);
6822
6823	if (cpi->cpi_maxeax == 0) {
6824		return;
6825	}
6826
6827	/*
6828	 * family, model, and step
6829	 */
6830	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6831	    "family", CPI_FAMILY(cpi));
6832	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6833	    "cpu-model", CPI_MODEL(cpi));
6834	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6835	    "stepping-id", CPI_STEP(cpi));
6836
6837	/* type */
6838	switch (cpi->cpi_vendor) {
6839	case X86_VENDOR_Intel:
6840		create = 1;
6841		break;
6842	default:
6843		create = 0;
6844		break;
6845	}
6846	if (create)
6847		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6848		    "type", CPI_TYPE(cpi));
6849
6850	/* ext-family */
6851	switch (cpi->cpi_vendor) {
6852	case X86_VENDOR_Intel:
6853	case X86_VENDOR_AMD:
6854		create = cpi->cpi_family >= 0xf;
6855		break;
6856	default:
6857		create = 0;
6858		break;
6859	}
6860	if (create)
6861		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6862		    "ext-family", CPI_FAMILY_XTD(cpi));
6863
6864	/* ext-model */
6865	switch (cpi->cpi_vendor) {
6866	case X86_VENDOR_Intel:
6867		create = IS_EXTENDED_MODEL_INTEL(cpi);
6868		break;
6869	case X86_VENDOR_AMD:
6870		create = CPI_FAMILY(cpi) == 0xf;
6871		break;
6872	default:
6873		create = 0;
6874		break;
6875	}
6876	if (create)
6877		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6878		    "ext-model", CPI_MODEL_XTD(cpi));
6879
6880	/* generation */
6881	switch (cpi->cpi_vendor) {
6882	case X86_VENDOR_AMD:
6883		/*
6884		 * AMD K5 model 1 was the first part to support this
6885		 */
6886		create = cpi->cpi_xmaxeax >= 0x80000001;
6887		break;
6888	default:
6889		create = 0;
6890		break;
6891	}
6892	if (create)
6893		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6894		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6895
6896	/* brand-id */
6897	switch (cpi->cpi_vendor) {
6898	case X86_VENDOR_Intel:
6899		/*
6900		 * brand id first appeared on Pentium III Xeon model 8,
6901		 * and Celeron model 8 processors and Opteron
6902		 */
6903		create = cpi->cpi_family > 6 ||
6904		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6905		break;
6906	case X86_VENDOR_AMD:
6907		create = cpi->cpi_family >= 0xf;
6908		break;
6909	default:
6910		create = 0;
6911		break;
6912	}
6913	if (create && cpi->cpi_brandid != 0) {
6914		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6915		    "brand-id", cpi->cpi_brandid);
6916	}
6917
6918	/* chunks, and apic-id */
6919	switch (cpi->cpi_vendor) {
6920		/*
6921		 * first available on Pentium IV and Opteron (K8)
6922		 */
6923	case X86_VENDOR_Intel:
6924		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6925		break;
6926	case X86_VENDOR_AMD:
6927		create = cpi->cpi_family >= 0xf;
6928		break;
6929	default:
6930		create = 0;
6931		break;
6932	}
6933	if (create) {
6934		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6935		    "chunks", CPI_CHUNKS(cpi));
6936		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6937		    "apic-id", cpi->cpi_apicid);
6938		if (cpi->cpi_chipid >= 0) {
6939			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6940			    "chip#", cpi->cpi_chipid);
6941			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6942			    "clog#", cpi->cpi_clogid);
6943		}
6944	}
6945
6946	/* cpuid-features */
6947	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6948	    "cpuid-features", CPI_FEATURES_EDX(cpi));
6949
6950
6951	/* cpuid-features-ecx */
6952	switch (cpi->cpi_vendor) {
6953	case X86_VENDOR_Intel:
6954		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6955		break;
6956	case X86_VENDOR_AMD:
6957		create = cpi->cpi_family >= 0xf;
6958		break;
6959	default:
6960		create = 0;
6961		break;
6962	}
6963	if (create)
6964		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6965		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6966
6967	/* ext-cpuid-features */
6968	switch (cpi->cpi_vendor) {
6969	case X86_VENDOR_Intel:
6970	case X86_VENDOR_AMD:
6971	case X86_VENDOR_Cyrix:
6972	case X86_VENDOR_TM:
6973	case X86_VENDOR_Centaur:
6974		create = cpi->cpi_xmaxeax >= 0x80000001;
6975		break;
6976	default:
6977		create = 0;
6978		break;
6979	}
6980	if (create) {
6981		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6982		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6983		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6984		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6985	}
6986
6987	/*
6988	 * Brand String first appeared in Intel Pentium IV, AMD K5
6989	 * model 1, and Cyrix GXm.  On earlier models we try and
6990	 * simulate something similar .. so this string should always
6991	 * same -something- about the processor, however lame.
6992	 */
6993	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6994	    "brand-string", cpi->cpi_brandstr);
6995
6996	/*
6997	 * Finally, cache and tlb information
6998	 */
6999	switch (x86_which_cacheinfo(cpi)) {
7000	case X86_VENDOR_Intel:
7001		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7002		break;
7003	case X86_VENDOR_Cyrix:
7004		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
7005		break;
7006	case X86_VENDOR_AMD:
7007		amd_cache_info(cpi, cpu_devi);
7008		break;
7009	default:
7010		break;
7011	}
7012}
7013
7014struct l2info {
7015	int *l2i_csz;
7016	int *l2i_lsz;
7017	int *l2i_assoc;
7018	int l2i_ret;
7019};
7020
7021/*
7022 * A cacheinfo walker that fetches the size, line-size and associativity
7023 * of the L2 cache
7024 */
7025static int
7026intel_l2cinfo(void *arg, const struct cachetab *ct)
7027{
7028	struct l2info *l2i = arg;
7029	int *ip;
7030
7031	if (ct->ct_label != l2_cache_str &&
7032	    ct->ct_label != sl2_cache_str)
7033		return (0);	/* not an L2 -- keep walking */
7034
7035	if ((ip = l2i->l2i_csz) != NULL)
7036		*ip = ct->ct_size;
7037	if ((ip = l2i->l2i_lsz) != NULL)
7038		*ip = ct->ct_line_size;
7039	if ((ip = l2i->l2i_assoc) != NULL)
7040		*ip = ct->ct_assoc;
7041	l2i->l2i_ret = ct->ct_size;
7042	return (1);		/* was an L2 -- terminate walk */
7043}
7044
7045/*
7046 * AMD L2/L3 Cache and TLB Associativity Field Definition:
7047 *
7048 *	Unlike the associativity for the L1 cache and tlb where the 8 bit
7049 *	value is the associativity, the associativity for the L2 cache and
7050 *	tlb is encoded in the following table. The 4 bit L2 value serves as
7051 *	an index into the amd_afd[] array to determine the associativity.
7052 *	-1 is undefined. 0 is fully associative.
7053 */
7054
7055static int amd_afd[] =
7056	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
7057
7058static void
7059amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
7060{
7061	struct cpuid_regs *cp;
7062	uint_t size, assoc;
7063	int i;
7064	int *ip;
7065
7066	if (cpi->cpi_xmaxeax < 0x80000006)
7067		return;
7068	cp = &cpi->cpi_extd[6];
7069
7070	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
7071	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
7072		uint_t cachesz = size * 1024;
7073		assoc = amd_afd[i];
7074
7075		ASSERT(assoc != -1);
7076
7077		if ((ip = l2i->l2i_csz) != NULL)
7078			*ip = cachesz;
7079		if ((ip = l2i->l2i_lsz) != NULL)
7080			*ip = BITX(cp->cp_ecx, 7, 0);
7081		if ((ip = l2i->l2i_assoc) != NULL)
7082			*ip = assoc;
7083		l2i->l2i_ret = cachesz;
7084	}
7085}
7086
7087int
7088getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
7089{
7090	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
7091	struct l2info __l2info, *l2i = &__l2info;
7092
7093	l2i->l2i_csz = csz;
7094	l2i->l2i_lsz = lsz;
7095	l2i->l2i_assoc = assoc;
7096	l2i->l2i_ret = -1;
7097
7098	switch (x86_which_cacheinfo(cpi)) {
7099	case X86_VENDOR_Intel:
7100		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7101		break;
7102	case X86_VENDOR_Cyrix:
7103		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
7104		break;
7105	case X86_VENDOR_AMD:
7106		amd_l2cacheinfo(cpi, l2i);
7107		break;
7108	default:
7109		break;
7110	}
7111	return (l2i->l2i_ret);
7112}
7113
7114#if !defined(__xpv)
7115
7116uint32_t *
7117cpuid_mwait_alloc(cpu_t *cpu)
7118{
7119	uint32_t	*ret;
7120	size_t		mwait_size;
7121
7122	ASSERT(cpuid_checkpass(CPU, 2));
7123
7124	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
7125	if (mwait_size == 0)
7126		return (NULL);
7127
7128	/*
7129	 * kmem_alloc() returns cache line size aligned data for mwait_size
7130	 * allocations.  mwait_size is currently cache line sized.  Neither
7131	 * of these implementation details are guarantied to be true in the
7132	 * future.
7133	 *
7134	 * First try allocating mwait_size as kmem_alloc() currently returns
7135	 * correctly aligned memory.  If kmem_alloc() does not return
7136	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
7137	 *
7138	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
7139	 * decide to free this memory.
7140	 */
7141	ret = kmem_zalloc(mwait_size, KM_SLEEP);
7142	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
7143		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7144		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
7145		*ret = MWAIT_RUNNING;
7146		return (ret);
7147	} else {
7148		kmem_free(ret, mwait_size);
7149		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
7150		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
7151		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
7152		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
7153		*ret = MWAIT_RUNNING;
7154		return (ret);
7155	}
7156}
7157
7158void
7159cpuid_mwait_free(cpu_t *cpu)
7160{
7161	if (cpu->cpu_m.mcpu_cpi == NULL) {
7162		return;
7163	}
7164
7165	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
7166	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
7167		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
7168		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
7169	}
7170
7171	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
7172	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
7173}
7174
7175void
7176patch_tsc_read(int flag)
7177{
7178	size_t cnt;
7179
7180	switch (flag) {
7181	case TSC_NONE:
7182		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
7183		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
7184		break;
7185	case TSC_RDTSC_MFENCE:
7186		cnt = &_tsc_mfence_end - &_tsc_mfence_start;
7187		(void) memcpy((void *)tsc_read,
7188		    (void *)&_tsc_mfence_start, cnt);
7189		break;
7190	case TSC_RDTSC_LFENCE:
7191		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
7192		(void) memcpy((void *)tsc_read,
7193		    (void *)&_tsc_lfence_start, cnt);
7194		break;
7195	case TSC_TSCP:
7196		cnt = &_tscp_end - &_tscp_start;
7197		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
7198		break;
7199	default:
7200		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
7201		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
7202		break;
7203	}
7204	tsc_type = flag;
7205}
7206
7207int
7208cpuid_deep_cstates_supported(void)
7209{
7210	struct cpuid_info *cpi;
7211	struct cpuid_regs regs;
7212
7213	ASSERT(cpuid_checkpass(CPU, 1));
7214
7215	cpi = CPU->cpu_m.mcpu_cpi;
7216
7217	if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
7218		return (0);
7219
7220	switch (cpi->cpi_vendor) {
7221	case X86_VENDOR_Intel:
7222		if (cpi->cpi_xmaxeax < 0x80000007)
7223			return (0);
7224
7225		/*
7226		 * TSC run at a constant rate in all ACPI C-states?
7227		 */
7228		regs.cp_eax = 0x80000007;
7229		(void) __cpuid_insn(&regs);
7230		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
7231
7232	default:
7233		return (0);
7234	}
7235}
7236
7237#endif	/* !__xpv */
7238
7239void
7240post_startu