1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2014 Josef "Jeff" Sipek <jeffpc@josefsipek.net>
26 */
27/*
28 * Copyright (c) 2010, Intel Corporation.
29 * All rights reserved.
30 */
31/*
32 * Portions Copyright 2009 Advanced Micro Devices, Inc.
33 */
34/*
35 * Copyright 2019 Joyent, Inc.
36 */
37
38/*
39 * CPU Identification logic
40 *
41 * The purpose of this file and its companion, cpuid_subr.c, is to help deal
42 * with the identification of CPUs, their features, and their topologies. More
43 * specifically, this file helps drive the following:
44 *
45 * 1. Enumeration of features of the processor which are used by the kernel to
46 *    determine what features to enable or disable. These may be instruction set
47 *    enhancements or features that we use.
48 *
49 * 2. Enumeration of instruction set architecture (ISA) additions that userland
50 *    will be told about through the auxiliary vector.
51 *
52 * 3. Understanding the physical topology of the CPU such as the number of
53 *    caches, how many cores it has, whether or not it supports symmetric
54 *    multi-processing (SMT), etc.
55 *
56 * ------------------------
57 * CPUID History and Basics
58 * ------------------------
59 *
60 * The cpuid instruction was added by Intel roughly around the time that the
61 * original Pentium was introduced. The purpose of cpuid was to tell in a
62 * programmatic fashion information about the CPU that previously was guessed
63 * at. For example, an important part of cpuid is that we can know what
64 * extensions to the ISA exist. If you use an invalid opcode you would get a
65 * #UD, so this method allows a program (whether a user program or the kernel)
66 * to determine what exists without crashing or getting a SIGILL. Of course,
67 * this was also during the era of the clones and the AMD Am5x86. The vendor
68 * name shows up first in cpuid for a reason.
69 *
70 * cpuid information is broken down into ranges called a 'leaf'. Each leaf puts
71 * unique values into the registers %eax, %ebx, %ecx, and %edx and each leaf has
72 * its own meaning. The different leaves are broken down into different regions:
73 *
74 *	[ 0, 7fffffff ]			This region is called the 'basic'
75 *					region. This region is generally defined
76 *					by Intel, though some of the original
77 *					portions have different meanings based
78 *					on the manufacturer. These days, Intel
79 *					adds most new features to this region.
80 *					AMD adds non-Intel compatible
81 *					information in the third, extended
82 *					region. Intel uses this for everything
83 *					including ISA extensions, CPU
84 *					features, cache information, topology,
85 *					and more.
86 *
87 *					There is a hole carved out of this
88 *					region which is reserved for
89 *					hypervisors.
90 *
91 *	[ 40000000, 4fffffff ]		This region, which is found in the
92 *					middle of the previous region, is
93 *					explicitly promised to never be used by
94 *					CPUs. Instead, it is used by hypervisors
95 *					to communicate information about
96 *					themselves to the operating system. The
97 *					values and details are unique for each
98 *					hypervisor.
99 *
100 *	[ 80000000, ffffffff ]		This region is called the 'extended'
101 *					region. Some of the low leaves mirror
102 *					parts of the basic leaves. This region
103 *					has generally been used by AMD for
104 *					various extensions. For example, AMD-
105 *					specific information about caches,
106 *					features, and topology are found in this
107 *					region.
108 *
109 * To specify a range, you place the desired leaf into %eax, zero %ebx, %ecx,
110 * and %edx, and then issue the cpuid instruction. At the first leaf in each of
111 * the ranges, one of the primary things returned is the maximum valid leaf in
112 * that range. This allows for discovery of what range of CPUID is valid.
113 *
114 * The CPUs have potentially surprising behavior when using an invalid leaf or
115 * unimplemented leaf. If the requested leaf is within the valid basic or
116 * extended range, but is unimplemented, then %eax, %ebx, %ecx, and %edx will be
117 * set to zero. However, if you specify a leaf that is outside of a valid range,
118 * then instead it will be filled with the last valid _basic_ leaf. For example,
119 * if the maximum basic value is on leaf 0x3, then issuing a cpuid for leaf 4 or
120 * an invalid extended leaf will return the information for leaf 3.
121 *
122 * Some leaves are broken down into sub-leaves. This means that the value
123 * depends on both the leaf asked for in %eax and a secondary register. For
124 * example, Intel uses the value in %ecx on leaf 7 to indicate a sub-leaf to get
125 * additional information. Or when getting topology information in leaf 0xb, the
126 * initial value in %ecx changes which level of the topology that you are
127 * getting information about.
128 *
129 * cpuid values are always kept to 32 bits regardless of whether or not the
130 * program is in 64-bit mode. When executing in 64-bit mode, the upper
131 * 32 bits of the register are always set to zero so that way the values are the
132 * same regardless of execution mode.
133 *
134 * ----------------------
135 * Identifying Processors
136 * ----------------------
137 *
138 * We can identify a processor in two steps. The first step looks at cpuid leaf
139 * 0. Leaf 0 contains the processor's vendor information. This is done by
140 * putting a 12 character string in %ebx, %ecx, and %edx. On AMD, it is
141 * 'AuthenticAMD' and on Intel it is 'GenuineIntel'.
142 *
143 * From there, a processor is identified by a combination of three different
144 * values:
145 *
146 *  1. Family
147 *  2. Model
148 *  3. Stepping
149 *
150 * Each vendor uses the family and model to uniquely identify a processor. The
151 * way that family and model are changed depends on the vendor. For example,
152 * Intel has been using family 0x6 for almost all of their processor since the
153 * Pentium Pro/Pentium II era, often called the P6. The model is used to
154 * identify the exact processor. Different models are often used for the client
155 * (consumer) and server parts. Even though each processor often has major
156 * architectural differences, they still are considered the same family by
157 * Intel.
158 *
159 * On the other hand, each major AMD architecture generally has its own family.
160 * For example, the K8 is family 0x10, Bulldozer 0x15, and Zen 0x17. Within it
161 * the model number is used to help identify specific processors.
162 *
163 * The stepping is used to refer to a revision of a specific microprocessor. The
164 * term comes from equipment used to produce masks that are used to create
165 * integrated circuits.
166 *
167 * The information is present in leaf 1, %eax. In technical documentation you
168 * will see the terms extended model and extended family. The original family,
169 * model, and stepping fields were each 4 bits wide. If the values in either
170 * are 0xf, then one is to consult the extended model and extended family, which
171 * take previously reserved bits and allow for a larger number of models and add
172 * 0xf to them.
173 *
174 * When we process this information, we store the full family, model, and
175 * stepping in the struct cpuid_info members cpi_family, cpi_model, and
176 * cpi_step, respectively. Whenever you are performing comparisons with the
177 * family, model, and stepping, you should use these members and not the raw
178 * values from cpuid. If you must use the raw values from cpuid directly, you
179 * must make sure that you add the extended model and family to the base model
180 * and family.
181 *
182 * In general, we do not use information about the family, model, and stepping
183 * to determine whether or not a feature is present; that is generally driven by
184 * specific leaves. However, when something we care about on the processor is
185 * not considered 'architectural' meaning that it is specific to a set of
186 * processors and not promised in the architecture model to be consistent from
187 * generation to generation, then we will fall back on this information. The
188 * most common cases where this comes up is when we have to workaround errata in
189 * the processor, are dealing with processor-specific features such as CPU
190 * performance counters, or we want to provide additional information for things
191 * such as fault management.
192 *
193 * While processors also do have a brand string, which is the name that people
194 * are familiar with when buying the processor, they are not meant for
195 * programmatic consumption. That is what the family, model, and stepping are
196 * for.
197 *
198 * ------------
199 * CPUID Passes
200 * ------------
201 *
202 * As part of performing feature detection, we break this into several different
203 * passes. The passes are as follows:
204 *
205 *	Pass 0		This is a primordial pass done in locore.s to deal with
206 *			Cyrix CPUs that don't support cpuid. The reality is that
207 *			we likely don't run on them any more, but there is still
208 *			logic for handling them.
209 *
210 *	Pass 1		This is the primary pass and is responsible for doing a
211 *			large number of different things:
212 *
213 *			1. Determine which vendor manufactured the CPU and
214 *			determining the family, model, and stepping information.
215 *
216 *			2. Gathering a large number of feature flags to
217 *			determine which features the CPU support and which
218 *			indicate things that we need to do other work in the OS
219 *			to enable. Features detected this way are added to the
220 *			x86_featureset which can be queried to
221 *			determine what we should do. This includes processing
222 *			all of the basic and extended CPU features that we care
223 *			about.
224 *
225 *			3. Determining the CPU's topology. This includes
226 *			information about how many cores and threads are present
227 *			in the package. It also is responsible for figuring out
228 *			which logical CPUs are potentially part of the same core
229 *			and what other resources they might share. For more
230 *			information see the 'Topology' section.
231 *
232 *			4. Determining the set of CPU security-specific features
233 *			that we need to worry about and determine the
234 *			appropriate set of workarounds.
235 *
236 *			Pass 1 on the boot CPU occurs before KMDB is started.
237 *
238 *	Pass 2		The second pass is done after startup(). Here, we check
239 *			other miscellaneous features. Most of this is gathering
240 *			additional basic and extended features that we'll use in
241 *			later passes or for debugging support.
242 *
243 *	Pass 3		The third pass occurs after the kernel memory allocator
244 *			has been fully initialized. This gathers information
245 *			where we might need dynamic memory available for our
246 *			uses. This includes several varying width leaves that
247 *			have cache information and the processor's brand string.
248 *
249 *	Pass 4		The fourth and final normal pass is performed after the
250 *			kernel has brought most everything online. This is
251 *			invoked from post_startup(). In this pass, we go through
252 *			the set of features that we have enabled and turn that
253 *			into the hardware auxiliary vector features that
254 *			userland receives. This is used by userland, primarily
255 *			by the run-time link-editor (RTLD), though userland
256 *			software could also refer to it directly.
257 *
258 *	Microcode	After a microcode update, we do a selective rescan of
259 *			the cpuid leaves to determine what features have
260 *			changed. Microcode updates can provide more details
261 *			about security related features to deal with issues like
262 *			Spectre and L1TF. On occasion, vendors have violated
263 *			their contract and removed bits. However, we don't try
264 *			to detect that because that puts us in a situation that
265 *			we really can't deal with. As such, the only thing we
266 *			rescan are security related features today. See
267 *			cpuid_pass_ucode().
268 *
269 * All of the passes (except pass 0) are run on all CPUs. However, for the most
270 * part we only care about what the boot CPU says about this information and use
271 * the other CPUs as a rough guide to sanity check that we have the same feature
272 * set.
273 *
274 * We do not support running multiple logical CPUs with disjoint, let alone
275 * different, feature sets.
276 *
277 * ------------------
278 * Processor Topology
279 * ------------------
280 *
281 * One of the important things that we need to do is to understand the topology
282 * of the underlying processor. When we say topology in this case, we're trying
283 * to understand the relationship between the logical CPUs that the operating
284 * system sees and the underlying physical layout. Different logical CPUs may
285 * share different resources which can have important consequences for the
286 * performance of the system. For example, they may share caches, execution
287 * units, and more.
288 *
289 * The topology of the processor changes from generation to generation and
290 * vendor to vendor.  Along with that, different vendors use different
291 * terminology, and the operating system itself uses occasionally overlapping
292 * terminology. It's important to understand what this topology looks like so
293 * one can understand the different things that we try to calculate and
294 * determine.
295 *
296 * To get started, let's talk about a little bit of terminology that we've used
297 * so far, is used throughout this file, and is fairly generic across multiple
298 * vendors:
299 *
300 * CPU
301 *	A central processing unit (CPU) refers to a logical and/or virtual
302 *	entity that the operating system can execute instructions on. The
303 *	underlying resources for this CPU may be shared between multiple
304 *	entities; however, to the operating system it is a discrete unit.
305 *
306 * PROCESSOR and PACKAGE
307 *
308 *	Generally, when we use the term 'processor' on its own, we are referring
309 *	to the physical entity that one buys and plugs into a board. However,
310 *	because processor has been overloaded and one might see it used to mean
311 *	multiple different levels, we will instead use the term 'package' for
312 *	the rest of this file. The term package comes from the electrical
313 *	engineering side and refers to the physical entity that encloses the
314 *	electronics inside. Strictly speaking the package can contain more than
315 *	just the CPU, for example, on many processors it may also have what's
316 *	called an 'integrated graphical processing unit (GPU)'. Because the
317 *	package can encapsulate multiple units, it is the largest physical unit
318 *	that we refer to.
319 *
320 * SOCKET
321 *
322 *	A socket refers to unit on a system board (generally the motherboard)
323 *	that can receive a package. A single package, or processor, is plugged
324 *	into a single socket. A system may have multiple sockets. Often times,
325 *	the term socket is used interchangeably with package and refers to the
326 *	electrical component that has plugged in, and not the receptacle itself.
327 *
328 * CORE
329 *
330 *	A core refers to the physical instantiation of a CPU, generally, with a
331 *	full set of hardware resources available to it. A package may contain
332 *	multiple cores inside of it or it may just have a single one. A
333 *	processor with more than one core is often referred to as 'multi-core'.
334 *	In illumos, we will use the feature X86FSET_CMP to refer to a system
335 *	that has 'multi-core' processors.
336 *
337 *	A core may expose a single logical CPU to the operating system, or it
338 *	may expose multiple CPUs, which we call threads, defined below.
339 *
340 *	Some resources may still be shared by cores in the same package. For
341 *	example, many processors will share the level 3 cache between cores.
342 *	Some AMD generations share hardware resources between cores. For more
343 *	information on that see the section 'AMD Topology'.
344 *
345 * THREAD and STRAND
346 *
347 *	In this file, generally a thread refers to a hardware resources and not
348 *	the operating system's logical abstraction. A thread is always exposed
349 *	as an independent logical CPU to the operating system. A thread belongs
350 *	to a specific core. A core may have more than one thread. When that is
351 *	the case, the threads that are part of the same core are often referred
352 *	to as 'siblings'.
353 *
354 *	When multiple threads exist, this is generally referred to as
355 *	simultaneous multi-threading (SMT). When Intel introduced this in their
356 *	processors they called it hyper-threading (HT). When multiple threads
357 *	are active in a core, they split the resources of the core. For example,
358 *	two threads may share the same set of hardware execution units.
359 *
360 *	The operating system often uses the term 'strand' to refer to a thread.
361 *	This helps disambiguate it from the software concept.
362 *
363 * CHIP
364 *
365 *	Unfortunately, the term 'chip' is dramatically overloaded. At its most
366 *	base meaning, it is used to refer to a single integrated circuit, which
367 *	may or may not be the only thing in the package. In illumos, when you
368 *	see the term 'chip' it is almost always referring to the same thing as
369 *	the 'package'. However, many vendors may use chip to refer to one of
370 *	many integrated circuits that have been placed in the package. As an
371 *	example, see the subsequent definition.
372 *
373 *	To try and keep things consistent, we will only use chip when referring
374 *	to the entire integrated circuit package, with the exception of the
375 *	definition of multi-chip module (because it is in the name) and use the
376 *	term 'die' when we want the more general, potential sub-component
377 *	definition.
378 *
379 * DIE
380 *
381 *	A die refers to an integrated circuit. Inside of the package there may
382 *	be a single die or multiple dies. This is sometimes called a 'chip' in
383 *	vendor's parlance, but in this file, we use the term die to refer to a
384 *	subcomponent.
385 *
386 * MULTI-CHIP MODULE
387 *
388 *	A multi-chip module (MCM) refers to putting multiple distinct chips that
389 *	are connected together in the same package. When a multi-chip design is
390 *	used, generally each chip is manufactured independently and then joined
391 *	together in the package. For example, on AMD's Zen microarchitecture
392 *	(family 0x17), the package contains several dies (the second meaning of
393 *	chip from above) that are connected together.
394 *
395 * CACHE
396 *
397 *	A cache is a part of the processor that maintains copies of recently
398 *	accessed memory. Caches are split into levels and then into types.
399 *	Commonly there are one to three levels, called level one, two, and
400 *	three. The lower the level, the smaller it is, the closer it is to the
401 *	execution units of the CPU, and the faster it is to access. The layout
402 *	and design of the cache come in many different flavors, consult other
403 *	resources for a discussion of those.
404 *
405 *	Caches are generally split into two types, the instruction and data
406 *	cache. The caches contain what their names suggest, the instruction
407 *	cache has executable program text, while the data cache has all other
408 *	memory that the processor accesses. As of this writing, data is kept
409 *	coherent between all of the caches on x86, so if one modifies program
410 *	text before it is executed, that will be in the data cache, and the
411 *	instruction cache will be synchronized with that change when the
412 *	processor actually executes those instructions. This coherency also
413 *	covers the fact that data could show up in multiple caches.
414 *
415 *	Generally, the lowest level caches are specific to a core. However, the
416 *	last layer cache is shared between some number of cores. The number of
417 *	CPUs sharing this last level cache is important. This has implications
418 *	for the choices that the scheduler makes, as accessing memory that might
419 *	be in a remote cache after thread migration can be quite expensive.
420 *
421 *	Sometimes, the word cache is abbreviated with a '$', because in US
422 *	English the word cache is pronounced the same as cash. So L1D$ refers to
423 *	the L1 data cache, and L2$ would be the L2 cache. This will not be used
424 *	in the rest of this theory statement for clarity.
425 *
426 * MEMORY CONTROLLER
427 *
428 *	The memory controller is a component that provides access to DRAM. Each
429 *	memory controller can access a set number of DRAM channels. Each channel
430 *	can have a number of DIMMs (sticks of memory) associated with it. A
431 *	given package may have more than one memory controller. The association
432 *	of the memory controller to a group of cores is important as it is
433 *	cheaper to access memory on the controller that you are associated with.
434 *
435 * NUMA
436 *
437 *	NUMA or non-uniform memory access, describes a way that systems are
438 *	built. On x86, any processor core can address all of the memory in the
439 *	system. However, When using multiple sockets or possibly within a
440 *	multi-chip module, some of that memory is physically closer and some of
441 *	it is further. Memory that is further away is more expensive to access.
442 *	Consider the following image of multiple sockets with memory:
443 *
444 *	+--------+                                                +--------+
445 *	| DIMM A |         +----------+      +----------+         | DIMM D |
446 *	+--------+-+       |          |      |          |       +-+------+-+
447 *	  | DIMM B |=======| Socket 0 |======| Socket 1 |=======| DIMM E |
448 *	  +--------+-+     |          |      |          |     +-+------+-+
449 *	    | DIMM C |     +----------+      +----------+     | DIMM F |
450 *	    +--------+                                        +--------+
451 *
452 *	In this example, Socket 0 is closer to DIMMs A-C while Socket 1 is
453 *	closer to DIMMs D-F. This means that it is cheaper for socket 0 to
454 *	access DIMMs A-C and more expensive to access D-F as it has to go
455 *	through Socket 1 to get there. The inverse is true for Socket 1. DIMMs
456 *	D-F are cheaper than A-C. While the socket form is the most common, when
457 *	using multi-chip modules, this can also sometimes occur. For another
458 *	example of this that's more involved, see the AMD topology section.
459 *
460 *
461 * Intel Topology
462 * --------------
463 *
464 * Most Intel processors since Nehalem, (as of this writing the current gen
465 * is Skylake / Cannon Lake) follow a fairly similar pattern. The CPU portion of
466 * the package is a single monolithic die. MCMs currently aren't used. Most
467 * parts have three levels of caches, with the L3 cache being shared between
468 * all of the cores on the package. The L1/L2 cache is generally specific to
469 * an individual core. The following image shows at a simplified level what
470 * this looks like. The memory controller is commonly part of something called
471 * the 'Uncore', that used to be separate physical chips that were not a part of
472 * the package, but are now part of the same chip.
473 *
474 *  +-----------------------------------------------------------------------+
475 *  | Package                                                               |
476 *  |  +-------------------+  +-------------------+  +-------------------+  |
477 *  |  | Core              |  | Core              |  | Core              |  |
478 *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
479 *  |  |  | Thread | | L | |  |  | Thread | | L | |  |  | Thread | | L | |  |
480 *  |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |  +--------+ | 1 | |  |
481 *  |  |  +--------+ |   | |  |  +--------+ |   | |  |  +--------+ |   | |  |
482 *  |  |  | Thread | |   | |  |  | Thread | |   | |  |  | Thread | |   | |  |
483 *  |  |  +--------+ +---+ |  |  +--------+ +---+ |  |  +--------+ +---+ |  |
484 *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
485 *  |  |  | L2 Cache     | |  |  | L2 Cache     | |  |  | L2 Cache     | |  |
486 *  |  |  +--------------+ |  |  +--------------+ |  |  +--------------+ |  |
487 *  |  +-------------------+  +-------------------+  +-------------------+  |
488 *  | +-------------------------------------------------------------------+ |
489 *  | |                         Shared L3 Cache                           | |
490 *  | +-------------------------------------------------------------------+ |
491 *  | +-------------------------------------------------------------------+ |
492 *  | |                        Memory Controller                          | |
493 *  | +-------------------------------------------------------------------+ |
494 *  +-----------------------------------------------------------------------+
495 *
496 * A side effect of this current architecture is that what we care about from a
497 * scheduling and topology perspective, is simplified. In general we care about
498 * understanding which logical CPUs are part of the same core and socket.
499 *
500 * To determine the relationship between threads and cores, Intel initially used
501 * the identifier in the advanced programmable interrupt controller (APIC). They
502 * also added cpuid leaf 4 to give additional information about the number of
503 * threads and CPUs in the processor. With the addition of x2apic (which
504 * increased the number of addressable logical CPUs from 8-bits to 32-bits), an
505 * additional cpuid topology leaf 0xB was added.
506 *
507 * AMD Topology
508 * ------------
509 *
510 * When discussing AMD topology, we want to break this into three distinct
511 * generations of topology. There's the basic topology that has been used in
512 * family 0xf+ (Opteron, Athlon64), there's the topology that was introduced
513 * with family 0x15 (Bulldozer), and there's the topology that was introduced
514 * with family 0x17 (Zen). AMD also has some additional terminology that's worth
515 * talking about.
516 *
517 * Until the introduction of family 0x17 (Zen), AMD did not implement something
518 * that they considered SMT. Whether or not the AMD processors have SMT
519 * influences many things including scheduling and reliability, availability,
520 * and serviceability (RAS) features.
521 *
522 * NODE
523 *
524 *	AMD uses the term node to refer to a die that contains a number of cores
525 *	and I/O resources. Depending on the processor family and model, more
526 *	than one node can be present in the package. When there is more than one
527 *	node this indicates a multi-chip module. Usually each node has its own
528 *	access to memory and I/O devices. This is important and generally
529 *	different from the corresponding Intel Nehalem-Skylake+ processors. As a
530 *	result, we track this relationship in the operating system.
531 *
532 *	In processors with an L3 cache, the L3 cache is generally shared across
533 *	the entire node, though the way this is carved up varies from generation
534 *	to generation.
535 *
536 * BULLDOZER
537 *
538 *	Starting with the Bulldozer family (0x15) and continuing until the
539 *	introduction of the Zen microarchitecture, AMD introduced the idea of a
540 *	compute unit. In a compute unit, two traditional cores share a number of
541 *	hardware resources. Critically, they share the FPU, L1 instruction
542 *	cache, and the L2 cache. Several compute units were then combined inside
543 *	of a single node.  Because the integer execution units, L1 data cache,
544 *	and some other resources were not shared between the cores, AMD never
545 *	considered this to be SMT.
546 *
547 * ZEN
548 *
549 *	The Zen family (0x17) uses a multi-chip module (MCM) design, the module
550 *	is called Zeppelin. These modules are similar to the idea of nodes used
551 *	previously. Each of these nodes has two DRAM channels which all of the
552 *	cores in the node can access uniformly. These nodes are linked together
553 *	in the package, creating a NUMA environment.
554 *
555 *	The Zeppelin die itself contains two different 'core complexes'. Each
556 *	core complex consists of four cores which each have two threads, for a
557 *	total of 8 logical CPUs per complex. Unlike other generations,
558 *	where all the logical CPUs in a given node share the L3 cache, here each
559 *	core complex has its own shared L3 cache.
560 *
561 *	A further thing that we need to consider is that in some configurations,
562 *	particularly with the Threadripper line of processors, not every die
563 *	actually has its memory controllers wired up to actual memory channels.
564 *	This means that some cores have memory attached to them and others
565 *	don't.
566 *
567 *	To put Zen in perspective, consider the following images:
568 *
569 *      +--------------------------------------------------------+
570 *      | Core Complex                                           |
571 *      | +-------------------+    +-------------------+  +---+  |
572 *      | | Core       +----+ |    | Core       +----+ |  |   |  |
573 *      | | +--------+ | L2 | |    | +--------+ | L2 | |  |   |  |
574 *      | | | Thread | +----+ |    | | Thread | +----+ |  |   |  |
575 *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  | L |  |
576 *      | |   | Thread | |L1| |    |   | Thread | |L1| |  | 3 |  |
577 *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
578 *      | +-------------------+    +-------------------+  | C |  |
579 *      | +-------------------+    +-------------------+  | a |  |
580 *      | | Core       +----+ |    | Core       +----+ |  | c |  |
581 *      | | +--------+ | L2 | |    | +--------+ | L2 | |  | h |  |
582 *      | | | Thread | +----+ |    | | Thread | +----+ |  | e |  |
583 *      | | +--------+-+ +--+ |    | +--------+-+ +--+ |  |   |  |
584 *      | |   | Thread | |L1| |    |   | Thread | |L1| |  |   |  |
585 *      | |   +--------+ +--+ |    |   +--------+ +--+ |  |   |  |
586 *      | +-------------------+    +-------------------+  +---+  |
587 *      |                                                        |
588 *	+--------------------------------------------------------+
589 *
590 *  This first image represents a single Zen core complex that consists of four
591 *  cores.
592 *
593 *
594 *	+--------------------------------------------------------+
595 *	| Zeppelin Die                                           |
596 *	|  +--------------------------------------------------+  |
597 *	|  |         I/O Units (PCIe, SATA, USB, etc.)        |  |
598 *	|  +--------------------------------------------------+  |
599 *      |                           HH                           |
600 *	|          +-----------+    HH    +-----------+          |
601 *	|          |           |    HH    |           |          |
602 *	|          |    Core   |==========|    Core   |          |
603 *	|          |  Complex  |==========|  Complex  |          |
604 *	|          |           |    HH    |           |          |
605 *	|          +-----------+    HH    +-----------+          |
606 *      |                           HH                           |
607 *	|  +--------------------------------------------------+  |
608 *	|  |                Memory Controller                 |  |
609 *	|  +--------------------------------------------------+  |
610 *      |                                                        |
611 *	+--------------------------------------------------------+
612 *
613 *  This image represents a single Zeppelin Die. Note how both cores are
614 *  connected to the same memory controller and I/O units. While each core
615 *  complex has its own L3 cache as seen in the first image, they both have
616 *  uniform access to memory.
617 *
618 *
619 *                      PP                     PP
620 *                      PP                     PP
621 *           +----------PP---------------------PP---------+
622 *           |          PP                     PP         |
623 *           |    +-----------+          +-----------+    |
624 *           |    |           |          |           |    |
625 *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
626 *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
627 *           |    |           |          |           |    |
628 *           |    +-----------+ooo    ...+-----------+    |
629 *           |          HH      ooo  ...       HH         |
630 *           |          HH        oo..         HH         |
631 *           |          HH        ..oo         HH         |
632 *           |          HH      ...  ooo       HH         |
633 *           |    +-----------+...    ooo+-----------+    |
634 *           |    |           |          |           |    |
635 *       MMMMMMMMM|  Zeppelin |==========|  Zeppelin |MMMMMMMMM
636 *       MMMMMMMMM|    Die    |==========|    Die    |MMMMMMMMM
637 *           |    |           |          |           |    |
638 *           |    +-----------+          +-----------+    |
639 *           |          PP                     PP         |
640 *           +----------PP---------------------PP---------+
641 *                      PP                     PP
642 *                      PP                     PP
643 *
644 *  This image represents a single Zen package. In this example, it has four
645 *  Zeppelin dies, though some configurations only have a single one. In this
646 *  example, each die is directly connected to the next. Also, each die is
647 *  represented as being connected to memory by the 'M' character and connected
648 *  to PCIe devices and other I/O, by the 'P' character. Because each Zeppelin
649 *  die is made up of two core complexes, we have multiple different NUMA
650 *  domains that we care about for these systems.
651 *
652 * CPUID LEAVES
653 *
654 * There are a few different CPUID leaves that we can use to try and understand
655 * the actual state of the world. As part of the introduction of family 0xf, AMD
656 * added CPUID leaf 0x80000008. This leaf tells us the number of logical
657 * processors that are in the system. Because families before Zen didn't have
658 * SMT, this was always the number of cores that were in the system. However, it
659 * should always be thought of as the number of logical threads to be consistent
660 * between generations. In addition we also get the size of the APIC ID that is
661 * used to represent the number of logical processors. This is important for
662 * deriving topology information.
663 *
664 * In the Bulldozer family, AMD added leaf 0x8000001E. The information varies a
665 * bit between Bulldozer and later families, but it is quite useful in
666 * determining the topology information. Because this information has changed
667 * across family generations, it's worth calling out what these mean
668 * explicitly. The registers have the following meanings:
669 *
670 *	%eax	The APIC ID. The entire register is defined to have a 32-bit
671 *		APIC ID, even though on systems without x2apic support, it will
672 *		be limited to 8 bits.
673 *
674 *	%ebx	On Bulldozer-era systems this contains information about the
675 *		number of cores that are in a compute unit (cores that share
676 *		resources). It also contains a per-package compute unit ID that
677 *		identifies which compute unit the logical CPU is a part of.
678 *
679 *		On Zen-era systems this instead contains the number of threads
680 *		per core and the ID of the core that the logical CPU is a part
681 *		of. Note, this ID is unique only to the package, it is not
682 *		globally unique across the entire system.
683 *
684 *	%ecx	This contains the number of nodes that exist in the package. It
685 *		also contains an ID that identifies which node the logical CPU
686 *		is a part of.
687 *
688 * Finally, we also use cpuid leaf 0x8000001D to determine information about the
689 * cache layout to determine which logical CPUs are sharing which caches.
690 *
691 * illumos Topology
692 * ----------------
693 *
694 * Based on the above we synthesize the information into several different
695 * variables that we store in the 'struct cpuid_info'. We'll go into the details
696 * of what each member is supposed to represent and their uniqueness. In
697 * general, there are two levels of uniqueness that we care about. We care about
698 * an ID that is globally unique. That means that it will be unique across all
699 * entities in the system. For example, the default logical CPU ID is globally
700 * unique. On the other hand, there is some information that we only care about
701 * being unique within the context of a single package / socket. Here are the
702 * variables that we keep track of and their meaning.
703 *
704 * Several of the values that are asking for an identifier, with the exception
705 * of cpi_apicid, are allowed to be synthetic.
706 *
707 *
708 * cpi_apicid
709 *
710 *	This is the value of the CPU's APIC id. This should be the full 32-bit
711 *	ID if the CPU is using the x2apic. Otherwise, it should be the 8-bit
712 *	APIC ID. This value is globally unique between all logical CPUs across
713 *	all packages. This is usually required by the APIC.
714 *
715 * cpi_chipid
716 *
717 *	This value indicates the ID of the package that the logical CPU is a
718 *	part of. This value is allowed to be synthetic. It is usually derived by
719 *	taking the CPU's APIC ID and determining how many bits are used to
720 *	represent CPU cores in the package. All logical CPUs that are part of
721 *	the same package must have the same value.
722 *
723 * cpi_coreid
724 *
725 *	This represents the ID of a CPU core. Two logical CPUs should only have
726 *	the same cpi_coreid value if they are part of the same core. These
727 *	values may be synthetic. On systems that support SMT, this value is
728 *	usually derived from the APIC ID, otherwise it is often synthetic and
729 *	just set to the value of the cpu_id in the cpu_t.
730 *
731 * cpi_pkgcoreid
732 *
733 *	This is similar to the cpi_coreid in that logical CPUs that are part of
734 *	the same core should have the same ID. The main difference is that these
735 *	values are only required to be unique to a given socket.
736 *
737 * cpi_clogid
738 *
739 *	This represents the logical ID of a logical CPU. This value should be
740 *	unique within a given socket for each logical CPU. This is allowed to be
741 *	synthetic, though it is usually based off of the CPU's apic ID. The
742 *	broader system expects that logical CPUs that have are part of the same
743 *	core have contiguous numbers. For example, if there were two threads per
744 *	core, then the core IDs divided by two should be the same and the first
745 *	modulus two should be zero and the second one. For example, IDs 4 and 5
746 *	indicate two logical CPUs that are part of the same core. But IDs 5 and
747 *	6 represent two logical CPUs that are part of different cores.
748 *
749 *	While it is common for the cpi_coreid and the cpi_clogid to be derived
750 *	from the same source, strictly speaking, they don't have to be and the
751 *	two values should be considered logically independent. One should not
752 *	try to compare a logical CPU's cpi_coreid and cpi_clogid to determine
753 *	some kind of relationship. While this is tempting, we've seen cases on
754 *	AMD family 0xf where the system's cpu id is not related to its APIC ID.
755 *
756 * cpi_ncpu_per_chip
757 *
758 *	This value indicates the total number of logical CPUs that exist in the
759 *	physical package. Critically, this is not the number of logical CPUs
760 *	that exist for just the single core.
761 *
762 *	This value should be the same for all logical CPUs in the same package.
763 *
764 * cpi_ncore_per_chip
765 *
766 *	This value indicates the total number of physical CPU cores that exist
767 *	in the package. The system compares this value with cpi_ncpu_per_chip to
768 *	determine if simultaneous multi-threading (SMT) is enabled. When
769 *	cpi_ncpu_per_chip equals cpi_ncore_per_chip, then there is no SMT and
770 *	the X86FSET_HTT feature is not set. If this value is greater than one,
771 *	than we consider the processor to have the feature X86FSET_CMP, to
772 *	indicate that there is support for more than one core.
773 *
774 *	This value should be the same for all logical CPUs in the same package.
775 *
776 * cpi_procnodes_per_pkg
777 *
778 *	This value indicates the number of 'nodes' that exist in the package.
779 *	When processors are actually a multi-chip module, this represents the
780 *	number of such modules that exist in the package. Currently, on Intel
781 *	based systems this member is always set to 1.
782 *
783 *	This value should be the same for all logical CPUs in the same package.
784 *
785 * cpi_procnodeid
786 *
787 *	This value indicates the ID of the node that the logical CPU is a part
788 *	of. All logical CPUs that are in the same node must have the same value
789 *	here. This value must be unique across all of the packages in the
790 *	system.  On Intel based systems, this is currently set to the value in
791 *	cpi_chipid because there is only one node.
792 *
793 * cpi_cores_per_compunit
794 *
795 *	This value indicates the number of cores that are part of a compute
796 *	unit. See the AMD topology section for this. This member only has real
797 *	meaning currently for AMD Bulldozer family processors. For all other
798 *	processors, this should currently be set to 1.
799 *
800 * cpi_compunitid
801 *
802 *	This indicates the compute unit that the logical CPU belongs to. For
803 *	processors without AMD Bulldozer-style compute units this should be set
804 *	to the value of cpi_coreid.
805 *
806 * cpi_ncpu_shr_last_cache
807 *
808 *	This indicates the number of logical CPUs that are sharing the same last
809 *	level cache. This value should be the same for all CPUs that are sharing
810 *	that cache. The last cache refers to the cache that is closest to memory
811 *	and furthest away from the CPU.
812 *
813 * cpi_last_lvl_cacheid
814 *
815 *	This indicates the ID of the last cache that the logical CPU uses. This
816 *	cache is often shared between multiple logical CPUs and is the cache
817 *	that is closest to memory and furthest away from the CPU. This value
818 *	should be the same for a group of logical CPUs only if they actually
819 *	share the same last level cache. IDs should not overlap between
820 *	packages.
821 *
822 * cpi_ncore_bits
823 *
824 *	This indicates the number of bits that are required to represent all of
825 *	the cores in the system. As cores are derived based on their APIC IDs,
826 *	we aren't guaranteed a run of APIC IDs starting from zero. It's OK for
827 *	this value to be larger than the actual number of IDs that are present
828 *	in the system. This is used to size tables by the CMI framework. It is
829 *	only filled in for Intel and AMD CPUs.
830 *
831 * cpi_nthread_bits
832 *
833 *	This indicates the number of bits required to represent all of the IDs
834 *	that cover the logical CPUs that exist on a given core. It's OK for this
835 *	value to be larger than the actual number of IDs that are present in the
836 *	system.  This is used to size tables by the CMI framework. It is
837 *	only filled in for Intel and AMD CPUs.
838 *
839 * -----------
840 * Hypervisors
841 * -----------
842 *
843 * If trying to manage the differences between vendors wasn't bad enough, it can
844 * get worse thanks to our friend hardware virtualization. Hypervisors are given
845 * the ability to interpose on all cpuid instructions and change them to suit
846 * their purposes. In general, this is necessary as the hypervisor wants to be
847 * able to present a more uniform set of features or not necessarily give the
848 * guest operating system kernel knowledge of all features so it can be
849 * more easily migrated between systems.
850 *
851 * When it comes to trying to determine topology information, this can be a
852 * double edged sword. When a hypervisor doesn't actually implement a cpuid
853 * leaf, it'll often return all zeros. Because of that, you'll often see various
854 * checks scattered about fields being non-zero before we assume we can use
855 * them.
856 *
857 * When it comes to topology information, the hypervisor is often incentivized
858 * to lie to you about topology. This is because it doesn't always actually
859 * guarantee that topology at all. The topology path we take in the system
860 * depends on how the CPU advertises itself. If it advertises itself as an Intel
861 * or AMD CPU, then we basically do our normal path. However, when they don't
862 * use an actual vendor, then that usually turns into multiple one-core CPUs
863 * that we enumerate that are often on different sockets. The actual behavior
864 * depends greatly on what the hypervisor actually exposes to us.
865 *
866 * --------------------
867 * Exposing Information
868 * --------------------
869 *
870 * We expose CPUID information in three different forms in the system.
871 *
872 * The first is through the x86_featureset variable. This is used in conjunction
873 * with the is_x86_feature() function. This is queried by x86-specific functions
874 * to determine which features are or aren't present in the system and to make
875 * decisions based upon them. For example, users of this include everything from
876 * parts of the system dedicated to reliability, availability, and
877 * serviceability (RAS), to making decisions about how to handle security
878 * mitigations, to various x86-specific drivers. General purpose or
879 * architecture independent drivers should never be calling this function.
880 *
881 * The second means is through the auxiliary vector. The auxiliary vector is a
882 * series of tagged data that the kernel passes down to a user program when it
883 * begins executing. This information is used to indicate to programs what
884 * instruction set extensions are present. For example, information about the
885 * CPU supporting the machine check architecture (MCA) wouldn't be passed down
886 * since user programs cannot make use of it. However, things like the AVX
887 * instruction sets are. Programs use this information to make run-time
888 * decisions about what features they should use. As an example, the run-time
889 * link-editor (rtld) can relocate different functions depending on the hardware
890 * support available.
891 *
892 * The final form is through a series of accessor functions that all have the
893 * form cpuid_get*. This is used by a number of different subsystems in the
894 * kernel to determine more detailed information about what we're running on,
895 * topology information, etc. Some of these subsystems include processor groups
896 * (uts/common/os/pg.c.), CPU Module Interface (uts/i86pc/os/cmi.c), ACPI,
897 * microcode, and performance monitoring. These functions all ASSERT that the
898 * CPU they're being called on has reached a certain cpuid pass. If the passes
899 * are rearranged, then this needs to be adjusted.
900 */
901
902#include <sys/types.h>
903#include <sys/archsystm.h>
904#include <sys/x86_archext.h>
905#include <sys/kmem.h>
906#include <sys/systm.h>
907#include <sys/cmn_err.h>
908#include <sys/sunddi.h>
909#include <sys/sunndi.h>
910#include <sys/cpuvar.h>
911#include <sys/processor.h>
912#include <sys/sysmacros.h>
913#include <sys/pg.h>
914#include <sys/fp.h>
915#include <sys/controlregs.h>
916#include <sys/bitmap.h>
917#include <sys/auxv_386.h>
918#include <sys/memnode.h>
919#include <sys/pci_cfgspace.h>
920#include <sys/comm_page.h>
921#include <sys/mach_mmu.h>
922#include <sys/ucode.h>
923#include <sys/tsc.h>
924
925#ifdef __xpv
926#include <sys/hypervisor.h>
927#else
928#include <sys/ontrap.h>
929#endif
930
931uint_t x86_vendor = X86_VENDOR_IntelClone;
932uint_t x86_type = X86_TYPE_OTHER;
933uint_t x86_clflush_size = 0;
934
935#if defined(__xpv)
936int x86_use_pcid = 0;
937int x86_use_invpcid = 0;
938#else
939int x86_use_pcid = -1;
940int x86_use_invpcid = -1;
941#endif
942
943uint_t pentiumpro_bug4046376;
944
945uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
946
947static char *x86_feature_names[NUM_X86_FEATURES] = {
948	"lgpg",
949	"tsc",
950	"msr",
951	"mtrr",
952	"pge",
953	"de",
954	"cmov",
955	"mmx",
956	"mca",
957	"pae",
958	"cv8",
959	"pat",
960	"sep",
961	"sse",
962	"sse2",
963	"htt",
964	"asysc",
965	"nx",
966	"sse3",
967	"cx16",
968	"cmp",
969	"tscp",
970	"mwait",
971	"sse4a",
972	"cpuid",
973	"ssse3",
974	"sse4_1",
975	"sse4_2",
976	"1gpg",
977	"clfsh",
978	"64",
979	"aes",
980	"pclmulqdq",
981	"xsave",
982	"avx",
983	"vmx",
984	"svm",
985	"topoext",
986	"f16c",
987	"rdrand",
988	"x2apic",
989	"avx2",
990	"bmi1",
991	"bmi2",
992	"fma",
993	"smep",
994	"smap",
995	"adx",
996	"rdseed",
997	"mpx",
998	"avx512f",
999	"avx512dq",
1000	"avx512pf",
1001	"avx512er",
1002	"avx512cd",
1003	"avx512bw",
1004	"avx512vl",
1005	"avx512fma",
1006	"avx512vbmi",
1007	"avx512_vpopcntdq",
1008	"avx512_4vnniw",
1009	"avx512_4fmaps",
1010	"xsaveopt",
1011	"xsavec",
1012	"xsaves",
1013	"sha",
1014	"umip",
1015	"pku",
1016	"ospke",
1017	"pcid",
1018	"invpcid",
1019	"ibrs",
1020	"ibpb",
1021	"stibp",
1022	"ssbd",
1023	"ssbd_virt",
1024	"rdcl_no",
1025	"ibrs_all",
1026	"rsba",
1027	"ssb_no",
1028	"stibp_all",
1029	"flush_cmd",
1030	"l1d_vmentry_no",
1031	"fsgsbase",
1032	"clflushopt",
1033	"clwb",
1034	"monitorx",
1035	"clzero",
1036	"xop",
1037	"fma4",
1038	"tbm",
1039	"avx512_vnni",
1040	"amd_pcec",
1041	"mb_clear",
1042	"mds_no",
1043	"core_thermal",
1044	"pkg_thermal"
1045};
1046
1047boolean_t
1048is_x86_feature(void *featureset, uint_t feature)
1049{
1050	ASSERT(feature < NUM_X86_FEATURES);
1051	return (BT_TEST((ulong_t *)featureset, feature));
1052}
1053
1054void
1055add_x86_feature(void *featureset, uint_t feature)
1056{
1057	ASSERT(feature < NUM_X86_FEATURES);
1058	BT_SET((ulong_t *)featureset, feature);
1059}
1060
1061void
1062remove_x86_feature(void *featureset, uint_t feature)
1063{
1064	ASSERT(feature < NUM_X86_FEATURES);
1065	BT_CLEAR((ulong_t *)featureset, feature);
1066}
1067
1068boolean_t
1069compare_x86_featureset(void *setA, void *setB)
1070{
1071	/*
1072	 * We assume that the unused bits of the bitmap are always zero.
1073	 */
1074	if (memcmp(setA, setB, BT_SIZEOFMAP(NUM_X86_FEATURES)) == 0) {
1075		return (B_TRUE);
1076	} else {
1077		return (B_FALSE);
1078	}
1079}
1080
1081void
1082print_x86_featureset(void *featureset)
1083{
1084	uint_t i;
1085
1086	for (i = 0; i < NUM_X86_FEATURES; i++) {
1087		if (is_x86_feature(featureset, i)) {
1088			cmn_err(CE_CONT, "?x86_feature: %s\n",
1089			    x86_feature_names[i]);
1090		}
1091	}
1092}
1093
1094/* Note: This is the maximum size for the CPU, not the size of the structure. */
1095static size_t xsave_state_size = 0;
1096uint64_t xsave_bv_all = (XFEATURE_LEGACY_FP | XFEATURE_SSE);
1097boolean_t xsave_force_disable = B_FALSE;
1098extern int disable_smap;
1099
1100/*
1101 * This is set to platform type we are running on.
1102 */
1103static int platform_type = -1;
1104
1105#if !defined(__xpv)
1106/*
1107 * Variable to patch if hypervisor platform detection needs to be
1108 * disabled (e.g. platform_type will always be HW_NATIVE if this is 0).
1109 */
1110int enable_platform_detection = 1;
1111#endif
1112
1113/*
1114 * monitor/mwait info.
1115 *
1116 * size_actual and buf_actual are the real address and size allocated to get
1117 * proper mwait_buf alignement.  buf_actual and size_actual should be passed
1118 * to kmem_free().  Currently kmem_alloc() and mwait happen to both use
1119 * processor cache-line alignment, but this is not guarantied in the furture.
1120 */
1121struct mwait_info {
1122	size_t		mon_min;	/* min size to avoid missed wakeups */
1123	size_t		mon_max;	/* size to avoid false wakeups */
1124	size_t		size_actual;	/* size actually allocated */
1125	void		*buf_actual;	/* memory actually allocated */
1126	uint32_t	support;	/* processor support of monitor/mwait */
1127};
1128
1129/*
1130 * xsave/xrestor info.
1131 *
1132 * This structure contains HW feature bits and the size of the xsave save area.
1133 * Note: the kernel declares a fixed size (AVX_XSAVE_SIZE) structure
1134 * (xsave_state) to describe the xsave layout. However, at runtime the
1135 * per-lwp xsave area is dynamically allocated based on xsav_max_size. The
1136 * xsave_state structure simply represents the legacy layout of the beginning
1137 * of the xsave area.
1138 */
1139struct xsave_info {
1140	uint32_t	xsav_hw_features_low;   /* Supported HW features */
1141	uint32_t	xsav_hw_features_high;  /* Supported HW features */
1142	size_t		xsav_max_size;  /* max size save area for HW features */
1143	size_t		ymm_size;	/* AVX: size of ymm save area */
1144	size_t		ymm_offset;	/* AVX: offset for ymm save area */
1145	size_t		bndregs_size;	/* MPX: size of bndregs save area */
1146	size_t		bndregs_offset;	/* MPX: offset for bndregs save area */
1147	size_t		bndcsr_size;	/* MPX: size of bndcsr save area */
1148	size_t		bndcsr_offset;	/* MPX: offset for bndcsr save area */
1149	size_t		opmask_size;	/* AVX512: size of opmask save */
1150	size_t		opmask_offset;	/* AVX512: offset for opmask save */
1151	size_t		zmmlo_size;	/* AVX512: size of zmm 256 save */
1152	size_t		zmmlo_offset;	/* AVX512: offset for zmm 256 save */
1153	size_t		zmmhi_size;	/* AVX512: size of zmm hi reg save */
1154	size_t		zmmhi_offset;	/* AVX512: offset for zmm hi reg save */
1155};
1156
1157
1158/*
1159 * These constants determine how many of the elements of the
1160 * cpuid we cache in the cpuid_info data structure; the
1161 * remaining elements are accessible via the cpuid instruction.
1162 */
1163
1164#define	NMAX_CPI_STD	8		/* eax = 0 .. 7 */
1165#define	NMAX_CPI_EXTD	0x1f		/* eax = 0x80000000 .. 0x8000001e */
1166
1167/*
1168 * See the big theory statement for a more detailed explanation of what some of
1169 * these members mean.
1170 */
1171struct cpuid_info {
1172	uint_t cpi_pass;		/* last pass completed */
1173	/*
1174	 * standard function information
1175	 */
1176	uint_t cpi_maxeax;		/* fn 0: %eax */
1177	char cpi_vendorstr[13];		/* fn 0: %ebx:%ecx:%edx */
1178	uint_t cpi_vendor;		/* enum of cpi_vendorstr */
1179
1180	uint_t cpi_family;		/* fn 1: extended family */
1181	uint_t cpi_model;		/* fn 1: extended model */
1182	uint_t cpi_step;		/* fn 1: stepping */
1183	chipid_t cpi_chipid;		/* fn 1: %ebx:  Intel: chip # */
1184					/*		AMD: package/socket # */
1185	uint_t cpi_brandid;		/* fn 1: %ebx: brand ID */
1186	int cpi_clogid;			/* fn 1: %ebx: thread # */
1187	uint_t cpi_ncpu_per_chip;	/* fn 1: %ebx: logical cpu count */
1188	uint8_t cpi_cacheinfo[16];	/* fn 2: intel-style cache desc */
1189	uint_t cpi_ncache;		/* fn 2: number of elements */
1190	uint_t cpi_ncpu_shr_last_cache;	/* fn 4: %eax: ncpus sharing cache */
1191	id_t cpi_last_lvl_cacheid;	/* fn 4: %eax: derived cache id */
1192	uint_t cpi_cache_leaf_size;	/* Number of cache elements */
1193					/* Intel fn: 4, AMD fn: 8000001d */
1194	struct cpuid_regs **cpi_cache_leaves;	/* Acual leaves from above */
1195	struct cpuid_regs cpi_std[NMAX_CPI_STD];	/* 0 .. 7 */
1196	/*
1197	 * extended function information
1198	 */
1199	uint_t cpi_xmaxeax;		/* fn 0x80000000: %eax */
1200	char cpi_brandstr[49];		/* fn 0x8000000[234] */
1201	uint8_t cpi_pabits;		/* fn 0x80000006: %eax */
1202	uint8_t	cpi_vabits;		/* fn 0x80000006: %eax */
1203	uint8_t cpi_fp_amd_save;	/* AMD: FP error pointer save rqd. */
1204	struct	cpuid_regs cpi_extd[NMAX_CPI_EXTD];	/* 0x800000XX */
1205
1206	id_t cpi_coreid;		/* same coreid => strands share core */
1207	int cpi_pkgcoreid;		/* core number within single package */
1208	uint_t cpi_ncore_per_chip;	/* AMD: fn 0x80000008: %ecx[7-0] */
1209					/* Intel: fn 4: %eax[31-26] */
1210
1211	/*
1212	 * These values represent the number of bits that are required to store
1213	 * information about the number of cores and threads.
1214	 */
1215	uint_t cpi_ncore_bits;
1216	uint_t cpi_nthread_bits;
1217	/*
1218	 * supported feature information
1219	 */
1220	uint32_t cpi_support[6];
1221#define	STD_EDX_FEATURES	0
1222#define	AMD_EDX_FEATURES	1
1223#define	TM_EDX_FEATURES		2
1224#define	STD_ECX_FEATURES	3
1225#define	AMD_ECX_FEATURES	4
1226#define	STD_EBX_FEATURES	5
1227	/*
1228	 * Synthesized information, where known.
1229	 */
1230	uint32_t cpi_chiprev;		/* See X86_CHIPREV_* in x86_archext.h */
1231	const char *cpi_chiprevstr;	/* May be NULL if chiprev unknown */
1232	uint32_t cpi_socket;		/* Chip package/socket type */
1233
1234	struct mwait_info cpi_mwait;	/* fn 5: monitor/mwait info */
1235	uint32_t cpi_apicid;
1236	uint_t cpi_procnodeid;		/* AMD: nodeID on HT, Intel: chipid */
1237	uint_t cpi_procnodes_per_pkg;	/* AMD: # of nodes in the package */
1238					/* Intel: 1 */
1239	uint_t cpi_compunitid;		/* AMD: ComputeUnit ID, Intel: coreid */
1240	uint_t cpi_cores_per_compunit;	/* AMD: # of cores in the ComputeUnit */
1241
1242	struct xsave_info cpi_xsave;	/* fn D: xsave/xrestor info */
1243};
1244
1245
1246static struct cpuid_info cpuid_info0;
1247
1248/*
1249 * These bit fields are defined by the Intel Application Note AP-485
1250 * "Intel Processor Identification and the CPUID Instruction"
1251 */
1252#define	CPI_FAMILY_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 27, 20)
1253#define	CPI_MODEL_XTD(cpi)	BITX((cpi)->cpi_std[1].cp_eax, 19, 16)
1254#define	CPI_TYPE(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 13, 12)
1255#define	CPI_FAMILY(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 11, 8)
1256#define	CPI_STEP(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 3, 0)
1257#define	CPI_MODEL(cpi)		BITX((cpi)->cpi_std[1].cp_eax, 7, 4)
1258
1259#define	CPI_FEATURES_EDX(cpi)		((cpi)->cpi_std[1].cp_edx)
1260#define	CPI_FEATURES_ECX(cpi)		((cpi)->cpi_std[1].cp_ecx)
1261#define	CPI_FEATURES_XTD_EDX(cpi)	((cpi)->cpi_extd[1].cp_edx)
1262#define	CPI_FEATURES_XTD_ECX(cpi)	((cpi)->cpi_extd[1].cp_ecx)
1263#define	CPI_FEATURES_7_0_EBX(cpi)	((cpi)->cpi_std[7].cp_ebx)
1264#define	CPI_FEATURES_7_0_ECX(cpi)	((cpi)->cpi_std[7].cp_ecx)
1265#define	CPI_FEATURES_7_0_EDX(cpi)	((cpi)->cpi_std[7].cp_edx)
1266
1267#define	CPI_BRANDID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 7, 0)
1268#define	CPI_CHUNKS(cpi)		BITX((cpi)->cpi_std[1].cp_ebx, 15, 7)
1269#define	CPI_CPU_COUNT(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 23, 16)
1270#define	CPI_APIC_ID(cpi)	BITX((cpi)->cpi_std[1].cp_ebx, 31, 24)
1271
1272#define	CPI_MAXEAX_MAX		0x100		/* sanity control */
1273#define	CPI_XMAXEAX_MAX		0x80000100
1274#define	CPI_FN4_ECX_MAX		0x20		/* sanity: max fn 4 levels */
1275#define	CPI_FNB_ECX_MAX		0x20		/* sanity: max fn B levels */
1276
1277/*
1278 * Function 4 (Deterministic Cache Parameters) macros
1279 * Defined by Intel Application Note AP-485
1280 */
1281#define	CPI_NUM_CORES(regs)		BITX((regs)->cp_eax, 31, 26)
1282#define	CPI_NTHR_SHR_CACHE(regs)	BITX((regs)->cp_eax, 25, 14)
1283#define	CPI_FULL_ASSOC_CACHE(regs)	BITX((regs)->cp_eax, 9, 9)
1284#define	CPI_SELF_INIT_CACHE(regs)	BITX((regs)->cp_eax, 8, 8)
1285#define	CPI_CACHE_LVL(regs)		BITX((regs)->cp_eax, 7, 5)
1286#define	CPI_CACHE_TYPE(regs)		BITX((regs)->cp_eax, 4, 0)
1287#define	CPI_CPU_LEVEL_TYPE(regs)	BITX((regs)->cp_ecx, 15, 8)
1288
1289#define	CPI_CACHE_WAYS(regs)		BITX((regs)->cp_ebx, 31, 22)
1290#define	CPI_CACHE_PARTS(regs)		BITX((regs)->cp_ebx, 21, 12)
1291#define	CPI_CACHE_COH_LN_SZ(regs)	BITX((regs)->cp_ebx, 11, 0)
1292
1293#define	CPI_CACHE_SETS(regs)		BITX((regs)->cp_ecx, 31, 0)
1294
1295#define	CPI_PREFCH_STRIDE(regs)		BITX((regs)->cp_edx, 9, 0)
1296
1297
1298/*
1299 * A couple of shorthand macros to identify "later" P6-family chips
1300 * like the Pentium M and Core.  First, the "older" P6-based stuff
1301 * (loosely defined as "pre-Pentium-4"):
1302 * P6, PII, Mobile PII, PII Xeon, PIII, Mobile PIII, PIII Xeon
1303 */
1304#define	IS_LEGACY_P6(cpi) (			\
1305	cpi->cpi_family == 6 &&			\
1306		(cpi->cpi_model == 1 ||		\
1307		cpi->cpi_model == 3 ||		\
1308		cpi->cpi_model == 5 ||		\
1309		cpi->cpi_model == 6 ||		\
1310		cpi->cpi_model == 7 ||		\
1311		cpi->cpi_model == 8 ||		\
1312		cpi->cpi_model == 0xA ||	\
1313		cpi->cpi_model == 0xB)		\
1314)
1315
1316/* A "new F6" is everything with family 6 that's not the above */
1317#define	IS_NEW_F6(cpi) ((cpi->cpi_family == 6) && !IS_LEGACY_P6(cpi))
1318
1319/* Extended family/model support */
1320#define	IS_EXTENDED_MODEL_INTEL(cpi) (cpi->cpi_family == 0x6 || \
1321	cpi->cpi_family >= 0xf)
1322
1323/*
1324 * Info for monitor/mwait idle loop.
1325 *
1326 * See cpuid section of "Intel 64 and IA-32 Architectures Software Developer's
1327 * Manual Volume 2A: Instruction Set Reference, A-M" #25366-022US, November
1328 * 2006.
1329 * See MONITOR/MWAIT section of "AMD64 Architecture Programmer's Manual
1330 * Documentation Updates" #33633, Rev 2.05, December 2006.
1331 */
1332#define	MWAIT_SUPPORT		(0x00000001)	/* mwait supported */
1333#define	MWAIT_EXTENSIONS	(0x00000002)	/* extenstion supported */
1334#define	MWAIT_ECX_INT_ENABLE	(0x00000004)	/* ecx 1 extension supported */
1335#define	MWAIT_SUPPORTED(cpi)	((cpi)->cpi_std[1].cp_ecx & CPUID_INTC_ECX_MON)
1336#define	MWAIT_INT_ENABLE(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x2)
1337#define	MWAIT_EXTENSION(cpi)	((cpi)->cpi_std[5].cp_ecx & 0x1)
1338#define	MWAIT_SIZE_MIN(cpi)	BITX((cpi)->cpi_std[5].cp_eax, 15, 0)
1339#define	MWAIT_SIZE_MAX(cpi)	BITX((cpi)->cpi_std[5].cp_ebx, 15, 0)
1340/*
1341 * Number of sub-cstates for a given c-state.
1342 */
1343#define	MWAIT_NUM_SUBC_STATES(cpi, c_state)			\
1344	BITX((cpi)->cpi_std[5].cp_edx, c_state + 3, c_state)
1345
1346/*
1347 * XSAVE leaf 0xD enumeration
1348 */
1349#define	CPUID_LEAFD_2_YMM_OFFSET	576
1350#define	CPUID_LEAFD_2_YMM_SIZE		256
1351
1352/*
1353 * Common extended leaf names to cut down on typos.
1354 */
1355#define	CPUID_LEAF_EXT_0		0x80000000
1356#define	CPUID_LEAF_EXT_8		0x80000008
1357#define	CPUID_LEAF_EXT_1d		0x8000001d
1358#define	CPUID_LEAF_EXT_1e		0x8000001e
1359
1360/*
1361 * Functions we consune from cpuid_subr.c;  don't publish these in a header
1362 * file to try and keep people using the expected cpuid_* interfaces.
1363 */
1364extern uint32_t _cpuid_skt(uint_t, uint_t, uint_t, uint_t);
1365extern const char *_cpuid_sktstr(uint_t, uint_t, uint_t, uint_t);
1366extern uint32_t _cpuid_chiprev(uint_t, uint_t, uint_t, uint_t);
1367extern const char *_cpuid_chiprevstr(uint_t, uint_t, uint_t, uint_t);
1368extern uint_t _cpuid_vendorstr_to_vendorcode(char *);
1369
1370/*
1371 * Apply up various platform-dependent restrictions where the
1372 * underlying platform restrictions mean the CPU can be marked
1373 * as less capable than its cpuid instruction would imply.
1374 */
1375#if defined(__xpv)
1376static void
1377platform_cpuid_mangle(uint_t vendor, uint32_t eax, struct cpuid_regs *cp)
1378{
1379	switch (eax) {
1380	case 1: {
1381		uint32_t mcamask = DOMAIN_IS_INITDOMAIN(xen_info) ?
1382		    0 : CPUID_INTC_EDX_MCA;
1383		cp->cp_edx &=
1384		    ~(mcamask |
1385		    CPUID_INTC_EDX_PSE |
1386		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1387		    CPUID_INTC_EDX_SEP | CPUID_INTC_EDX_MTRR |
1388		    CPUID_INTC_EDX_PGE | CPUID_INTC_EDX_PAT |
1389		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1390		    CPUID_INTC_EDX_PSE36 | CPUID_INTC_EDX_HTT);
1391		break;
1392	}
1393
1394	case 0x80000001:
1395		cp->cp_edx &=
1396		    ~(CPUID_AMD_EDX_PSE |
1397		    CPUID_INTC_EDX_VME | CPUID_INTC_EDX_DE |
1398		    CPUID_AMD_EDX_MTRR | CPUID_AMD_EDX_PGE |
1399		    CPUID_AMD_EDX_PAT | CPUID_AMD_EDX_PSE36 |
1400		    CPUID_AMD_EDX_SYSC | CPUID_INTC_EDX_SEP |
1401		    CPUID_AMD_EDX_TSCP);
1402		cp->cp_ecx &= ~CPUID_AMD_ECX_CMP_LGCY;
1403		break;
1404	default:
1405		break;
1406	}
1407
1408	switch (vendor) {
1409	case X86_VENDOR_Intel:
1410		switch (eax) {
1411		case 4:
1412			/*
1413			 * Zero out the (ncores-per-chip - 1) field
1414			 */
1415			cp->cp_eax &= 0x03fffffff;
1416			break;
1417		default:
1418			break;
1419		}
1420		break;
1421	case X86_VENDOR_AMD:
1422		switch (eax) {
1423
1424		case 0x80000001:
1425			cp->cp_ecx &= ~CPUID_AMD_ECX_CR8D;
1426			break;
1427
1428		case CPUID_LEAF_EXT_8:
1429			/*
1430			 * Zero out the (ncores-per-chip - 1) field
1431			 */
1432			cp->cp_ecx &= 0xffffff00;
1433			break;
1434		default:
1435			break;
1436		}
1437		break;
1438	default:
1439		break;
1440	}
1441}
1442#else
1443#define	platform_cpuid_mangle(vendor, eax, cp)	/* nothing */
1444#endif
1445
1446/*
1447 *  Some undocumented ways of patching the results of the cpuid
1448 *  instruction to permit running Solaris 10 on future cpus that
1449 *  we don't currently support.  Could be set to non-zero values
1450 *  via settings in eeprom.
1451 */
1452
1453uint32_t cpuid_feature_ecx_include;
1454uint32_t cpuid_feature_ecx_exclude;
1455uint32_t cpuid_feature_edx_include;
1456uint32_t cpuid_feature_edx_exclude;
1457
1458/*
1459 * Allocate space for mcpu_cpi in the machcpu structure for all non-boot CPUs.
1460 */
1461void
1462cpuid_alloc_space(cpu_t *cpu)
1463{
1464	/*
1465	 * By convention, cpu0 is the boot cpu, which is set up
1466	 * before memory allocation is available.  All other cpus get
1467	 * their cpuid_info struct allocated here.
1468	 */
1469	ASSERT(cpu->cpu_id != 0);
1470	ASSERT(cpu->cpu_m.mcpu_cpi == NULL);
1471	cpu->cpu_m.mcpu_cpi =
1472	    kmem_zalloc(sizeof (*cpu->cpu_m.mcpu_cpi), KM_SLEEP);
1473}
1474
1475void
1476cpuid_free_space(cpu_t *cpu)
1477{
1478	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1479	int i;
1480
1481	ASSERT(cpi != NULL);
1482	ASSERT(cpi != &cpuid_info0);
1483
1484	/*
1485	 * Free up any cache leaf related dynamic storage. The first entry was
1486	 * cached from the standard cpuid storage, so we should not free it.
1487	 */
1488	for (i = 1; i < cpi->cpi_cache_leaf_size; i++)
1489		kmem_free(cpi->cpi_cache_leaves[i], sizeof (struct cpuid_regs));
1490	if (cpi->cpi_cache_leaf_size > 0)
1491		kmem_free(cpi->cpi_cache_leaves,
1492		    cpi->cpi_cache_leaf_size * sizeof (struct cpuid_regs *));
1493
1494	kmem_free(cpi, sizeof (*cpi));
1495	cpu->cpu_m.mcpu_cpi = NULL;
1496}
1497
1498#if !defined(__xpv)
1499/*
1500 * Determine the type of the underlying platform. This is used to customize
1501 * initialization of various subsystems (e.g. TSC). determine_platform() must
1502 * only ever be called once to prevent two processors from seeing different
1503 * values of platform_type. Must be called before cpuid_pass1(), the earliest
1504 * consumer to execute (uses _cpuid_chiprev --> synth_amd_info --> get_hwenv).
1505 */
1506void
1507determine_platform(void)
1508{
1509	struct cpuid_regs cp;
1510	uint32_t base;
1511	uint32_t regs[4];
1512	char *hvstr = (char *)regs;
1513
1514	ASSERT(platform_type == -1);
1515
1516	platform_type = HW_NATIVE;
1517
1518	if (!enable_platform_detection)
1519		return;
1520
1521	/*
1522	 * If Hypervisor CPUID bit is set, try to determine hypervisor
1523	 * vendor signature, and set platform type accordingly.
1524	 *
1525	 * References:
1526	 * http://lkml.org/lkml/2008/10/1/246
1527	 * http://kb.vmware.com/kb/1009458
1528	 */
1529	cp.cp_eax = 0x1;
1530	(void) __cpuid_insn(&cp);
1531	if ((cp.cp_ecx & CPUID_INTC_ECX_HV) != 0) {
1532		cp.cp_eax = 0x40000000;
1533		(void) __cpuid_insn(&cp);
1534		regs[0] = cp.cp_ebx;
1535		regs[1] = cp.cp_ecx;
1536		regs[2] = cp.cp_edx;
1537		regs[3] = 0;
1538		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0) {
1539			platform_type = HW_XEN_HVM;
1540			return;
1541		}
1542		if (strcmp(hvstr, HVSIG_VMWARE) == 0) {
1543			platform_type = HW_VMWARE;
1544			return;
1545		}
1546		if (strcmp(hvstr, HVSIG_KVM) == 0) {
1547			platform_type = HW_KVM;
1548			return;
1549		}
1550		if (strcmp(hvstr, HVSIG_BHYVE) == 0) {
1551			platform_type = HW_BHYVE;
1552			return;
1553		}
1554		if (strcmp(hvstr, HVSIG_MICROSOFT) == 0)
1555			platform_type = HW_MICROSOFT;
1556	} else {
1557		/*
1558		 * Check older VMware hardware versions. VMware hypervisor is
1559		 * detected by performing an IN operation to VMware hypervisor
1560		 * port and checking that value returned in %ebx is VMware
1561		 * hypervisor magic value.
1562		 *
1563		 * References: http://kb.vmware.com/kb/1009458
1564		 */
1565		vmware_port(VMWARE_HVCMD_GETVERSION, regs);
1566		if (regs[1] == VMWARE_HVMAGIC) {
1567			platform_type = HW_VMWARE;
1568			return;
1569		}
1570	}
1571
1572	/*
1573	 * Check Xen hypervisor. In a fully virtualized domain,
1574	 * Xen's pseudo-cpuid function returns a string representing the
1575	 * Xen signature in %ebx, %ecx, and %edx. %eax contains the maximum
1576	 * supported cpuid function. We need at least a (base + 2) leaf value
1577	 * to do what we want to do. Try different base values, since the
1578	 * hypervisor might use a different one depending on whether Hyper-V
1579	 * emulation is switched on by default or not.
1580	 */
1581	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1582		cp.cp_eax = base;
1583		(void) __cpuid_insn(&cp);
1584		regs[0] = cp.cp_ebx;
1585		regs[1] = cp.cp_ecx;
1586		regs[2] = cp.cp_edx;
1587		regs[3] = 0;
1588		if (strcmp(hvstr, HVSIG_XEN_HVM) == 0 &&
1589		    cp.cp_eax >= (base + 2)) {
1590			platform_type &= ~HW_NATIVE;
1591			platform_type |= HW_XEN_HVM;
1592			return;
1593		}
1594	}
1595}
1596
1597int
1598get_hwenv(void)
1599{
1600	ASSERT(platform_type != -1);
1601	return (platform_type);
1602}
1603
1604int
1605is_controldom(void)
1606{
1607	return (0);
1608}
1609
1610#else
1611
1612int
1613get_hwenv(void)
1614{
1615	return (HW_XEN_PV);
1616}
1617
1618int
1619is_controldom(void)
1620{
1621	return (DOMAIN_IS_INITDOMAIN(xen_info));
1622}
1623
1624#endif	/* __xpv */
1625
1626/*
1627 * Make sure that we have gathered all of the CPUID leaves that we might need to
1628 * determine topology. We assume that the standard leaf 1 has already been done
1629 * and that xmaxeax has already been calculated.
1630 */
1631static void
1632cpuid_gather_amd_topology_leaves(cpu_t *cpu)
1633{
1634	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1635
1636	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1637		struct cpuid_regs *cp;
1638
1639		cp = &cpi->cpi_extd[8];
1640		cp->cp_eax = CPUID_LEAF_EXT_8;
1641		(void) __cpuid_insn(cp);
1642		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, cp);
1643	}
1644
1645	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1646	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1647		struct cpuid_regs *cp;
1648
1649		cp = &cpi->cpi_extd[0x1e];
1650		cp->cp_eax = CPUID_LEAF_EXT_1e;
1651		(void) __cpuid_insn(cp);
1652	}
1653}
1654
1655/*
1656 * Get the APIC ID for this processor. If Leaf B is present and valid, we prefer
1657 * it to everything else. If not, and we're on an AMD system where 8000001e is
1658 * valid, then we use that. Othewrise, we fall back to the default value for the
1659 * APIC ID in leaf 1.
1660 */
1661static uint32_t
1662cpuid_gather_apicid(struct cpuid_info *cpi)
1663{
1664	/*
1665	 * Leaf B changes based on the arguments to it. Beacuse we don't cache
1666	 * it, we need to gather it again.
1667	 */
1668	if (cpi->cpi_maxeax >= 0xB) {
1669		struct cpuid_regs regs;
1670		struct cpuid_regs *cp;
1671
1672		cp = &regs;
1673		cp->cp_eax = 0xB;
1674		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1675		(void) __cpuid_insn(cp);
1676
1677		if (cp->cp_ebx != 0) {
1678			return (cp->cp_edx);
1679		}
1680	}
1681
1682	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
1683	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1684	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1685		return (cpi->cpi_extd[0x1e].cp_eax);
1686	}
1687
1688	return (CPI_APIC_ID(cpi));
1689}
1690
1691/*
1692 * For AMD processors, attempt to calculate the number of chips and cores that
1693 * exist. The way that we do this varies based on the generation, because the
1694 * generations themselves have changed dramatically.
1695 *
1696 * If cpuid leaf 0x80000008 exists, that generally tells us the number of cores.
1697 * However, with the advent of family 17h (Zen) it actually tells us the number
1698 * of threads, so we need to look at leaf 0x8000001e if available to determine
1699 * its value. Otherwise, for all prior families, the number of enabled cores is
1700 * the same as threads.
1701 *
1702 * If we do not have leaf 0x80000008, then we assume that this processor does
1703 * not have anything. AMD's older CPUID specification says there's no reason to
1704 * fall back to leaf 1.
1705 *
1706 * In some virtualization cases we will not have leaf 8000001e or it will be
1707 * zero. When that happens we assume the number of threads is one.
1708 */
1709static void
1710cpuid_amd_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1711{
1712	uint_t nthreads, nthread_per_core;
1713
1714	nthreads = nthread_per_core = 1;
1715
1716	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
1717		nthreads = BITX(cpi->cpi_extd[8].cp_ecx, 7, 0) + 1;
1718	} else if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
1719		nthreads = CPI_CPU_COUNT(cpi);
1720	}
1721
1722	/*
1723	 * For us to have threads, and know about it, we have to be at least at
1724	 * family 17h and have the cpuid bit that says we have extended
1725	 * topology.
1726	 */
1727	if (cpi->cpi_family >= 0x17 &&
1728	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1729	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1730		nthread_per_core = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
1731	}
1732
1733	*ncpus = nthreads;
1734	*ncores = nthreads / nthread_per_core;
1735}
1736
1737/*
1738 * Seed the initial values for the cores and threads for an Intel based
1739 * processor. These values will be overwritten if we detect that the processor
1740 * supports CPUID leaf 0xb.
1741 */
1742static void
1743cpuid_intel_ncores(struct cpuid_info *cpi, uint_t *ncpus, uint_t *ncores)
1744{
1745	/*
1746	 * Only seed the number of physical cores from the first level leaf 4
1747	 * information. The number of threads there indicate how many share the
1748	 * L1 cache, which may or may not have anything to do with the number of
1749	 * logical CPUs per core.
1750	 */
1751	if (cpi->cpi_maxeax >= 4) {
1752		*ncores = BITX(cpi->cpi_std[4].cp_eax, 31, 26) + 1;
1753	} else {
1754		*ncores = 1;
1755	}
1756
1757	if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
1758		*ncpus = CPI_CPU_COUNT(cpi);
1759	} else {
1760		*ncpus = *ncores;
1761	}
1762}
1763
1764static boolean_t
1765cpuid_leafB_getids(cpu_t *cpu)
1766{
1767	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1768	struct cpuid_regs regs;
1769	struct cpuid_regs *cp;
1770
1771	if (cpi->cpi_maxeax < 0xB)
1772		return (B_FALSE);
1773
1774	cp = &regs;
1775	cp->cp_eax = 0xB;
1776	cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
1777
1778	(void) __cpuid_insn(cp);
1779
1780	/*
1781	 * Check CPUID.EAX=0BH, ECX=0H:EBX is non-zero, which
1782	 * indicates that the extended topology enumeration leaf is
1783	 * available.
1784	 */
1785	if (cp->cp_ebx != 0) {
1786		uint32_t x2apic_id = 0;
1787		uint_t coreid_shift = 0;
1788		uint_t ncpu_per_core = 1;
1789		uint_t chipid_shift = 0;
1790		uint_t ncpu_per_chip = 1;
1791		uint_t i;
1792		uint_t level;
1793
1794		for (i = 0; i < CPI_FNB_ECX_MAX; i++) {
1795			cp->cp_eax = 0xB;
1796			cp->cp_ecx = i;
1797
1798			(void) __cpuid_insn(cp);
1799			level = CPI_CPU_LEVEL_TYPE(cp);
1800
1801			if (level == 1) {
1802				x2apic_id = cp->cp_edx;
1803				coreid_shift = BITX(cp->cp_eax, 4, 0);
1804				ncpu_per_core = BITX(cp->cp_ebx, 15, 0);
1805			} else if (level == 2) {
1806				x2apic_id = cp->cp_edx;
1807				chipid_shift = BITX(cp->cp_eax, 4, 0);
1808				ncpu_per_chip = BITX(cp->cp_ebx, 15, 0);
1809			}
1810		}
1811
1812		/*
1813		 * cpi_apicid is taken care of in cpuid_gather_apicid.
1814		 */
1815		cpi->cpi_ncpu_per_chip = ncpu_per_chip;
1816		cpi->cpi_ncore_per_chip = ncpu_per_chip /
1817		    ncpu_per_core;
1818		cpi->cpi_chipid = x2apic_id >> chipid_shift;
1819		cpi->cpi_clogid = x2apic_id & ((1 << chipid_shift) - 1);
1820		cpi->cpi_coreid = x2apic_id >> coreid_shift;
1821		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
1822		cpi->cpi_procnodeid = cpi->cpi_chipid;
1823		cpi->cpi_compunitid = cpi->cpi_coreid;
1824
1825		if (coreid_shift > 0 && chipid_shift > coreid_shift) {
1826			cpi->cpi_nthread_bits = coreid_shift;
1827			cpi->cpi_ncore_bits = chipid_shift - coreid_shift;
1828		}
1829
1830		return (B_TRUE);
1831	} else {
1832		return (B_FALSE);
1833	}
1834}
1835
1836static void
1837cpuid_intel_getids(cpu_t *cpu, void *feature)
1838{
1839	uint_t i;
1840	uint_t chipid_shift = 0;
1841	uint_t coreid_shift = 0;
1842	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1843
1844	/*
1845	 * There are no compute units or processor nodes currently on Intel.
1846	 * Always set these to one.
1847	 */
1848	cpi->cpi_procnodes_per_pkg = 1;
1849	cpi->cpi_cores_per_compunit = 1;
1850
1851	/*
1852	 * If cpuid Leaf B is present, use that to try and get this information.
1853	 * It will be the most accurate for Intel CPUs.
1854	 */
1855	if (cpuid_leafB_getids(cpu))
1856		return;
1857
1858	/*
1859	 * In this case, we have the leaf 1 and leaf 4 values for ncpu_per_chip
1860	 * and ncore_per_chip. These represent the largest power of two values
1861	 * that we need to cover all of the IDs in the system. Therefore, we use
1862	 * those values to seed the number of bits needed to cover information
1863	 * in the case when leaf B is not available. These values will probably
1864	 * be larger than required, but that's OK.
1865	 */
1866	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip);
1867	cpi->cpi_ncore_bits = ddi_fls(cpi->cpi_ncore_per_chip);
1868
1869	for (i = 1; i < cpi->cpi_ncpu_per_chip; i <<= 1)
1870		chipid_shift++;
1871
1872	cpi->cpi_chipid = cpi->cpi_apicid >> chipid_shift;
1873	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << chipid_shift) - 1);
1874
1875	if (is_x86_feature(feature, X86FSET_CMP)) {
1876		/*
1877		 * Multi-core (and possibly multi-threaded)
1878		 * processors.
1879		 */
1880		uint_t ncpu_per_core;
1881		if (cpi->cpi_ncore_per_chip == 1)
1882			ncpu_per_core = cpi->cpi_ncpu_per_chip;
1883		else if (cpi->cpi_ncore_per_chip > 1)
1884			ncpu_per_core = cpi->cpi_ncpu_per_chip /
1885			    cpi->cpi_ncore_per_chip;
1886		/*
1887		 * 8bit APIC IDs on dual core Pentiums
1888		 * look like this:
1889		 *
1890		 * +-----------------------+------+------+
1891		 * | Physical Package ID   |  MC  |  HT  |
1892		 * +-----------------------+------+------+
1893		 * <------- chipid -------->
1894		 * <------- coreid --------------->
1895		 *			   <--- clogid -->
1896		 *			   <------>
1897		 *			   pkgcoreid
1898		 *
1899		 * Where the number of bits necessary to
1900		 * represent MC and HT fields together equals
1901		 * to the minimum number of bits necessary to
1902		 * store the value of cpi->cpi_ncpu_per_chip.
1903		 * Of those bits, the MC part uses the number
1904		 * of bits necessary to store the value of
1905		 * cpi->cpi_ncore_per_chip.
1906		 */
1907		for (i = 1; i < ncpu_per_core; i <<= 1)
1908			coreid_shift++;
1909		cpi->cpi_coreid = cpi->cpi_apicid >> coreid_shift;
1910		cpi->cpi_pkgcoreid = cpi->cpi_clogid >> coreid_shift;
1911	} else if (is_x86_feature(feature, X86FSET_HTT)) {
1912		/*
1913		 * Single-core multi-threaded processors.
1914		 */
1915		cpi->cpi_coreid = cpi->cpi_chipid;
1916		cpi->cpi_pkgcoreid = 0;
1917	} else {
1918		/*
1919		 * Single-core single-thread processors.
1920		 */
1921		cpi->cpi_coreid = cpu->cpu_id;
1922		cpi->cpi_pkgcoreid = 0;
1923	}
1924	cpi->cpi_procnodeid = cpi->cpi_chipid;
1925	cpi->cpi_compunitid = cpi->cpi_coreid;
1926}
1927
1928/*
1929 * Historically, AMD has had CMP chips with only a single thread per core.
1930 * However, starting in family 17h (Zen), this has changed and they now have
1931 * multiple threads. Our internal core id needs to be a unique value.
1932 *
1933 * To determine the core id of an AMD system, if we're from a family before 17h,
1934 * then we just use the cpu id, as that gives us a good value that will be
1935 * unique for each core. If instead, we're on family 17h or later, then we need
1936 * to do something more complicated. CPUID leaf 0x8000001e can tell us
1937 * how many threads are in the system. Based on that, we'll shift the APIC ID.
1938 * We can't use the normal core id in that leaf as it's only unique within the
1939 * socket, which is perfect for cpi_pkgcoreid, but not us.
1940 */
1941static id_t
1942cpuid_amd_get_coreid(cpu_t *cpu)
1943{
1944	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1945
1946	if (cpi->cpi_family >= 0x17 &&
1947	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
1948	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
1949		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
1950		if (nthreads > 1) {
1951			VERIFY3U(nthreads, ==, 2);
1952			return (cpi->cpi_apicid >> 1);
1953		}
1954	}
1955
1956	return (cpu->cpu_id);
1957}
1958
1959/*
1960 * IDs on AMD is a more challenging task. This is notable because of the
1961 * following two facts:
1962 *
1963 *  1. Before family 0x17 (Zen), there was no support for SMT and there was
1964 *     also no way to get an actual unique core id from the system. As such, we
1965 *     synthesize this case by using cpu->cpu_id.  This scheme does not,
1966 *     however, guarantee that sibling cores of a chip will have sequential
1967 *     coreids starting at a multiple of the number of cores per chip - that is
1968 *     usually the case, but if the ACPI MADT table is presented in a different
1969 *     order then we need to perform a few more gymnastics for the pkgcoreid.
1970 *
1971 *  2. In families 0x15 and 16x (Bulldozer and co.) the cores came in groups
1972 *     called compute units. These compute units share the L1I cache, L2 cache,
1973 *     and the FPU. To deal with this, a new topology leaf was added in
1974 *     0x8000001e. However, parts of this leaf have different meanings
1975 *     once we get to family 0x17.
1976 */
1977
1978static void
1979cpuid_amd_getids(cpu_t *cpu, uchar_t *features)
1980{
1981	int i, first_half, coreidsz;
1982	uint32_t nb_caps_reg;
1983	uint_t node2_1;
1984	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
1985	struct cpuid_regs *cp;
1986
1987	/*
1988	 * Calculate the core id (this comes from hardware in family 0x17 if it
1989	 * hasn't been stripped by virtualization). We always set the compute
1990	 * unit id to the same value. Also, initialize the default number of
1991	 * cores per compute unit and nodes per package. This will be
1992	 * overwritten when we know information about a particular family.
1993	 */
1994	cpi->cpi_coreid = cpuid_amd_get_coreid(cpu);
1995	cpi->cpi_compunitid = cpi->cpi_coreid;
1996	cpi->cpi_cores_per_compunit = 1;
1997	cpi->cpi_procnodes_per_pkg = 1;
1998
1999	/*
2000	 * To construct the logical ID, we need to determine how many APIC IDs
2001	 * are dedicated to the cores and threads. This is provided for us in
2002	 * 0x80000008. However, if it's not present (say due to virtualization),
2003	 * then we assume it's one. This should be present on all 64-bit AMD
2004	 * processors.  It was added in family 0xf (Hammer).
2005	 */
2006	if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2007		coreidsz = BITX((cpi)->cpi_extd[8].cp_ecx, 15, 12);
2008
2009		/*
2010		 * In AMD parlance chip is really a node while illumos
2011		 * uses chip as equivalent to socket/package.
2012		 */
2013		if (coreidsz == 0) {
2014			/* Use legacy method */
2015			for (i = 1; i < cpi->cpi_ncore_per_chip; i <<= 1)
2016				coreidsz++;
2017			if (coreidsz == 0)
2018				coreidsz = 1;
2019		}
2020	} else {
2021		/* Assume single-core part */
2022		coreidsz = 1;
2023	}
2024	cpi->cpi_clogid = cpi->cpi_apicid & ((1 << coreidsz) - 1);
2025
2026	/*
2027	 * The package core ID varies depending on the family. While it may be
2028	 * tempting to use the CPUID_LEAF_EXT_1e %ebx core id, unfortunately,
2029	 * this value is the core id in the given node. For non-virtualized
2030	 * family 17h, we need to take the logical core id and shift off the
2031	 * threads like we do when getting the core id.  Otherwise, we can use
2032	 * the clogid as is. When family 17h is virtualized, the clogid should
2033	 * be sufficient as if we don't have valid data in the leaf, then we
2034	 * won't think we have SMT, in which case the cpi_clogid should be
2035	 * sufficient.
2036	 */
2037	if (cpi->cpi_family >= 0x17 &&
2038	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2039	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e &&
2040	    cpi->cpi_extd[0x1e].cp_ebx != 0) {
2041		uint_t nthreads = BITX(cpi->cpi_extd[0x1e].cp_ebx, 15, 8) + 1;
2042		if (nthreads > 1) {
2043			VERIFY3U(nthreads, ==, 2);
2044			cpi->cpi_pkgcoreid = cpi->cpi_clogid >> 1;
2045		} else {
2046			cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2047		}
2048	} else {
2049		cpi->cpi_pkgcoreid = cpi->cpi_clogid;
2050	}
2051
2052	/*
2053	 * Obtain the node ID and compute unit IDs. If we're on family 0x15
2054	 * (bulldozer) or newer, then we can derive all of this from leaf
2055	 * CPUID_LEAF_EXT_1e. Otherwise, the method varies by family.
2056	 */
2057	if (is_x86_feature(x86_featureset, X86FSET_TOPOEXT) &&
2058	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1e) {
2059		cp = &cpi->cpi_extd[0x1e];
2060
2061		cpi->cpi_procnodes_per_pkg = BITX(cp->cp_ecx, 10, 8) + 1;
2062		cpi->cpi_procnodeid = BITX(cp->cp_ecx, 7, 0);
2063
2064		/*
2065		 * For Bulldozer-era CPUs, recalculate the compute unit
2066		 * information.
2067		 */
2068		if (cpi->cpi_family >= 0x15 && cpi->cpi_family < 0x17) {
2069			cpi->cpi_cores_per_compunit =
2070			    BITX(cp->cp_ebx, 15, 8) + 1;
2071			cpi->cpi_compunitid = BITX(cp->cp_ebx, 7, 0) +
2072			    (cpi->cpi_ncore_per_chip /
2073			    cpi->cpi_cores_per_compunit) *
2074			    (cpi->cpi_procnodeid /
2075			    cpi->cpi_procnodes_per_pkg);
2076		}
2077	} else if (cpi->cpi_family == 0xf || cpi->cpi_family >= 0x11) {
2078		cpi->cpi_procnodeid = (cpi->cpi_apicid >> coreidsz) & 7;
2079	} else if (cpi->cpi_family == 0x10) {
2080		/*
2081		 * See if we are a multi-node processor.
2082		 * All processors in the system have the same number of nodes
2083		 */
2084		nb_caps_reg =  pci_getl_func(0, 24, 3, 0xe8);
2085		if ((cpi->cpi_model < 8) || BITX(nb_caps_reg, 29, 29) == 0) {
2086			/* Single-node */
2087			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 5,
2088			    coreidsz);
2089		} else {
2090
2091			/*
2092			 * Multi-node revision D (2 nodes per package
2093			 * are supported)
2094			 */
2095			cpi->cpi_procnodes_per_pkg = 2;
2096
2097			first_half = (cpi->cpi_pkgcoreid <=
2098			    (cpi->cpi_ncore_per_chip/2 - 1));
2099
2100			if (cpi->cpi_apicid == cpi->cpi_pkgcoreid) {
2101				/* We are BSP */
2102				cpi->cpi_procnodeid = (first_half ? 0 : 1);
2103			} else {
2104
2105				/* We are AP */
2106				/* NodeId[2:1] bits to use for reading F3xe8 */
2107				node2_1 = BITX(cpi->cpi_apicid, 5, 4) << 1;
2108
2109				nb_caps_reg =
2110				    pci_getl_func(0, 24 + node2_1, 3, 0xe8);
2111
2112				/*
2113				 * Check IntNodeNum bit (31:30, but bit 31 is
2114				 * always 0 on dual-node processors)
2115				 */
2116				if (BITX(nb_caps_reg, 30, 30) == 0)
2117					cpi->cpi_procnodeid = node2_1 +
2118					    !first_half;
2119				else
2120					cpi->cpi_procnodeid = node2_1 +
2121					    first_half;
2122			}
2123		}
2124	} else {
2125		cpi->cpi_procnodeid = 0;
2126	}
2127
2128	cpi->cpi_chipid =
2129	    cpi->cpi_procnodeid / cpi->cpi_procnodes_per_pkg;
2130
2131	cpi->cpi_ncore_bits = coreidsz;
2132	cpi->cpi_nthread_bits = ddi_fls(cpi->cpi_ncpu_per_chip /
2133	    cpi->cpi_ncore_per_chip);
2134}
2135
2136static void
2137spec_uarch_flush_noop(void)
2138{
2139}
2140
2141/*
2142 * When microcode is present that mitigates MDS, this wrmsr will also flush the
2143 * MDS-related micro-architectural state that would normally happen by calling
2144 * x86_md_clear().
2145 */
2146static void
2147spec_uarch_flush_msr(void)
2148{
2149	wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D);
2150}
2151
2152/*
2153 * This function points to a function that will flush certain
2154 * micro-architectural state on the processor. This flush is used to mitigate
2155 * two different classes of Intel CPU vulnerabilities: L1TF and MDS. This
2156 * function can point to one of three functions:
2157 *
2158 * - A noop which is done because we either are vulnerable, but do not have
2159 *   microcode available to help deal with a fix, or because we aren't
2160 *   vulnerable.
2161 *
2162 * - spec_uarch_flush_msr which will issue an L1D flush and if microcode to
2163 *   mitigate MDS is present, also perform the equivalent of the MDS flush;
2164 *   however, it only flushes the MDS related micro-architectural state on the
2165 *   current hyperthread, it does not do anything for the twin.
2166 *
2167 * - x86_md_clear which will flush the MDS related state. This is done when we
2168 *   have a processor that is vulnerable to MDS, but is not vulnerable to L1TF
2169 *   (RDCL_NO is set).
2170 */
2171void (*spec_uarch_flush)(void) = spec_uarch_flush_noop;
2172
2173void (*x86_md_clear)(void) = x86_md_clear_noop;
2174
2175static void
2176cpuid_update_md_clear(cpu_t *cpu, uchar_t *featureset)
2177{
2178	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2179
2180	/*
2181	 * While RDCL_NO indicates that one of the MDS vulnerabilities (MSBDS)
2182	 * has been fixed in hardware, it doesn't cover everything related to
2183	 * MDS. Therefore we can only rely on MDS_NO to determine that we don't
2184	 * need to mitigate this.
2185	 */
2186	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2187	    is_x86_feature(featureset, X86FSET_MDS_NO)) {
2188		x86_md_clear = x86_md_clear_noop;
2189		membar_producer();
2190		return;
2191	}
2192
2193	if (is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2194		x86_md_clear = x86_md_clear_verw;
2195	}
2196
2197	membar_producer();
2198}
2199
2200static void
2201cpuid_update_l1d_flush(cpu_t *cpu, uchar_t *featureset)
2202{
2203	boolean_t need_l1d, need_mds;
2204	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2205
2206	/*
2207	 * If we're not on Intel or we've mitigated both RDCL and MDS in
2208	 * hardware, then there's nothing left for us to do for enabling the
2209	 * flush. We can also go ahead and say that SMT exclusion is
2210	 * unnecessary.
2211	 */
2212	if (cpi->cpi_vendor != X86_VENDOR_Intel ||
2213	    (is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2214	    is_x86_feature(featureset, X86FSET_MDS_NO))) {
2215		extern int smt_exclusion;
2216		smt_exclusion = 0;
2217		spec_uarch_flush = spec_uarch_flush_noop;
2218		membar_producer();
2219		return;
2220	}
2221
2222	/*
2223	 * The locations where we need to perform an L1D flush are required both
2224	 * for mitigating L1TF and MDS. When verw support is present in
2225	 * microcode, then the L1D flush will take care of doing that as well.
2226	 * However, if we have a system where RDCL_NO is present, but we don't
2227	 * have MDS_NO, then we need to do a verw (x86_md_clear) and not a full
2228	 * L1D flush.
2229	 */
2230	if (!is_x86_feature(featureset, X86FSET_RDCL_NO) &&
2231	    is_x86_feature(featureset, X86FSET_FLUSH_CMD) &&
2232	    !is_x86_feature(featureset, X86FSET_L1D_VM_NO)) {
2233		need_l1d = B_TRUE;
2234	} else {
2235		need_l1d = B_FALSE;
2236	}
2237
2238	if (!is_x86_feature(featureset, X86FSET_MDS_NO) &&
2239	    is_x86_feature(featureset, X86FSET_MD_CLEAR)) {
2240		need_mds = B_TRUE;
2241	} else {
2242		need_mds = B_FALSE;
2243	}
2244
2245	if (need_l1d) {
2246		spec_uarch_flush = spec_uarch_flush_msr;
2247	} else if (need_mds) {
2248		spec_uarch_flush = x86_md_clear;
2249	} else {
2250		/*
2251		 * We have no hardware mitigations available to us.
2252		 */
2253		spec_uarch_flush = spec_uarch_flush_noop;
2254	}
2255	membar_producer();
2256}
2257
2258static void
2259cpuid_scan_security(cpu_t *cpu, uchar_t *featureset)
2260{
2261	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2262
2263	if (cpi->cpi_vendor == X86_VENDOR_AMD &&
2264	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8) {
2265		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBPB)
2266			add_x86_feature(featureset, X86FSET_IBPB);
2267		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS)
2268			add_x86_feature(featureset, X86FSET_IBRS);
2269		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP)
2270			add_x86_feature(featureset, X86FSET_STIBP);
2271		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_IBRS_ALL)
2272			add_x86_feature(featureset, X86FSET_IBRS_ALL);
2273		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_STIBP_ALL)
2274			add_x86_feature(featureset, X86FSET_STIBP_ALL);
2275		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_PREFER_IBRS)
2276			add_x86_feature(featureset, X86FSET_RSBA);
2277		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSBD)
2278			add_x86_feature(featureset, X86FSET_SSBD);
2279		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_VIRT_SSBD)
2280			add_x86_feature(featureset, X86FSET_SSBD_VIRT);
2281		if (cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_SSB_NO)
2282			add_x86_feature(featureset, X86FSET_SSB_NO);
2283	} else if (cpi->cpi_vendor == X86_VENDOR_Intel &&
2284	    cpi->cpi_maxeax >= 7) {
2285		struct cpuid_regs *ecp;
2286		ecp = &cpi->cpi_std[7];
2287
2288		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_MD_CLEAR) {
2289			add_x86_feature(featureset, X86FSET_MD_CLEAR);
2290		}
2291
2292		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SPEC_CTRL) {
2293			add_x86_feature(featureset, X86FSET_IBRS);
2294			add_x86_feature(featureset, X86FSET_IBPB);
2295		}
2296
2297		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_STIBP) {
2298			add_x86_feature(featureset, X86FSET_STIBP);
2299		}
2300
2301		/*
2302		 * Don't read the arch caps MSR on xpv where we lack the
2303		 * on_trap().
2304		 */
2305#ifndef __xpv
2306		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_ARCH_CAPS) {
2307			on_trap_data_t otd;
2308
2309			/*
2310			 * Be paranoid and assume we'll get a #GP.
2311			 */
2312			if (!on_trap(&otd, OT_DATA_ACCESS)) {
2313				uint64_t reg;
2314
2315				reg = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
2316				if (reg & IA32_ARCH_CAP_RDCL_NO) {
2317					add_x86_feature(featureset,
2318					    X86FSET_RDCL_NO);
2319				}
2320				if (reg & IA32_ARCH_CAP_IBRS_ALL) {
2321					add_x86_feature(featureset,
2322					    X86FSET_IBRS_ALL);
2323				}
2324				if (reg & IA32_ARCH_CAP_RSBA) {
2325					add_x86_feature(featureset,
2326					    X86FSET_RSBA);
2327				}
2328				if (reg & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) {
2329					add_x86_feature(featureset,
2330					    X86FSET_L1D_VM_NO);
2331				}
2332				if (reg & IA32_ARCH_CAP_SSB_NO) {
2333					add_x86_feature(featureset,
2334					    X86FSET_SSB_NO);
2335				}
2336				if (reg & IA32_ARCH_CAP_MDS_NO) {
2337					add_x86_feature(featureset,
2338					    X86FSET_MDS_NO);
2339				}
2340			}
2341			no_trap();
2342		}
2343#endif	/* !__xpv */
2344
2345		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_SSBD)
2346			add_x86_feature(featureset, X86FSET_SSBD);
2347
2348		if (ecp->cp_edx & CPUID_INTC_EDX_7_0_FLUSH_CMD)
2349			add_x86_feature(featureset, X86FSET_FLUSH_CMD);
2350	}
2351
2352	if (cpu->cpu_id != 0)
2353		return;
2354
2355	/*
2356	 * We need to determine what changes are required for mitigating L1TF
2357	 * and MDS. If the CPU suffers from either of them, then SMT exclusion
2358	 * is required.
2359	 *
2360	 * If any of these are present, then we need to flush u-arch state at
2361	 * various points. For MDS, we need to do so whenever we change to a
2362	 * lesser privilege level or we are halting the CPU. For L1TF we need to
2363	 * flush the L1D cache at VM entry. When we have microcode that handles
2364	 * MDS, the L1D flush also clears the other u-arch state that the
2365	 * md_clear does.
2366	 */
2367
2368	/*
2369	 * Update whether or not we need to be taking explicit action against
2370	 * MDS.
2371	 */
2372	cpuid_update_md_clear(cpu, featureset);
2373
2374	/*
2375	 * Determine whether SMT exclusion is required and whether or not we
2376	 * need to perform an l1d flush.
2377	 */
2378	cpuid_update_l1d_flush(cpu, featureset);
2379}
2380
2381/*
2382 * Setup XFeature_Enabled_Mask register. Required by xsave feature.
2383 */
2384void
2385setup_xfem(void)
2386{
2387	uint64_t flags = XFEATURE_LEGACY_FP;
2388
2389	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
2390
2391	if (is_x86_feature(x86_featureset, X86FSET_SSE))
2392		flags |= XFEATURE_SSE;
2393
2394	if (is_x86_feature(x86_featureset, X86FSET_AVX))
2395		flags |= XFEATURE_AVX;
2396
2397	if (is_x86_feature(x86_featureset, X86FSET_AVX512F))
2398		flags |= XFEATURE_AVX512;
2399
2400	set_xcr(XFEATURE_ENABLED_MASK, flags);
2401
2402	xsave_bv_all = flags;
2403}
2404
2405static void
2406cpuid_pass1_topology(cpu_t *cpu, uchar_t *featureset)
2407{
2408	struct cpuid_info *cpi;
2409
2410	cpi = cpu->cpu_m.mcpu_cpi;
2411
2412	if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2413		cpuid_gather_amd_topology_leaves(cpu);
2414	}
2415
2416	cpi->cpi_apicid = cpuid_gather_apicid(cpi);
2417
2418	/*
2419	 * Before we can calculate the IDs that we should assign to this
2420	 * processor, we need to understand how many cores and threads it has.
2421	 */
2422	switch (cpi->cpi_vendor) {
2423	case X86_VENDOR_Intel:
2424		cpuid_intel_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2425		    &cpi->cpi_ncore_per_chip);
2426		break;
2427	case X86_VENDOR_AMD:
2428		cpuid_amd_ncores(cpi, &cpi->cpi_ncpu_per_chip,
2429		    &cpi->cpi_ncore_per_chip);
2430		break;
2431	default:
2432		/*
2433		 * If we have some other x86 compatible chip, it's not clear how
2434		 * they would behave. The most common case is virtualization
2435		 * today, though there are also 64-bit VIA chips. Assume that
2436		 * all we can get is the basic Leaf 1 HTT information.
2437		 */
2438		if ((cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_HTT) != 0) {
2439			cpi->cpi_ncore_per_chip = 1;
2440			cpi->cpi_ncpu_per_chip = CPI_CPU_COUNT(cpi);
2441		}
2442		break;
2443	}
2444
2445	/*
2446	 * Based on the calculated number of threads and cores, potentially
2447	 * assign the HTT and CMT features.
2448	 */
2449	if (cpi->cpi_ncore_per_chip > 1) {
2450		add_x86_feature(featureset, X86FSET_CMP);
2451	}
2452
2453	if (cpi->cpi_ncpu_per_chip > 1 &&
2454	    cpi->cpi_ncpu_per_chip != cpi->cpi_ncore_per_chip) {
2455		add_x86_feature(featureset, X86FSET_HTT);
2456	}
2457
2458	/*
2459	 * Now that has been set up, we need to go through and calculate all of
2460	 * the rest of the parameters that exist. If we think the CPU doesn't
2461	 * have either SMT (HTT) or CMP, then we basically go through and fake
2462	 * up information in some way. The most likely case for this is
2463	 * virtualization where we have a lot of partial topology information.
2464	 */
2465	if (!is_x86_feature(featureset, X86FSET_HTT) &&
2466	    !is_x86_feature(featureset, X86FSET_CMP)) {
2467		/*
2468		 * This is a single core, single-threaded processor.
2469		 */
2470		cpi->cpi_procnodes_per_pkg = 1;
2471		cpi->cpi_cores_per_compunit = 1;
2472		cpi->cpi_compunitid = 0;
2473		cpi->cpi_chipid = -1;
2474		cpi->cpi_clogid = 0;
2475		cpi->cpi_coreid = cpu->cpu_id;
2476		cpi->cpi_pkgcoreid = 0;
2477		if (cpi->cpi_vendor == X86_VENDOR_AMD) {
2478			cpi->cpi_procnodeid = BITX(cpi->cpi_apicid, 3, 0);
2479		} else {
2480			cpi->cpi_procnodeid = cpi->cpi_chipid;
2481		}
2482	} else {
2483		switch (cpi->cpi_vendor) {
2484		case X86_VENDOR_Intel:
2485			cpuid_intel_getids(cpu, featureset);
2486			break;
2487		case X86_VENDOR_AMD:
2488			cpuid_amd_getids(cpu, featureset);
2489			break;
2490		default:
2491			/*
2492			 * In this case, it's hard to say what we should do.
2493			 * We're going to model them to the OS as single core
2494			 * threads. We don't have a good identifier for them, so
2495			 * we're just going to use the cpu id all on a single
2496			 * chip.
2497			 *
2498			 * This case has historically been different from the
2499			 * case above where we don't have HTT or CMP. While they
2500			 * could be combined, we've opted to keep it separate to
2501			 * minimize the risk of topology changes in weird cases.
2502			 */
2503			cpi->cpi_procnodes_per_pkg = 1;
2504			cpi->cpi_cores_per_compunit = 1;
2505			cpi->cpi_chipid = 0;
2506			cpi->cpi_coreid = cpu->cpu_id;
2507			cpi->cpi_clogid = cpu->cpu_id;
2508			cpi->cpi_pkgcoreid = cpu->cpu_id;
2509			cpi->cpi_procnodeid = cpi->cpi_chipid;
2510			cpi->cpi_compunitid = cpi->cpi_coreid;
2511			break;
2512		}
2513	}
2514}
2515
2516/*
2517 * Gather relevant CPU features from leaf 6 which covers thermal information. We
2518 * always gather leaf 6 if it's supported; however, we only look for features on
2519 * Intel systems as AMD does not currently define any of the features we look
2520 * for below.
2521 */
2522static void
2523cpuid_pass1_thermal(cpu_t *cpu, uchar_t *featureset)
2524{
2525	struct cpuid_regs *cp;
2526	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
2527
2528	if (cpi->cpi_maxeax < 6) {
2529		return;
2530	}
2531
2532	cp = &cpi->cpi_std[6];
2533	cp->cp_eax = 6;
2534	cp->cp_ebx = cp->cp_ecx = cp->cp_edx = 0;
2535	(void) __cpuid_insn(cp);
2536	platform_cpuid_mangle(cpi->cpi_vendor, 6, cp);
2537
2538	if (cpi->cpi_vendor != X86_VENDOR_Intel) {
2539		return;
2540	}
2541
2542	if ((cp->cp_eax & CPUID_INTC_EAX_DTS) != 0) {
2543		add_x86_feature(featureset, X86FSET_CORE_THERMAL);
2544	}
2545
2546	if ((cp->cp_eax & CPUID_INTC_EAX_PTM) != 0) {
2547		add_x86_feature(featureset, X86FSET_PKG_THERMAL);
2548	}
2549}
2550
2551void
2552cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
2553{
2554	uint32_t mask_ecx, mask_edx;
2555	struct cpuid_info *cpi;
2556	struct cpuid_regs *cp;
2557	int xcpuid;
2558#if !defined(__xpv)
2559	extern int idle_cpu_prefer_mwait;
2560#endif
2561
2562	/*
2563	 * Space statically allocated for BSP, ensure pointer is set
2564	 */
2565	if (cpu->cpu_id == 0) {
2566		if (cpu->cpu_m.mcpu_cpi == NULL)
2567			cpu->cpu_m.mcpu_cpi = &cpuid_info0;
2568	}
2569
2570	add_x86_feature(featureset, X86FSET_CPUID);
2571
2572	cpi = cpu->cpu_m.mcpu_cpi;
2573	ASSERT(cpi != NULL);
2574	cp = &cpi->cpi_std[0];
2575	cp->cp_eax = 0;
2576	cpi->cpi_maxeax = __cpuid_insn(cp);
2577	{
2578		uint32_t *iptr = (uint32_t *)cpi->cpi_vendorstr;
2579		*iptr++ = cp->cp_ebx;
2580		*iptr++ = cp->cp_edx;
2581		*iptr++ = cp->cp_ecx;
2582		*(char *)&cpi->cpi_vendorstr[12] = '\0';
2583	}
2584
2585	cpi->cpi_vendor = _cpuid_vendorstr_to_vendorcode(cpi->cpi_vendorstr);
2586	x86_vendor = cpi->cpi_vendor; /* for compatibility */
2587
2588	/*
2589	 * Limit the range in case of weird hardware
2590	 */
2591	if (cpi->cpi_maxeax > CPI_MAXEAX_MAX)
2592		cpi->cpi_maxeax = CPI_MAXEAX_MAX;
2593	if (cpi->cpi_maxeax < 1)
2594		goto pass1_done;
2595
2596	cp = &cpi->cpi_std[1];
2597	cp->cp_eax = 1;
2598	(void) __cpuid_insn(cp);
2599
2600	/*
2601	 * Extract identifying constants for easy access.
2602	 */
2603	cpi->cpi_model = CPI_MODEL(cpi);
2604	cpi->cpi_family = CPI_FAMILY(cpi);
2605
2606	if (cpi->cpi_family == 0xf)
2607		cpi->cpi_family += CPI_FAMILY_XTD(cpi);
2608
2609	/*
2610	 * Beware: AMD uses "extended model" iff base *FAMILY* == 0xf.
2611	 * Intel, and presumably everyone else, uses model == 0xf, as
2612	 * one would expect (max value means possible overflow).  Sigh.
2613	 */
2614
2615	switch (cpi->cpi_vendor) {
2616	case X86_VENDOR_Intel:
2617		if (IS_EXTENDED_MODEL_INTEL(cpi))
2618			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2619		break;
2620	case X86_VENDOR_AMD:
2621		if (CPI_FAMILY(cpi) == 0xf)
2622			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2623		break;
2624	default:
2625		if (cpi->cpi_model == 0xf)
2626			cpi->cpi_model += CPI_MODEL_XTD(cpi) << 4;
2627		break;
2628	}
2629
2630	cpi->cpi_step = CPI_STEP(cpi);
2631	cpi->cpi_brandid = CPI_BRANDID(cpi);
2632
2633	/*
2634	 * *default* assumptions:
2635	 * - believe %edx feature word
2636	 * - ignore %ecx feature word
2637	 * - 32-bit virtual and physical addressing
2638	 */
2639	mask_edx = 0xffffffff;
2640	mask_ecx = 0;
2641
2642	cpi->cpi_pabits = cpi->cpi_vabits = 32;
2643
2644	switch (cpi->cpi_vendor) {
2645	case X86_VENDOR_Intel:
2646		if (cpi->cpi_family == 5)
2647			x86_type = X86_TYPE_P5;
2648		else if (IS_LEGACY_P6(cpi)) {
2649			x86_type = X86_TYPE_P6;
2650			pentiumpro_bug4046376 = 1;
2651			/*
2652			 * Clear the SEP bit when it was set erroneously
2653			 */
2654			if (cpi->cpi_model < 3 && cpi->cpi_step < 3)
2655				cp->cp_edx &= ~CPUID_INTC_EDX_SEP;
2656		} else if (IS_NEW_F6(cpi) || cpi->cpi_family == 0xf) {
2657			x86_type = X86_TYPE_P4;
2658			/*
2659			 * We don't currently depend on any of the %ecx
2660			 * features until Prescott, so we'll only check
2661			 * this from P4 onwards.  We might want to revisit
2662			 * that idea later.
2663			 */
2664			mask_ecx = 0xffffffff;
2665		} else if (cpi->cpi_family > 0xf)
2666			mask_ecx = 0xffffffff;
2667		/*
2668		 * We don't support MONITOR/MWAIT if leaf 5 is not available
2669		 * to obtain the monitor linesize.
2670		 */
2671		if (cpi->cpi_maxeax < 5)
2672			mask_ecx &= ~CPUID_INTC_ECX_MON;
2673		break;
2674	case X86_VENDOR_IntelClone:
2675	default:
2676		break;
2677	case X86_VENDOR_AMD:
2678#if defined(OPTERON_ERRATUM_108)
2679		if (cpi->cpi_family == 0xf && cpi->cpi_model == 0xe) {
2680			cp->cp_eax = (0xf0f & cp->cp_eax) | 0xc0;
2681			cpi->cpi_model = 0xc;
2682		} else
2683#endif
2684		if (cpi->cpi_family == 5) {
2685			/*
2686			 * AMD K5 and K6
2687			 *
2688			 * These CPUs have an incomplete implementation
2689			 * of MCA/MCE which we mask away.
2690			 */
2691			mask_edx &= ~(CPUID_INTC_EDX_MCE | CPUID_INTC_EDX_MCA);
2692
2693			/*
2694			 * Model 0 uses the wrong (APIC) bit
2695			 * to indicate PGE.  Fix it here.
2696			 */
2697			if (cpi->cpi_model == 0) {
2698				if (cp->cp_edx & 0x200) {
2699					cp->cp_edx &= ~0x200;
2700					cp->cp_edx |= CPUID_INTC_EDX_PGE;
2701				}
2702			}
2703
2704			/*
2705			 * Early models had problems w/ MMX; disable.
2706			 */
2707			if (cpi->cpi_model < 6)
2708				mask_edx &= ~CPUID_INTC_EDX_MMX;
2709		}
2710
2711		/*
2712		 * For newer families, SSE3 and CX16, at least, are valid;
2713		 * enable all
2714		 */
2715		if (cpi->cpi_family >= 0xf)
2716			mask_ecx = 0xffffffff;
2717		/*
2718		 * We don't support MONITOR/MWAIT if leaf 5 is not available
2719		 * to obtain the monitor linesize.
2720		 */
2721		if (cpi->cpi_maxeax < 5)
2722			mask_ecx &= ~CPUID_INTC_ECX_MON;
2723
2724#if !defined(__xpv)
2725		/*
2726		 * AMD has not historically used MWAIT in the CPU's idle loop.
2727		 * Pre-family-10h Opterons do not have the MWAIT instruction. We
2728		 * know for certain that in at least family 17h, per AMD, mwait
2729		 * is preferred. Families in-between are less certain.
2730		 */
2731		if (cpi->cpi_family < 0x17) {
2732			idle_cpu_prefer_mwait = 0;
2733		}
2734#endif
2735
2736		break;
2737	case X86_VENDOR_TM:
2738		/*
2739		 * workaround the NT workaround in CMS 4.1
2740		 */
2741		if (cpi->cpi_family == 5 && cpi->cpi_model == 4 &&
2742		    (cpi->cpi_step == 2 || cpi->cpi_step == 3))
2743			cp->cp_edx |= CPUID_INTC_EDX_CX8;
2744		break;
2745	case X86_VENDOR_Centaur:
2746		/*
2747		 * workaround the NT workarounds again
2748		 */
2749		if (cpi->cpi_family == 6)
2750			cp->cp_edx |= CPUID_INTC_EDX_CX8;
2751		break;
2752	case X86_VENDOR_Cyrix:
2753		/*
2754		 * We rely heavily on the probing in locore
2755		 * to actually figure out what parts, if any,
2756		 * of the Cyrix cpuid instruction to believe.
2757		 */
2758		switch (x86_type) {
2759		case X86_TYPE_CYRIX_486:
2760			mask_edx = 0;
2761			break;
2762		case X86_TYPE_CYRIX_6x86:
2763			mask_edx = 0;
2764			break;
2765		case X86_TYPE_CYRIX_6x86L:
2766			mask_edx =
2767			    CPUID_INTC_EDX_DE |
2768			    CPUID_INTC_EDX_CX8;
2769			break;
2770		case X86_TYPE_CYRIX_6x86MX:
2771			mask_edx =
2772			    CPUID_INTC_EDX_DE |
2773			    CPUID_INTC_EDX_MSR |
2774			    CPUID_INTC_EDX_CX8 |
2775			    CPUID_INTC_EDX_PGE |
2776			    CPUID_INTC_EDX_CMOV |
2777			    CPUID_INTC_EDX_MMX;
2778			break;
2779		case X86_TYPE_CYRIX_GXm:
2780			mask_edx =
2781			    CPUID_INTC_EDX_MSR |
2782			    CPUID_INTC_EDX_CX8 |
2783			    CPUID_INTC_EDX_CMOV |
2784			    CPUID_INTC_EDX_MMX;
2785			break;
2786		case X86_TYPE_CYRIX_MediaGX:
2787			break;
2788		case X86_TYPE_CYRIX_MII:
2789		case X86_TYPE_VIA_CYRIX_III:
2790			mask_edx =
2791			    CPUID_INTC_EDX_DE |
2792			    CPUID_INTC_EDX_TSC |
2793			    CPUID_INTC_EDX_MSR |
2794			    CPUID_INTC_EDX_CX8 |
2795			    CPUID_INTC_EDX_PGE |
2796			    CPUID_INTC_EDX_CMOV |
2797			    CPUID_INTC_EDX_MMX;
2798			break;
2799		default:
2800			break;
2801		}
2802		break;
2803	}
2804
2805#if defined(__xpv)
2806	/*
2807	 * Do not support MONITOR/MWAIT under a hypervisor
2808	 */
2809	mask_ecx &= ~CPUID_INTC_ECX_MON;
2810	/*
2811	 * Do not support XSAVE under a hypervisor for now
2812	 */
2813	xsave_force_disable = B_TRUE;
2814
2815#endif	/* __xpv */
2816
2817	if (xsave_force_disable) {
2818		mask_ecx &= ~CPUID_INTC_ECX_XSAVE;
2819		mask_ecx &= ~CPUID_INTC_ECX_AVX;
2820		mask_ecx &= ~CPUID_INTC_ECX_F16C;
2821		mask_ecx &= ~CPUID_INTC_ECX_FMA;
2822	}
2823
2824	/*
2825	 * Now we've figured out the masks that determine
2826	 * which bits we choose to believe, apply the masks
2827	 * to the feature words, then map the kernel's view
2828	 * of these feature words into its feature word.
2829	 */
2830	cp->cp_edx &= mask_edx;
2831	cp->cp_ecx &= mask_ecx;
2832
2833	/*
2834	 * apply any platform restrictions (we don't call this
2835	 * immediately after __cpuid_insn here, because we need the
2836	 * workarounds applied above first)
2837	 */
2838	platform_cpuid_mangle(cpi->cpi_vendor, 1, cp);
2839
2840	/*
2841	 * In addition to ecx and edx, Intel and AMD are storing a bunch of
2842	 * instruction set extensions in leaf 7's ebx, ecx, and edx.
2843	 */
2844	if (cpi->cpi_maxeax >= 7) {
2845		struct cpuid_regs *ecp;
2846		ecp = &cpi->cpi_std[7];
2847		ecp->cp_eax = 7;
2848		ecp->cp_ecx = 0;
2849		(void) __cpuid_insn(ecp);
2850
2851		/*
2852		 * If XSAVE has been disabled, just ignore all of the
2853		 * extended-save-area dependent flags here.
2854		 */
2855		if (xsave_force_disable) {
2856			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
2857			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
2858			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
2859			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_MPX;
2860			ecp->cp_ebx &= ~CPUID_INTC_EBX_7_0_ALL_AVX512;
2861			ecp->cp_ecx &= ~CPUID_INTC_ECX_7_0_ALL_AVX512;
2862			ecp->cp_edx &= ~CPUID_INTC_EDX_7_0_ALL_AVX512;
2863		}
2864
2865		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
2866			add_x86_feature(featureset, X86FSET_SMEP);
2867
2868		/*
2869		 * We check disable_smap here in addition to in startup_smap()
2870		 * to ensure CPUs that aren't the boot CPU don't accidentally
2871		 * include it in the feature set and thus generate a mismatched
2872		 * x86 feature set across CPUs.
2873		 */
2874		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMAP &&
2875		    disable_smap == 0)
2876			add_x86_feature(featureset, X86FSET_SMAP);
2877
2878		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_RDSEED)
2879			add_x86_feature(featureset, X86FSET_RDSEED);
2880
2881		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_ADX)
2882			add_x86_feature(featureset, X86FSET_ADX);
2883
2884		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
2885			add_x86_feature(featureset, X86FSET_FSGSBASE);
2886
2887		if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
2888			add_x86_feature(featureset, X86FSET_CLFLUSHOPT);
2889
2890		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
2891			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID)
2892				add_x86_feature(featureset, X86FSET_INVPCID);
2893
2894			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_MPX)
2895				add_x86_feature(featureset, X86FSET_MPX);
2896
2897			if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_CLWB)
2898				add_x86_feature(featureset, X86FSET_CLWB);
2899		}
2900	}
2901
2902	/*
2903	 * fold in overrides from the "eeprom" mechanism
2904	 */
2905	cp->cp_edx |= cpuid_feature_edx_include;
2906	cp->cp_edx &= ~cpuid_feature_edx_exclude;
2907
2908	cp->cp_ecx |= cpuid_feature_ecx_include;
2909	cp->cp_ecx &= ~cpuid_feature_ecx_exclude;
2910
2911	if (cp->cp_edx & CPUID_INTC_EDX_PSE) {
2912		add_x86_feature(featureset, X86FSET_LARGEPAGE);
2913	}
2914	if (cp->cp_edx & CPUID_INTC_EDX_TSC) {
2915		add_x86_feature(featureset, X86FSET_TSC);
2916	}
2917	if (cp->cp_edx & CPUID_INTC_EDX_MSR) {
2918		add_x86_feature(featureset, X86FSET_MSR);
2919	}
2920	if (cp->cp_edx & CPUID_INTC_EDX_MTRR) {
2921		add_x86_feature(featureset, X86FSET_MTRR);
2922	}
2923	if (cp->cp_edx & CPUID_INTC_EDX_PGE) {
2924		add_x86_feature(featureset, X86FSET_PGE);
2925	}
2926	if (cp->cp_edx & CPUID_INTC_EDX_CMOV) {
2927		add_x86_feature(featureset, X86FSET_CMOV);
2928	}
2929	if (cp->cp_edx & CPUID_INTC_EDX_MMX) {
2930		add_x86_feature(featureset, X86FSET_MMX);
2931	}
2932	if ((cp->cp_edx & CPUID_INTC_EDX_MCE) != 0 &&
2933	    (cp->cp_edx & CPUID_INTC_EDX_MCA) != 0) {
2934		add_x86_feature(featureset, X86FSET_MCA);
2935	}
2936	if (cp->cp_edx & CPUID_INTC_EDX_PAE) {
2937		add_x86_feature(featureset, X86FSET_PAE);
2938	}
2939	if (cp->cp_edx & CPUID_INTC_EDX_CX8) {
2940		add_x86_feature(featureset, X86FSET_CX8);
2941	}
2942	if (cp->cp_ecx & CPUID_INTC_ECX_CX16) {
2943		add_x86_feature(featureset, X86FSET_CX16);
2944	}
2945	if (cp->cp_edx & CPUID_INTC_EDX_PAT) {
2946		add_x86_feature(featureset, X86FSET_PAT);
2947	}
2948	if (cp->cp_edx & CPUID_INTC_EDX_SEP) {
2949		add_x86_feature(featureset, X86FSET_SEP);
2950	}
2951	if (cp->cp_edx & CPUID_INTC_EDX_FXSR) {
2952		/*
2953		 * In our implementation, fxsave/fxrstor
2954		 * are prerequisites before we'll even
2955		 * try and do SSE things.
2956		 */
2957		if (cp->cp_edx & CPUID_INTC_EDX_SSE) {
2958			add_x86_feature(featureset, X86FSET_SSE);
2959		}
2960		if (cp->cp_edx & CPUID_INTC_EDX_SSE2) {
2961			add_x86_feature(featureset, X86FSET_SSE2);
2962		}
2963		if (cp->cp_ecx & CPUID_INTC_ECX_SSE3) {
2964			add_x86_feature(featureset, X86FSET_SSE3);
2965		}
2966		if (cp->cp_ecx & CPUID_INTC_ECX_SSSE3) {
2967			add_x86_feature(featureset, X86FSET_SSSE3);
2968		}
2969		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_1) {
2970			add_x86_feature(featureset, X86FSET_SSE4_1);
2971		}
2972		if (cp->cp_ecx & CPUID_INTC_ECX_SSE4_2) {
2973			add_x86_feature(featureset, X86FSET_SSE4_2);
2974		}
2975		if (cp->cp_ecx & CPUID_INTC_ECX_AES) {
2976			add_x86_feature(featureset, X86FSET_AES);
2977		}
2978		if (cp->cp_ecx & CPUID_INTC_ECX_PCLMULQDQ) {
2979			add_x86_feature(featureset, X86FSET_PCLMULQDQ);
2980		}
2981
2982		if (cpi->cpi_std[7].cp_ebx & CPUID_INTC_EBX_7_0_SHA)
2983			add_x86_feature(featureset, X86FSET_SHA);
2984
2985		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_UMIP)
2986			add_x86_feature(featureset, X86FSET_UMIP);
2987		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_PKU)
2988			add_x86_feature(featureset, X86FSET_PKU);
2989		if (cpi->cpi_std[7].cp_ecx & CPUID_INTC_ECX_7_0_OSPKE)
2990			add_x86_feature(featureset, X86FSET_OSPKE);
2991
2992		if (cp->cp_ecx & CPUID_INTC_ECX_XSAVE) {
2993			add_x86_feature(featureset, X86FSET_XSAVE);
2994
2995			/* We only test AVX & AVX512 when there is XSAVE */
2996
2997			if (cp->cp_ecx & CPUID_INTC_ECX_AVX) {
2998				add_x86_feature(featureset,
2999				    X86FSET_AVX);
3000
3001				/*
3002				 * Intel says we can't check these without also
3003				 * checking AVX.
3004				 */
3005				if (cp->cp_ecx & CPUID_INTC_ECX_F16C)
3006					add_x86_feature(featureset,
3007					    X86FSET_F16C);
3008
3009				if (cp->cp_ecx & CPUID_INTC_ECX_FMA)
3010					add_x86_feature(featureset,
3011					    X86FSET_FMA);
3012
3013				if (cpi->cpi_std[7].cp_ebx &
3014				    CPUID_INTC_EBX_7_0_BMI1)
3015					add_x86_feature(featureset,
3016					    X86FSET_BMI1);
3017
3018				if (cpi->cpi_std[7].cp_ebx &
3019				    CPUID_INTC_EBX_7_0_BMI2)
3020					add_x86_feature(featureset,
3021					    X86FSET_BMI2);
3022
3023				if (cpi->cpi_std[7].cp_ebx &
3024				    CPUID_INTC_EBX_7_0_AVX2)
3025					add_x86_feature(featureset,
3026					    X86FSET_AVX2);
3027			}
3028
3029			if (cpi->cpi_vendor == X86_VENDOR_Intel &&
3030			    (cpi->cpi_std[7].cp_ebx &
3031			    CPUID_INTC_EBX_7_0_AVX512F) != 0) {
3032				add_x86_feature(featureset, X86FSET_AVX512F);
3033
3034				if (cpi->cpi_std[7].cp_ebx &
3035				    CPUID_INTC_EBX_7_0_AVX512DQ)
3036					add_x86_feature(featureset,
3037					    X86FSET_AVX512DQ);
3038				if (cpi->cpi_std[7].cp_ebx &
3039				    CPUID_INTC_EBX_7_0_AVX512IFMA)
3040					add_x86_feature(featureset,
3041					    X86FSET_AVX512FMA);
3042				if (cpi->cpi_std[7].cp_ebx &
3043				    CPUID_INTC_EBX_7_0_AVX512PF)
3044					add_x86_feature(featureset,
3045					    X86FSET_AVX512PF);
3046				if (cpi->cpi_std[7].cp_ebx &
3047				    CPUID_INTC_EBX_7_0_AVX512ER)
3048					add_x86_feature(featureset,
3049					    X86FSET_AVX512ER);
3050				if (cpi->cpi_std[7].cp_ebx &
3051				    CPUID_INTC_EBX_7_0_AVX512CD)
3052					add_x86_feature(featureset,
3053					    X86FSET_AVX512CD);
3054				if (cpi->cpi_std[7].cp_ebx &
3055				    CPUID_INTC_EBX_7_0_AVX512BW)
3056					add_x86_feature(featureset,
3057					    X86FSET_AVX512BW);
3058				if (cpi->cpi_std[7].cp_ebx &
3059				    CPUID_INTC_EBX_7_0_AVX512VL)
3060					add_x86_feature(featureset,
3061					    X86FSET_AVX512VL);
3062
3063				if (cpi->cpi_std[7].cp_ecx &
3064				    CPUID_INTC_ECX_7_0_AVX512VBMI)
3065					add_x86_feature(featureset,
3066					    X86FSET_AVX512VBMI);
3067				if (cpi->cpi_std[7].cp_ecx &
3068				    CPUID_INTC_ECX_7_0_AVX512VNNI)
3069					add_x86_feature(featureset,
3070					    X86FSET_AVX512VNNI);
3071				if (cpi->cpi_std[7].cp_ecx &
3072				    CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
3073					add_x86_feature(featureset,
3074					    X86FSET_AVX512VPOPCDQ);
3075
3076				if (cpi->cpi_std[7].cp_edx &
3077				    CPUID_INTC_EDX_7_0_AVX5124NNIW)
3078					add_x86_feature(featureset,
3079					    X86FSET_AVX512NNIW);
3080				if (cpi->cpi_std[7].cp_edx &
3081				    CPUID_INTC_EDX_7_0_AVX5124FMAPS)
3082					add_x86_feature(featureset,
3083					    X86FSET_AVX512FMAPS);
3084			}
3085		}
3086	}
3087
3088	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
3089		if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
3090			add_x86_feature(featureset, X86FSET_PCID);
3091		}
3092	}
3093
3094	if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
3095		add_x86_feature(featureset, X86FSET_X2APIC);
3096	}
3097	if (cp->cp_edx & CPUID_INTC_EDX_DE) {
3098		add_x86_feature(featureset, X86FSET_DE);
3099	}
3100#if !defined(__xpv)
3101	if (cp->cp_ecx & CPUID_INTC_ECX_MON) {
3102
3103		/*
3104		 * We require the CLFLUSH instruction for erratum workaround
3105		 * to use MONITOR/MWAIT.
3106		 */
3107		if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3108			cpi->cpi_mwait.support |= MWAIT_SUPPORT;
3109			add_x86_feature(featureset, X86FSET_MWAIT);
3110		} else {
3111			extern int idle_cpu_assert_cflush_monitor;
3112
3113			/*
3114			 * All processors we are aware of which have
3115			 * MONITOR/MWAIT also have CLFLUSH.
3116			 */
3117			if (idle_cpu_assert_cflush_monitor) {
3118				ASSERT((cp->cp_ecx & CPUID_INTC_ECX_MON) &&
3119				    (cp->cp_edx & CPUID_INTC_EDX_CLFSH));
3120			}
3121		}
3122	}
3123#endif	/* __xpv */
3124
3125	if (cp->cp_ecx & CPUID_INTC_ECX_VMX) {
3126		add_x86_feature(featureset, X86FSET_VMX);
3127	}
3128
3129	if (cp->cp_ecx & CPUID_INTC_ECX_RDRAND)
3130		add_x86_feature(featureset, X86FSET_RDRAND);
3131
3132	/*
3133	 * Only need it first time, rest of the cpus would follow suit.
3134	 * we only capture this for the bootcpu.
3135	 */
3136	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
3137		add_x86_feature(featureset, X86FSET_CLFSH);
3138		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
3139	}
3140	if (is_x86_feature(featureset, X86FSET_PAE))
3141		cpi->cpi_pabits = 36;
3142
3143	if (cpi->cpi_maxeax >= 0xD && !xsave_force_disable) {
3144		struct cpuid_regs r, *ecp;
3145
3146		ecp = &r;
3147		ecp->cp_eax = 0xD;
3148		ecp->cp_ecx = 1;
3149		ecp->cp_edx = ecp->cp_ebx = 0;
3150		(void) __cpuid_insn(ecp);
3151
3152		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEOPT)
3153			add_x86_feature(featureset, X86FSET_XSAVEOPT);
3154		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVEC)
3155			add_x86_feature(featureset, X86FSET_XSAVEC);
3156		if (ecp->cp_eax & CPUID_INTC_EAX_D_1_XSAVES)
3157			add_x86_feature(featureset, X86FSET_XSAVES);
3158	}
3159
3160	/*
3161	 * Work on the "extended" feature information, doing
3162	 * some basic initialization for cpuid_pass2()
3163	 */
3164	xcpuid = 0;
3165	switch (cpi->cpi_vendor) {
3166	case X86_VENDOR_Intel:
3167		/*
3168		 * On KVM we know we will have proper support for extended
3169		 * cpuid.
3170		 */
3171		if (IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf ||
3172		    (get_hwenv() == HW_KVM && cpi->cpi_family == 6 &&
3173		    (cpi->cpi_model == 6 || cpi->cpi_model == 2)))
3174			xcpuid++;
3175		break;
3176	case X86_VENDOR_AMD:
3177		if (cpi->cpi_family > 5 ||
3178		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
3179			xcpuid++;
3180		break;
3181	case X86_VENDOR_Cyrix:
3182		/*
3183		 * Only these Cyrix CPUs are -known- to support
3184		 * extended cpuid operations.
3185		 */
3186		if (x86_type == X86_TYPE_VIA_CYRIX_III ||
3187		    x86_type == X86_TYPE_CYRIX_GXm)
3188			xcpuid++;
3189		break;
3190	case X86_VENDOR_Centaur:
3191	case X86_VENDOR_TM:
3192	default:
3193		xcpuid++;
3194		break;
3195	}
3196
3197	if (xcpuid) {
3198		cp = &cpi->cpi_extd[0];
3199		cp->cp_eax = CPUID_LEAF_EXT_0;
3200		cpi->cpi_xmaxeax = __cpuid_insn(cp);
3201	}
3202
3203	if (cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) {
3204
3205		if (cpi->cpi_xmaxeax > CPI_XMAXEAX_MAX)
3206			cpi->cpi_xmaxeax = CPI_XMAXEAX_MAX;
3207
3208		switch (cpi->cpi_vendor) {
3209		case X86_VENDOR_Intel:
3210		case X86_VENDOR_AMD:
3211			if (cpi->cpi_xmaxeax < 0x80000001)
3212				break;
3213			cp = &cpi->cpi_extd[1];
3214			cp->cp_eax = 0x80000001;
3215			(void) __cpuid_insn(cp);
3216
3217			if (cpi->cpi_vendor == X86_VENDOR_AMD &&
3218			    cpi->cpi_family == 5 &&
3219			    cpi->cpi_model == 6 &&
3220			    cpi->cpi_step == 6) {
3221				/*
3222				 * K6 model 6 uses bit 10 to indicate SYSC
3223				 * Later models use bit 11. Fix it here.
3224				 */
3225				if (cp->cp_edx & 0x400) {
3226					cp->cp_edx &= ~0x400;
3227					cp->cp_edx |= CPUID_AMD_EDX_SYSC;
3228				}
3229			}
3230
3231			platform_cpuid_mangle(cpi->cpi_vendor, 0x80000001, cp);
3232
3233			/*
3234			 * Compute the additions to the kernel's feature word.
3235			 */
3236			if (cp->cp_edx & CPUID_AMD_EDX_NX) {
3237				add_x86_feature(featureset, X86FSET_NX);
3238			}
3239
3240			/*
3241			 * Regardless whether or not we boot 64-bit,
3242			 * we should have a way to identify whether
3243			 * the CPU is capable of running 64-bit.
3244			 */
3245			if (cp->cp_edx & CPUID_AMD_EDX_LM) {
3246				add_x86_feature(featureset, X86FSET_64);
3247			}
3248
3249			/* 1 GB large page - enable only for 64 bit kernel */
3250			if (cp->cp_edx & CPUID_AMD_EDX_1GPG) {
3251				add_x86_feature(featureset, X86FSET_1GPG);
3252			}
3253
3254			if ((cpi->cpi_vendor == X86_VENDOR_AMD) &&
3255			    (cpi->cpi_std[1].cp_edx & CPUID_INTC_EDX_FXSR) &&
3256			    (cp->cp_ecx & CPUID_AMD_ECX_SSE4A)) {
3257				add_x86_feature(featureset, X86FSET_SSE4A);
3258			}
3259
3260			/*
3261			 * It's really tricky to support syscall/sysret in
3262			 * the i386 kernel; we rely on sysenter/sysexit
3263			 * instead.  In the amd64 kernel, things are -way-
3264			 * better.
3265			 */
3266			if (cp->cp_edx & CPUID_AMD_EDX_SYSC) {
3267				add_x86_feature(featureset, X86FSET_ASYSC);
3268			}
3269
3270			/*
3271			 * While we're thinking about system calls, note
3272			 * that AMD processors don't support sysenter
3273			 * in long mode at all, so don't try to program them.
3274			 */
3275			if (x86_vendor == X86_VENDOR_AMD) {
3276				remove_x86_feature(featureset, X86FSET_SEP);
3277			}
3278
3279			if (cp->cp_edx & CPUID_AMD_EDX_TSCP) {
3280				add_x86_feature(featureset, X86FSET_TSCP);
3281			}
3282
3283			if (cp->cp_ecx & CPUID_AMD_ECX_SVM) {
3284				add_x86_feature(featureset, X86FSET_SVM);
3285			}
3286
3287			if (cp->cp_ecx & CPUID_AMD_ECX_TOPOEXT) {
3288				add_x86_feature(featureset, X86FSET_TOPOEXT);
3289			}
3290
3291			if (cp->cp_ecx & CPUID_AMD_ECX_PCEC) {
3292				add_x86_feature(featureset, X86FSET_AMD_PCEC);
3293			}
3294
3295			if (cp->cp_ecx & CPUID_AMD_ECX_XOP) {
3296				add_x86_feature(featureset, X86FSET_XOP);
3297			}
3298
3299			if (cp->cp_ecx & CPUID_AMD_ECX_FMA4) {
3300				add_x86_feature(featureset, X86FSET_FMA4);
3301			}
3302
3303			if (cp->cp_ecx & CPUID_AMD_ECX_TBM) {
3304				add_x86_feature(featureset, X86FSET_TBM);
3305			}
3306
3307			if (cp->cp_ecx & CPUID_AMD_ECX_MONITORX) {
3308				add_x86_feature(featureset, X86FSET_MONITORX);
3309			}
3310			break;
3311		default:
3312			break;
3313		}
3314
3315		/*
3316		 * Get CPUID data about processor cores and hyperthreads.
3317		 */
3318		switch (cpi->cpi_vendor) {
3319		case X86_VENDOR_Intel:
3320			if (cpi->cpi_maxeax >= 4) {
3321				cp = &cpi->cpi_std[4];
3322				cp->cp_eax = 4;
3323				cp->cp_ecx = 0;
3324				(void) __cpuid_insn(cp);
3325				platform_cpuid_mangle(cpi->cpi_vendor, 4, cp);
3326			}
3327			/*FALLTHROUGH*/
3328		case X86_VENDOR_AMD:
3329			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8)
3330				break;
3331			cp = &cpi->cpi_extd[8];
3332			cp->cp_eax = CPUID_LEAF_EXT_8;
3333			(void) __cpuid_insn(cp);
3334			platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8,
3335			    cp);
3336
3337			/*
3338			 * AMD uses ebx for some extended functions.
3339			 */
3340			if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3341				/*
3342				 * While we're here, check for the AMD "Error
3343				 * Pointer Zero/Restore" feature. This can be
3344				 * used to setup the FP save handlers
3345				 * appropriately.
3346				 */
3347				if (cp->cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3348					cpi->cpi_fp_amd_save = 0;
3349				} else {
3350					cpi->cpi_fp_amd_save = 1;
3351				}
3352
3353				if (cp->cp_ebx & CPUID_AMD_EBX_CLZERO) {
3354					add_x86_feature(featureset,
3355					    X86FSET_CLZERO);
3356				}
3357			}
3358
3359			/*
3360			 * Virtual and physical address limits from
3361			 * cpuid override previously guessed values.
3362			 */
3363			cpi->cpi_pabits = BITX(cp->cp_eax, 7, 0);
3364			cpi->cpi_vabits = BITX(cp->cp_eax, 15, 8);
3365			break;
3366		default:
3367			break;
3368		}
3369
3370		/*
3371		 * Get CPUID data about TSC Invariance in Deep C-State.
3372		 */
3373		switch (cpi->cpi_vendor) {
3374		case X86_VENDOR_Intel:
3375		case X86_VENDOR_AMD:
3376			if (cpi->cpi_maxeax >= 7) {
3377				cp = &cpi->cpi_extd[7];
3378				cp->cp_eax = 0x80000007;
3379				cp->cp_ecx = 0;
3380				(void) __cpuid_insn(cp);
3381			}
3382			break;
3383		default:
3384			break;
3385		}
3386	}
3387
3388	cpuid_pass1_topology(cpu, featureset);
3389	cpuid_pass1_thermal(cpu, featureset);
3390
3391	/*
3392	 * Synthesize chip "revision" and socket type
3393	 */
3394	cpi->cpi_chiprev = _cpuid_chiprev(cpi->cpi_vendor, cpi->cpi_family,
3395	    cpi->cpi_model, cpi->cpi_step);
3396	cpi->cpi_chiprevstr = _cpuid_chiprevstr(cpi->cpi_vendor,
3397	    cpi->cpi_family, cpi->cpi_model, cpi->cpi_step);
3398	cpi->cpi_socket = _cpuid_skt(cpi->cpi_vendor, cpi->cpi_family,
3399	    cpi->cpi_model, cpi->cpi_step);
3400
3401	if (cpi->cpi_vendor == X86_VENDOR_AMD) {
3402		if (cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_8 &&
3403		    cpi->cpi_extd[8].cp_ebx & CPUID_AMD_EBX_ERR_PTR_ZERO) {
3404			/* Special handling for AMD FP not necessary. */
3405			cpi->cpi_fp_amd_save = 0;
3406		} else {
3407			cpi->cpi_fp_amd_save = 1;
3408		}
3409	}
3410
3411	/*
3412	 * Check the processor leaves that are used for security features.
3413	 */
3414	cpuid_scan_security(cpu, featureset);
3415
3416pass1_done:
3417	cpi->cpi_pass = 1;
3418}
3419
3420/*
3421 * Make copies of the cpuid table entries we depend on, in
3422 * part for ease of parsing now, in part so that we have only
3423 * one place to correct any of it, in part for ease of
3424 * later export to userland, and in part so we can look at
3425 * this stuff in a crash dump.
3426 */
3427
3428/*ARGSUSED*/
3429void
3430cpuid_pass2(cpu_t *cpu)
3431{
3432	uint_t n, nmax;
3433	int i;
3434	struct cpuid_regs *cp;
3435	uint8_t *dp;
3436	uint32_t *iptr;
3437	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
3438
3439	ASSERT(cpi->cpi_pass == 1);
3440
3441	if (cpi->cpi_maxeax < 1)
3442		goto pass2_done;
3443
3444	if ((nmax = cpi->cpi_maxeax + 1) > NMAX_CPI_STD)
3445		nmax = NMAX_CPI_STD;
3446	/*
3447	 * (We already handled n == 0 and n == 1 in pass 1)
3448	 */
3449	for (n = 2, cp = &cpi->cpi_std[2]; n < nmax; n++, cp++) {
3450		/*
3451		 * leaves 6 and 7 were handled in pass 1
3452		 */
3453		if (n == 6 || n == 7)
3454			continue;
3455
3456		cp->cp_eax = n;
3457
3458		/*
3459		 * CPUID function 4 expects %ecx to be initialized
3460		 * with an index which indicates which cache to return
3461		 * information about. The OS is expected to call function 4
3462		 * with %ecx set to 0, 1, 2, ... until it returns with
3463		 * EAX[4:0] set to 0, which indicates there are no more
3464		 * caches.
3465		 *
3466		 * Here, populate cpi_std[4] with the information returned by
3467		 * function 4 when %ecx == 0, and do the rest in cpuid_pass3()
3468		 * when dynamic memory allocation becomes available.
3469		 *
3470		 * Note: we need to explicitly initialize %ecx here, since
3471		 * function 4 may have been previously invoked.
3472		 */
3473		if (n == 4)
3474			cp->cp_ecx = 0;
3475
3476		(void) __cpuid_insn(cp);
3477		platform_cpuid_mangle(cpi->cpi_vendor, n, cp);
3478		switch (n) {
3479		case 2:
3480			/*
3481			 * "the lower 8 bits of the %eax register
3482			 * contain a value that identifies the number
3483			 * of times the cpuid [instruction] has to be
3484			 * executed to obtain a complete image of the
3485			 * processor's caching systems."
3486			 *
3487			 * How *do* they make this stuff up?
3488			 */
3489			cpi->cpi_ncache = sizeof (*cp) *
3490			    BITX(cp->cp_eax, 7, 0);
3491			if (cpi->cpi_ncache == 0)
3492				break;
3493			cpi->cpi_ncache--;	/* skip count byte */
3494
3495			/*
3496			 * Well, for now, rather than attempt to implement
3497			 * this slightly dubious algorithm, we just look
3498			 * at the first 15 ..
3499			 */
3500			if (cpi->cpi_ncache > (sizeof (*cp) - 1))
3501				cpi->cpi_ncache = sizeof (*cp) - 1;
3502
3503			dp = cpi->cpi_cacheinfo;
3504			if (BITX(cp->cp_eax, 31, 31) == 0) {
3505				uint8_t *p = (void *)&cp->cp_eax;
3506				for (i = 1; i < 4; i++)
3507					if (p[i] != 0)
3508						*dp++ = p[i];
3509			}
3510			if (BITX(cp->cp_ebx, 31, 31) == 0) {
3511				uint8_t *p = (void *)&cp->cp_ebx;
3512				for (i = 0; i < 4; i++)
3513					if (p[i] != 0)
3514						*dp++ = p[i];
3515			}
3516			if (BITX(cp->cp_ecx, 31, 31) == 0) {
3517				uint8_t *p = (void *)&cp->cp_ecx;
3518				for (i = 0; i < 4; i++)
3519					if (p[i] != 0)
3520						*dp++ = p[i];
3521			}
3522			if (BITX(cp->cp_edx, 31, 31) == 0) {
3523				uint8_t *p = (void *)&cp->cp_edx;
3524				for (i = 0; i < 4; i++)
3525					if (p[i] != 0)
3526						*dp++ = p[i];
3527			}
3528			break;
3529
3530		case 3:	/* Processor serial number, if PSN supported */
3531			break;
3532
3533		case 4:	/* Deterministic cache parameters */
3534			break;
3535
3536		case 5:	/* Monitor/Mwait parameters */
3537		{
3538			size_t mwait_size;
3539
3540			/*
3541			 * check cpi_mwait.support which was set in cpuid_pass1
3542			 */
3543			if (!(cpi->cpi_mwait.support & MWAIT_SUPPORT))
3544				break;
3545
3546			/*
3547			 * Protect ourself from insane mwait line size.
3548			 * Workaround for incomplete hardware emulator(s).
3549			 */
3550			mwait_size = (size_t)MWAIT_SIZE_MAX(cpi);
3551			if (mwait_size < sizeof (uint32_t) ||
3552			    !ISP2(mwait_size)) {
3553#if DEBUG
3554				cmn_err(CE_NOTE, "Cannot handle cpu %d mwait "
3555				    "size %ld", cpu->cpu_id, (long)mwait_size);
3556#endif
3557				break;
3558			}
3559
3560			cpi->cpi_mwait.mon_min = (size_t)MWAIT_SIZE_MIN(cpi);
3561			cpi->cpi_mwait.mon_max = mwait_size;
3562			if (MWAIT_EXTENSION(cpi)) {
3563				cpi->cpi_mwait.support |= MWAIT_EXTENSIONS;
3564				if (MWAIT_INT_ENABLE(cpi))
3565					cpi->cpi_mwait.support |=
3566					    MWAIT_ECX_INT_ENABLE;
3567			}
3568			break;
3569		}
3570		default:
3571			break;
3572		}
3573	}
3574
3575	/*
3576	 * XSAVE enumeration
3577	 */
3578	if (cpi->cpi_maxeax >= 0xD) {
3579		struct cpuid_regs regs;
3580		boolean_t cpuid_d_valid = B_TRUE;
3581
3582		cp = &regs;
3583		cp->cp_eax = 0xD;
3584		cp->cp_edx = cp->cp_ebx = cp->cp_ecx = 0;
3585
3586		(void) __cpuid_insn(cp);
3587
3588		/*
3589		 * Sanity checks for debug
3590		 */
3591		if ((cp->cp_eax & XFEATURE_LEGACY_FP) == 0 ||
3592		    (cp->cp_eax & XFEATURE_SSE) == 0) {
3593			cpuid_d_valid = B_FALSE;
3594		}
3595
3596		cpi->cpi_xsave.xsav_hw_features_low = cp->cp_eax;
3597		cpi->cpi_xsave.xsav_hw_features_high = cp->cp_edx;
3598		cpi->cpi_xsave.xsav_max_size = cp->cp_ecx;
3599
3600		/*
3601		 * If the hw supports AVX, get the size and offset in the save
3602		 * area for the ymm state.
3603		 */
3604		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX) {
3605			cp->cp_eax = 0xD;
3606			cp->cp_ecx = 2;
3607			cp->cp_edx = cp->cp_ebx = 0;
3608
3609			(void) __cpuid_insn(cp);
3610
3611			if (cp->cp_ebx != CPUID_LEAFD_2_YMM_OFFSET ||
3612			    cp->cp_eax != CPUID_LEAFD_2_YMM_SIZE) {
3613				cpuid_d_valid = B_FALSE;
3614			}
3615
3616			cpi->cpi_xsave.ymm_size = cp->cp_eax;
3617			cpi->cpi_xsave.ymm_offset = cp->cp_ebx;
3618		}
3619
3620		/*
3621		 * If the hw supports MPX, get the size and offset in the
3622		 * save area for BNDREGS and BNDCSR.
3623		 */
3624		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_MPX) {
3625			cp->cp_eax = 0xD;
3626			cp->cp_ecx = 3;
3627			cp->cp_edx = cp->cp_ebx = 0;
3628
3629			(void) __cpuid_insn(cp);
3630
3631			cpi->cpi_xsave.bndregs_size = cp->cp_eax;
3632			cpi->cpi_xsave.bndregs_offset = cp->cp_ebx;
3633
3634			cp->cp_eax = 0xD;
3635			cp->cp_ecx = 4;
3636			cp->cp_edx = cp->cp_ebx = 0;
3637
3638			(void) __cpuid_insn(cp);
3639
3640			cpi->cpi_xsave.bndcsr_size = cp->cp_eax;
3641			cpi->cpi_xsave.bndcsr_offset = cp->cp_ebx;
3642		}
3643
3644		/*
3645		 * If the hw supports AVX512, get the size and offset in the
3646		 * save area for the opmask registers and zmm state.
3647		 */
3648		if (cpi->cpi_xsave.xsav_hw_features_low & XFEATURE_AVX512) {
3649			cp->cp_eax = 0xD;
3650			cp->cp_ecx = 5;
3651			cp->cp_edx = cp->cp_ebx = 0;
3652
3653			(void) __cpuid_insn(cp);
3654
3655			cpi->cpi_xsave.opmask_size = cp->cp_eax;
3656			cpi->cpi_xsave.opmask_offset = cp->cp_ebx;
3657
3658			cp->cp_eax = 0xD;
3659			cp->cp_ecx = 6;
3660			cp->cp_edx = cp->cp_ebx = 0;
3661
3662			(void) __cpuid_insn(cp);
3663
3664			cpi->cpi_xsave.zmmlo_size = cp->cp_eax;
3665			cpi->cpi_xsave.zmmlo_offset = cp->cp_ebx;
3666
3667			cp->cp_eax = 0xD;
3668			cp->cp_ecx = 7;
3669			cp->cp_edx = cp->cp_ebx = 0;
3670
3671			(void) __cpuid_insn(cp);
3672
3673			cpi->cpi_xsave.zmmhi_size = cp->cp_eax;
3674			cpi->cpi_xsave.zmmhi_offset = cp->cp_ebx;
3675		}
3676
3677		if (is_x86_feature(x86_featureset, X86FSET_XSAVE)) {
3678			xsave_state_size = 0;
3679		} else if (cpuid_d_valid) {
3680			xsave_state_size = cpi->cpi_xsave.xsav_max_size;
3681		} else {
3682			/* Broken CPUID 0xD, probably in HVM */
3683			cmn_err(CE_WARN, "cpu%d: CPUID.0xD returns invalid "
3684			    "value: hw_low = %d, hw_high = %d, xsave_size = %d"
3685			    ", ymm_size = %d, ymm_offset = %d\n",
3686			    cpu->cpu_id, cpi->cpi_xsave.xsav_hw_features_low,
3687			    cpi->cpi_xsave.xsav_hw_features_high,
3688			    (int)cpi->cpi_xsave.xsav_max_size,
3689			    (int)cpi->cpi_xsave.ymm_size,
3690			    (int)cpi->cpi_xsave.ymm_offset);
3691
3692			if (xsave_state_size != 0) {
3693				/*
3694				 * This must be a non-boot CPU. We cannot
3695				 * continue, because boot cpu has already
3696				 * enabled XSAVE.
3697				 */
3698				ASSERT(cpu->cpu_id != 0);
3699				cmn_err(CE_PANIC, "cpu%d: we have already "
3700				    "enabled XSAVE on boot cpu, cannot "
3701				    "continue.", cpu->cpu_id);
3702			} else {
3703				/*
3704				 * If we reached here on the boot CPU, it's also
3705				 * almost certain that we'll reach here on the
3706				 * non-boot CPUs. When we're here on a boot CPU
3707				 * we should disable the feature, on a non-boot
3708				 * CPU we need to confirm that we have.
3709				 */
3710				if (cpu->cpu_id == 0) {
3711					remove_x86_feature(x86_featureset,
3712					    X86FSET_XSAVE);
3713					remove_x86_feature(x86_featureset,
3714					    X86FSET_AVX);
3715					remove_x86_feature(x86_featureset,
3716					    X86FSET_F16C);
3717					remove_x86_feature(x86_featureset,
3718					    X86FSET_BMI1);
3719					remove_x86_feature(x86_featureset,
3720					    X86FSET_BMI2);
3721					remove_x86_feature(x86_featureset,
3722					    X86FSET_FMA);
3723					remove_x86_feature(x86_featureset,
3724					    X86FSET_AVX2);
3725					remove_x86_feature(x86_featureset,
3726					    X86FSET_MPX);
3727					remove_x86_feature(x86_featureset,
3728					    X86FSET_AVX512F);
3729					remove_x86_feature(x86_featureset,
3730					    X86FSET_AVX512DQ);
3731					remove_x86_feature(x86_featureset,
3732					    X86FSET_AVX512PF);
3733					remove_x86_feature(x86_featureset,
3734					    X86FSET_AVX512ER);
3735					remove_x86_feature(x86_featureset,
3736					    X86FSET_AVX512CD);
3737					remove_x86_feature(x86_featureset,
3738					    X86FSET_AVX512BW);
3739					remove_x86_feature(x86_featureset,
3740					    X86FSET_AVX512VL);
3741					remove_x86_feature(x86_featureset,
3742					    X86FSET_AVX512FMA);
3743					remove_x86_feature(x86_featureset,
3744					    X86FSET_AVX512VBMI);
3745					remove_x86_feature(x86_featureset,
3746					    X86FSET_AVX512VNNI);
3747					remove_x86_feature(x86_featureset,
3748					    X86FSET_AVX512VPOPCDQ);
3749					remove_x86_feature(x86_featureset,
3750					    X86FSET_AVX512NNIW);
3751					remove_x86_feature(x86_featureset,
3752					    X86FSET_AVX512FMAPS);
3753
3754					CPI_FEATURES_ECX(cpi) &=
3755					    ~CPUID_INTC_ECX_XSAVE;
3756					CPI_FEATURES_ECX(cpi) &=
3757					    ~CPUID_INTC_ECX_AVX;
3758					CPI_FEATURES_ECX(cpi) &=
3759					    ~CPUID_INTC_ECX_F16C;
3760					CPI_FEATURES_ECX(cpi) &=
3761					    ~CPUID_INTC_ECX_FMA;
3762					CPI_FEATURES_7_0_EBX(cpi) &=
3763					    ~CPUID_INTC_EBX_7_0_BMI1;
3764					CPI_FEATURES_7_0_EBX(cpi) &=
3765					    ~CPUID_INTC_EBX_7_0_BMI2;
3766					CPI_FEATURES_7_0_EBX(cpi) &=
3767					    ~CPUID_INTC_EBX_7_0_AVX2;
3768					CPI_FEATURES_7_0_EBX(cpi) &=
3769					    ~CPUID_INTC_EBX_7_0_MPX;
3770					CPI_FEATURES_7_0_EBX(cpi) &=
3771					    ~CPUID_INTC_EBX_7_0_ALL_AVX512;
3772
3773					CPI_FEATURES_7_0_ECX(cpi) &=
3774					    ~CPUID_INTC_ECX_7_0_ALL_AVX512;
3775
3776					CPI_FEATURES_7_0_EDX(cpi) &=
3777					    ~CPUID_INTC_EDX_7_0_ALL_AVX512;
3778
3779					xsave_force_disable = B_TRUE;
3780				} else {
3781					VERIFY(is_x86_feature(x86_featureset,
3782					    X86FSET_XSAVE) == B_FALSE);
3783				}
3784			}
3785		}
3786	}
3787
3788
3789	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0)
3790		goto pass2_done;
3791
3792	if ((nmax = cpi->cpi_xmaxeax - CPUID_LEAF_EXT_0 + 1) > NMAX_CPI_EXTD)
3793		nmax = NMAX_CPI_EXTD;
3794	/*
3795	 * Copy the extended properties, fixing them as we go.
3796	 * (We already handled n == 0 and n == 1 in pass 1)
3797	 */
3798	iptr = (void *)cpi->cpi_brandstr;
3799	for (n = 2, cp = &cpi->cpi_extd[2]; n < nmax; cp++, n++) {
3800		cp->cp_eax = CPUID_LEAF_EXT_0 + n;
3801		(void) __cpuid_insn(cp);
3802		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_0 + n,
3803		    cp);
3804		switch (n) {
3805		case 2:
3806		case 3:
3807		case 4:
3808			/*
3809			 * Extract the brand string
3810			 */
3811			*iptr++ = cp->cp_eax;
3812			*iptr++ = cp->cp_ebx;
3813			*iptr++ = cp->cp_ecx;
3814			*iptr++ = cp->cp_edx;
3815			break;
3816		case 5:
3817			switch (cpi->cpi_vendor) {
3818			case X86_VENDOR_AMD:
3819				/*
3820				 * The Athlon and Duron were the first
3821				 * parts to report the sizes of the
3822				 * TLB for large pages. Before then,
3823				 * we don't trust the data.
3824				 */
3825				if (cpi->cpi_family < 6 ||
3826				    (cpi->cpi_family == 6 &&
3827				    cpi->cpi_model < 1))
3828					cp->cp_eax = 0;
3829				break;
3830			default:
3831				break;
3832			}
3833			break;
3834		case 6:
3835			switch (cpi->cpi_vendor) {
3836			case X86_VENDOR_AMD:
3837				/*
3838				 * The Athlon and Duron were the first
3839				 * AMD parts with L2 TLB's.
3840				 * Before then, don't trust the data.
3841				 */
3842				if (cpi->cpi_family < 6 ||
3843				    cpi->cpi_family == 6 &&
3844				    cpi->cpi_model < 1)
3845					cp->cp_eax = cp->cp_ebx = 0;
3846				/*
3847				 * AMD Duron rev A0 reports L2
3848				 * cache size incorrectly as 1K
3849				 * when it is really 64K
3850				 */
3851				if (cpi->cpi_family == 6 &&
3852				    cpi->cpi_model == 3 &&
3853				    cpi->cpi_step == 0) {
3854					cp->cp_ecx &= 0xffff;
3855					cp->cp_ecx |= 0x400000;
3856				}
3857				break;
3858			case X86_VENDOR_Cyrix:	/* VIA C3 */
3859				/*
3860				 * VIA C3 processors are a bit messed
3861				 * up w.r.t. encoding cache sizes in %ecx
3862				 */
3863				if (cpi->cpi_family != 6)
3864					break;
3865				/*
3866				 * model 7 and 8 were incorrectly encoded
3867				 *
3868				 * xxx is model 8 really broken?
3869				 */
3870				if (cpi->cpi_model == 7 ||
3871				    cpi->cpi_model == 8)
3872					cp->cp_ecx =
3873					    BITX(cp->cp_ecx, 31, 24) << 16 |
3874					    BITX(cp->cp_ecx, 23, 16) << 12 |
3875					    BITX(cp->cp_ecx, 15, 8) << 8 |
3876					    BITX(cp->cp_ecx, 7, 0);
3877				/*
3878				 * model 9 stepping 1 has wrong associativity
3879				 */
3880				if (cpi->cpi_model == 9 && cpi->cpi_step == 1)
3881					cp->cp_ecx |= 8 << 12;
3882				break;
3883			case X86_VENDOR_Intel:
3884				/*
3885				 * Extended L2 Cache features function.
3886				 * First appeared on Prescott.
3887				 */
3888			default:
3889				break;
3890			}
3891			break;
3892		default:
3893			break;
3894		}
3895	}
3896
3897pass2_done:
3898	cpi->cpi_pass = 2;
3899}
3900
3901static const char *
3902intel_cpubrand(const struct cpuid_info *cpi)
3903{
3904	int i;
3905
3906	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
3907	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
3908		return ("i486");
3909
3910	switch (cpi->cpi_family) {
3911	case 5:
3912		return ("Intel Pentium(r)");
3913	case 6:
3914		switch (cpi->cpi_model) {
3915			uint_t celeron, xeon;
3916			const struct cpuid_regs *cp;
3917		case 0:
3918		case 1:
3919		case 2:
3920			return ("Intel Pentium(r) Pro");
3921		case 3:
3922		case 4:
3923			return ("Intel Pentium(r) II");
3924		case 6:
3925			return ("Intel Celeron(r)");
3926		case 5:
3927		case 7:
3928			celeron = xeon = 0;
3929			cp = &cpi->cpi_std[2];	/* cache info */
3930
3931			for (i = 1; i < 4; i++) {
3932				uint_t tmp;
3933
3934				tmp = (cp->cp_eax >> (8 * i)) & 0xff;
3935				if (tmp == 0x40)
3936					celeron++;
3937				if (tmp >= 0x44 && tmp <= 0x45)
3938					xeon++;
3939			}
3940
3941			for (i = 0; i < 2; i++) {
3942				uint_t tmp;
3943
3944				tmp = (cp->cp_ebx >> (8 * i)) & 0xff;
3945				if (tmp == 0x40)
3946					celeron++;
3947				else if (tmp >= 0x44 && tmp <= 0x45)
3948					xeon++;
3949			}
3950
3951			for (i = 0; i < 4; i++) {
3952				uint_t tmp;
3953
3954				tmp = (cp->cp_ecx >> (8 * i)) & 0xff;
3955				if (tmp == 0x40)
3956					celeron++;
3957				else if (tmp >= 0x44 && tmp <= 0x45)
3958					xeon++;
3959			}
3960
3961			for (i = 0; i < 4; i++) {
3962				uint_t tmp;
3963
3964				tmp = (cp->cp_edx >> (8 * i)) & 0xff;
3965				if (tmp == 0x40)
3966					celeron++;
3967				else if (tmp >= 0x44 && tmp <= 0x45)
3968					xeon++;
3969			}
3970
3971			if (celeron)
3972				return ("Intel Celeron(r)");
3973			if (xeon)
3974				return (cpi->cpi_model == 5 ?
3975				    "Intel Pentium(r) II Xeon(tm)" :
3976				    "Intel Pentium(r) III Xeon(tm)");
3977			return (cpi->cpi_model == 5 ?
3978			    "Intel Pentium(r) II or Pentium(r) II Xeon(tm)" :
3979			    "Intel Pentium(r) III or Pentium(r) III Xeon(tm)");
3980		default:
3981			break;
3982		}
3983	default:
3984		break;
3985	}
3986
3987	/* BrandID is present if the field is nonzero */
3988	if (cpi->cpi_brandid != 0) {
3989		static const struct {
3990			uint_t bt_bid;
3991			const char *bt_str;
3992		} brand_tbl[] = {
3993			{ 0x1,	"Intel(r) Celeron(r)" },
3994			{ 0x2,	"Intel(r) Pentium(r) III" },
3995			{ 0x3,	"Intel(r) Pentium(r) III Xeon(tm)" },
3996			{ 0x4,	"Intel(r) Pentium(r) III" },
3997			{ 0x6,	"Mobile Intel(r) Pentium(r) III" },
3998			{ 0x7,	"Mobile Intel(r) Celeron(r)" },
3999			{ 0x8,	"Intel(r) Pentium(r) 4" },
4000			{ 0x9,	"Intel(r) Pentium(r) 4" },
4001			{ 0xa,	"Intel(r) Celeron(r)" },
4002			{ 0xb,	"Intel(r) Xeon(tm)" },
4003			{ 0xc,	"Intel(r) Xeon(tm) MP" },
4004			{ 0xe,	"Mobile Intel(r) Pentium(r) 4" },
4005			{ 0xf,	"Mobile Intel(r) Celeron(r)" },
4006			{ 0x11, "Mobile Genuine Intel(r)" },
4007			{ 0x12, "Intel(r) Celeron(r) M" },
4008			{ 0x13, "Mobile Intel(r) Celeron(r)" },
4009			{ 0x14, "Intel(r) Celeron(r)" },
4010			{ 0x15, "Mobile Genuine Intel(r)" },
4011			{ 0x16,	"Intel(r) Pentium(r) M" },
4012			{ 0x17, "Mobile Intel(r) Celeron(r)" }
4013		};
4014		uint_t btblmax = sizeof (brand_tbl) / sizeof (brand_tbl[0]);
4015		uint_t sgn;
4016
4017		sgn = (cpi->cpi_family << 8) |
4018		    (cpi->cpi_model << 4) | cpi->cpi_step;
4019
4020		for (i = 0; i < btblmax; i++)
4021			if (brand_tbl[i].bt_bid == cpi->cpi_brandid)
4022				break;
4023		if (i < btblmax) {
4024			if (sgn == 0x6b1 && cpi->cpi_brandid == 3)
4025				return ("Intel(r) Celeron(r)");
4026			if (sgn < 0xf13 && cpi->cpi_brandid == 0xb)
4027				return ("Intel(r) Xeon(tm) MP");
4028			if (sgn < 0xf13 && cpi->cpi_brandid == 0xe)
4029				return ("Intel(r) Xeon(tm)");
4030			return (brand_tbl[i].bt_str);
4031		}
4032	}
4033
4034	return (NULL);
4035}
4036
4037static const char *
4038amd_cpubrand(const struct cpuid_info *cpi)
4039{
4040	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4041	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5)
4042		return ("i486 compatible");
4043
4044	switch (cpi->cpi_family) {
4045	case 5:
4046		switch (cpi->cpi_model) {
4047		case 0:
4048		case 1:
4049		case 2:
4050		case 3:
4051		case 4:
4052		case 5:
4053			return ("AMD-K5(r)");
4054		case 6:
4055		case 7:
4056			return ("AMD-K6(r)");
4057		case 8:
4058			return ("AMD-K6(r)-2");
4059		case 9:
4060			return ("AMD-K6(r)-III");
4061		default:
4062			return ("AMD (family 5)");
4063		}
4064	case 6:
4065		switch (cpi->cpi_model) {
4066		case 1:
4067			return ("AMD-K7(tm)");
4068		case 0:
4069		case 2:
4070		case 4:
4071			return ("AMD Athlon(tm)");
4072		case 3:
4073		case 7:
4074			return ("AMD Duron(tm)");
4075		case 6:
4076		case 8:
4077		case 10:
4078			/*
4079			 * Use the L2 cache size to distinguish
4080			 */
4081			return ((cpi->cpi_extd[6].cp_ecx >> 16) >= 256 ?
4082			    "AMD Athlon(tm)" : "AMD Duron(tm)");
4083		default:
4084			return ("AMD (family 6)");
4085		}
4086	default:
4087		break;
4088	}
4089
4090	if (cpi->cpi_family == 0xf && cpi->cpi_model == 5 &&
4091	    cpi->cpi_brandid != 0) {
4092		switch (BITX(cpi->cpi_brandid, 7, 5)) {
4093		case 3:
4094			return ("AMD Opteron(tm) UP 1xx");
4095		case 4:
4096			return ("AMD Opteron(tm) DP 2xx");
4097		case 5:
4098			return ("AMD Opteron(tm) MP 8xx");
4099		default:
4100			return ("AMD Opteron(tm)");
4101		}
4102	}
4103
4104	return (NULL);
4105}
4106
4107static const char *
4108cyrix_cpubrand(struct cpuid_info *cpi, uint_t type)
4109{
4110	if (!is_x86_feature(x86_featureset, X86FSET_CPUID) ||
4111	    cpi->cpi_maxeax < 1 || cpi->cpi_family < 5 ||
4112	    type == X86_TYPE_CYRIX_486)
4113		return ("i486 compatible");
4114
4115	switch (type) {
4116	case X86_TYPE_CYRIX_6x86:
4117		return ("Cyrix 6x86");
4118	case X86_TYPE_CYRIX_6x86L:
4119		return ("Cyrix 6x86L");
4120	case X86_TYPE_CYRIX_6x86MX:
4121		return ("Cyrix 6x86MX");
4122	case X86_TYPE_CYRIX_GXm:
4123		return ("Cyrix GXm");
4124	case X86_TYPE_CYRIX_MediaGX:
4125		return ("Cyrix MediaGX");
4126	case X86_TYPE_CYRIX_MII:
4127		return ("Cyrix M2");
4128	case X86_TYPE_VIA_CYRIX_III:
4129		return ("VIA Cyrix M3");
4130	default:
4131		/*
4132		 * Have another wild guess ..
4133		 */
4134		if (cpi->cpi_family == 4 && cpi->cpi_model == 9)
4135			return ("Cyrix 5x86");
4136		else if (cpi->cpi_family == 5) {
4137			switch (cpi->cpi_model) {
4138			case 2:
4139				return ("Cyrix 6x86");	/* Cyrix M1 */
4140			case 4:
4141				return ("Cyrix MediaGX");
4142			default:
4143				break;
4144			}
4145		} else if (cpi->cpi_family == 6) {
4146			switch (cpi->cpi_model) {
4147			case 0:
4148				return ("Cyrix 6x86MX"); /* Cyrix M2? */
4149			case 5:
4150			case 6:
4151			case 7:
4152			case 8:
4153			case 9:
4154				return ("VIA C3");
4155			default:
4156				break;
4157			}
4158		}
4159		break;
4160	}
4161	return (NULL);
4162}
4163
4164/*
4165 * This only gets called in the case that the CPU extended
4166 * feature brand string (0x80000002, 0x80000003, 0x80000004)
4167 * aren't available, or contain null bytes for some reason.
4168 */
4169static void
4170fabricate_brandstr(struct cpuid_info *cpi)
4171{
4172	const char *brand = NULL;
4173
4174	switch (cpi->cpi_vendor) {
4175	case X86_VENDOR_Intel:
4176		brand = intel_cpubrand(cpi);
4177		break;
4178	case X86_VENDOR_AMD:
4179		brand = amd_cpubrand(cpi);
4180		break;
4181	case X86_VENDOR_Cyrix:
4182		brand = cyrix_cpubrand(cpi, x86_type);
4183		break;
4184	case X86_VENDOR_NexGen:
4185		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4186			brand = "NexGen Nx586";
4187		break;
4188	case X86_VENDOR_Centaur:
4189		if (cpi->cpi_family == 5)
4190			switch (cpi->cpi_model) {
4191			case 4:
4192				brand = "Centaur C6";
4193				break;
4194			case 8:
4195				brand = "Centaur C2";
4196				break;
4197			case 9:
4198				brand = "Centaur C3";
4199				break;
4200			default:
4201				break;
4202			}
4203		break;
4204	case X86_VENDOR_Rise:
4205		if (cpi->cpi_family == 5 &&
4206		    (cpi->cpi_model == 0 || cpi->cpi_model == 2))
4207			brand = "Rise mP6";
4208		break;
4209	case X86_VENDOR_SiS:
4210		if (cpi->cpi_family == 5 && cpi->cpi_model == 0)
4211			brand = "SiS 55x";
4212		break;
4213	case X86_VENDOR_TM:
4214		if (cpi->cpi_family == 5 && cpi->cpi_model == 4)
4215			brand = "Transmeta Crusoe TM3x00 or TM5x00";
4216		break;
4217	case X86_VENDOR_NSC:
4218	case X86_VENDOR_UMC:
4219	default:
4220		break;
4221	}
4222	if (brand) {
4223		(void) strcpy((char *)cpi->cpi_brandstr, brand);
4224		return;
4225	}
4226
4227	/*
4228	 * If all else fails ...
4229	 */
4230	(void) snprintf(cpi->cpi_brandstr, sizeof (cpi->cpi_brandstr),
4231	    "%s %d.%d.%d", cpi->cpi_vendorstr, cpi->cpi_family,
4232	    cpi->cpi_model, cpi->cpi_step);
4233}
4234
4235/*
4236 * This routine is called just after kernel memory allocation
4237 * becomes available on cpu0, and as part of mp_startup() on
4238 * the other cpus.
4239 *
4240 * Fixup the brand string, and collect any information from cpuid
4241 * that requires dynamically allocated storage to represent.
4242 */
4243/*ARGSUSED*/
4244void
4245cpuid_pass3(cpu_t *cpu)
4246{
4247	int	i, max, shft, level, size;
4248	struct cpuid_regs regs;
4249	struct cpuid_regs *cp;
4250	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4251
4252	ASSERT(cpi->cpi_pass == 2);
4253
4254	/*
4255	 * Deterministic cache parameters
4256	 *
4257	 * Intel uses leaf 0x4 for this, while AMD uses leaf 0x8000001d. The
4258	 * values that are present are currently defined to be the same. This
4259	 * means we can use the same logic to parse it as long as we use the
4260	 * appropriate leaf to get the data. If you're updating this, make sure
4261	 * you're careful about which vendor supports which aspect.
4262	 *
4263	 * Take this opportunity to detect the number of threads sharing the
4264	 * last level cache, and construct a corresponding cache id. The
4265	 * respective cpuid_info members are initialized to the default case of
4266	 * "no last level cache sharing".
4267	 */
4268	cpi->cpi_ncpu_shr_last_cache = 1;
4269	cpi->cpi_last_lvl_cacheid = cpu->cpu_id;
4270
4271	if ((cpi->cpi_maxeax >= 4 && cpi->cpi_vendor == X86_VENDOR_Intel) ||
4272	    (cpi->cpi_vendor == X86_VENDOR_AMD &&
4273	    cpi->cpi_xmaxeax >= CPUID_LEAF_EXT_1d &&
4274	    is_x86_feature(x86_featureset, X86FSET_TOPOEXT))) {
4275		uint32_t leaf;
4276
4277		if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4278			leaf = 4;
4279		} else {
4280			leaf = CPUID_LEAF_EXT_1d;
4281		}
4282
4283		/*
4284		 * Find the # of elements (size) returned by the leaf and along
4285		 * the way detect last level cache sharing details.
4286		 */
4287		bzero(&regs, sizeof (regs));
4288		cp = &regs;
4289		for (i = 0, max = 0; i < CPI_FN4_ECX_MAX; i++) {
4290			cp->cp_eax = leaf;
4291			cp->cp_ecx = i;
4292
4293			(void) __cpuid_insn(cp);
4294
4295			if (CPI_CACHE_TYPE(cp) == 0)
4296				break;
4297			level = CPI_CACHE_LVL(cp);
4298			if (level > max) {
4299				max = level;
4300				cpi->cpi_ncpu_shr_last_cache =
4301				    CPI_NTHR_SHR_CACHE(cp) + 1;
4302			}
4303		}
4304		cpi->cpi_cache_leaf_size = size = i;
4305
4306		/*
4307		 * Allocate the cpi_cache_leaves array. The first element
4308		 * references the regs for the corresponding leaf with %ecx set
4309		 * to 0. This was gathered in cpuid_pass2().
4310		 */
4311		if (size > 0) {
4312			cpi->cpi_cache_leaves =
4313			    kmem_alloc(size * sizeof (cp), KM_SLEEP);
4314			if (cpi->cpi_vendor == X86_VENDOR_Intel) {
4315				cpi->cpi_cache_leaves[0] = &cpi->cpi_std[4];
4316			} else {
4317				cpi->cpi_cache_leaves[0] = &cpi->cpi_extd[0x1d];
4318			}
4319
4320			/*
4321			 * Allocate storage to hold the additional regs
4322			 * for the leaf, %ecx == 1 .. cpi_cache_leaf_size.
4323			 *
4324			 * The regs for the leaf, %ecx == 0 has already
4325			 * been allocated as indicated above.
4326			 */
4327			for (i = 1; i < size; i++) {
4328				cp = cpi->cpi_cache_leaves[i] =
4329				    kmem_zalloc(sizeof (regs), KM_SLEEP);
4330				cp->cp_eax = leaf;
4331				cp->cp_ecx = i;
4332
4333				(void) __cpuid_insn(cp);
4334			}
4335		}
4336		/*
4337		 * Determine the number of bits needed to represent
4338		 * the number of CPUs sharing the last level cache.
4339		 *
4340		 * Shift off that number of bits from the APIC id to
4341		 * derive the cache id.
4342		 */
4343		shft = 0;
4344		for (i = 1; i < cpi->cpi_ncpu_shr_last_cache; i <<= 1)
4345			shft++;
4346		cpi->cpi_last_lvl_cacheid = cpi->cpi_apicid >> shft;
4347	}
4348
4349	/*
4350	 * Now fixup the brand string
4351	 */
4352	if ((cpi->cpi_xmaxeax & CPUID_LEAF_EXT_0) == 0) {
4353		fabricate_brandstr(cpi);
4354	} else {
4355
4356		/*
4357		 * If we successfully extracted a brand string from the cpuid
4358		 * instruction, clean it up by removing leading spaces and
4359		 * similar junk.
4360		 */
4361		if (cpi->cpi_brandstr[0]) {
4362			size_t maxlen = sizeof (cpi->cpi_brandstr);
4363			char *src, *dst;
4364
4365			dst = src = (char *)cpi->cpi_brandstr;
4366			src[maxlen - 1] = '\0';
4367			/*
4368			 * strip leading spaces
4369			 */
4370			while (*src == ' ')
4371				src++;
4372			/*
4373			 * Remove any 'Genuine' or "Authentic" prefixes
4374			 */
4375			if (strncmp(src, "Genuine ", 8) == 0)
4376				src += 8;
4377			if (strncmp(src, "Authentic ", 10) == 0)
4378				src += 10;
4379
4380			/*
4381			 * Now do an in-place copy.
4382			 * Map (R) to (r) and (TM) to (tm).
4383			 * The era of teletypes is long gone, and there's
4384			 * -really- no need to shout.
4385			 */
4386			while (*src != '\0') {
4387				if (src[0] == '(') {
4388					if (strncmp(src + 1, "R)", 2) == 0) {
4389						(void) strncpy(dst, "(r)", 3);
4390						src += 3;
4391						dst += 3;
4392						continue;
4393					}
4394					if (strncmp(src + 1, "TM)", 3) == 0) {
4395						(void) strncpy(dst, "(tm)", 4);
4396						src += 4;
4397						dst += 4;
4398						continue;
4399					}
4400				}
4401				*dst++ = *src++;
4402			}
4403			*dst = '\0';
4404
4405			/*
4406			 * Finally, remove any trailing spaces
4407			 */
4408			while (--dst > cpi->cpi_brandstr)
4409				if (*dst == ' ')
4410					*dst = '\0';
4411				else
4412					break;
4413		} else
4414			fabricate_brandstr(cpi);
4415	}
4416	cpi->cpi_pass = 3;
4417}
4418
4419/*
4420 * This routine is called out of bind_hwcap() much later in the life
4421 * of the kernel (post_startup()).  The job of this routine is to resolve
4422 * the hardware feature support and kernel support for those features into
4423 * what we're actually going to tell applications via the aux vector.
4424 */
4425void
4426cpuid_pass4(cpu_t *cpu, uint_t *hwcap_out)
4427{
4428	struct cpuid_info *cpi;
4429	uint_t hwcap_flags = 0, hwcap_flags_2 = 0;
4430
4431	if (cpu == NULL)
4432		cpu = CPU;
4433	cpi = cpu->cpu_m.mcpu_cpi;
4434
4435	ASSERT(cpi->cpi_pass == 3);
4436
4437	if (cpi->cpi_maxeax >= 1) {
4438		uint32_t *edx = &cpi->cpi_support[STD_EDX_FEATURES];
4439		uint32_t *ecx = &cpi->cpi_support[STD_ECX_FEATURES];
4440		uint32_t *ebx = &cpi->cpi_support[STD_EBX_FEATURES];
4441
4442		*edx = CPI_FEATURES_EDX(cpi);
4443		*ecx = CPI_FEATURES_ECX(cpi);
4444		*ebx = CPI_FEATURES_7_0_EBX(cpi);
4445
4446		/*
4447		 * [these require explicit kernel support]
4448		 */
4449		if (!is_x86_feature(x86_featureset, X86FSET_SEP))
4450			*edx &= ~CPUID_INTC_EDX_SEP;
4451
4452		if (!is_x86_feature(x86_featureset, X86FSET_SSE))
4453			*edx &= ~(CPUID_INTC_EDX_FXSR|CPUID_INTC_EDX_SSE);
4454		if (!is_x86_feature(x86_featureset, X86FSET_SSE2))
4455			*edx &= ~CPUID_INTC_EDX_SSE2;
4456
4457		if (!is_x86_feature(x86_featureset, X86FSET_HTT))
4458			*edx &= ~CPUID_INTC_EDX_HTT;
4459
4460		if (!is_x86_feature(x86_featureset, X86FSET_SSE3))
4461			*ecx &= ~CPUID_INTC_ECX_SSE3;
4462
4463		if (!is_x86_feature(x86_featureset, X86FSET_SSSE3))
4464			*ecx &= ~CPUID_INTC_ECX_SSSE3;
4465		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_1))
4466			*ecx &= ~CPUID_INTC_ECX_SSE4_1;
4467		if (!is_x86_feature(x86_featureset, X86FSET_SSE4_2))
4468			*ecx &= ~CPUID_INTC_ECX_SSE4_2;
4469		if (!is_x86_feature(x86_featureset, X86FSET_AES))
4470			*ecx &= ~CPUID_INTC_ECX_AES;
4471		if (!is_x86_feature(x86_featureset, X86FSET_PCLMULQDQ))
4472			*ecx &= ~CPUID_INTC_ECX_PCLMULQDQ;
4473		if (!is_x86_feature(x86_featureset, X86FSET_XSAVE))
4474			*ecx &= ~(CPUID_INTC_ECX_XSAVE |
4475			    CPUID_INTC_ECX_OSXSAVE);
4476		if (!is_x86_feature(x86_featureset, X86FSET_AVX))
4477			*ecx &= ~CPUID_INTC_ECX_AVX;
4478		if (!is_x86_feature(x86_featureset, X86FSET_F16C))
4479			*ecx &= ~CPUID_INTC_ECX_F16C;
4480		if (!is_x86_feature(x86_featureset, X86FSET_FMA))
4481			*ecx &= ~CPUID_INTC_ECX_FMA;
4482		if (!is_x86_feature(x86_featureset, X86FSET_BMI1))
4483			*ebx &= ~CPUID_INTC_EBX_7_0_BMI1;
4484		if (!is_x86_feature(x86_featureset, X86FSET_BMI2))
4485			*ebx &= ~CPUID_INTC_EBX_7_0_BMI2;
4486		if (!is_x86_feature(x86_featureset, X86FSET_AVX2))
4487			*ebx &= ~CPUID_INTC_EBX_7_0_AVX2;
4488		if (!is_x86_feature(x86_featureset, X86FSET_RDSEED))
4489			*ebx &= ~CPUID_INTC_EBX_7_0_RDSEED;
4490		if (!is_x86_feature(x86_featureset, X86FSET_ADX))
4491			*ebx &= ~CPUID_INTC_EBX_7_0_ADX;
4492
4493		/*
4494		 * [no explicit support required beyond x87 fp context]
4495		 */
4496		if (!fpu_exists)
4497			*edx &= ~(CPUID_INTC_EDX_FPU | CPUID_INTC_EDX_MMX);
4498
4499		/*
4500		 * Now map the supported feature vector to things that we
4501		 * think userland will care about.
4502		 */
4503		if (*edx & CPUID_INTC_EDX_SEP)
4504			hwcap_flags |= AV_386_SEP;
4505		if (*edx & CPUID_INTC_EDX_SSE)
4506			hwcap_flags |= AV_386_FXSR | AV_386_SSE;
4507		if (*edx & CPUID_INTC_EDX_SSE2)
4508			hwcap_flags |= AV_386_SSE2;
4509		if (*ecx & CPUID_INTC_ECX_SSE3)
4510			hwcap_flags |= AV_386_SSE3;
4511		if (*ecx & CPUID_INTC_ECX_SSSE3)
4512			hwcap_flags |= AV_386_SSSE3;
4513		if (*ecx & CPUID_INTC_ECX_SSE4_1)
4514			hwcap_flags |= AV_386_SSE4_1;
4515		if (*ecx & CPUID_INTC_ECX_SSE4_2)
4516			hwcap_flags |= AV_386_SSE4_2;
4517		if (*ecx & CPUID_INTC_ECX_MOVBE)
4518			hwcap_flags |= AV_386_MOVBE;
4519		if (*ecx & CPUID_INTC_ECX_AES)
4520			hwcap_flags |= AV_386_AES;
4521		if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
4522			hwcap_flags |= AV_386_PCLMULQDQ;
4523		if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
4524		    (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
4525			hwcap_flags |= AV_386_XSAVE;
4526
4527			if (*ecx & CPUID_INTC_ECX_AVX) {
4528				uint32_t *ecx_7 = &CPI_FEATURES_7_0_ECX(cpi);
4529				uint32_t *edx_7 = &CPI_FEATURES_7_0_EDX(cpi);
4530
4531				hwcap_flags |= AV_386_AVX;
4532				if (*ecx & CPUID_INTC_ECX_F16C)
4533					hwcap_flags_2 |= AV_386_2_F16C;
4534				if (*ecx & CPUID_INTC_ECX_FMA)
4535					hwcap_flags_2 |= AV_386_2_FMA;
4536
4537				if (*ebx & CPUID_INTC_EBX_7_0_BMI1)
4538					hwcap_flags_2 |= AV_386_2_BMI1;
4539				if (*ebx & CPUID_INTC_EBX_7_0_BMI2)
4540					hwcap_flags_2 |= AV_386_2_BMI2;
4541				if (*ebx & CPUID_INTC_EBX_7_0_AVX2)
4542					hwcap_flags_2 |= AV_386_2_AVX2;
4543				if (*ebx & CPUID_INTC_EBX_7_0_AVX512F)
4544					hwcap_flags_2 |= AV_386_2_AVX512F;
4545				if (*ebx & CPUID_INTC_EBX_7_0_AVX512DQ)
4546					hwcap_flags_2 |= AV_386_2_AVX512DQ;
4547				if (*ebx & CPUID_INTC_EBX_7_0_AVX512IFMA)
4548					hwcap_flags_2 |= AV_386_2_AVX512IFMA;
4549				if (*ebx & CPUID_INTC_EBX_7_0_AVX512PF)
4550					hwcap_flags_2 |= AV_386_2_AVX512PF;
4551				if (*ebx & CPUID_INTC_EBX_7_0_AVX512ER)
4552					hwcap_flags_2 |= AV_386_2_AVX512ER;
4553				if (*ebx & CPUID_INTC_EBX_7_0_AVX512CD)
4554					hwcap_flags_2 |= AV_386_2_AVX512CD;
4555				if (*ebx & CPUID_INTC_EBX_7_0_AVX512BW)
4556					hwcap_flags_2 |= AV_386_2_AVX512BW;
4557				if (*ebx & CPUID_INTC_EBX_7_0_AVX512VL)
4558					hwcap_flags_2 |= AV_386_2_AVX512VL;
4559
4560				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VBMI)
4561					hwcap_flags_2 |= AV_386_2_AVX512VBMI;
4562				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VNNI)
4563					hwcap_flags_2 |= AV_386_2_AVX512_VNNI;
4564				if (*ecx_7 & CPUID_INTC_ECX_7_0_AVX512VPOPCDQ)
4565					hwcap_flags_2 |= AV_386_2_AVX512VPOPCDQ;
4566
4567				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124NNIW)
4568					hwcap_flags_2 |= AV_386_2_AVX512_4NNIW;
4569				if (*edx_7 & CPUID_INTC_EDX_7_0_AVX5124FMAPS)
4570					hwcap_flags_2 |= AV_386_2_AVX512_4FMAPS;
4571			}
4572		}
4573		if (*ecx & CPUID_INTC_ECX_VMX)
4574			hwcap_flags |= AV_386_VMX;
4575		if (*ecx & CPUID_INTC_ECX_POPCNT)
4576			hwcap_flags |= AV_386_POPCNT;
4577		if (*edx & CPUID_INTC_EDX_FPU)
4578			hwcap_flags |= AV_386_FPU;
4579		if (*edx & CPUID_INTC_EDX_MMX)
4580			hwcap_flags |= AV_386_MMX;
4581
4582		if (*edx & CPUID_INTC_EDX_TSC)
4583			hwcap_flags |= AV_386_TSC;
4584		if (*edx & CPUID_INTC_EDX_CX8)
4585			hwcap_flags |= AV_386_CX8;
4586		if (*edx & CPUID_INTC_EDX_CMOV)
4587			hwcap_flags |= AV_386_CMOV;
4588		if (*ecx & CPUID_INTC_ECX_CX16)
4589			hwcap_flags |= AV_386_CX16;
4590
4591		if (*ecx & CPUID_INTC_ECX_RDRAND)
4592			hwcap_flags_2 |= AV_386_2_RDRAND;
4593		if (*ebx & CPUID_INTC_EBX_7_0_ADX)
4594			hwcap_flags_2 |= AV_386_2_ADX;
4595		if (*ebx & CPUID_INTC_EBX_7_0_RDSEED)
4596			hwcap_flags_2 |= AV_386_2_RDSEED;
4597		if (*ebx & CPUID_INTC_EBX_7_0_SHA)
4598			hwcap_flags_2 |= AV_386_2_SHA;
4599		if (*ebx & CPUID_INTC_EBX_7_0_FSGSBASE)
4600			hwcap_flags_2 |= AV_386_2_FSGSBASE;
4601		if (*ebx & CPUID_INTC_EBX_7_0_CLWB)
4602			hwcap_flags_2 |= AV_386_2_CLWB;
4603		if (*ebx & CPUID_INTC_EBX_7_0_CLFLUSHOPT)
4604			hwcap_flags_2 |= AV_386_2_CLFLUSHOPT;
4605
4606	}
4607	/*
4608	 * Check a few miscilaneous features.
4609	 */
4610	if (is_x86_feature(x86_featureset, X86FSET_CLZERO))
4611		hwcap_flags_2 |= AV_386_2_CLZERO;
4612
4613	if (cpi->cpi_xmaxeax < 0x80000001)
4614		goto pass4_done;
4615
4616	switch (cpi->cpi_vendor) {
4617		struct cpuid_regs cp;
4618		uint32_t *edx, *ecx;
4619
4620	case X86_VENDOR_Intel:
4621		/*
4622		 * Seems like Intel duplicated what we necessary
4623		 * here to make the initial crop of 64-bit OS's work.
4624		 * Hopefully, those are the only "extended" bits
4625		 * they'll add.
4626		 */
4627		/*FALLTHROUGH*/
4628
4629	case X86_VENDOR_AMD:
4630		edx = &cpi->cpi_support[AMD_EDX_FEATURES];
4631		ecx = &cpi->cpi_support[AMD_ECX_FEATURES];
4632
4633		*edx = CPI_FEATURES_XTD_EDX(cpi);
4634		*ecx = CPI_FEATURES_XTD_ECX(cpi);
4635
4636		/*
4637		 * [these features require explicit kernel support]
4638		 */
4639		switch (cpi->cpi_vendor) {
4640		case X86_VENDOR_Intel:
4641			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
4642				*edx &= ~CPUID_AMD_EDX_TSCP;
4643			break;
4644
4645		case X86_VENDOR_AMD:
4646			if (!is_x86_feature(x86_featureset, X86FSET_TSCP))
4647				*edx &= ~CPUID_AMD_EDX_TSCP;
4648			if (!is_x86_feature(x86_featureset, X86FSET_SSE4A))
4649				*ecx &= ~CPUID_AMD_ECX_SSE4A;
4650			break;
4651
4652		default:
4653			break;
4654		}
4655
4656		/*
4657		 * [no explicit support required beyond
4658		 * x87 fp context and exception handlers]
4659		 */
4660		if (!fpu_exists)
4661			*edx &= ~(CPUID_AMD_EDX_MMXamd |
4662			    CPUID_AMD_EDX_3DNow | CPUID_AMD_EDX_3DNowx);
4663
4664		if (!is_x86_feature(x86_featureset, X86FSET_NX))
4665			*edx &= ~CPUID_AMD_EDX_NX;
4666#if !defined(__amd64)
4667		*edx &= ~CPUID_AMD_EDX_LM;
4668#endif
4669		/*
4670		 * Now map the supported feature vector to
4671		 * things that we think userland will care about.
4672		 */
4673#if defined(__amd64)
4674		if (*edx & CPUID_AMD_EDX_SYSC)
4675			hwcap_flags |= AV_386_AMD_SYSC;
4676#endif
4677		if (*edx & CPUID_AMD_EDX_MMXamd)
4678			hwcap_flags |= AV_386_AMD_MMX;
4679		if (*edx & CPUID_AMD_EDX_3DNow)
4680			hwcap_flags |= AV_386_AMD_3DNow;
4681		if (*edx & CPUID_AMD_EDX_3DNowx)
4682			hwcap_flags |= AV_386_AMD_3DNowx;
4683		if (*ecx & CPUID_AMD_ECX_SVM)
4684			hwcap_flags |= AV_386_AMD_SVM;
4685
4686		switch (cpi->cpi_vendor) {
4687		case X86_VENDOR_AMD:
4688			if (*edx & CPUID_AMD_EDX_TSCP)
4689				hwcap_flags |= AV_386_TSCP;
4690			if (*ecx & CPUID_AMD_ECX_AHF64)
4691				hwcap_flags |= AV_386_AHF;
4692			if (*ecx & CPUID_AMD_ECX_SSE4A)
4693				hwcap_flags |= AV_386_AMD_SSE4A;
4694			if (*ecx & CPUID_AMD_ECX_LZCNT)
4695				hwcap_flags |= AV_386_AMD_LZCNT;
4696			if (*ecx & CPUID_AMD_ECX_MONITORX)
4697				hwcap_flags_2 |= AV_386_2_MONITORX;
4698			break;
4699
4700		case X86_VENDOR_Intel:
4701			if (*edx & CPUID_AMD_EDX_TSCP)
4702				hwcap_flags |= AV_386_TSCP;
4703			if (*ecx & CPUID_AMD_ECX_LZCNT)
4704				hwcap_flags |= AV_386_AMD_LZCNT;
4705			/*
4706			 * Aarrgh.
4707			 * Intel uses a different bit in the same word.
4708			 */
4709			if (*ecx & CPUID_INTC_ECX_AHF64)
4710				hwcap_flags |= AV_386_AHF;
4711			break;
4712
4713		default:
4714			break;
4715		}
4716		break;
4717
4718	case X86_VENDOR_TM:
4719		cp.cp_eax = 0x80860001;
4720		(void) __cpuid_insn(&cp);
4721		cpi->cpi_support[TM_EDX_FEATURES] = cp.cp_edx;
4722		break;
4723
4724	default:
4725		break;
4726	}
4727
4728pass4_done:
4729	cpi->cpi_pass = 4;
4730	if (hwcap_out != NULL) {
4731		hwcap_out[0] = hwcap_flags;
4732		hwcap_out[1] = hwcap_flags_2;
4733	}
4734}
4735
4736
4737/*
4738 * Simulate the cpuid instruction using the data we previously
4739 * captured about this CPU.  We try our best to return the truth
4740 * about the hardware, independently of kernel support.
4741 */
4742uint32_t
4743cpuid_insn(cpu_t *cpu, struct cpuid_regs *cp)
4744{
4745	struct cpuid_info *cpi;
4746	struct cpuid_regs *xcp;
4747
4748	if (cpu == NULL)
4749		cpu = CPU;
4750	cpi = cpu->cpu_m.mcpu_cpi;
4751
4752	ASSERT(cpuid_checkpass(cpu, 3));
4753
4754	/*
4755	 * CPUID data is cached in two separate places: cpi_std for standard
4756	 * CPUID leaves , and cpi_extd for extended CPUID leaves.
4757	 */
4758	if (cp->cp_eax <= cpi->cpi_maxeax && cp->cp_eax < NMAX_CPI_STD) {
4759		xcp = &cpi->cpi_std[cp->cp_eax];
4760	} else if (cp->cp_eax >= CPUID_LEAF_EXT_0 &&
4761	    cp->cp_eax <= cpi->cpi_xmaxeax &&
4762	    cp->cp_eax < CPUID_LEAF_EXT_0 + NMAX_CPI_EXTD) {
4763		xcp = &cpi->cpi_extd[cp->cp_eax - CPUID_LEAF_EXT_0];
4764	} else {
4765		/*
4766		 * The caller is asking for data from an input parameter which
4767		 * the kernel has not cached.  In this case we go fetch from
4768		 * the hardware and return the data directly to the user.
4769		 */
4770		return (__cpuid_insn(cp));
4771	}
4772
4773	cp->cp_eax = xcp->cp_eax;
4774	cp->cp_ebx = xcp->cp_ebx;
4775	cp->cp_ecx = xcp->cp_ecx;
4776	cp->cp_edx = xcp->cp_edx;
4777	return (cp->cp_eax);
4778}
4779
4780int
4781cpuid_checkpass(cpu_t *cpu, int pass)
4782{
4783	return (cpu != NULL && cpu->cpu_m.mcpu_cpi != NULL &&
4784	    cpu->cpu_m.mcpu_cpi->cpi_pass >= pass);
4785}
4786
4787int
4788cpuid_getbrandstr(cpu_t *cpu, char *s, size_t n)
4789{
4790	ASSERT(cpuid_checkpass(cpu, 3));
4791
4792	return (snprintf(s, n, "%s", cpu->cpu_m.mcpu_cpi->cpi_brandstr));
4793}
4794
4795int
4796cpuid_is_cmt(cpu_t *cpu)
4797{
4798	if (cpu == NULL)
4799		cpu = CPU;
4800
4801	ASSERT(cpuid_checkpass(cpu, 1));
4802
4803	return (cpu->cpu_m.mcpu_cpi->cpi_chipid >= 0);
4804}
4805
4806/*
4807 * AMD and Intel both implement the 64-bit variant of the syscall
4808 * instruction (syscallq), so if there's -any- support for syscall,
4809 * cpuid currently says "yes, we support this".
4810 *
4811 * However, Intel decided to -not- implement the 32-bit variant of the
4812 * syscall instruction, so we provide a predicate to allow our caller
4813 * to test that subtlety here.
4814 *
4815 * XXPV	Currently, 32-bit syscall instructions don't work via the hypervisor,
4816 *	even in the case where the hardware would in fact support it.
4817 */
4818/*ARGSUSED*/
4819int
4820cpuid_syscall32_insn(cpu_t *cpu)
4821{
4822	ASSERT(cpuid_checkpass((cpu == NULL ? CPU : cpu), 1));
4823
4824#if !defined(__xpv)
4825	if (cpu == NULL)
4826		cpu = CPU;
4827
4828	/*CSTYLED*/
4829	{
4830		struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4831
4832		if (cpi->cpi_vendor == X86_VENDOR_AMD &&
4833		    cpi->cpi_xmaxeax >= 0x80000001 &&
4834		    (CPI_FEATURES_XTD_EDX(cpi) & CPUID_AMD_EDX_SYSC))
4835			return (1);
4836	}
4837#endif
4838	return (0);
4839}
4840
4841int
4842cpuid_getidstr(cpu_t *cpu, char *s, size_t n)
4843{
4844	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
4845
4846	static const char fmt[] =
4847	    "x86 (%s %X family %d model %d step %d clock %d MHz)";
4848	static const char fmt_ht[] =
4849	    "x86 (chipid 0x%x %s %X family %d model %d step %d clock %d MHz)";
4850
4851	ASSERT(cpuid_checkpass(cpu, 1));
4852
4853	if (cpuid_is_cmt(cpu))
4854		return (snprintf(s, n, fmt_ht, cpi->cpi_chipid,
4855		    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
4856		    cpi->cpi_family, cpi->cpi_model,
4857		    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
4858	return (snprintf(s, n, fmt,
4859	    cpi->cpi_vendorstr, cpi->cpi_std[1].cp_eax,
4860	    cpi->cpi_family, cpi->cpi_model,
4861	    cpi->cpi_step, cpu->cpu_type_info.pi_clock));
4862}
4863
4864const char *
4865cpuid_getvendorstr(cpu_t *cpu)
4866{
4867	ASSERT(cpuid_checkpass(cpu, 1));
4868	return ((const char *)cpu->cpu_m.mcpu_cpi->cpi_vendorstr);
4869}
4870
4871uint_t
4872cpuid_getvendor(cpu_t *cpu)
4873{
4874	ASSERT(cpuid_checkpass(cpu, 1));
4875	return (cpu->cpu_m.mcpu_cpi->cpi_vendor);
4876}
4877
4878uint_t
4879cpuid_getfamily(cpu_t *cpu)
4880{
4881	ASSERT(cpuid_checkpass(cpu, 1));
4882	return (cpu->cpu_m.mcpu_cpi->cpi_family);
4883}
4884
4885uint_t
4886cpuid_getmodel(cpu_t *cpu)
4887{
4888	ASSERT(cpuid_checkpass(cpu, 1));
4889	return (cpu->cpu_m.mcpu_cpi->cpi_model);
4890}
4891
4892uint_t
4893cpuid_get_ncpu_per_chip(cpu_t *cpu)
4894{
4895	ASSERT(cpuid_checkpass(cpu, 1));
4896	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_per_chip);
4897}
4898
4899uint_t
4900cpuid_get_ncore_per_chip(cpu_t *cpu)
4901{
4902	ASSERT(cpuid_checkpass(cpu, 1));
4903	return (cpu->cpu_m.mcpu_cpi->cpi_ncore_per_chip);
4904}
4905
4906uint_t
4907cpuid_get_ncpu_sharing_last_cache(cpu_t *cpu)
4908{
4909	ASSERT(cpuid_checkpass(cpu, 2));
4910	return (cpu->cpu_m.mcpu_cpi->cpi_ncpu_shr_last_cache);
4911}
4912
4913id_t
4914cpuid_get_last_lvl_cacheid(cpu_t *cpu)
4915{
4916	ASSERT(cpuid_checkpass(cpu, 2));
4917	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
4918}
4919
4920uint_t
4921cpuid_getstep(cpu_t *cpu)
4922{
4923	ASSERT(cpuid_checkpass(cpu, 1));
4924	return (cpu->cpu_m.mcpu_cpi->cpi_step);
4925}
4926
4927uint_t
4928cpuid_getsig(struct cpu *cpu)
4929{
4930	ASSERT(cpuid_checkpass(cpu, 1));
4931	return (cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_eax);
4932}
4933
4934uint32_t
4935cpuid_getchiprev(struct cpu *cpu)
4936{
4937	ASSERT(cpuid_checkpass(cpu, 1));
4938	return (cpu->cpu_m.mcpu_cpi->cpi_chiprev);
4939}
4940
4941const char *
4942cpuid_getchiprevstr(struct cpu *cpu)
4943{
4944	ASSERT(cpuid_checkpass(cpu, 1));
4945	return (cpu->cpu_m.mcpu_cpi->cpi_chiprevstr);
4946}
4947
4948uint32_t
4949cpuid_getsockettype(struct cpu *cpu)
4950{
4951	ASSERT(cpuid_checkpass(cpu, 1));
4952	return (cpu->cpu_m.mcpu_cpi->cpi_socket);
4953}
4954
4955const char *
4956cpuid_getsocketstr(cpu_t *cpu)
4957{
4958	static const char *socketstr = NULL;
4959	struct cpuid_info *cpi;
4960
4961	ASSERT(cpuid_checkpass(cpu, 1));
4962	cpi = cpu->cpu_m.mcpu_cpi;
4963
4964	/* Assume that socket types are the same across the system */
4965	if (socketstr == NULL)
4966		socketstr = _cpuid_sktstr(cpi->cpi_vendor, cpi->cpi_family,
4967		    cpi->cpi_model, cpi->cpi_step);
4968
4969
4970	return (socketstr);
4971}
4972
4973int
4974cpuid_get_chipid(cpu_t *cpu)
4975{
4976	ASSERT(cpuid_checkpass(cpu, 1));
4977
4978	if (cpuid_is_cmt(cpu))
4979		return (cpu->cpu_m.mcpu_cpi->cpi_chipid);
4980	return (cpu->cpu_id);
4981}
4982
4983id_t
4984cpuid_get_coreid(cpu_t *cpu)
4985{
4986	ASSERT(cpuid_checkpass(cpu, 1));
4987	return (cpu->cpu_m.mcpu_cpi->cpi_coreid);
4988}
4989
4990int
4991cpuid_get_pkgcoreid(cpu_t *cpu)
4992{
4993	ASSERT(cpuid_checkpass(cpu, 1));
4994	return (cpu->cpu_m.mcpu_cpi->cpi_pkgcoreid);
4995}
4996
4997int
4998cpuid_get_clogid(cpu_t *cpu)
4999{
5000	ASSERT(cpuid_checkpass(cpu, 1));
5001	return (cpu->cpu_m.mcpu_cpi->cpi_clogid);
5002}
5003
5004int
5005cpuid_get_cacheid(cpu_t *cpu)
5006{
5007	ASSERT(cpuid_checkpass(cpu, 1));
5008	return (cpu->cpu_m.mcpu_cpi->cpi_last_lvl_cacheid);
5009}
5010
5011uint_t
5012cpuid_get_procnodeid(cpu_t *cpu)
5013{
5014	ASSERT(cpuid_checkpass(cpu, 1));
5015	return (cpu->cpu_m.mcpu_cpi->cpi_procnodeid);
5016}
5017
5018uint_t
5019cpuid_get_procnodes_per_pkg(cpu_t *cpu)
5020{
5021	ASSERT(cpuid_checkpass(cpu, 1));
5022	return (cpu->cpu_m.mcpu_cpi->cpi_procnodes_per_pkg);
5023}
5024
5025uint_t
5026cpuid_get_compunitid(cpu_t *cpu)
5027{
5028	ASSERT(cpuid_checkpass(cpu, 1));
5029	return (cpu->cpu_m.mcpu_cpi->cpi_compunitid);
5030}
5031
5032uint_t
5033cpuid_get_cores_per_compunit(cpu_t *cpu)
5034{
5035	ASSERT(cpuid_checkpass(cpu, 1));
5036	return (cpu->cpu_m.mcpu_cpi->cpi_cores_per_compunit);
5037}
5038
5039/*ARGSUSED*/
5040int
5041cpuid_have_cr8access(cpu_t *cpu)
5042{
5043#if defined(__amd64)
5044	return (1);
5045#else
5046	struct cpuid_info *cpi;
5047
5048	ASSERT(cpu != NULL);
5049	cpi = cpu->cpu_m.mcpu_cpi;
5050	if (cpi->cpi_vendor == X86_VENDOR_AMD && cpi->cpi_maxeax >= 1 &&
5051	    (CPI_FEATURES_XTD_ECX(cpi) & CPUID_AMD_ECX_CR8D) != 0)
5052		return (1);
5053	return (0);
5054#endif
5055}
5056
5057uint32_t
5058cpuid_get_apicid(cpu_t *cpu)
5059{
5060	ASSERT(cpuid_checkpass(cpu, 1));
5061	if (cpu->cpu_m.mcpu_cpi->cpi_maxeax < 1) {
5062		return (UINT32_MAX);
5063	} else {
5064		return (cpu->cpu_m.mcpu_cpi->cpi_apicid);
5065	}
5066}
5067
5068void
5069cpuid_get_addrsize(cpu_t *cpu, uint_t *pabits, uint_t *vabits)
5070{
5071	struct cpuid_info *cpi;
5072
5073	if (cpu == NULL)
5074		cpu = CPU;
5075	cpi = cpu->cpu_m.mcpu_cpi;
5076
5077	ASSERT(cpuid_checkpass(cpu, 1));
5078
5079	if (pabits)
5080		*pabits = cpi->cpi_pabits;
5081	if (vabits)
5082		*vabits = cpi->cpi_vabits;
5083}
5084
5085size_t
5086cpuid_get_xsave_size()
5087{
5088	return (MAX(cpuid_info0.cpi_xsave.xsav_max_size,
5089	    sizeof (struct xsave_state)));
5090}
5091
5092/*
5093 * Return true if the CPUs on this system require 'pointer clearing' for the
5094 * floating point error pointer exception handling. In the past, this has been
5095 * true for all AMD K7 & K8 CPUs, although newer AMD CPUs have been changed to
5096 * behave the same as Intel. This is checked via the CPUID_AMD_EBX_ERR_PTR_ZERO
5097 * feature bit and is reflected in the cpi_fp_amd_save member.
5098 */
5099boolean_t
5100cpuid_need_fp_excp_handling()
5101{
5102	return (cpuid_info0.cpi_vendor == X86_VENDOR_AMD &&
5103	    cpuid_info0.cpi_fp_amd_save != 0);
5104}
5105
5106/*
5107 * Returns the number of data TLB entries for a corresponding
5108 * pagesize.  If it can't be computed, or isn't known, the
5109 * routine returns zero.  If you ask about an architecturally
5110 * impossible pagesize, the routine will panic (so that the
5111 * hat implementor knows that things are inconsistent.)
5112 */
5113uint_t
5114cpuid_get_dtlb_nent(cpu_t *cpu, size_t pagesize)
5115{
5116	struct cpuid_info *cpi;
5117	uint_t dtlb_nent = 0;
5118
5119	if (cpu == NULL)
5120		cpu = CPU;
5121	cpi = cpu->cpu_m.mcpu_cpi;
5122
5123	ASSERT(cpuid_checkpass(cpu, 1));
5124
5125	/*
5126	 * Check the L2 TLB info
5127	 */
5128	if (cpi->cpi_xmaxeax >= 0x80000006) {
5129		struct cpuid_regs *cp = &cpi->cpi_extd[6];
5130
5131		switch (pagesize) {
5132
5133		case 4 * 1024:
5134			/*
5135			 * All zero in the top 16 bits of the register
5136			 * indicates a unified TLB. Size is in low 16 bits.
5137			 */
5138			if ((cp->cp_ebx & 0xffff0000) == 0)
5139				dtlb_nent = cp->cp_ebx & 0x0000ffff;
5140			else
5141				dtlb_nent = BITX(cp->cp_ebx, 27, 16);
5142			break;
5143
5144		case 2 * 1024 * 1024:
5145			if ((cp->cp_eax & 0xffff0000) == 0)
5146				dtlb_nent = cp->cp_eax & 0x0000ffff;
5147			else
5148				dtlb_nent = BITX(cp->cp_eax, 27, 16);
5149			break;
5150
5151		default:
5152			panic("unknown L2 pagesize");
5153			/*NOTREACHED*/
5154		}
5155	}
5156
5157	if (dtlb_nent != 0)
5158		return (dtlb_nent);
5159
5160	/*
5161	 * No L2 TLB support for this size, try L1.
5162	 */
5163	if (cpi->cpi_xmaxeax >= 0x80000005) {
5164		struct cpuid_regs *cp = &cpi->cpi_extd[5];
5165
5166		switch (pagesize) {
5167		case 4 * 1024:
5168			dtlb_nent = BITX(cp->cp_ebx, 23, 16);
5169			break;
5170		case 2 * 1024 * 1024:
5171			dtlb_nent = BITX(cp->cp_eax, 23, 16);
5172			break;
5173		default:
5174			panic("unknown L1 d-TLB pagesize");
5175			/*NOTREACHED*/
5176		}
5177	}
5178
5179	return (dtlb_nent);
5180}
5181
5182/*
5183 * Return 0 if the erratum is not present or not applicable, positive
5184 * if it is, and negative if the status of the erratum is unknown.
5185 *
5186 * See "Revision Guide for AMD Athlon(tm) 64 and AMD Opteron(tm)
5187 * Processors" #25759, Rev 3.57, August 2005
5188 */
5189int
5190cpuid_opteron_erratum(cpu_t *cpu, uint_t erratum)
5191{
5192	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
5193	uint_t eax;
5194
5195	/*
5196	 * Bail out if this CPU isn't an AMD CPU, or if it's
5197	 * a legacy (32-bit) AMD CPU.
5198	 */
5199	if (cpi->cpi_vendor != X86_VENDOR_AMD ||
5200	    cpi->cpi_family == 4 || cpi->cpi_family == 5 ||
5201	    cpi->cpi_family == 6) {
5202		return (0);
5203	}
5204
5205	eax = cpi->cpi_std[1].cp_eax;
5206
5207#define	SH_B0(eax)	(eax == 0xf40 || eax == 0xf50)
5208#define	SH_B3(eax)	(eax == 0xf51)
5209#define	B(eax)		(SH_B0(eax) || SH_B3(eax))
5210
5211#define	SH_C0(eax)	(eax == 0xf48 || eax == 0xf58)
5212
5213#define	SH_CG(eax)	(eax == 0xf4a || eax == 0xf5a || eax == 0xf7a)
5214#define	DH_CG(eax)	(eax == 0xfc0 || eax == 0xfe0 || eax == 0xff0)
5215#define	CH_CG(eax)	(eax == 0xf82 || eax == 0xfb2)
5216#define	CG(eax)		(SH_CG(eax) || DH_CG(eax) || CH_CG(eax))
5217
5218#define	SH_D0(eax)	(eax == 0x10f40 || eax == 0x10f50 || eax == 0x10f70)
5219#define	DH_D0(eax)	(eax == 0x10fc0 || eax == 0x10ff0)
5220#define	CH_D0(eax)	(eax == 0x10f80 || eax == 0x10fb0)
5221#define	D0(eax)		(SH_D0(eax) || DH_D0(eax) || CH_D0(eax))
5222
5223#define	SH_E0(eax)	(eax == 0x20f50 || eax == 0x20f40 || eax == 0x20f70)
5224#define	JH_E1(eax)	(eax == 0x20f10)	/* JH8_E0 had 0x20f30 */
5225#define	DH_E3(eax)	(eax == 0x20fc0 || eax == 0x20ff0)
5226#define	SH_E4(eax)	(eax == 0x20f51 || eax == 0x20f71)
5227#define	BH_E4(eax)	(eax == 0x20fb1)
5228#define	SH_E5(eax)	(eax == 0x20f42)
5229#define	DH_E6(eax)	(eax == 0x20ff2 || eax == 0x20fc2)
5230#define	JH_E6(eax)	(eax == 0x20f12 || eax == 0x20f32)
5231#define	EX(eax)		(SH_E0(eax) || JH_E1(eax) || DH_E3(eax) || \
5232			    SH_E4(eax) || BH_E4(eax) || SH_E5(eax) || \
5233			    DH_E6(eax) || JH_E6(eax))
5234
5235#define	DR_AX(eax)	(eax == 0x100f00 || eax == 0x100f01 || eax == 0x100f02)
5236#define	DR_B0(eax)	(eax == 0x100f20)
5237#define	DR_B1(eax)	(eax == 0x100f21)
5238#define	DR_BA(eax)	(eax == 0x100f2a)
5239#define	DR_B2(eax)	(eax == 0x100f22)
5240#define	DR_B3(eax)	(eax == 0x100f23)
5241#define	RB_C0(eax)	(eax == 0x100f40)
5242
5243	switch (erratum) {
5244	case 1:
5245		return (cpi->cpi_family < 0x10);
5246	case 51:	/* what does the asterisk mean? */
5247		return (B(eax) || SH_C0(eax) || CG(eax));
5248	case 52:
5249		return (B(eax));
5250	case 57:
5251		return (cpi->cpi_family <= 0x11);
5252	case 58:
5253		return (B(eax));
5254	case 60:
5255		return (cpi->cpi_family <= 0x11);
5256	case 61:
5257	case 62:
5258	case 63:
5259	case 64:
5260	case 65:
5261	case 66:
5262	case 68:
5263	case 69:
5264	case 70:
5265	case 71:
5266		return (B(eax));
5267	case 72:
5268		return (SH_B0(eax));
5269	case 74:
5270		return (B(eax));
5271	case 75:
5272		return (cpi->cpi_family < 0x10);
5273	case 76:
5274		return (B(eax));
5275	case 77:
5276		return (cpi->cpi_family <= 0x11);
5277	case 78:
5278		return (B(eax) || SH_C0(eax));
5279	case 79:
5280		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5281	case 80:
5282	case 81:
5283	case 82:
5284		return (B(eax));
5285	case 83:
5286		return (B(eax) || SH_C0(eax) || CG(eax));
5287	case 85:
5288		return (cpi->cpi_family < 0x10);
5289	case 86:
5290		return (SH_C0(eax) || CG(eax));
5291	case 88:
5292#if !defined(__amd64)
5293		return (0);
5294#else
5295		return (B(eax) || SH_C0(eax));
5296#endif
5297	case 89:
5298		return (cpi->cpi_family < 0x10);
5299	case 90:
5300		return (B(eax) || SH_C0(eax) || CG(eax));
5301	case 91:
5302	case 92:
5303		return (B(eax) || SH_C0(eax));
5304	case 93:
5305		return (SH_C0(eax));
5306	case 94:
5307		return (B(eax) || SH_C0(eax) || CG(eax));
5308	case 95:
5309#if !defined(__amd64)
5310		return (0);
5311#else
5312		return (B(eax) || SH_C0(eax));
5313#endif
5314	case 96:
5315		return (B(eax) || SH_C0(eax) || CG(eax));
5316	case 97:
5317	case 98:
5318		return (SH_C0(eax) || CG(eax));
5319	case 99:
5320		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5321	case 100:
5322		return (B(eax) || SH_C0(eax));
5323	case 101:
5324	case 103:
5325		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5326	case 104:
5327		return (SH_C0(eax) || CG(eax) || D0(eax));
5328	case 105:
5329	case 106:
5330	case 107:
5331		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5332	case 108:
5333		return (DH_CG(eax));
5334	case 109:
5335		return (SH_C0(eax) || CG(eax) || D0(eax));
5336	case 110:
5337		return (D0(eax) || EX(eax));
5338	case 111:
5339		return (CG(eax));
5340	case 112:
5341		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5342	case 113:
5343		return (eax == 0x20fc0);
5344	case 114:
5345		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5346	case 115:
5347		return (SH_E0(eax) || JH_E1(eax));
5348	case 116:
5349		return (SH_E0(eax) || JH_E1(eax) || DH_E3(eax));
5350	case 117:
5351		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax));
5352	case 118:
5353		return (SH_E0(eax) || JH_E1(eax) || SH_E4(eax) || BH_E4(eax) ||
5354		    JH_E6(eax));
5355	case 121:
5356		return (B(eax) || SH_C0(eax) || CG(eax) || D0(eax) || EX(eax));
5357	case 122:
5358		return (cpi->cpi_family < 0x10 || cpi->cpi_family == 0x11);
5359	case 123:
5360		return (JH_E1(eax) || BH_E4(eax) || JH_E6(eax));
5361	case 131:
5362		return (cpi->cpi_family < 0x10);
5363	case 6336786:
5364
5365		/*
5366		 * Test for AdvPowerMgmtInfo.TscPStateInvariant
5367		 * if this is a K8 family or newer processor. We're testing for
5368		 * this 'erratum' to determine whether or not we have a constant
5369		 * TSC.
5370		 *
5371		 * Our current fix for this is to disable the C1-Clock ramping.
5372		 * However, this doesn't work on newer processor families nor
5373		 * does it work when virtualized as those devices don't exist.
5374		 */
5375		if (cpi->cpi_family >= 0x12 || get_hwenv() != HW_NATIVE) {
5376			return (0);
5377		}
5378
5379		if (CPI_FAMILY(cpi) == 0xf) {
5380			struct cpuid_regs regs;
5381			regs.cp_eax = 0x80000007;
5382			(void) __cpuid_insn(&regs);
5383			return (!(regs.cp_edx & 0x100));
5384		}
5385		return (0);
5386	case 6323525:
5387		/*
5388		 * This erratum (K8 #147) is not present on family 10 and newer.
5389		 */
5390		if (cpi->cpi_family >= 0x10) {
5391			return (0);
5392		}
5393		return (((((eax >> 12) & 0xff00) + (eax & 0xf00)) |
5394		    (((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0))) < 0xf40);
5395
5396	case 6671130:
5397		/*
5398		 * check for processors (pre-Shanghai) that do not provide
5399		 * optimal management of 1gb ptes in its tlb.
5400		 */
5401		return (cpi->cpi_family == 0x10 && cpi->cpi_model < 4);
5402
5403	case 298:
5404		return (DR_AX(eax) || DR_B0(eax) || DR_B1(eax) || DR_BA(eax) ||
5405		    DR_B2(eax) || RB_C0(eax));
5406
5407	case 721:
5408#if defined(__amd64)
5409		return (cpi->cpi_family == 0x10 || cpi->cpi_family == 0x12);
5410#else
5411		return (0);
5412#endif
5413
5414	default:
5415		return (-1);
5416
5417	}
5418}
5419
5420/*
5421 * Determine if specified erratum is present via OSVW (OS Visible Workaround).
5422 * Return 1 if erratum is present, 0 if not present and -1 if indeterminate.
5423 */
5424int
5425osvw_opteron_erratum(cpu_t *cpu, uint_t erratum)
5426{
5427	struct cpuid_info	*cpi;
5428	uint_t			osvwid;
5429	static int		osvwfeature = -1;
5430	uint64_t		osvwlength;
5431
5432
5433	cpi = cpu->cpu_m.mcpu_cpi;
5434
5435	/* confirm OSVW supported */
5436	if (osvwfeature == -1) {
5437		osvwfeature = cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW;
5438	} else {
5439		/* assert that osvw feature setting is consistent on all cpus */
5440		ASSERT(osvwfeature ==
5441		    (cpi->cpi_extd[1].cp_ecx & CPUID_AMD_ECX_OSVW));
5442	}
5443	if (!osvwfeature)
5444		return (-1);
5445
5446	osvwlength = rdmsr(MSR_AMD_OSVW_ID_LEN) & OSVW_ID_LEN_MASK;
5447
5448	switch (erratum) {
5449	case 298:	/* osvwid is 0 */
5450		osvwid = 0;
5451		if (osvwlength <= (uint64_t)osvwid) {
5452			/* osvwid 0 is unknown */
5453			return (-1);
5454		}
5455
5456		/*
5457		 * Check the OSVW STATUS MSR to determine the state
5458		 * of the erratum where:
5459		 *   0 - fixed by HW
5460		 *   1 - BIOS has applied the workaround when BIOS
5461		 *   workaround is available. (Or for other errata,
5462		 *   OS workaround is required.)
5463		 * For a value of 1, caller will confirm that the
5464		 * erratum 298 workaround has indeed been applied by BIOS.
5465		 *
5466		 * A 1 may be set in cpus that have a HW fix
5467		 * in a mixed cpu system. Regarding erratum 298:
5468		 *   In a multiprocessor platform, the workaround above
5469		 *   should be applied to all processors regardless of
5470		 *   silicon revision when an affected processor is
5471		 *   present.
5472		 */
5473
5474		return (rdmsr(MSR_AMD_OSVW_STATUS +
5475		    (osvwid / OSVW_ID_CNT_PER_MSR)) &
5476		    (1ULL << (osvwid % OSVW_ID_CNT_PER_MSR)));
5477
5478	default:
5479		return (-1);
5480	}
5481}
5482
5483static const char assoc_str[] = "associativity";
5484static const char line_str[] = "line-size";
5485static const char size_str[] = "size";
5486
5487static void
5488add_cache_prop(dev_info_t *devi, const char *label, const char *type,
5489    uint32_t val)
5490{
5491	char buf[128];
5492
5493	/*
5494	 * ndi_prop_update_int() is used because it is desirable for
5495	 * DDI_PROP_HW_DEF and DDI_PROP_DONTSLEEP to be set.
5496	 */
5497	if (snprintf(buf, sizeof (buf), "%s-%s", label, type) < sizeof (buf))
5498		(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, buf, val);
5499}
5500
5501/*
5502 * Intel-style cache/tlb description
5503 *
5504 * Standard cpuid level 2 gives a randomly ordered
5505 * selection of tags that index into a table that describes
5506 * cache and tlb properties.
5507 */
5508
5509static const char l1_icache_str[] = "l1-icache";
5510static const char l1_dcache_str[] = "l1-dcache";
5511static const char l2_cache_str[] = "l2-cache";
5512static const char l3_cache_str[] = "l3-cache";
5513static const char itlb4k_str[] = "itlb-4K";
5514static const char dtlb4k_str[] = "dtlb-4K";
5515static const char itlb2M_str[] = "itlb-2M";
5516static const char itlb4M_str[] = "itlb-4M";
5517static const char dtlb4M_str[] = "dtlb-4M";
5518static const char dtlb24_str[] = "dtlb0-2M-4M";
5519static const char itlb424_str[] = "itlb-4K-2M-4M";
5520static const char itlb24_str[] = "itlb-2M-4M";
5521static const char dtlb44_str[] = "dtlb-4K-4M";
5522static const char sl1_dcache_str[] = "sectored-l1-dcache";
5523static const char sl2_cache_str[] = "sectored-l2-cache";
5524static const char itrace_str[] = "itrace-cache";
5525static const char sl3_cache_str[] = "sectored-l3-cache";
5526static const char sh_l2_tlb4k_str[] = "shared-l2-tlb-4k";
5527
5528static const struct cachetab {
5529	uint8_t		ct_code;
5530	uint8_t		ct_assoc;
5531	uint16_t	ct_line_size;
5532	size_t		ct_size;
5533	const char	*ct_label;
5534} intel_ctab[] = {
5535	/*
5536	 * maintain descending order!
5537	 *
5538	 * Codes ignored - Reason
5539	 * ----------------------
5540	 * 40H - intel_cpuid_4_cache_info() disambiguates l2/l3 cache
5541	 * f0H/f1H - Currently we do not interpret prefetch size by design
5542	 */
5543	{ 0xe4, 16, 64, 8*1024*1024, l3_cache_str},
5544	{ 0xe3, 16, 64, 4*1024*1024, l3_cache_str},
5545	{ 0xe2, 16, 64, 2*1024*1024, l3_cache_str},
5546	{ 0xde, 12, 64, 6*1024*1024, l3_cache_str},
5547	{ 0xdd, 12, 64, 3*1024*1024, l3_cache_str},
5548	{ 0xdc, 12, 64, ((1*1024*1024)+(512*1024)), l3_cache_str},
5549	{ 0xd8, 8, 64, 4*1024*1024, l3_cache_str},
5550	{ 0xd7, 8, 64, 2*1024*1024, l3_cache_str},
5551	{ 0xd6, 8, 64, 1*1024*1024, l3_cache_str},
5552	{ 0xd2, 4, 64, 2*1024*1024, l3_cache_str},
5553	{ 0xd1, 4, 64, 1*1024*1024, l3_cache_str},
5554	{ 0xd0, 4, 64, 512*1024, l3_cache_str},
5555	{ 0xca, 4, 0, 512, sh_l2_tlb4k_str},
5556	{ 0xc0, 4, 0, 8, dtlb44_str },
5557	{ 0xba, 4, 0, 64, dtlb4k_str },
5558	{ 0xb4, 4, 0, 256, dtlb4k_str },
5559	{ 0xb3, 4, 0, 128, dtlb4k_str },
5560	{ 0xb2, 4, 0, 64, itlb4k_str },
5561	{ 0xb0, 4, 0, 128, itlb4k_str },
5562	{ 0x87, 8, 64, 1024*1024, l2_cache_str},
5563	{ 0x86, 4, 64, 512*1024, l2_cache_str},
5564	{ 0x85, 8, 32, 2*1024*1024, l2_cache_str},
5565	{ 0x84, 8, 32, 1024*1024, l2_cache_str},
5566	{ 0x83, 8, 32, 512*1024, l2_cache_str},
5567	{ 0x82, 8, 32, 256*1024, l2_cache_str},
5568	{ 0x80, 8, 64, 512*1024, l2_cache_str},
5569	{ 0x7f, 2, 64, 512*1024, l2_cache_str},
5570	{ 0x7d, 8, 64, 2*1024*1024, sl2_cache_str},
5571	{ 0x7c, 8, 64, 1024*1024, sl2_cache_str},
5572	{ 0x7b, 8, 64, 512*1024, sl2_cache_str},
5573	{ 0x7a, 8, 64, 256*1024, sl2_cache_str},
5574	{ 0x79, 8, 64, 128*1024, sl2_cache_str},
5575	{ 0x78, 8, 64, 1024*1024, l2_cache_str},
5576	{ 0x73, 8, 0, 64*1024, itrace_str},
5577	{ 0x72, 8, 0, 32*1024, itrace_str},
5578	{ 0x71, 8, 0, 16*1024, itrace_str},
5579	{ 0x70, 8, 0, 12*1024, itrace_str},
5580	{ 0x68, 4, 64, 32*1024, sl1_dcache_str},
5581	{ 0x67, 4, 64, 16*1024, sl1_dcache_str},
5582	{ 0x66, 4, 64, 8*1024, sl1_dcache_str},
5583	{ 0x60, 8, 64, 16*1024, sl1_dcache_str},
5584	{ 0x5d, 0, 0, 256, dtlb44_str},
5585	{ 0x5c, 0, 0, 128, dtlb44_str},
5586	{ 0x5b, 0, 0, 64, dtlb44_str},
5587	{ 0x5a, 4, 0, 32, dtlb24_str},
5588	{ 0x59, 0, 0, 16, dtlb4k_str},
5589	{ 0x57, 4, 0, 16, dtlb4k_str},
5590	{ 0x56, 4, 0, 16, dtlb4M_str},
5591	{ 0x55, 0, 0, 7, itlb24_str},
5592	{ 0x52, 0, 0, 256, itlb424_str},
5593	{ 0x51, 0, 0, 128, itlb424_str},
5594	{ 0x50, 0, 0, 64, itlb424_str},
5595	{ 0x4f, 0, 0, 32, itlb4k_str},
5596	{ 0x4e, 24, 64, 6*1024*1024, l2_cache_str},
5597	{ 0x4d, 16, 64, 16*1024*1024, l3_cache_str},
5598	{ 0x4c, 12, 64, 12*1024*1024, l3_cache_str},
5599	{ 0x4b, 16, 64, 8*1024*1024, l3_cache_str},
5600	{ 0x4a, 12, 64, 6*1024*1024, l3_cache_str},
5601	{ 0x49, 16, 64, 4*1024*1024, l3_cache_str},
5602	{ 0x48, 12, 64, 3*1024*1024, l2_cache_str},
5603	{ 0x47, 8, 64, 8*1024*1024, l3_cache_str},
5604	{ 0x46, 4, 64, 4*1024*1024, l3_cache_str},
5605	{ 0x45, 4, 32, 2*1024*1024, l2_cache_str},
5606	{ 0x44, 4, 32, 1024*1024, l2_cache_str},
5607	{ 0x43, 4, 32, 512*1024, l2_cache_str},
5608	{ 0x42, 4, 32, 256*1024, l2_cache_str},
5609	{ 0x41, 4, 32, 128*1024, l2_cache_str},
5610	{ 0x3e, 4, 64, 512*1024, sl2_cache_str},
5611	{ 0x3d, 6, 64, 384*1024, sl2_cache_str},
5612	{ 0x3c, 4, 64, 256*1024, sl2_cache_str},
5613	{ 0x3b, 2, 64, 128*1024, sl2_cache_str},
5614	{ 0x3a, 6, 64, 192*1024, sl2_cache_str},
5615	{ 0x39, 4, 64, 128*1024, sl2_cache_str},
5616	{ 0x30, 8, 64, 32*1024, l1_icache_str},
5617	{ 0x2c, 8, 64, 32*1024, l1_dcache_str},
5618	{ 0x29, 8, 64, 4096*1024, sl3_cache_str},
5619	{ 0x25, 8, 64, 2048*1024, sl3_cache_str},
5620	{ 0x23, 8, 64, 1024*1024, sl3_cache_str},
5621	{ 0x22, 4, 64, 512*1024, sl3_cache_str},
5622	{ 0x0e, 6, 64, 24*1024, l1_dcache_str},
5623	{ 0x0d, 4, 32, 16*1024, l1_dcache_str},
5624	{ 0x0c, 4, 32, 16*1024, l1_dcache_str},
5625	{ 0x0b, 4, 0, 4, itlb4M_str},
5626	{ 0x0a, 2, 32, 8*1024, l1_dcache_str},
5627	{ 0x08, 4, 32, 16*1024, l1_icache_str},
5628	{ 0x06, 4, 32, 8*1024, l1_icache_str},
5629	{ 0x05, 4, 0, 32, dtlb4M_str},
5630	{ 0x04, 4, 0, 8, dtlb4M_str},
5631	{ 0x03, 4, 0, 64, dtlb4k_str},
5632	{ 0x02, 4, 0, 2, itlb4M_str},
5633	{ 0x01, 4, 0, 32, itlb4k_str},
5634	{ 0 }
5635};
5636
5637static const struct cachetab cyrix_ctab[] = {
5638	{ 0x70, 4, 0, 32, "tlb-4K" },
5639	{ 0x80, 4, 16, 16*1024, "l1-cache" },
5640	{ 0 }
5641};
5642
5643/*
5644 * Search a cache table for a matching entry
5645 */
5646static const struct cachetab *
5647find_cacheent(const struct cachetab *ct, uint_t code)
5648{
5649	if (code != 0) {
5650		for (; ct->ct_code != 0; ct++)
5651			if (ct->ct_code <= code)
5652				break;
5653		if (ct->ct_code == code)
5654			return (ct);
5655	}
5656	return (NULL);
5657}
5658
5659/*
5660 * Populate cachetab entry with L2 or L3 cache-information using
5661 * cpuid function 4. This function is called from intel_walk_cacheinfo()
5662 * when descriptor 0x49 is encountered. It returns 0 if no such cache
5663 * information is found.
5664 */
5665static int
5666intel_cpuid_4_cache_info(struct cachetab *ct, struct cpuid_info *cpi)
5667{
5668	uint32_t level, i;
5669	int ret = 0;
5670
5671	for (i = 0; i < cpi->cpi_cache_leaf_size; i++) {
5672		level = CPI_CACHE_LVL(cpi->cpi_cache_leaves[i]);
5673
5674		if (level == 2 || level == 3) {
5675			ct->ct_assoc =
5676			    CPI_CACHE_WAYS(cpi->cpi_cache_leaves[i]) + 1;
5677			ct->ct_line_size =
5678			    CPI_CACHE_COH_LN_SZ(cpi->cpi_cache_leaves[i]) + 1;
5679			ct->ct_size = ct->ct_assoc *
5680			    (CPI_CACHE_PARTS(cpi->cpi_cache_leaves[i]) + 1) *
5681			    ct->ct_line_size *
5682			    (cpi->cpi_cache_leaves[i]->cp_ecx + 1);
5683
5684			if (level == 2) {
5685				ct->ct_label = l2_cache_str;
5686			} else if (level == 3) {
5687				ct->ct_label = l3_cache_str;
5688			}
5689			ret = 1;
5690		}
5691	}
5692
5693	return (ret);
5694}
5695
5696/*
5697 * Walk the cacheinfo descriptor, applying 'func' to every valid element
5698 * The walk is terminated if the walker returns non-zero.
5699 */
5700static void
5701intel_walk_cacheinfo(struct cpuid_info *cpi,
5702    void *arg, int (*func)(void *, const struct cachetab *))
5703{
5704	const struct cachetab *ct;
5705	struct cachetab des_49_ct, des_b1_ct;
5706	uint8_t *dp;
5707	int i;
5708
5709	if ((dp = cpi->cpi_cacheinfo) == NULL)
5710		return;
5711	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
5712		/*
5713		 * For overloaded descriptor 0x49 we use cpuid function 4
5714		 * if supported by the current processor, to create
5715		 * cache information.
5716		 * For overloaded descriptor 0xb1 we use X86_PAE flag
5717		 * to disambiguate the cache information.
5718		 */
5719		if (*dp == 0x49 && cpi->cpi_maxeax >= 0x4 &&
5720		    intel_cpuid_4_cache_info(&des_49_ct, cpi) == 1) {
5721				ct = &des_49_ct;
5722		} else if (*dp == 0xb1) {
5723			des_b1_ct.ct_code = 0xb1;
5724			des_b1_ct.ct_assoc = 4;
5725			des_b1_ct.ct_line_size = 0;
5726			if (is_x86_feature(x86_featureset, X86FSET_PAE)) {
5727				des_b1_ct.ct_size = 8;
5728				des_b1_ct.ct_label = itlb2M_str;
5729			} else {
5730				des_b1_ct.ct_size = 4;
5731				des_b1_ct.ct_label = itlb4M_str;
5732			}
5733			ct = &des_b1_ct;
5734		} else {
5735			if ((ct = find_cacheent(intel_ctab, *dp)) == NULL) {
5736				continue;
5737			}
5738		}
5739
5740		if (func(arg, ct) != 0) {
5741			break;
5742		}
5743	}
5744}
5745
5746/*
5747 * (Like the Intel one, except for Cyrix CPUs)
5748 */
5749static void
5750cyrix_walk_cacheinfo(struct cpuid_info *cpi,
5751    void *arg, int (*func)(void *, const struct cachetab *))
5752{
5753	const struct cachetab *ct;
5754	uint8_t *dp;
5755	int i;
5756
5757	if ((dp = cpi->cpi_cacheinfo) == NULL)
5758		return;
5759	for (i = 0; i < cpi->cpi_ncache; i++, dp++) {
5760		/*
5761		 * Search Cyrix-specific descriptor table first ..
5762		 */
5763		if ((ct = find_cacheent(cyrix_ctab, *dp)) != NULL) {
5764			if (func(arg, ct) != 0)
5765				break;
5766			continue;
5767		}
5768		/*
5769		 * .. else fall back to the Intel one
5770		 */
5771		if ((ct = find_cacheent(intel_ctab, *dp)) != NULL) {
5772			if (func(arg, ct) != 0)
5773				break;
5774			continue;
5775		}
5776	}
5777}
5778
5779/*
5780 * A cacheinfo walker that adds associativity, line-size, and size properties
5781 * to the devinfo node it is passed as an argument.
5782 */
5783static int
5784add_cacheent_props(void *arg, const struct cachetab *ct)
5785{
5786	dev_info_t *devi = arg;
5787
5788	add_cache_prop(devi, ct->ct_label, assoc_str, ct->ct_assoc);
5789	if (ct->ct_line_size != 0)
5790		add_cache_prop(devi, ct->ct_label, line_str,
5791		    ct->ct_line_size);
5792	add_cache_prop(devi, ct->ct_label, size_str, ct->ct_size);
5793	return (0);
5794}
5795
5796
5797static const char fully_assoc[] = "fully-associative?";
5798
5799/*
5800 * AMD style cache/tlb description
5801 *
5802 * Extended functions 5 and 6 directly describe properties of
5803 * tlbs and various cache levels.
5804 */
5805static void
5806add_amd_assoc(dev_info_t *devi, const char *label, uint_t assoc)
5807{
5808	switch (assoc) {
5809	case 0:	/* reserved; ignore */
5810		break;
5811	default:
5812		add_cache_prop(devi, label, assoc_str, assoc);
5813		break;
5814	case 0xff:
5815		add_cache_prop(devi, label, fully_assoc, 1);
5816		break;
5817	}
5818}
5819
5820static void
5821add_amd_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
5822{
5823	if (size == 0)
5824		return;
5825	add_cache_prop(devi, label, size_str, size);
5826	add_amd_assoc(devi, label, assoc);
5827}
5828
5829static void
5830add_amd_cache(dev_info_t *devi, const char *label,
5831    uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
5832{
5833	if (size == 0 || line_size == 0)
5834		return;
5835	add_amd_assoc(devi, label, assoc);
5836	/*
5837	 * Most AMD parts have a sectored cache. Multiple cache lines are
5838	 * associated with each tag. A sector consists of all cache lines
5839	 * associated with a tag. For example, the AMD K6-III has a sector
5840	 * size of 2 cache lines per tag.
5841	 */
5842	if (lines_per_tag != 0)
5843		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
5844	add_cache_prop(devi, label, line_str, line_size);
5845	add_cache_prop(devi, label, size_str, size * 1024);
5846}
5847
5848static void
5849add_amd_l2_assoc(dev_info_t *devi, const char *label, uint_t assoc)
5850{
5851	switch (assoc) {
5852	case 0:	/* off */
5853		break;
5854	case 1:
5855	case 2:
5856	case 4:
5857		add_cache_prop(devi, label, assoc_str, assoc);
5858		break;
5859	case 6:
5860		add_cache_prop(devi, label, assoc_str, 8);
5861		break;
5862	case 8:
5863		add_cache_prop(devi, label, assoc_str, 16);
5864		break;
5865	case 0xf:
5866		add_cache_prop(devi, label, fully_assoc, 1);
5867		break;
5868	default: /* reserved; ignore */
5869		break;
5870	}
5871}
5872
5873static void
5874add_amd_l2_tlb(dev_info_t *devi, const char *label, uint_t assoc, uint_t size)
5875{
5876	if (size == 0 || assoc == 0)
5877		return;
5878	add_amd_l2_assoc(devi, label, assoc);
5879	add_cache_prop(devi, label, size_str, size);
5880}
5881
5882static void
5883add_amd_l2_cache(dev_info_t *devi, const char *label,
5884    uint_t size, uint_t assoc, uint_t lines_per_tag, uint_t line_size)
5885{
5886	if (size == 0 || assoc == 0 || line_size == 0)
5887		return;
5888	add_amd_l2_assoc(devi, label, assoc);
5889	if (lines_per_tag != 0)
5890		add_cache_prop(devi, label, "lines-per-tag", lines_per_tag);
5891	add_cache_prop(devi, label, line_str, line_size);
5892	add_cache_prop(devi, label, size_str, size * 1024);
5893}
5894
5895static void
5896amd_cache_info(struct cpuid_info *cpi, dev_info_t *devi)
5897{
5898	struct cpuid_regs *cp;
5899
5900	if (cpi->cpi_xmaxeax < 0x80000005)
5901		return;
5902	cp = &cpi->cpi_extd[5];
5903
5904	/*
5905	 * 4M/2M L1 TLB configuration
5906	 *
5907	 * We report the size for 2M pages because AMD uses two
5908	 * TLB entries for one 4M page.
5909	 */
5910	add_amd_tlb(devi, "dtlb-2M",
5911	    BITX(cp->cp_eax, 31, 24), BITX(cp->cp_eax, 23, 16));
5912	add_amd_tlb(devi, "itlb-2M",
5913	    BITX(cp->cp_eax, 15, 8), BITX(cp->cp_eax, 7, 0));
5914
5915	/*
5916	 * 4K L1 TLB configuration
5917	 */
5918
5919	switch (cpi->cpi_vendor) {
5920		uint_t nentries;
5921	case X86_VENDOR_TM:
5922		if (cpi->cpi_family >= 5) {
5923			/*
5924			 * Crusoe processors have 256 TLB entries, but
5925			 * cpuid data format constrains them to only
5926			 * reporting 255 of them.
5927			 */
5928			if ((nentries = BITX(cp->cp_ebx, 23, 16)) == 255)
5929				nentries = 256;
5930			/*
5931			 * Crusoe processors also have a unified TLB
5932			 */
5933			add_amd_tlb(devi, "tlb-4K", BITX(cp->cp_ebx, 31, 24),
5934			    nentries);
5935			break;
5936		}
5937		/*FALLTHROUGH*/
5938	default:
5939		add_amd_tlb(devi, itlb4k_str,
5940		    BITX(cp->cp_ebx, 31, 24), BITX(cp->cp_ebx, 23, 16));
5941		add_amd_tlb(devi, dtlb4k_str,
5942		    BITX(cp->cp_ebx, 15, 8), BITX(cp->cp_ebx, 7, 0));
5943		break;
5944	}
5945
5946	/*
5947	 * data L1 cache configuration
5948	 */
5949
5950	add_amd_cache(devi, l1_dcache_str,
5951	    BITX(cp->cp_ecx, 31, 24), BITX(cp->cp_ecx, 23, 16),
5952	    BITX(cp->cp_ecx, 15, 8), BITX(cp->cp_ecx, 7, 0));
5953
5954	/*
5955	 * code L1 cache configuration
5956	 */
5957
5958	add_amd_cache(devi, l1_icache_str,
5959	    BITX(cp->cp_edx, 31, 24), BITX(cp->cp_edx, 23, 16),
5960	    BITX(cp->cp_edx, 15, 8), BITX(cp->cp_edx, 7, 0));
5961
5962	if (cpi->cpi_xmaxeax < 0x80000006)
5963		return;
5964	cp = &cpi->cpi_extd[6];
5965
5966	/* Check for a unified L2 TLB for large pages */
5967
5968	if (BITX(cp->cp_eax, 31, 16) == 0)
5969		add_amd_l2_tlb(devi, "l2-tlb-2M",
5970		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5971	else {
5972		add_amd_l2_tlb(devi, "l2-dtlb-2M",
5973		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
5974		add_amd_l2_tlb(devi, "l2-itlb-2M",
5975		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5976	}
5977
5978	/* Check for a unified L2 TLB for 4K pages */
5979
5980	if (BITX(cp->cp_ebx, 31, 16) == 0) {
5981		add_amd_l2_tlb(devi, "l2-tlb-4K",
5982		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5983	} else {
5984		add_amd_l2_tlb(devi, "l2-dtlb-4K",
5985		    BITX(cp->cp_eax, 31, 28), BITX(cp->cp_eax, 27, 16));
5986		add_amd_l2_tlb(devi, "l2-itlb-4K",
5987		    BITX(cp->cp_eax, 15, 12), BITX(cp->cp_eax, 11, 0));
5988	}
5989
5990	add_amd_l2_cache(devi, l2_cache_str,
5991	    BITX(cp->cp_ecx, 31, 16), BITX(cp->cp_ecx, 15, 12),
5992	    BITX(cp->cp_ecx, 11, 8), BITX(cp->cp_ecx, 7, 0));
5993}
5994
5995/*
5996 * There are two basic ways that the x86 world describes it cache
5997 * and tlb architecture - Intel's way and AMD's way.
5998 *
5999 * Return which flavor of cache architecture we should use
6000 */
6001static int
6002x86_which_cacheinfo(struct cpuid_info *cpi)
6003{
6004	switch (cpi->cpi_vendor) {
6005	case X86_VENDOR_Intel:
6006		if (cpi->cpi_maxeax >= 2)
6007			return (X86_VENDOR_Intel);
6008		break;
6009	case X86_VENDOR_AMD:
6010		/*
6011		 * The K5 model 1 was the first part from AMD that reported
6012		 * cache sizes via extended cpuid functions.
6013		 */
6014		if (cpi->cpi_family > 5 ||
6015		    (cpi->cpi_family == 5 && cpi->cpi_model >= 1))
6016			return (X86_VENDOR_AMD);
6017		break;
6018	case X86_VENDOR_TM:
6019		if (cpi->cpi_family >= 5)
6020			return (X86_VENDOR_AMD);
6021		/*FALLTHROUGH*/
6022	default:
6023		/*
6024		 * If they have extended CPU data for 0x80000005
6025		 * then we assume they have AMD-format cache
6026		 * information.
6027		 *
6028		 * If not, and the vendor happens to be Cyrix,
6029		 * then try our-Cyrix specific handler.
6030		 *
6031		 * If we're not Cyrix, then assume we're using Intel's
6032		 * table-driven format instead.
6033		 */
6034		if (cpi->cpi_xmaxeax >= 0x80000005)
6035			return (X86_VENDOR_AMD);
6036		else if (cpi->cpi_vendor == X86_VENDOR_Cyrix)
6037			return (X86_VENDOR_Cyrix);
6038		else if (cpi->cpi_maxeax >= 2)
6039			return (X86_VENDOR_Intel);
6040		break;
6041	}
6042	return (-1);
6043}
6044
6045void
6046cpuid_set_cpu_properties(void *dip, processorid_t cpu_id,
6047    struct cpuid_info *cpi)
6048{
6049	dev_info_t *cpu_devi;
6050	int create;
6051
6052	cpu_devi = (dev_info_t *)dip;
6053
6054	/* device_type */
6055	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6056	    "device_type", "cpu");
6057
6058	/* reg */
6059	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6060	    "reg", cpu_id);
6061
6062	/* cpu-mhz, and clock-frequency */
6063	if (cpu_freq > 0) {
6064		long long mul;
6065
6066		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6067		    "cpu-mhz", cpu_freq);
6068		if ((mul = cpu_freq * 1000000LL) <= INT_MAX)
6069			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6070			    "clock-frequency", (int)mul);
6071	}
6072
6073	if (!is_x86_feature(x86_featureset, X86FSET_CPUID)) {
6074		return;
6075	}
6076
6077	/* vendor-id */
6078	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6079	    "vendor-id", cpi->cpi_vendorstr);
6080
6081	if (cpi->cpi_maxeax == 0) {
6082		return;
6083	}
6084
6085	/*
6086	 * family, model, and step
6087	 */
6088	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6089	    "family", CPI_FAMILY(cpi));
6090	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6091	    "cpu-model", CPI_MODEL(cpi));
6092	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6093	    "stepping-id", CPI_STEP(cpi));
6094
6095	/* type */
6096	switch (cpi->cpi_vendor) {
6097	case X86_VENDOR_Intel:
6098		create = 1;
6099		break;
6100	default:
6101		create = 0;
6102		break;
6103	}
6104	if (create)
6105		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6106		    "type", CPI_TYPE(cpi));
6107
6108	/* ext-family */
6109	switch (cpi->cpi_vendor) {
6110	case X86_VENDOR_Intel:
6111	case X86_VENDOR_AMD:
6112		create = cpi->cpi_family >= 0xf;
6113		break;
6114	default:
6115		create = 0;
6116		break;
6117	}
6118	if (create)
6119		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6120		    "ext-family", CPI_FAMILY_XTD(cpi));
6121
6122	/* ext-model */
6123	switch (cpi->cpi_vendor) {
6124	case X86_VENDOR_Intel:
6125		create = IS_EXTENDED_MODEL_INTEL(cpi);
6126		break;
6127	case X86_VENDOR_AMD:
6128		create = CPI_FAMILY(cpi) == 0xf;
6129		break;
6130	default:
6131		create = 0;
6132		break;
6133	}
6134	if (create)
6135		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6136		    "ext-model", CPI_MODEL_XTD(cpi));
6137
6138	/* generation */
6139	switch (cpi->cpi_vendor) {
6140	case X86_VENDOR_AMD:
6141		/*
6142		 * AMD K5 model 1 was the first part to support this
6143		 */
6144		create = cpi->cpi_xmaxeax >= 0x80000001;
6145		break;
6146	default:
6147		create = 0;
6148		break;
6149	}
6150	if (create)
6151		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6152		    "generation", BITX((cpi)->cpi_extd[1].cp_eax, 11, 8));
6153
6154	/* brand-id */
6155	switch (cpi->cpi_vendor) {
6156	case X86_VENDOR_Intel:
6157		/*
6158		 * brand id first appeared on Pentium III Xeon model 8,
6159		 * and Celeron model 8 processors and Opteron
6160		 */
6161		create = cpi->cpi_family > 6 ||
6162		    (cpi->cpi_family == 6 && cpi->cpi_model >= 8);
6163		break;
6164	case X86_VENDOR_AMD:
6165		create = cpi->cpi_family >= 0xf;
6166		break;
6167	default:
6168		create = 0;
6169		break;
6170	}
6171	if (create && cpi->cpi_brandid != 0) {
6172		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6173		    "brand-id", cpi->cpi_brandid);
6174	}
6175
6176	/* chunks, and apic-id */
6177	switch (cpi->cpi_vendor) {
6178		/*
6179		 * first available on Pentium IV and Opteron (K8)
6180		 */
6181	case X86_VENDOR_Intel:
6182		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6183		break;
6184	case X86_VENDOR_AMD:
6185		create = cpi->cpi_family >= 0xf;
6186		break;
6187	default:
6188		create = 0;
6189		break;
6190	}
6191	if (create) {
6192		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6193		    "chunks", CPI_CHUNKS(cpi));
6194		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6195		    "apic-id", cpi->cpi_apicid);
6196		if (cpi->cpi_chipid >= 0) {
6197			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6198			    "chip#", cpi->cpi_chipid);
6199			(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6200			    "clog#", cpi->cpi_clogid);
6201		}
6202	}
6203
6204	/* cpuid-features */
6205	(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6206	    "cpuid-features", CPI_FEATURES_EDX(cpi));
6207
6208
6209	/* cpuid-features-ecx */
6210	switch (cpi->cpi_vendor) {
6211	case X86_VENDOR_Intel:
6212		create = IS_NEW_F6(cpi) || cpi->cpi_family >= 0xf;
6213		break;
6214	case X86_VENDOR_AMD:
6215		create = cpi->cpi_family >= 0xf;
6216		break;
6217	default:
6218		create = 0;
6219		break;
6220	}
6221	if (create)
6222		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6223		    "cpuid-features-ecx", CPI_FEATURES_ECX(cpi));
6224
6225	/* ext-cpuid-features */
6226	switch (cpi->cpi_vendor) {
6227	case X86_VENDOR_Intel:
6228	case X86_VENDOR_AMD:
6229	case X86_VENDOR_Cyrix:
6230	case X86_VENDOR_TM:
6231	case X86_VENDOR_Centaur:
6232		create = cpi->cpi_xmaxeax >= 0x80000001;
6233		break;
6234	default:
6235		create = 0;
6236		break;
6237	}
6238	if (create) {
6239		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6240		    "ext-cpuid-features", CPI_FEATURES_XTD_EDX(cpi));
6241		(void) ndi_prop_update_int(DDI_DEV_T_NONE, cpu_devi,
6242		    "ext-cpuid-features-ecx", CPI_FEATURES_XTD_ECX(cpi));
6243	}
6244
6245	/*
6246	 * Brand String first appeared in Intel Pentium IV, AMD K5
6247	 * model 1, and Cyrix GXm.  On earlier models we try and
6248	 * simulate something similar .. so this string should always
6249	 * same -something- about the processor, however lame.
6250	 */
6251	(void) ndi_prop_update_string(DDI_DEV_T_NONE, cpu_devi,
6252	    "brand-string", cpi->cpi_brandstr);
6253
6254	/*
6255	 * Finally, cache and tlb information
6256	 */
6257	switch (x86_which_cacheinfo(cpi)) {
6258	case X86_VENDOR_Intel:
6259		intel_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6260		break;
6261	case X86_VENDOR_Cyrix:
6262		cyrix_walk_cacheinfo(cpi, cpu_devi, add_cacheent_props);
6263		break;
6264	case X86_VENDOR_AMD:
6265		amd_cache_info(cpi, cpu_devi);
6266		break;
6267	default:
6268		break;
6269	}
6270}
6271
6272struct l2info {
6273	int *l2i_csz;
6274	int *l2i_lsz;
6275	int *l2i_assoc;
6276	int l2i_ret;
6277};
6278
6279/*
6280 * A cacheinfo walker that fetches the size, line-size and associativity
6281 * of the L2 cache
6282 */
6283static int
6284intel_l2cinfo(void *arg, const struct cachetab *ct)
6285{
6286	struct l2info *l2i = arg;
6287	int *ip;
6288
6289	if (ct->ct_label != l2_cache_str &&
6290	    ct->ct_label != sl2_cache_str)
6291		return (0);	/* not an L2 -- keep walking */
6292
6293	if ((ip = l2i->l2i_csz) != NULL)
6294		*ip = ct->ct_size;
6295	if ((ip = l2i->l2i_lsz) != NULL)
6296		*ip = ct->ct_line_size;
6297	if ((ip = l2i->l2i_assoc) != NULL)
6298		*ip = ct->ct_assoc;
6299	l2i->l2i_ret = ct->ct_size;
6300	return (1);		/* was an L2 -- terminate walk */
6301}
6302
6303/*
6304 * AMD L2/L3 Cache and TLB Associativity Field Definition:
6305 *
6306 *	Unlike the associativity for the L1 cache and tlb where the 8 bit
6307 *	value is the associativity, the associativity for the L2 cache and
6308 *	tlb is encoded in the following table. The 4 bit L2 value serves as
6309 *	an index into the amd_afd[] array to determine the associativity.
6310 *	-1 is undefined. 0 is fully associative.
6311 */
6312
6313static int amd_afd[] =
6314	{-1, 1, 2, -1, 4, -1, 8, -1, 16, -1, 32, 48, 64, 96, 128, 0};
6315
6316static void
6317amd_l2cacheinfo(struct cpuid_info *cpi, struct l2info *l2i)
6318{
6319	struct cpuid_regs *cp;
6320	uint_t size, assoc;
6321	int i;
6322	int *ip;
6323
6324	if (cpi->cpi_xmaxeax < 0x80000006)
6325		return;
6326	cp = &cpi->cpi_extd[6];
6327
6328	if ((i = BITX(cp->cp_ecx, 15, 12)) != 0 &&
6329	    (size = BITX(cp->cp_ecx, 31, 16)) != 0) {
6330		uint_t cachesz = size * 1024;
6331		assoc = amd_afd[i];
6332
6333		ASSERT(assoc != -1);
6334
6335		if ((ip = l2i->l2i_csz) != NULL)
6336			*ip = cachesz;
6337		if ((ip = l2i->l2i_lsz) != NULL)
6338			*ip = BITX(cp->cp_ecx, 7, 0);
6339		if ((ip = l2i->l2i_assoc) != NULL)
6340			*ip = assoc;
6341		l2i->l2i_ret = cachesz;
6342	}
6343}
6344
6345int
6346getl2cacheinfo(cpu_t *cpu, int *csz, int *lsz, int *assoc)
6347{
6348	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6349	struct l2info __l2info, *l2i = &__l2info;
6350
6351	l2i->l2i_csz = csz;
6352	l2i->l2i_lsz = lsz;
6353	l2i->l2i_assoc = assoc;
6354	l2i->l2i_ret = -1;
6355
6356	switch (x86_which_cacheinfo(cpi)) {
6357	case X86_VENDOR_Intel:
6358		intel_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6359		break;
6360	case X86_VENDOR_Cyrix:
6361		cyrix_walk_cacheinfo(cpi, l2i, intel_l2cinfo);
6362		break;
6363	case X86_VENDOR_AMD:
6364		amd_l2cacheinfo(cpi, l2i);
6365		break;
6366	default:
6367		break;
6368	}
6369	return (l2i->l2i_ret);
6370}
6371
6372#if !defined(__xpv)
6373
6374uint32_t *
6375cpuid_mwait_alloc(cpu_t *cpu)
6376{
6377	uint32_t	*ret;
6378	size_t		mwait_size;
6379
6380	ASSERT(cpuid_checkpass(CPU, 2));
6381
6382	mwait_size = CPU->cpu_m.mcpu_cpi->cpi_mwait.mon_max;
6383	if (mwait_size == 0)
6384		return (NULL);
6385
6386	/*
6387	 * kmem_alloc() returns cache line size aligned data for mwait_size
6388	 * allocations.  mwait_size is currently cache line sized.  Neither
6389	 * of these implementation details are guarantied to be true in the
6390	 * future.
6391	 *
6392	 * First try allocating mwait_size as kmem_alloc() currently returns
6393	 * correctly aligned memory.  If kmem_alloc() does not return
6394	 * mwait_size aligned memory, then use mwait_size ROUNDUP.
6395	 *
6396	 * Set cpi_mwait.buf_actual and cpi_mwait.size_actual in case we
6397	 * decide to free this memory.
6398	 */
6399	ret = kmem_zalloc(mwait_size, KM_SLEEP);
6400	if (ret == (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size)) {
6401		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6402		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size;
6403		*ret = MWAIT_RUNNING;
6404		return (ret);
6405	} else {
6406		kmem_free(ret, mwait_size);
6407		ret = kmem_zalloc(mwait_size * 2, KM_SLEEP);
6408		cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = ret;
6409		cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = mwait_size * 2;
6410		ret = (uint32_t *)P2ROUNDUP((uintptr_t)ret, mwait_size);
6411		*ret = MWAIT_RUNNING;
6412		return (ret);
6413	}
6414}
6415
6416void
6417cpuid_mwait_free(cpu_t *cpu)
6418{
6419	if (cpu->cpu_m.mcpu_cpi == NULL) {
6420		return;
6421	}
6422
6423	if (cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual != NULL &&
6424	    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual > 0) {
6425		kmem_free(cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual,
6426		    cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual);
6427	}
6428
6429	cpu->cpu_m.mcpu_cpi->cpi_mwait.buf_actual = NULL;
6430	cpu->cpu_m.mcpu_cpi->cpi_mwait.size_actual = 0;
6431}
6432
6433void
6434patch_tsc_read(int flag)
6435{
6436	size_t cnt;
6437
6438	switch (flag) {
6439	case TSC_NONE:
6440		cnt = &_no_rdtsc_end - &_no_rdtsc_start;
6441		(void) memcpy((void *)tsc_read, (void *)&_no_rdtsc_start, cnt);
6442		break;
6443	case TSC_RDTSC_MFENCE:
6444		cnt = &_tsc_mfence_end - &_tsc_mfence_start;
6445		(void) memcpy((void *)tsc_read,
6446		    (void *)&_tsc_mfence_start, cnt);
6447		break;
6448	case TSC_RDTSC_LFENCE:
6449		cnt = &_tsc_lfence_end - &_tsc_lfence_start;
6450		(void) memcpy((void *)tsc_read,
6451		    (void *)&_tsc_lfence_start, cnt);
6452		break;
6453	case TSC_TSCP:
6454		cnt = &_tscp_end - &_tscp_start;
6455		(void) memcpy((void *)tsc_read, (void *)&_tscp_start, cnt);
6456		break;
6457	default:
6458		/* Bail for unexpected TSC types. (TSC_NONE covers 0) */
6459		cmn_err(CE_PANIC, "Unrecogized TSC type: %d", flag);
6460		break;
6461	}
6462	tsc_type = flag;
6463}
6464
6465int
6466cpuid_deep_cstates_supported(void)
6467{
6468	struct cpuid_info *cpi;
6469	struct cpuid_regs regs;
6470
6471	ASSERT(cpuid_checkpass(CPU, 1));
6472
6473	cpi = CPU->cpu_m.mcpu_cpi;
6474
6475	if (!is_x86_feature(x86_featureset, X86FSET_CPUID))
6476		return (0);
6477
6478	switch (cpi->cpi_vendor) {
6479	case X86_VENDOR_Intel:
6480		if (cpi->cpi_xmaxeax < 0x80000007)
6481			return (0);
6482
6483		/*
6484		 * TSC run at a constant rate in all ACPI C-states?
6485		 */
6486		regs.cp_eax = 0x80000007;
6487		(void) __cpuid_insn(&regs);
6488		return (regs.cp_edx & CPUID_TSC_CSTATE_INVARIANCE);
6489
6490	default:
6491		return (0);
6492	}
6493}
6494
6495#endif	/* !__xpv */
6496
6497void
6498post_startup_cpu_fixups(void)
6499{
6500#ifndef __xpv
6501	/*
6502	 * Some AMD processors support C1E state. Entering this state will
6503	 * cause the local APIC timer to stop, which we can't deal with at
6504	 * this time.
6505	 */
6506	if (cpuid_getvendor(CPU) == X86_VENDOR_AMD) {
6507		on_trap_data_t otd;
6508		uint64_t reg;
6509
6510		if (!on_trap(&otd, OT_DATA_ACCESS)) {
6511			reg = rdmsr(MSR_AMD_INT_PENDING_CMP_HALT);
6512			/* Disable C1E state if it is enabled by BIOS */
6513			if ((reg >> AMD_ACTONCMPHALT_SHIFT) &
6514			    AMD_ACTONCMPHALT_MASK) {
6515				reg &= ~(AMD_ACTONCMPHALT_MASK <<
6516				    AMD_ACTONCMPHALT_SHIFT);
6517				wrmsr(MSR_AMD_INT_PENDING_CMP_HALT, reg);
6518			}
6519		}
6520		no_trap();
6521	}
6522#endif	/* !__xpv */
6523}
6524
6525void
6526enable_pcid(void)
6527{
6528	if (x86_use_pcid == -1)
6529		x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
6530
6531	if (x86_use_invpcid == -1) {
6532		x86_use_invpcid = is_x86_feature(x86_featureset,
6533		    X86FSET_INVPCID);
6534	}
6535
6536	if (!x86_use_pcid)
6537		return;
6538
6539	/*
6540	 * Intel say that on setting PCIDE, it immediately starts using the PCID
6541	 * bits; better make sure there's nothing there.
6542	 */
6543	ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
6544
6545	setcr4(getcr4() | CR4_PCIDE);
6546}
6547
6548/*
6549 * Setup necessary registers to enable XSAVE feature on this processor.
6550 * This function needs to be called early enough, so that no xsave/xrstor
6551 * ops will execute on the processor before the MSRs are properly set up.
6552 *
6553 * Current implementation has the following assumption:
6554 * - cpuid_pass1() is done, so that X86 features are known.
6555 * - fpu_probe() is done, so that fp_save_mech is chosen.
6556 */
6557void
6558xsave_setup_msr(cpu_t *cpu)
6559{
6560	ASSERT(fp_save_mech == FP_XSAVE);
6561	ASSERT(is_x86_feature(x86_featureset, X86FSET_XSAVE));
6562
6563	/* Enable OSXSAVE in CR4. */
6564	setcr4(getcr4() | CR4_OSXSAVE);
6565	/*
6566	 * Update SW copy of ECX, so that /dev/cpu/self/cpuid will report
6567	 * correct value.
6568	 */
6569	cpu->cpu_m.mcpu_cpi->cpi_std[1].cp_ecx |= CPUID_INTC_ECX_OSXSAVE;
6570	setup_xfem();
6571}
6572
6573/*
6574 * Starting with the Westmere processor the local
6575 * APIC timer will continue running in all C-states,
6576 * including the deepest C-states.
6577 */
6578int
6579cpuid_arat_supported(void)
6580{
6581	struct cpuid_info *cpi;
6582	struct cpuid_regs regs;
6583
6584	ASSERT(cpuid_checkpass(CPU, 1));
6585	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
6586
6587	cpi = CPU->cpu_m.mcpu_cpi;
6588
6589	switch (cpi->cpi_vendor) {
6590	case X86_VENDOR_Intel:
6591		/*
6592		 * Always-running Local APIC Timer is
6593		 * indicated by CPUID.6.EAX[2].
6594		 */
6595		if (cpi->cpi_maxeax >= 6) {
6596			regs.cp_eax = 6;
6597			(void) cpuid_insn(NULL, &regs);
6598			return (regs.cp_eax & CPUID_INTC_EAX_ARAT);
6599		} else {
6600			return (0);
6601		}
6602	default:
6603		return (0);
6604	}
6605}
6606
6607/*
6608 * Check support for Intel ENERGY_PERF_BIAS feature
6609 */
6610int
6611cpuid_iepb_supported(struct cpu *cp)
6612{
6613	struct cpuid_info *cpi = cp->cpu_m.mcpu_cpi;
6614	struct cpuid_regs regs;
6615
6616	ASSERT(cpuid_checkpass(cp, 1));
6617
6618	if (!(is_x86_feature(x86_featureset, X86FSET_CPUID)) ||
6619	    !(is_x86_feature(x86_featureset, X86FSET_MSR))) {
6620		return (0);
6621	}
6622
6623	/*
6624	 * Intel ENERGY_PERF_BIAS MSR is indicated by
6625	 * capability bit CPUID.6.ECX.3
6626	 */
6627	if ((cpi->cpi_vendor != X86_VENDOR_Intel) || (cpi->cpi_maxeax < 6))
6628		return (0);
6629
6630	regs.cp_eax = 0x6;
6631	(void) cpuid_insn(NULL, &regs);
6632	return (regs.cp_ecx & CPUID_INTC_ECX_PERFBIAS);
6633}
6634
6635/*
6636 * Check support for TSC deadline timer
6637 *
6638 * TSC deadline timer provides a superior software programming
6639 * model over local APIC timer that eliminates "time drifts".
6640 * Instead of specifying a relative time, software specifies an
6641 * absolute time as the target at which the processor should
6642 * generate a timer event.
6643 */
6644int
6645cpuid_deadline_tsc_supported(void)
6646{
6647	struct cpuid_info *cpi = CPU->cpu_m.mcpu_cpi;
6648	struct cpuid_regs regs;
6649
6650	ASSERT(cpuid_checkpass(CPU, 1));
6651	ASSERT(is_x86_feature(x86_featureset, X86FSET_CPUID));
6652
6653	switch (cpi->cpi_vendor) {
6654	case X86_VENDOR_Intel:
6655		if (cpi->cpi_maxeax >= 1) {
6656			regs.cp_eax = 1;
6657			(void) cpuid_insn(NULL, &regs);
6658			return (regs.cp_ecx & CPUID_DEADLINE_TSC);
6659		} else {
6660			return (0);
6661		}
6662	default:
6663		return (0);
6664	}
6665}
6666
6667#if defined(__amd64) && !defined(__xpv)
6668/*
6669 * Patch in versions of bcopy for high performance Intel Nhm processors
6670 * and later...
6671 */
6672void
6673patch_memops(uint_t vendor)
6674{
6675	size_t cnt, i;
6676	caddr_t to, from;
6677
6678	if ((vendor == X86_VENDOR_Intel) &&
6679	    is_x86_feature(x86_featureset, X86FSET_SSE4_2)) {
6680		cnt = &bcopy_patch_end - &bcopy_patch_start;
6681		to = &bcopy_ck_size;
6682		from = &bcopy_patch_start;
6683		for (i = 0; i < cnt; i++) {
6684			*to++ = *from++;
6685		}
6686	}
6687}
6688#endif  /* __amd64 && !__xpv */
6689
6690/*
6691 * We're being asked to tell the system how many bits are required to represent
6692 * the various thread and strand IDs. While it's tempting to derive this based
6693 * on the values in cpi_ncore_per_chip and cpi_ncpu_per_chip, that isn't quite
6694 * correct. Instead, this needs to be based on the number of bits that the APIC
6695 * allows for these different configurations. We only update these to a larger
6696 * value if we find one.
6697 */
6698void
6699cpuid_get_ext_topo(cpu_t *cpu, uint_t *core_nbits, uint_t *strand_nbits)
6700{
6701	struct cpuid_info *cpi;
6702
6703	VERIFY(cpuid_checkpass(CPU, 1));
6704	cpi = cpu->cpu_m.mcpu_cpi;
6705
6706	if (cpi->cpi_ncore_bits > *core_nbits) {
6707		*core_nbits = cpi->cpi_ncore_bits;
6708	}
6709
6710	if (cpi->cpi_nthread_bits > *strand_nbits) {
6711		*strand_nbits = cpi->cpi_nthread_bits;
6712	}
6713}
6714
6715void
6716cpuid_pass_ucode(cpu_t *cpu, uchar_t *fset)
6717{
6718	struct cpuid_info *cpi = cpu->cpu_m.mcpu_cpi;
6719	struct cpuid_regs cp;
6720
6721	/*
6722	 * Reread the CPUID portions that we need for various security
6723	 * information.
6724	 */
6725	if (cpi->cpi_vendor == X86_VENDOR_Intel) {
6726		/*
6727		 * Check if we now have leaf 7 available to us.
6728		 */
6729		if (cpi->cpi_maxeax < 7) {
6730			bzero(&cp, sizeof (cp));
6731			cp.cp_eax = 0;
6732			cpi->cpi_maxeax = __cpuid_insn(&cp);
6733			if (cpi->cpi_maxeax < 7)
6734				return;
6735		}
6736
6737		bzero(&cp, sizeof (cp));
6738		cp.cp_eax = 7;
6739		cp.cp_ecx = 0;
6740		(void) __cpuid_insn(&cp);
6741		cpi->cpi_std[7] = cp;
6742	} else if (cpi->cpi_vendor == X86_VENDOR_AMD) {
6743		/* No xcpuid support */
6744		if (cpi->cpi_family < 5 ||
6745		    (cpi->cpi_family == 5 && cpi->cpi_model < 1))
6746			return;
6747
6748		if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
6749			bzero(&cp, sizeof (cp));
6750			cp.cp_eax = CPUID_LEAF_EXT_0;
6751			cpi->cpi_xmaxeax = __cpuid_insn(&cp);
6752			if (cpi->cpi_xmaxeax < CPUID_LEAF_EXT_8) {
6753				return;
6754			}
6755		}
6756
6757		bzero(&cp, sizeof (cp));
6758		cp.cp_eax = CPUID_LEAF_EXT_8;
6759		(void) __cpuid_insn(&cp);
6760		platform_cpuid_mangle(cpi->cpi_vendor, CPUID_LEAF_EXT_8, &cp);
6761		cpi->cpi_extd[8] = cp;
6762	} else {
6763		/*
6764		 * Nothing to do here. Return an empty set which has already
6765		 * been zeroed for us.
6766		 */
6767		return;
6768	}
6769	cpuid_scan_security(cpu, fset);
6770}
6771
6772/* ARGSUSED */
6773static int
6774cpuid_post_ucodeadm_xc(xc_arg_t arg0, xc_arg_t arg1, xc_arg_t arg2)
6775{
6776	uchar_t *fset;
6777
6778	fset = (uchar_t *)(arg0 + sizeof (x86_featureset) * CPU->cpu_id);
6779	cpuid_pass_ucode(CPU, fset);
6780
6781	return (0);
6782}
6783
6784/*
6785 * After a microcode update where the version has changed, then we need to
6786 * rescan CPUID. To do this we check every CPU to make sure that they have the
6787 * same microcode. Then we perform a cross call to all such CPUs. It's the
6788 * caller's job to make sure that no one else can end up doing an update while
6789 * this is going on.
6790 *
6791 * We assume that the system is microcode capable if we're called.
6792 */
6793void
6794cpuid_post_ucodeadm(void)
6795{
6796	uint32_t rev;
6797	int i;
6798	struct cpu *cpu;
6799	cpuset_t cpuset;
6800	void *argdata;
6801	uchar_t *f0;
6802
6803	argdata = kmem_zalloc(sizeof (x86_featureset) * NCPU, KM_SLEEP);
6804
6805	mutex_enter(&cpu_lock);
6806	cpu = cpu_get(0);
6807	rev = cpu->cpu_m.mcpu_ucode_info->cui_rev;
6808	CPUSET_ONLY(cpuset, 0);
6809	for (i = 1; i < max_ncpus; i++) {
6810		if ((cpu = cpu_get(i)) == NULL)
6811			continue;
6812
6813		if (cpu->cpu_m.mcpu_ucode_info->cui_rev != rev) {
6814			panic("post microcode update CPU %d has differing "
6815			    "microcode revision (%u) from CPU 0 (%u)",
6816			    i, cpu->cpu_m.mcpu_ucode_info->cui_rev, rev);
6817		}
6818		CPUSET_ADD(cpuset, i);
6819	}
6820
6821	kpreempt_disable();
6822	xc_sync((xc_arg_t)argdata, 0, 0, CPUSET2BV(cpuset),
6823	    cpuid_post_ucodeadm_xc);
6824	kpreempt_enable();
6825
6826	/*
6827	 * OK, now look at each CPU and see if their feature sets are equal.
6828	 */
6829	f0 = argdata;
6830	for (i = 1; i < max_ncpus; i++) {
6831		uchar_t *fset;
6832		if (!CPU_IN_SET(cpuset, i))
6833			continue;
6834
6835		fset = (uchar_t *)((uintptr_t)argdata +
6836		    sizeof (x86_featureset) * i);
6837
6838		if (!compare_x86_featureset(f0, fset)) {
6839			panic("Post microcode update CPU %d has "
6840			    "differing security feature (%p) set from CPU 0 "
6841			    "(%p), not appending to feature set", i,
6842			    (void *)fset, (void *)f0);
6843		}
6844	}
6845
6846	mutex_exit(&cpu_lock);
6847
6848	for (i = 0; i < NUM_X86_FEATURES; i++) {
6849		cmn_err(CE_CONT, "?post-ucode x86_feature: %s\n",
6850		    x86_feature_names[i]);
6851		if (is_x86_feature(f0, i)) {
6852			add_x86_feature(x86_featureset, i);
6853		}
6854	}
6855	kmem_free(argdata, sizeof (x86_featureset) * NCPU);
6856}
6857