1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright 2019 Joyent, Inc.
14 */
15
16/*
17 * Generic Intel Integrated Memory Controller (IMC) Driver
18 *
19 * This driver talks to the CPU's IMC to understand the detailed topology of the
20 * processor and to determine how to map between physical addresses to the
21 * corresponding DIMM. This driver supports the following generations of Intel
22 * chips:
23 *
24 *  - Sandy Bridge
25 *  - Ivy Bridge
26 *  - Haswell
27 *  - Broadwell
28 *  - Skylake / Cascade Lake
29 *
30 * Memory Decoding
31 * ---------------
32 *
33 * For more detailed summaries of the memory decoding process, please refer to
34 * the Intel External Design Specifications for the corresponding processor.
35 * What follows is a rough overview of how the memory decoding system works.
36 *
37 * First, we'd like to define the following concepts:
38 *
39 * SYSTEM ADDRESS
40 *
41 *	This is a physical address that the operating system normally uses. This
42 *	address may refer to DRAM, it may refer to memory mapped PCI
43 *	configuration space or device registers, or it may refer to other parts
44 *	of the system's memory map, such as the extended advanced programmable
45 *	interrupt controller (xAPIC), etc.
46 *
47 * DIMM
48 *
49 *	Dual-inline memory module. This refers to a physical stick of volatile
50 *	memory that is inserted into a slot on the motherboard.
51 *
52 * RANK
53 *
54 *	A potential sub-division of a DIMM. A DIMM's memory capacity is divided
55 *	into a number of equal sized ranks. For example, an 8 GiB DIMM, may have
56 *	1 8 GiB rank, 2 4 GiB ranks, or 4 2 GiB ranks.
57 *
58 * RANK ADDRESS
59 *
60 *	An address that exists in the context of a given rank on a DIMM. All
61 *	ranks have overlapping addresses, so the address 0x400 exists on all
62 *	ranks on a given DIMM.
63 *
64 * CHANNEL
65 *
66 *	Multiple DIMMs may be combined into a single channel. The channel
67 *	represents the combined memory of all the DIMMs. A given channel only
68 *	ever exists on a socket and is bound to a single memory controller.
69 *
70 * CHANNEL ADDRESS
71 *
72 *	This is an address that exists logically on a channel. Each address on a
73 *	channel maps to a corresponding DIMM that exists on that channel. The
74 *	address space on one channel is independent from that on another. This
75 *	means that address 0x1000 can exist on each memory channel in the
76 *	system.
77 *
78 * INTERLEAVE
79 *
80 *	There are several different cases where interleaving occurs on the
81 *	system. For example, addresses may be interleaved across sockets,
82 *	memory channels, or DIMM ranks. When addresses are interleaved, then
83 *	some number of bits in an address are used to select which target to go
84 *	to (usually through a look up table). The effect of interleaving is that
85 *	addresses that are next to one another may not all go to the same
86 *	device. The following image shows a non-interleaving case.
87 *
88 *	0x0fff +-----+             +-----+ 0x7ff
89 *	       |     |\___________/|     |
90 *	       |     |  __________ | (b) |
91 *	       |     | /          \|     |
92 *	0x0800 |=====|=            +-----+ 0x000       +-----+ 0x7ff
93 *	       |     | \______________________________/|     |
94 *	       |     | _______________________________ | (a) |
95 *	       |     |/                               \|     |
96 *	0x0000 +-----+                                 +-----+ 0x000
97 *
98 *	In this example of non-interleaving, addresses 0x0000 to 0x07ff go to
99 *	device (a). While, addresses 0x08000 to 0xfff, go to device (b).
100 *	However, each range is divided into the same number of components.
101 *
102 *	If instead, we were to look at that with interleaving, what we might say
103 *	is that rather than splitting the range in half, we might say that if
104 *	the address has bit 8 set (0x100), then it goes to (b), otherwise it
105 *	goes to (a). This means that addresses 0x000 to 0x0ff, would go to (a).
106 *	0x100 to 0x1ff would go to (b). 0x200 to 0x2ff would go back to (a)
107 *	again, and then 0x300 to 0x2ff would go back to (b). This would continue
108 *	for a while. This would instead look something more like:
109 *
110 *
111 *      0x0fff +-----+       A: 0x7ff +---------+   B: 0x7ff +---------+
112 *             | (b) |                | e00-eff |            | f00-fff |
113 *      0x0f00 |-----|          0x700 +---------+      0x700 +---------+
114 *             | (a) |                | c00-cff |            | d00-dff |
115 *      0x0e00 ~~~~~~~          0x600 +---------+      0x600 +---------+
116 *               ***                  | a00-aff |            | b00-bff |
117 *      0x0400 ~~~~~~~          0x500 +---------+      0x500 +---------+
118 *             | (b) |                | 800-8ff |            | 900-9ff |
119 *      0x0300 |-----|          0x400 +---------+      0x400 +---------+
120 *             | (a) |                | 600-6ff |            | 700-7ff |
121 *      0x0200 |-----|          0x300 +---------+      0x300 +---------+
122 *             | (b) |                | 400-4ff |            | 500-5ff |
123 *      0x0100 |-----|          0x200 +---------+      0x200 +---------+
124 *             | (a) |                | 200-2ff |            | 300-3ff |
125 *      0x0000 +-----+          0x100 +---------+      0x100 +---------+
126 *                                    | 000-0ff |            | 100-1ff |
127 *                              0x000 +---------+      0x000 +---------+
128 *
129 *	In this example we've performed two-way interleaving. The number of ways
130 *	that something can interleave varies based on what we're interleaving
131 *	between.
132 *
133 * MEMORY CONTROLLER
134 *
135 *	A given processor die (see uts/i86pc/os/cpuid.c) contains a number of
136 *	memory controllers. Usually 1 or two. Each memory controller supports a
137 *	given number of DIMMs, which are divided across multiple channels.
138 *
139 * TARGET ADDRESS DECODER
140 *
141 *	The target address decoder (TAD) is responsible for taking a system
142 *	address and transforming it into a channel address based on the rules
143 *	that are present. Each memory controller has a corresponding TAD. The
144 *	TAD is often contained in a device called a 'Home Agent'.
145 *
146 * SYSTEM ADDRESS DECODER
147 *
148 *	The system address decoder (SAD) is responsible for taking a system
149 *	address and directing it to the right place, whether this be memory or
150 *	otherwise. There is a single memory controller per socket (see
151 *	uts/i86pc/os/cpuid.c) that is shared between all the cores currently.
152 *
153 * NODE IDENTIFIER
154 *
155 *	The node identifier is used to uniquely identify an element in the
156 *	various routing topologies on the die (see uts/i86pc/os/cpuid.c for the
157 *	definition of 'die'). One can roughly think about this as a unique
158 *	identifier for the socket itself. In general, the primary node ID for a
159 *	socket should map to the socket APIC ID.
160 *
161 * Finding Devices
162 * ---------------
163 *
164 * There is a bit of a chicken and egg problem on Intel systems and in the
165 * device driver interface. The information that we need in the system is spread
166 * out amongst a large number of different PCI devices that the processor
167 * exposes. The number of such devices can vary based on the processor
168 * generation and the specific SKU in the processor. To deal with this, we break
169 * the driver into two different components: a stub driver and the full driver.
170 *
171 * The stub driver has aliases for all known PCI devices that we might attach to
172 * in a given generation on the system. This driver is called 'imcstub'. When a
173 * stub attaches, it just registers itself with the main driver, upon which it
174 * has a module dependency.
175 *
176 * The main driver, 'imc', is a pseudo-device driver. When it first attaches, it
177 * kicks off a scan of the device tree which takes place in a task queue. Once
178 * there, it determines the number of devices that it expects to exist by
179 * walking the tree and comparing it against the generation-specific table.
180 *
181 * If all devices are found, we'll go ahead and read through all the devices and
182 * build a map of all the information we need to understand the topology of the
183 * system and to be able to decode addresses. We do this here, because we can be
184 * asked to perform decoding in dangerous contexts (after taking an MCE, panic,
185 * etc) where we don't want to have to rely on the broader kernel functioning at
186 * this point in time.
187 *
188 * Once our topology is built, we'll create minor nodes which are used by the
189 * fault management architecture to query for information and register our
190 * decoding functionality with the kernel.
191 *
192 * PCI Numbering
193 * -------------
194 *
195 * For each device that we care about, Intel defines the device and function
196 * that we can expect to find the information and PCI configuration space
197 * registers that we care about at. However, the PCI bus is not well defined.
198 * Devices that are on the same socket use the same set of bus numbers; however,
199 * some sockets have multiple device numbers that they'll use to represent
200 * different classes. These bus numbers are programmed by systems firmware as
201 * part of powering on the system. This means, that we need the ability to
202 * map together these disparate ranges ourselves.
203 *
204 * There is a device called a utility box (UBOX), which exists per-socket and
205 * maps the different sockets together. We use this to determine which devices
206 * correspond to which sockets.
207 *
208 * Mapping Sockets
209 * ---------------
210 *
211 * Another wrinkle is that the way that the OS sees the numbering of the CPUs is
212 * generally based on the APIC ID (see uts/i86pc/os/cpuid.c for more
213 * information). However, to map to the corresponding socket, we need to look at
214 * the socket's node ID. The order of PCI buses in the system is not required to
215 * have any relation to the socket ID. Therefore, we have to have yet another
216 * indirection table in the imc_t.
217 *
218 * Exposing Data
219 * -------------
220 *
221 * We expose topology data to FMA using the OS-private memory controller
222 * interfaces. By creating minor nodes of the type, 'ddi_mem_ctrl', there are a
223 * number of specific interfaces that we can then implement. The ioctl API asks
224 * us for a snapshot of data, which basically has us go through and send an
225 * nvlist_t to userland. This nvlist_t is constructed as part of the scan
226 * process. This nvlist uses the version 1 format, which more explicitly encodes
227 * the topology in a series of nested nvlists.
228 *
229 * In addition, the tool /usr/lib/fm/fmd/mcdecode can be used to query the
230 * decoder and ask it to perform decoding.
231 *
232 * Decoding Addresses
233 * ------------------
234 *
235 * The decoding logic can be found in common/imc/imc_decode.c. This file is
236 * shared between the kernel and userland to allow for easier testing and
237 * additional flexibility in operation. The decoding process happens in a few
238 * different phases.
239 *
240 * The first phase, is to determine which memory controller on which socket is
241 * responsible for this data. To determine this, we use the system address
242 * decoder and walk the rules, looking for the correct target. There are various
243 * manipulations to the address that exist which are used to determine which
244 * index we use. The way that we interpret the output of the rule varies
245 * somewhat based on the generation. Sandy Bridge just has a node ID which
246 * points us to the socket with its single IMC. On Ivy Bridge through Broadwell,
247 * the memory controller to use is also encoded in part of the node ID. Finally,
248 * on Skylake, the SAD tells us which socket to look at. The socket in question
249 * then has a routing table which tells us which channel on which memory
250 * controller that is local to that socket.
251 *
252 * Once we have the target memory controller, we walk the list of target address
253 * decoder rules. These rules can help tell us which channel we care about
254 * (which is required on Sandy Bridge through Broadwell) and then describe some
255 * amount of the interleaving rules which are used to turn the system address
256 * into a channel address.
257 *
258 * Once we know the channel and the channel address, we walk the rank interleave
259 * rules which help us determine which DIMM and the corresponding rank on it
260 * that the corresponding channel address is on. It also has logic that we need
261 * to use to determine how to transform a channel address into an address on
262 * that specific rank. Once we have that, then the initial decoding is done.
263 *
264 * The logic in imc_decode.c is abstracted away from the broader kernel CMI
265 * logic.  This is on purpose and allows us not only an easier time unit testing
266 * the logic, but also allows us to express more high fidelity errors that are
267 * translated into a much smaller subset. This logic is exercised in the
268 * 'imc_test' program which is built in 'test/os-tests/tests/imc'.
269 *
270 * Limitations
271 * -----------
272 *
273 * Currently, this driver has the following limitations:
274 *
275 *  o It doesn't decode the row and column addresses.
276 *  o It doesn't encode from a DIMM address to a system address.
277 *  o It doesn't properly support lockstep and mirroring modes on Sandy Bridge -
278 *    Broadwell platforms.
279 *  o It doesn't support virtual lockstep and adaptive mirroring on Purley
280 *    platforms.
281 *  o It doesn't properly handle Intel Optane (3D-X Point) NVDIMMs.
282 *  o It doesn't know how to decode three way channel interleaving.
283 *
284 * None of these are intrinsic problems to the driver, it's mostly a matter of
285 * having proper documentation and testing.
286 */
287
288#include <sys/modctl.h>
289#include <sys/conf.h>
290#include <sys/devops.h>
291#include <sys/ddi.h>
292#include <sys/sunddi.h>
293#include <sys/types.h>
294#include <sys/file.h>
295#include <sys/errno.h>
296#include <sys/open.h>
297#include <sys/cred.h>
298#include <sys/pci.h>
299#include <sys/sysmacros.h>
300#include <sys/avl.h>
301#include <sys/stat.h>
302#include <sys/policy.h>
303
304#include <sys/cpu_module.h>
305#include <sys/mc.h>
306#include <sys/mc_intel.h>
307
308#include "imc.h"
309
310/*
311 * These tables contain generational data that varies between processor
312 * generation such as the maximum number of sockets, memory controllers, and the
313 * offsets of the various registers.
314 */
315
316static const imc_gen_data_t imc_gen_data_snb = {
317	.igd_max_sockets = 4,
318	.igd_max_imcs = 2,
319	.igd_max_channels = 4,
320	.igd_max_dimms = 3,
321	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
322	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
323	    IMC_REG_MC_MTR2 },
324	.igd_mcmtr_offset = 0x7c,
325	.igd_tolm_offset = 0x80,
326	.igd_tohm_low_offset = 0x84,
327	.igd_sad_dram_offset = 0x80,
328	.igd_sad_ndram_rules = 10,
329	.igd_sad_nodeid_offset = 0x40,
330	.igd_tad_nrules = 12,
331	.igd_tad_rule_offset = 0x40,
332	.igd_tad_chan_offset = 0x90,
333	.igd_tad_sysdef = 0x80,
334	.igd_tad_sysdef2 = 0x84,
335	.igd_mc_mirror = 0xac,
336	.igd_rir_nways = 5,
337	.igd_rir_way_offset = 0x108,
338	.igd_rir_nileaves = 8,
339	.igd_rir_ileave_offset = 0x120,
340	.igd_ubox_cpubusno_offset = 0xd0,
341};
342
343static const imc_gen_data_t imc_gen_data_ivb = {
344	.igd_max_sockets = 4,
345	.igd_max_imcs = 2,
346	.igd_max_channels = 4,
347	.igd_max_dimms = 3,
348	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
349	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
350	    IMC_REG_MC_MTR2 },
351	.igd_mcmtr_offset = 0x7c,
352	.igd_tolm_offset = 0x80,
353	.igd_tohm_low_offset = 0x84,
354	.igd_sad_dram_offset = 0x60,
355	.igd_sad_ndram_rules = 20,
356	.igd_sad_nodeid_offset = 0x40,
357	.igd_tad_nrules = 12,
358	.igd_tad_rule_offset = 0x40,
359	.igd_tad_chan_offset = 0x90,
360	.igd_tad_sysdef = 0x80,
361	.igd_tad_sysdef2 = 0x84,
362	.igd_mc_mirror = 0xac,
363	.igd_rir_nways = 5,
364	.igd_rir_way_offset = 0x108,
365	.igd_rir_nileaves = 8,
366	.igd_rir_ileave_offset = 0x120,
367	.igd_ubox_cpubusno_offset = 0xd0,
368};
369
370static const imc_gen_data_t imc_gen_data_has_brd = {
371	.igd_max_sockets = 4,
372	.igd_max_imcs = 2,
373	.igd_max_channels = 4,
374	.igd_max_dimms = 3,
375	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX_HAS_SKX,
376	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1,
377	    IMC_REG_MC_MTR2 },
378	.igd_mcmtr_offset = 0x7c,
379	.igd_tolm_offset = 0xd0,
380	.igd_tohm_low_offset = 0xd4,
381	.igd_tohm_hi_offset = 0xd8,
382	.igd_sad_dram_offset = 0x60,
383	.igd_sad_ndram_rules = 20,
384	.igd_sad_nodeid_offset = 0x40,
385	.igd_tad_nrules = 12,
386	.igd_tad_rule_offset = 0x40,
387	.igd_tad_chan_offset = 0x90,
388	.igd_tad_sysdef = 0x80,
389	.igd_tad_sysdef2 = 0x84,
390	.igd_mc_mirror = 0xac,
391	.igd_rir_nways = 5,
392	.igd_rir_way_offset = 0x108,
393	.igd_rir_nileaves = 8,
394	.igd_rir_ileave_offset = 0x120,
395	.igd_ubox_cpubusno_offset = 0xd0,
396};
397
398static const imc_gen_data_t imc_gen_data_skx = {
399	.igd_max_sockets = 8,
400	.igd_max_imcs = 2,
401	.igd_max_channels = 3,
402	.igd_max_dimms = 2,
403	.igd_max_ranks = IMC_MTR_DDR_RANKS_MAX,
404	.igd_mtr_offsets = { IMC_REG_MC_MTR0, IMC_REG_MC_MTR1 },
405	.igd_mcmtr_offset = 0x87c,
406	.igd_topo_offset = 0x88,
407	.igd_tolm_offset = 0xd0,
408	.igd_tohm_low_offset = 0xd4,
409	.igd_tohm_hi_offset = 0xd8,
410	.igd_sad_dram_offset = 0x60,
411	.igd_sad_ndram_rules = 24,
412	.igd_sad_nodeid_offset = 0xc0,
413	.igd_tad_nrules = 8,
414	.igd_tad_rule_offset = 0x850,
415	.igd_tad_chan_offset = 0x90,
416	.igd_rir_nways = 4,
417	.igd_rir_way_offset = 0x108,
418	.igd_rir_nileaves = 4,
419	.igd_rir_ileave_offset = 0x120,
420	.igd_ubox_cpubusno_offset = 0xcc,
421};
422
423/*
424 * This table contains all of the devices that we're looking for from a stub
425 * perspective. These are organized by generation. Different generations behave
426 * in slightly different ways. For example, Sandy Bridge through Broadwell use
427 * unique PCI IDs for each PCI device/function combination that appears. Whereas
428 * Skylake based systems use the same PCI ID; however, different device/function
429 * values indicate that the IDs are used for different purposes.
430 */
431/* BEGIN CSTYLED */
432static const imc_stub_table_t imc_stub_table[] = {
433	/* Sandy Bridge */
434	{ IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN0, 0x3ca8, 15, 0, "IMC 0 Main 0" },
435	{ IMC_GEN_SANDY, IMC_TYPE_MC0_MAIN1, 0x3c71, 15, 1, "IMC 0 Main 0" },
436	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL0, 0x3caa, 15, 2, "IMC 0 Channel 0 Info" },
437	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL1, 0x3cab, 15, 3, "IMC 0 Channel 1 Info" },
438	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL2, 0x3cac, 15, 4, "IMC 0 Channel 2 Info" },
439	{ IMC_GEN_SANDY, IMC_TYPE_MC0_CHANNEL3, 0x3cad, 15, 5, "IMC 0 Channel 3 Info" },
440	{ IMC_GEN_SANDY, IMC_TYPE_SAD_DRAM, 0x3cf4, 12, 6, "SAD DRAM Rules" },
441	{ IMC_GEN_SANDY, IMC_TYPE_SAD_MMIO, 0x3cf5, 13, 6, "SAD MMIO Rules" },
442	{ IMC_GEN_SANDY, IMC_TYPE_SAD_MISC, 0x3cf6, 12, 7, "SAD Memory Map" },
443	{ IMC_GEN_SANDY, IMC_TYPE_UBOX, 0x3ce0, 11, 0, "UBox" },
444	{ IMC_GEN_SANDY, IMC_TYPE_UBOX_CPUBUSNO, 0x3ce3, 11, 3, "UBox Scratch" },
445	{ IMC_GEN_SANDY, IMC_TYPE_HA0, 0x3ca0, 14, 0, "Home Agent" },
446	/* Ivy Bridge */
447	{ IMC_GEN_IVY, IMC_TYPE_MC0_MAIN0, 0x0ea8, 15, 0, "IMC 0 Main 0" },
448	{ IMC_GEN_IVY, IMC_TYPE_MC0_MAIN1, 0x0e71, 15, 1, "IMC 0 Main 1" },
449	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL0, 0x0eaa, 15, 2, "IMC 0 Channel 0 Info" },
450	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL1, 0x0eab, 15, 3, "IMC 0 Channel 1 Info" },
451	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL2, 0x0eac, 15, 4, "IMC 0 Channel 2 Info" },
452	{ IMC_GEN_IVY, IMC_TYPE_MC0_CHANNEL3, 0x0ead, 15, 5, "IMC 0 Channel 3 Info" },
453	{ IMC_GEN_IVY, IMC_TYPE_MC1_MAIN0, 0x0e68, 29, 0, "IMC 1 Main 0" },
454	{ IMC_GEN_IVY, IMC_TYPE_MC1_MAIN1, 0x0e79, 29, 1, "IMC 1 Main 1" },
455	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL0, 0x0e6a, 15, 2, "IMC 1 Channel 0 Info" },
456	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL1, 0x0e6b, 15, 3, "IMC 1 Channel 1 Info" },
457	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL2, 0x0e6c, 15, 4, "IMC 1 Channel 2 Info" },
458	{ IMC_GEN_IVY, IMC_TYPE_MC1_CHANNEL3, 0x0e6d, 15, 5, "IMC 1 Channel 3 Info" },
459	{ IMC_GEN_IVY, IMC_TYPE_SAD_DRAM, 0x0ec8, 22, 0, "SAD DRAM Rules" },
460	{ IMC_GEN_IVY, IMC_TYPE_SAD_MMIO, 0x0ec9, 22, 1, "SAD MMIO Rules" },
461	{ IMC_GEN_IVY, IMC_TYPE_SAD_MISC, 0x0eca, 22, 2, "SAD Memory Map" },
462	{ IMC_GEN_IVY, IMC_TYPE_UBOX, 0x0e1e, 11, 0, "UBox" },
463	{ IMC_GEN_IVY, IMC_TYPE_UBOX_CPUBUSNO, 0x0e1f, 11, 3, "UBox Scratch" },
464	{ IMC_GEN_IVY, IMC_TYPE_HA0, 0x0ea0, 14, 0, "Home Agent 0" },
465	{ IMC_GEN_IVY, IMC_TYPE_HA1, 0x0e60, 28, 0, "Home Agent 1" },
466	/* Haswell */
467	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN0, 0x2fa8, 19, 0, "IMC 0 Main 0" },
468	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_MAIN1, 0x2f71, 19, 1, "IMC 0 Main 1" },
469	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL0, 0x2faa, 19, 2, "IMC 0 Channel 0 Info" },
470	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL1, 0x2fab, 19, 3, "IMC 0 Channel 1 Info" },
471	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL2, 0x2fac, 19, 4, "IMC 0 Channel 2 Info" },
472	{ IMC_GEN_HASWELL, IMC_TYPE_MC0_CHANNEL3, 0x2fad, 19, 5, "IMC 0 Channel 3 Info" },
473	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN0, 0x2f68, 22, 0, "IMC 1 Main 0" },
474	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_MAIN1, 0x2f79, 22, 1, "IMC 1 Main 1" },
475	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL0, 0x2f6a, 22, 2, "IMC 1 Channel 0 Info" },
476	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL1, 0x2f6b, 22, 3, "IMC 1 Channel 1 Info" },
477	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL2, 0x2f6c, 22, 4, "IMC 1 Channel 2 Info" },
478	{ IMC_GEN_HASWELL, IMC_TYPE_MC1_CHANNEL3, 0x2f6d, 22, 5, "IMC 1 Channel 3 Info" },
479	{ IMC_GEN_HASWELL, IMC_TYPE_SAD_DRAM, 0x2ffc, 15, 4, "SAD DRAM Rules" },
480	{ IMC_GEN_HASWELL, IMC_TYPE_SAD_MMIO, 0x2ffd, 15, 5, "SAD MMIO Rules" },
481	{ IMC_GEN_HASWELL, IMC_TYPE_VTD_MISC, 0x2f28, 5, 0, "Misc. Vritualization" },
482	{ IMC_GEN_HASWELL, IMC_TYPE_UBOX, 0x2f1e, 16, 5, "UBox" },
483	{ IMC_GEN_HASWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x2f1f, 16, 7, "UBox Scratch" },
484	{ IMC_GEN_HASWELL, IMC_TYPE_HA0, 0x2fa0, 18, 0, "Home Agent 0" },
485	{ IMC_GEN_HASWELL, IMC_TYPE_HA1, 0x2f60, 18, 4, "Home Agent 1" },
486	/* Broadwell Devices */
487	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN0, 0x6fa8, 19, 0, "IMC 0 Main 0" },
488	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_MAIN1, 0x6f71, 19, 1, "IMC 0 Main 1" },
489	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL0, 0x6faa, 19, 2, "IMC 0 Channel 0 Info" },
490	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL1, 0x6fab, 19, 3, "IMC 0 Channel 1 Info" },
491	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL2, 0x6fac, 19, 4, "IMC 0 Channel 2 Info" },
492	{ IMC_GEN_BROADWELL, IMC_TYPE_MC0_CHANNEL3, 0x6fad, 19, 5, "IMC 0 Channel 3 Info" },
493	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN0, 0x6f68, 22, 0, "IMC 1 Main 0" },
494	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_MAIN1, 0x6f79, 22, 1, "IMC 1 Main 1" },
495	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL0, 0x6f6a, 22, 2, "IMC 1 Channel 0 Info" },
496	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL1, 0x6f6b, 22, 3, "IMC 1 Channel 1 Info" },
497	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL2, 0x6f6c, 22, 4, "IMC 1 Channel 2 Info" },
498	{ IMC_GEN_BROADWELL, IMC_TYPE_MC1_CHANNEL3, 0x6f6d, 22, 5, "IMC 1 Channel 3 Info" },
499	{ IMC_GEN_BROADWELL, IMC_TYPE_SAD_DRAM, 0x6ffc, 15, 4, "SAD DRAM Rules" },
500	{ IMC_GEN_BROADWELL, IMC_TYPE_SAD_MMIO, 0x6ffd, 15, 5, "SAD MMIO Rules" },
501	{ IMC_GEN_BROADWELL, IMC_TYPE_VTD_MISC, 0x6f28, 5, 0, "Misc. Vritualization" },
502	{ IMC_GEN_BROADWELL, IMC_TYPE_UBOX, 0x6f1e, 16, 5, "UBox" },
503	{ IMC_GEN_BROADWELL, IMC_TYPE_UBOX_CPUBUSNO, 0x6f1f, 16, 7, "UBox Scratch" },
504	{ IMC_GEN_BROADWELL, IMC_TYPE_HA0, 0x6fa0, 18, 0, "Home Agent 0" },
505	{ IMC_GEN_BROADWELL, IMC_TYPE_HA1, 0x6f60, 18, 4, "Home Agent 1" },
506	/* Skylake and Cascade Lake Devices */
507	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_M2M, 0x2066, 8, 0, "IMC 0 M2M" },
508	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_M2M, 0x2066, 9, 0, "IMC 0 M2M" },
509	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_MAIN0, 0x2040, 10, 0, "IMC 0 Main / Channel 0" },
510	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_MAIN0, 0x2040, 12, 0, "IMC 0 Main / Channel 0" },
511	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL1, 0x2044, 10, 4, "IMC 0 Channel 1" },
512	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC0_CHANNEL2, 0x2048, 11, 0, "IMC 0 Channel 2" },
513	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL1, 0x2044, 12, 4, "IMC 1 Channel 1" },
514	{ IMC_GEN_SKYLAKE, IMC_TYPE_MC1_CHANNEL2, 0x2048, 13, 0, "IMC 1 Channel 2" },
515	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_DRAM, 0x2054, 29, 0, "SAD DRAM Rules" },
516	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MMIO, 0x2055, 29, 1, "SAD MMIO Rules" },
517	{ IMC_GEN_SKYLAKE, IMC_TYPE_VTD_MISC, 0x2024, 5, 0, "Misc. Virtualization" },
518
519	/*
520	 * There is one SAD MC Route type device per core! Because of this a
521	 * wide array of device and functions are allocated. For now, we list
522	 * all 28 of them out.
523	 */
524	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 0, "Per-Core SAD" },
525	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 1, "Per-Core SAD" },
526	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 2, "Per-Core SAD" },
527	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 3, "Per-Core SAD" },
528	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 4, "Per-Core SAD" },
529	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 5, "Per-Core SAD" },
530	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 6, "Per-Core SAD" },
531	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 14, 7, "Per-Core SAD" },
532	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 0, "Per-Core SAD" },
533	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 1, "Per-Core SAD" },
534	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 2, "Per-Core SAD" },
535	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 3, "Per-Core SAD" },
536	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 4, "Per-Core SAD" },
537	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 5, "Per-Core SAD" },
538	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 6, "Per-Core SAD" },
539	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 15, 7, "Per-Core SAD" },
540	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 0, "Per-Core SAD" },
541	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 1, "Per-Core SAD" },
542	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 2, "Per-Core SAD" },
543	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 3, "Per-Core SAD" },
544	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 4, "Per-Core SAD" },
545	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 5, "Per-Core SAD" },
546	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 6, "Per-Core SAD" },
547	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 16, 7, "Per-Core SAD" },
548	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 0, "Per-Core SAD" },
549	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 1, "Per-Core SAD" },
550	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 2, "Per-Core SAD" },
551	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 3, "Per-Core SAD" },
552	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 4, "Per-Core SAD" },
553	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 5, "Per-Core SAD" },
554	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 6, "Per-Core SAD" },
555	{ IMC_GEN_SKYLAKE, IMC_TYPE_SAD_MCROUTE, 0x208e, 17, 7, "Per-Core SAD" },
556
557	{ IMC_GEN_SKYLAKE, IMC_TYPE_UBOX, 0x2014, 8, 0, "UBox" },
558	{ IMC_GEN_SKYLAKE, IMC_TYPE_UBOX_CPUBUSNO, 0x2016, 8, 2, "DECS" },
559};
560/* END CSTYLED */
561
562#define	IMC_PCI_VENDOR_INTC	0x8086
563
564/*
565 * Our IMC data is global and statically set up during a combination of
566 * _init(9E) and attach(9E). While we have a module dependency between the PCI
567 * stub driver, imcstub, and this pseudo-driver, imc, the dependencies don't
568 * guarantee that the imc driver has finished attaching. As such we make sure
569 * that it can operate without it being attached in any way.
570 */
571static imc_t *imc_data = NULL;
572
573/*
574 * By default we should not allow the stubs to detach as we don't have a good
575 * way of forcing them to attach again. This is provided in case someone does
576 * want to allow the driver to unload.
577 */
578int imc_allow_detach = 0;
579
580static void
581imc_set_gen_data(imc_t *imc)
582{
583	switch (imc->imc_gen) {
584	case IMC_GEN_SANDY:
585		imc->imc_gen_data = &imc_gen_data_snb;
586		break;
587	case IMC_GEN_IVY:
588		imc->imc_gen_data = &imc_gen_data_ivb;
589		break;
590	case IMC_GEN_HASWELL:
591	case IMC_GEN_BROADWELL:
592		imc->imc_gen_data = &imc_gen_data_has_brd;
593		break;
594	case IMC_GEN_SKYLAKE:
595		imc->imc_gen_data = &imc_gen_data_skx;
596		break;
597	default:
598		dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
599		    "set to unknown generation: %u", imc->imc_gen);
600	}
601}
602
603/*
604 * If our device (dev_info_t) does not have a non-zero unit address, then
605 * devfsadmd will not pay attention to us at all. Therefore we need to set the
606 * unit address below, before we create minor nodes.
607 *
608 * The rest of the system expects us to have one minor node per socket. The
609 * minor node ID should be the ID of the socket.
610 */
611static boolean_t
612imc_create_minors(imc_t *imc)
613{
614	uint_t i;
615
616	ddi_set_name_addr(imc->imc_dip, "1");
617	for (i = 0; i < imc->imc_nsockets; i++) {
618		char buf[MAXNAMELEN];
619
620		if (snprintf(buf, sizeof (buf), "mc-imc-%u", i) >=
621		    sizeof (buf)) {
622			goto fail;
623		}
624
625		if (ddi_create_minor_node(imc->imc_dip, buf, S_IFCHR, i,
626		    "ddi_mem_ctrl", 0) != DDI_SUCCESS) {
627			dev_err(imc->imc_dip, CE_WARN, "failed to create "
628			    "minor node %u: %s", i, buf);
629			goto fail;
630		}
631	}
632	return (B_TRUE);
633
634fail:
635	ddi_remove_minor_node(imc->imc_dip, NULL);
636	return (B_FALSE);
637}
638
639/*
640 * Check the current MC route value for this SAD. On Skylake systems there is
641 * one per core. Every core should agree. If not, we will not trust the SAD
642 * MCROUTE values and this will cause system address decoding to fail on
643 * skylake.
644 */
645static void
646imc_mcroute_check(imc_t *imc, imc_sad_t *sad, imc_stub_t *stub)
647{
648	uint32_t val;
649
650	val = pci_config_get32(stub->istub_cfgspace,
651	    IMC_REG_SKX_SAD_MC_ROUTE_TABLE);
652	if (val == PCI_EINVAL32) {
653		sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
654		return;
655	}
656
657	if ((sad->isad_flags & IMC_SAD_MCROUTE_VALID) == 0 && val != 0) {
658		sad->isad_flags |= IMC_SAD_MCROUTE_VALID;
659		sad->isad_mcroute.ismc_raw_mcroute = val;
660		return;
661	}
662
663	/*
664	 * Occasionally we see MC ROUTE table entries with a value of zero.
665	 * We should ignore those for now.
666	 */
667	if (val != sad->isad_mcroute.ismc_raw_mcroute && val != 0) {
668		dev_err(imc->imc_dip, CE_WARN, "SAD MC_ROUTE_TABLE mismatch "
669		    "with socket. SAD has val 0x%x, system has %x\n",
670		    val, sad->isad_mcroute.ismc_raw_mcroute);
671		sad->isad_valid |= IMC_SAD_V_BAD_MCROUTE;
672	}
673}
674
675/*
676 * On Skylake, many of the devices that we care about are on separate PCI Buses.
677 * These can be mapped together by the DECS register. However, we need to know
678 * how to map different buses together so that we can more usefully associate
679 * information. The set of buses is all present in the DECS register. We'll
680 * effectively assign sockets to buses. This is also still something that comes
681 * up on pre-Skylake systems as well.
682 */
683static boolean_t
684imc_map_buses(imc_t *imc)
685{
686	imc_stub_t *stub;
687	uint_t nsock;
688
689	/*
690	 * Find the UBOX_DECS registers so we can establish socket mappings. On
691	 * Skylake, there are three different sets of buses that we need to
692	 * cover all of our devices, while there are only two before that.
693	 */
694	for (nsock = 0, stub = avl_first(&imc->imc_stubs); stub != NULL;
695	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
696		uint32_t busno;
697
698		if (stub->istub_table->imcs_type != IMC_TYPE_UBOX_CPUBUSNO) {
699			continue;
700		}
701
702		busno = pci_config_get32(stub->istub_cfgspace,
703		    imc->imc_gen_data->igd_ubox_cpubusno_offset);
704		if (busno == PCI_EINVAL32) {
705			dev_err(imc->imc_dip, CE_WARN, "failed to read "
706			    "UBOX_DECS CPUBUSNO0: invalid PCI read");
707			return (B_FALSE);
708		}
709
710		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
711			imc->imc_sockets[nsock].isock_nbus = 3;
712			imc->imc_sockets[nsock].isock_bus[0] =
713			    IMC_UBOX_CPUBUSNO_0(busno);
714			imc->imc_sockets[nsock].isock_bus[1] =
715			    IMC_UBOX_CPUBUSNO_1(busno);
716			imc->imc_sockets[nsock].isock_bus[2] =
717			    IMC_UBOX_CPUBUSNO_2(busno);
718		} else {
719			imc->imc_sockets[nsock].isock_bus[0] =
720			    IMC_UBOX_CPUBUSNO_0(busno);
721			imc->imc_sockets[nsock].isock_bus[1] =
722			    IMC_UBOX_CPUBUSNO_1(busno);
723			imc->imc_sockets[nsock].isock_nbus = 2;
724		}
725		nsock++;
726	}
727	imc->imc_nsockets = nsock;
728
729	return (B_TRUE);
730}
731
732/*
733 * For a given stub that we've found, map it to its corresponding socket based
734 * on the PCI bus that it has.
735 */
736static imc_socket_t *
737imc_map_find_socket(imc_t *imc, imc_stub_t *stub)
738{
739	uint_t i;
740
741	for (i = 0; i < imc->imc_nsockets; i++) {
742		uint_t bus;
743
744		for (bus = 0; bus < imc->imc_sockets[i].isock_nbus; bus++) {
745			if (imc->imc_sockets[i].isock_bus[bus] ==
746			    stub->istub_bus) {
747				return (&imc->imc_sockets[i]);
748			}
749		}
750	}
751
752	return (NULL);
753}
754
755static boolean_t
756imc_map_stubs(imc_t *imc)
757{
758	imc_stub_t *stub;
759
760	if (!imc_map_buses(imc)) {
761		return (B_FALSE);
762	}
763
764	stub = avl_first(&imc->imc_stubs);
765	for (stub = avl_first(&imc->imc_stubs); stub != NULL;
766	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
767		imc_socket_t *sock = imc_map_find_socket(imc, stub);
768
769		if (sock == NULL) {
770			dev_err(imc->imc_dip, CE_WARN, "found stub type %u "
771			    "PCI%x,%x with bdf %u/%u/%u that does not match a "
772			    "known PCI bus for any of %u sockets",
773			    stub->istub_table->imcs_type, stub->istub_vid,
774			    stub->istub_did, stub->istub_bus, stub->istub_dev,
775			    stub->istub_func, imc->imc_nsockets);
776			continue;
777		}
778
779		/*
780		 * We don't have to worry about duplicates here. We check to
781		 * make sure that we have unique bdfs here.
782		 */
783		switch (stub->istub_table->imcs_type) {
784		case IMC_TYPE_MC0_M2M:
785			sock->isock_imcs[0].icn_m2m = stub;
786			break;
787		case IMC_TYPE_MC1_M2M:
788			sock->isock_imcs[1].icn_m2m = stub;
789			break;
790		case IMC_TYPE_MC0_MAIN0:
791			sock->isock_nimc++;
792			sock->isock_imcs[0].icn_main0 = stub;
793
794			/*
795			 * On Skylake, the MAIN0 does double duty as channel
796			 * zero and as the TAD.
797			 */
798			if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
799				sock->isock_imcs[0].icn_nchannels++;
800				sock->isock_imcs[0].icn_channels[0].ich_desc =
801				    stub;
802				sock->isock_tad[0].itad_stub = stub;
803				sock->isock_ntad++;
804			}
805			break;
806		case IMC_TYPE_MC0_MAIN1:
807			sock->isock_imcs[0].icn_main1 = stub;
808			break;
809		case IMC_TYPE_MC1_MAIN0:
810			sock->isock_nimc++;
811			sock->isock_imcs[1].icn_main0 = stub;
812
813			/*
814			 * On Skylake, the MAIN0 does double duty as channel
815			 * zero and as the TAD.
816			 */
817			if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
818				sock->isock_imcs[1].icn_nchannels++;
819				sock->isock_imcs[1].icn_channels[0].ich_desc =
820				    stub;
821				sock->isock_tad[1].itad_stub = stub;
822				sock->isock_ntad++;
823			}
824			break;
825		case IMC_TYPE_MC1_MAIN1:
826			sock->isock_imcs[1].icn_main1 = stub;
827			break;
828		case IMC_TYPE_MC0_CHANNEL0:
829			sock->isock_imcs[0].icn_nchannels++;
830			sock->isock_imcs[0].icn_channels[0].ich_desc = stub;
831			break;
832		case IMC_TYPE_MC0_CHANNEL1:
833			sock->isock_imcs[0].icn_nchannels++;
834			sock->isock_imcs[0].icn_channels[1].ich_desc = stub;
835			break;
836		case IMC_TYPE_MC0_CHANNEL2:
837			sock->isock_imcs[0].icn_nchannels++;
838			sock->isock_imcs[0].icn_channels[2].ich_desc = stub;
839			break;
840		case IMC_TYPE_MC0_CHANNEL3:
841			sock->isock_imcs[0].icn_nchannels++;
842			sock->isock_imcs[0].icn_channels[3].ich_desc = stub;
843			break;
844		case IMC_TYPE_MC1_CHANNEL0:
845			sock->isock_imcs[1].icn_nchannels++;
846			sock->isock_imcs[1].icn_channels[0].ich_desc = stub;
847			break;
848		case IMC_TYPE_MC1_CHANNEL1:
849			sock->isock_imcs[1].icn_nchannels++;
850			sock->isock_imcs[1].icn_channels[1].ich_desc = stub;
851			break;
852		case IMC_TYPE_MC1_CHANNEL2:
853			sock->isock_imcs[1].icn_nchannels++;
854			sock->isock_imcs[1].icn_channels[2].ich_desc = stub;
855			break;
856		case IMC_TYPE_MC1_CHANNEL3:
857			sock->isock_imcs[1].icn_nchannels++;
858			sock->isock_imcs[1].icn_channels[3].ich_desc = stub;
859			break;
860		case IMC_TYPE_SAD_DRAM:
861			sock->isock_sad.isad_dram = stub;
862			break;
863		case IMC_TYPE_SAD_MMIO:
864			sock->isock_sad.isad_mmio = stub;
865			break;
866		case IMC_TYPE_SAD_MISC:
867			sock->isock_sad.isad_tolh = stub;
868			break;
869		case IMC_TYPE_VTD_MISC:
870			/*
871			 * Some systems have multiple VT-D Misc. entry points
872			 * in the system. In this case, only use the first one
873			 * we find.
874			 */
875			if (imc->imc_gvtd_misc == NULL) {
876				imc->imc_gvtd_misc = stub;
877			}
878			break;
879		case IMC_TYPE_SAD_MCROUTE:
880			ASSERT3U(imc->imc_gen, >=, IMC_GEN_SKYLAKE);
881			imc_mcroute_check(imc, &sock->isock_sad, stub);
882			break;
883		case IMC_TYPE_UBOX:
884			sock->isock_ubox = stub;
885			break;
886		case IMC_TYPE_HA0:
887			sock->isock_ntad++;
888			sock->isock_tad[0].itad_stub = stub;
889			break;
890		case IMC_TYPE_HA1:
891			sock->isock_ntad++;
892			sock->isock_tad[1].itad_stub = stub;
893			break;
894		case IMC_TYPE_UBOX_CPUBUSNO:
895			sock->isock_cpubusno = stub;
896			break;
897		default:
898			/*
899			 * Attempt to still attach if we can.
900			 */
901			dev_err(imc->imc_dip, CE_WARN, "Encountered unknown "
902			    "IMC type (%u) on PCI %x,%x",
903			    stub->istub_table->imcs_type,
904			    stub->istub_vid, stub->istub_did);
905			break;
906		}
907	}
908
909	return (B_TRUE);
910}
911
912/*
913 * Go through and fix up various aspects of the stubs mappings on systems. The
914 * following are a list of what we need to fix up:
915 *
916 *  1. On Haswell and newer systems, there is only one global VT-d device. We
917 *     need to go back and map that to all of the per-socket imc_sad_t entries.
918 */
919static void
920imc_fixup_stubs(imc_t *imc)
921{
922	if (imc->imc_gen >= IMC_GEN_HASWELL) {
923		uint_t i;
924
925		for (i = 0; i < imc->imc_nsockets; i++) {
926			ASSERT3P(imc->imc_sockets[i].isock_sad.isad_tolh,
927			    ==, NULL);
928			imc->imc_sockets[i].isock_sad.isad_tolh =
929			    imc->imc_gvtd_misc;
930		}
931	}
932}
933
934/*
935 * In the wild we've hit a few odd cases where not all devices are exposed that
936 * we might expect by firmware. In particular we've seen and validate the
937 * following cases:
938 *
939 *  o We don't find all of the channel devices that we expect, e.g. we have the
940 *    stubs for channels 1-3, but not 0. That has been seen on an Intel S2600CW
941 *    with an E5-2630v3.
942 */
943static boolean_t
944imc_validate_stubs(imc_t *imc)
945{
946	for (uint_t sock = 0; sock < imc->imc_nsockets; sock++) {
947		imc_socket_t *socket = &imc->imc_sockets[sock];
948
949		for (uint_t mc = 0; mc < socket->isock_nimc; mc++) {
950			imc_mc_t *mcp = &socket->isock_imcs[mc];
951
952			for (uint_t chan = 0; chan < mcp->icn_nchannels;
953			    chan++) {
954				if (mcp->icn_channels[chan].ich_desc == NULL) {
955					dev_err(imc->imc_dip, CE_WARN,
956					    "!missing device for socket %u/"
957					    "imc %u/channel %u", sock, mc,
958					    chan);
959					return (B_FALSE);
960				}
961			}
962		}
963	}
964
965	return (B_TRUE);
966}
967
968/*
969 * Attempt to map all of the discovered sockets to the corresponding APIC based
970 * socket. We do these mappings by getting the node id of the socket and
971 * adjusting it to make sure that no home agent is present in it. We use the
972 * UBOX to avoid any home agent related bits that are present in other
973 * registers.
974 */
975static void
976imc_map_sockets(imc_t *imc)
977{
978	uint_t i;
979
980	for (i = 0; i < imc->imc_nsockets; i++) {
981		uint32_t nodeid;
982		ddi_acc_handle_t h;
983
984		h = imc->imc_sockets[i].isock_ubox->istub_cfgspace;
985		nodeid = pci_config_get32(h,
986		    imc->imc_gen_data->igd_sad_nodeid_offset);
987		if (nodeid == PCI_EINVAL32) {
988			imc->imc_sockets[i].isock_valid |=
989			    IMC_SOCKET_V_BAD_NODEID;
990			continue;
991		}
992
993		imc->imc_sockets[i].isock_nodeid = IMC_NODEID_UBOX_MASK(nodeid);
994		imc->imc_spointers[nodeid] = &imc->imc_sockets[i];
995	}
996}
997
998/*
999 * Decode the MTR, accounting for variances between processor generations.
1000 */
1001static void
1002imc_decode_mtr(imc_t *imc, imc_mc_t *icn, imc_dimm_t *dimm, uint32_t mtr)
1003{
1004	uint8_t disable;
1005
1006	/*
1007	 * Check present first, before worrying about anything else.
1008	 */
1009	if (imc->imc_gen < IMC_GEN_SKYLAKE &&
1010	    IMC_MTR_PRESENT_SNB_BRD(mtr) == 0) {
1011		dimm->idimm_present = B_FALSE;
1012		return;
1013	} else if (imc->imc_gen >= IMC_GEN_SKYLAKE &&
1014	    IMC_MTR_PRESENT_SKYLAKE(mtr) == 0) {
1015		dimm->idimm_present = B_FALSE;
1016		return;
1017	}
1018
1019	dimm->idimm_present = B_TRUE;
1020	dimm->idimm_ncolumns = IMC_MTR_CA_WIDTH(mtr) + IMC_MTR_CA_BASE;
1021	if (dimm->idimm_ncolumns < IMC_MTR_CA_MIN ||
1022	    dimm->idimm_ncolumns > IMC_MTR_CA_MAX) {
1023		dimm->idimm_valid |= IMC_DIMM_V_BAD_COLUMNS;
1024	}
1025
1026	dimm->idimm_nrows = IMC_MTR_RA_WIDTH(mtr) + IMC_MTR_RA_BASE;
1027	if (dimm->idimm_nrows < IMC_MTR_RA_MIN ||
1028	    dimm->idimm_nrows > IMC_MTR_RA_MAX) {
1029		dimm->idimm_valid |= IMC_DIMM_V_BAD_ROWS;
1030	}
1031
1032	/*
1033	 * Determine Density, this information is not present on Sandy Bridge.
1034	 */
1035	switch (imc->imc_gen) {
1036	case IMC_GEN_IVY:
1037		dimm->idimm_density = 1U << IMC_MTR_DENSITY_IVY_BRD(mtr);
1038		break;
1039	case IMC_GEN_HASWELL:
1040	case IMC_GEN_BROADWELL:
1041		switch (IMC_MTR_DENSITY_IVY_BRD(mtr)) {
1042		case 0:
1043		default:
1044			dimm->idimm_density = 0;
1045			dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1046			break;
1047		case 1:
1048			dimm->idimm_density = 2;
1049			break;
1050		case 2:
1051			dimm->idimm_density = 4;
1052			break;
1053		case 3:
1054			dimm->idimm_density = 8;
1055			break;
1056		}
1057		break;
1058	case IMC_GEN_SKYLAKE:
1059		switch (IMC_MTR_DENSITY_SKX(mtr)) {
1060		case 0:
1061		default:
1062			dimm->idimm_density = 0;
1063			dimm->idimm_valid |= IMC_DIMM_V_BAD_DENSITY;
1064			break;
1065		case 1:
1066			dimm->idimm_density = 2;
1067			break;
1068		case 2:
1069			dimm->idimm_density = 4;
1070			break;
1071		case 3:
1072			dimm->idimm_density = 8;
1073			break;
1074		case 4:
1075			dimm->idimm_density = 16;
1076			break;
1077		case 5:
1078			dimm->idimm_density = 12;
1079			break;
1080		}
1081		break;
1082	case IMC_GEN_UNKNOWN:
1083	case IMC_GEN_SANDY:
1084		dimm->idimm_density = 0;
1085		break;
1086	}
1087
1088	/*
1089	 * The values of width are the same on IVY->SKX, but the bits are
1090	 * different. This doesn't exist on SNB.
1091	 */
1092	if (imc->imc_gen > IMC_GEN_SANDY) {
1093		uint8_t width;
1094
1095		if (imc->imc_gen >= IMC_GEN_BROADWELL) {
1096			width = IMC_MTR_WIDTH_BRD_SKX(mtr);
1097		} else {
1098			width = IMC_MTR_WIDTH_IVB_HAS(mtr);
1099		}
1100		switch (width) {
1101		case 0:
1102			dimm->idimm_width = 4;
1103			break;
1104		case 1:
1105			dimm->idimm_width = 8;
1106			break;
1107		case 2:
1108			dimm->idimm_width = 16;
1109			break;
1110		default:
1111			dimm->idimm_width = 0;
1112			dimm->idimm_valid |= IMC_DIMM_V_BAD_WIDTH;
1113			break;
1114		}
1115	} else {
1116		dimm->idimm_width = 0;
1117	}
1118
1119	dimm->idimm_nranks = 1 << IMC_MTR_DDR_RANKS(mtr);
1120	switch (imc->imc_gen) {
1121	case IMC_GEN_HASWELL:
1122	case IMC_GEN_BROADWELL:
1123	case IMC_GEN_SKYLAKE:
1124		if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX_HAS_SKX) {
1125			dimm->idimm_nranks = 0;
1126			dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1127		}
1128		break;
1129	default:
1130		if (dimm->idimm_nranks > IMC_MTR_DDR_RANKS_MAX) {
1131			dimm->idimm_nranks = 0;
1132			dimm->idimm_valid |= IMC_DIMM_V_BAD_RANKS;
1133		}
1134	}
1135
1136	disable = IMC_MTR_RANK_DISABLE(mtr);
1137	dimm->idimm_ranks_disabled[0] = (disable & 0x1) != 0;
1138	dimm->idimm_ranks_disabled[1] = (disable & 0x2) != 0;
1139	dimm->idimm_ranks_disabled[2] = (disable & 0x4) != 0;
1140	dimm->idimm_ranks_disabled[3] = (disable & 0x8) != 0;
1141
1142	/*
1143	 * Only Haswell and later have this information.
1144	 */
1145	if (imc->imc_gen >= IMC_GEN_HASWELL) {
1146		dimm->idimm_hdrl = IMC_MTR_HDRL_HAS_SKX(mtr) != 0;
1147		dimm->idimm_hdrl_parity = IMC_MTR_HDRL_PARITY_HAS_SKX(mtr) != 0;
1148		dimm->idimm_3dsranks = IMC_MTR_3DSRANKS_HAS_SKX(mtr);
1149		if (dimm->idimm_3dsranks != 0) {
1150			dimm->idimm_3dsranks = 1 << dimm->idimm_3dsranks;
1151		}
1152	}
1153
1154
1155	if (icn->icn_dimm_type == IMC_DIMM_DDR4) {
1156		dimm->idimm_nbanks = 16;
1157	} else {
1158		dimm->idimm_nbanks = 8;
1159	}
1160
1161	/*
1162	 * To calculate the DIMM size we need first take the number of rows and
1163	 * columns. This gives us the number of slots per chip. In a given rank
1164	 * there are nbanks of these. There are nrank entries of those. Each of
1165	 * these slots can fit a byte.
1166	 */
1167	dimm->idimm_size = dimm->idimm_nbanks * dimm->idimm_nranks * 8 *
1168	    (1ULL << (dimm->idimm_ncolumns + dimm->idimm_nrows));
1169}
1170
1171static void
1172imc_fill_dimms(imc_t *imc, imc_mc_t *icn, imc_channel_t *chan)
1173{
1174	uint_t i;
1175
1176	/*
1177	 * There's one register for each DIMM that might be present, we always
1178	 * read that information to determine information about the DIMMs.
1179	 */
1180	chan->ich_ndimms = imc->imc_gen_data->igd_max_dimms;
1181	for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1182		uint32_t mtr;
1183		imc_dimm_t *dimm = &chan->ich_dimms[i];
1184
1185		bzero(dimm, sizeof (imc_dimm_t));
1186		mtr = pci_config_get32(chan->ich_desc->istub_cfgspace,
1187		    imc->imc_gen_data->igd_mtr_offsets[i]);
1188		dimm->idimm_mtr = mtr;
1189		/*
1190		 * We don't really expect to get a bad PCIe read. However, if we
1191		 * do, treat that for the moment as though the DIMM is bad.
1192		 */
1193		if (mtr == PCI_EINVAL32) {
1194			dimm->idimm_valid |= IMC_DIMM_V_BAD_PCI_READ;
1195			continue;
1196		}
1197
1198		imc_decode_mtr(imc, icn, dimm, mtr);
1199	}
1200}
1201
1202static boolean_t
1203imc_fill_controller(imc_t *imc, imc_mc_t *icn)
1204{
1205	uint32_t mcmtr;
1206
1207	mcmtr = pci_config_get32(icn->icn_main0->istub_cfgspace,
1208	    imc->imc_gen_data->igd_mcmtr_offset);
1209	if (mcmtr == PCI_EINVAL32) {
1210		icn->icn_invalid = B_TRUE;
1211		return (B_FALSE);
1212	}
1213
1214	icn->icn_closed = IMC_MCMTR_CLOSED_PAGE(mcmtr) != 0;
1215	if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1216		icn->icn_lockstep = IMC_MCMTR_LOCKSTEP(mcmtr) != 0;
1217	} else {
1218		icn->icn_lockstep = B_FALSE;
1219	}
1220
1221	icn->icn_ecc = IMC_MCMTR_ECC_ENABLED(mcmtr) != 0;
1222
1223	/*
1224	 * SNB and IVB only support DDR3. Haswell and Broadwell may support
1225	 * DDR4, depends on the SKU. Skylake only supports DDR4.
1226	 */
1227	switch (imc->imc_gen) {
1228	case IMC_GEN_SANDY:
1229	case IMC_GEN_IVY:
1230		icn->icn_dimm_type = IMC_DIMM_DDR3;
1231		break;
1232	case IMC_GEN_HASWELL:
1233	case IMC_GEN_BROADWELL:
1234		if (IMC_MCMTR_DDR4_HAS_BRD(mcmtr)) {
1235			icn->icn_dimm_type = IMC_DIMM_DDR4;
1236		} else {
1237			icn->icn_dimm_type = IMC_DIMM_DDR3;
1238		}
1239		break;
1240	default:
1241		/*
1242		 * Skylake and on are all DDR4.
1243		 */
1244		icn->icn_dimm_type = IMC_DIMM_DDR4;
1245		break;
1246	}
1247
1248	if (imc->imc_gen >= IMC_GEN_SKYLAKE && icn->icn_m2m != NULL) {
1249		icn->icn_topo = pci_config_get32(icn->icn_m2m->istub_cfgspace,
1250		    imc->imc_gen_data->igd_topo_offset);
1251	}
1252
1253	return (B_TRUE);
1254}
1255
1256/*
1257 * Walk the IMC data and fill in the information on DIMMs and the memory
1258 * controller configurations.
1259 */
1260static void
1261imc_fill_data(imc_t *imc)
1262{
1263	uint_t csock, cmc, cchan;
1264
1265	for (csock = 0; csock < imc->imc_nsockets; csock++) {
1266		imc_socket_t *sock = &imc->imc_sockets[csock];
1267
1268		for (cmc = 0; cmc < sock->isock_nimc; cmc++) {
1269			imc_mc_t *icn = &sock->isock_imcs[cmc];
1270
1271			if (!imc_fill_controller(imc, icn))
1272				continue;
1273
1274			for (cchan = 0; cchan < icn->icn_nchannels; cchan++) {
1275				imc_fill_dimms(imc, icn,
1276				    &icn->icn_channels[cchan]);
1277			}
1278		}
1279	}
1280}
1281
1282static nvlist_t *
1283imc_nvl_create_dimm(imc_t *imc, imc_dimm_t *dimm)
1284{
1285	nvlist_t *nvl;
1286
1287	nvl = fnvlist_alloc();
1288	fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_PRESENT,
1289	    dimm->idimm_present);
1290	if (!dimm->idimm_present) {
1291		return (nvl);
1292	}
1293
1294	fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_SIZE, dimm->idimm_size);
1295	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NCOLS,
1296	    dimm->idimm_ncolumns);
1297	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_NROWS,
1298	    dimm->idimm_nrows);
1299
1300	if (imc->imc_gen > IMC_GEN_SANDY) {
1301		fnvlist_add_uint64(nvl, MCINTEL_NVLIST_V1_DIMM_DENSITY,
1302		    dimm->idimm_density * (1ULL << 30));
1303		fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_WIDTH,
1304		    dimm->idimm_width);
1305	}
1306	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_RANKS,
1307	    dimm->idimm_nranks);
1308	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_BANKS,
1309	    dimm->idimm_nbanks);
1310	fnvlist_add_boolean_array(nvl, MCINTEL_NVLIST_V1_DIMM_RDIS,
1311	    dimm->idimm_ranks_disabled, IMC_MAX_RANK_DISABLE);
1312
1313	if (imc->imc_gen >= IMC_GEN_HASWELL) {
1314		fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRL,
1315		    dimm->idimm_hdrl);
1316		fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_DIMM_HDRLP,
1317		    dimm->idimm_hdrl_parity);
1318		if (dimm->idimm_3dsranks > 0) {
1319			fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_DIMM_3DRANK,
1320			    dimm->idimm_3dsranks);
1321		}
1322	}
1323
1324	return (nvl);
1325}
1326
1327static nvlist_t *
1328imc_nvl_create_channel(imc_t *imc, imc_channel_t *chan)
1329{
1330	nvlist_t *nvl;
1331	nvlist_t *dimms[IMC_MAX_DIMMPERCHAN];
1332	uint_t i;
1333
1334	nvl = fnvlist_alloc();
1335	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_CHAN_NDPC,
1336	    imc->imc_gen_data->igd_max_dimms);
1337	for (i = 0; i < imc->imc_gen_data->igd_max_dimms; i++) {
1338		dimms[i] = imc_nvl_create_dimm(imc, &chan->ich_dimms[i]);
1339	}
1340
1341	fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_CHAN_DIMMS,
1342	    dimms, i);
1343
1344	for (; i > 0; i--) {
1345		nvlist_free(dimms[i-1]);
1346	}
1347
1348	return (nvl);
1349}
1350
1351static nvlist_t *
1352imc_nvl_create_mc(imc_t *imc, imc_mc_t *icn)
1353{
1354	nvlist_t *nvl;
1355	nvlist_t *channels[IMC_MAX_CHANPERMC];
1356	uint_t i;
1357
1358	nvl = fnvlist_alloc();
1359	fnvlist_add_uint32(nvl, MCINTEL_NVLIST_V1_MC_NCHAN, icn->icn_nchannels);
1360	fnvlist_add_boolean_value(nvl, MCINTEL_NVLIST_V1_MC_ECC,
1361	    icn->icn_ecc);
1362	if (icn->icn_lockstep) {
1363		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1364		    MCINTEL_NVLIST_V1_MC_CHAN_MODE_LOCK);
1365	} else {
1366		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_CHAN_MODE,
1367		    MCINTEL_NVLIST_V1_MC_CHAN_MODE_INDEP);
1368
1369	}
1370
1371	if (icn->icn_closed) {
1372		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1373		    MCINTEL_NVLIST_V1_MC_POLICY_CLOSED);
1374	} else {
1375		fnvlist_add_string(nvl, MCINTEL_NVLIST_V1_MC_POLICY,
1376		    MCINTEL_NVLIST_V1_MC_POLICY_OPEN);
1377	}
1378
1379	for (i = 0; i < icn->icn_nchannels; i++) {
1380		channels[i] = imc_nvl_create_channel(imc,
1381		    &icn->icn_channels[i]);
1382	}
1383	fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MC_CHANNELS,
1384	    channels, icn->icn_nchannels);
1385	for (i = 0; i < icn->icn_nchannels; i++) {
1386		nvlist_free(channels[i]);
1387	}
1388
1389	return (nvl);
1390}
1391
1392static void
1393imc_nvl_pack(imc_socket_t *sock, boolean_t sleep)
1394{
1395	char *buf = NULL;
1396	size_t len = 0;
1397	int kmflag;
1398
1399	if (sock->isock_nvl == NULL)
1400		return;
1401
1402	if (sock->isock_buf != NULL)
1403		return;
1404
1405	if (sleep) {
1406		kmflag = KM_SLEEP;
1407	} else {
1408		kmflag = KM_NOSLEEP | KM_NORMALPRI;
1409	}
1410
1411	if (nvlist_pack(sock->isock_nvl, &buf, &len, NV_ENCODE_XDR,
1412	    kmflag) != 0) {
1413		return;
1414	}
1415
1416	sock->isock_buf = buf;
1417	sock->isock_buflen = len;
1418	sock->isock_gen++;
1419}
1420
1421static void
1422imc_decoder_pack(imc_t *imc)
1423{
1424	char *buf = NULL;
1425	size_t len = 0;
1426
1427	if (imc->imc_decoder_buf != NULL)
1428		return;
1429
1430	if (imc->imc_decoder_dump == NULL) {
1431		imc->imc_decoder_dump = imc_dump_decoder(imc);
1432	}
1433
1434	if (nvlist_pack(imc->imc_decoder_dump, &buf, &len, NV_ENCODE_XDR,
1435	    KM_NOSLEEP | KM_NORMALPRI) != 0) {
1436		return;
1437	}
1438
1439	imc->imc_decoder_buf = buf;
1440	imc->imc_decoder_len = len;
1441}
1442
1443static void
1444imc_nvl_create(imc_t *imc)
1445{
1446	uint_t csock;
1447	for (csock = 0; csock < imc->imc_nsockets; csock++) {
1448		uint_t i;
1449		nvlist_t *nvl;
1450		nvlist_t *mcs[IMC_MAX_IMCPERSOCK];
1451		imc_socket_t *sock = &imc->imc_sockets[csock];
1452
1453		nvl = fnvlist_alloc();
1454		fnvlist_add_uint8(nvl, MCINTEL_NVLIST_VERSTR,
1455		    MCINTEL_NVLIST_VERS1);
1456		fnvlist_add_uint8(nvl, MCINTEL_NVLIST_V1_NMC,
1457		    sock->isock_nimc);
1458
1459		for (i = 0; i < sock->isock_nimc; i++) {
1460			mcs[i] = imc_nvl_create_mc(imc, &sock->isock_imcs[i]);
1461		}
1462
1463		fnvlist_add_nvlist_array(nvl, MCINTEL_NVLIST_V1_MCS,
1464		    mcs, sock->isock_nimc);
1465
1466		for (i = 0; i < sock->isock_nimc; i++) {
1467			nvlist_free(mcs[i]);
1468		}
1469
1470		sock->isock_nvl = nvl;
1471		imc_nvl_pack(sock, B_TRUE);
1472	}
1473}
1474
1475/*
1476 * Determine the top of low and high memory. These determine whether transaction
1477 * addresses target main memory or not. Unfortunately, the way that these are
1478 * stored and fetched changes with different generations.
1479 */
1480static void
1481imc_sad_read_tohm(imc_t *imc, imc_sad_t *sad)
1482{
1483	uint32_t tolm, tohm_low, tohm_hi;
1484
1485	tolm = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1486	    imc->imc_gen_data->igd_tolm_offset);
1487	tohm_low = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1488	    imc->imc_gen_data->igd_tohm_low_offset);
1489	if (imc->imc_gen_data->igd_tohm_hi_offset != 0) {
1490		tohm_hi = pci_config_get32(sad->isad_tolh->istub_cfgspace,
1491		    imc->imc_gen_data->igd_tohm_hi_offset);
1492	} else {
1493		tohm_hi = 0;
1494	}
1495
1496	if (tolm == PCI_EINVAL32 || tohm_low == PCI_EINVAL32 ||
1497	    tohm_hi == PCI_EINVAL32) {
1498		sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1499		return;
1500	}
1501
1502	switch (imc->imc_gen) {
1503	case IMC_GEN_SANDY:
1504	case IMC_GEN_IVY:
1505		sad->isad_tolm = ((uint64_t)tolm & IMC_TOLM_SNB_IVY_MASK) <<
1506		    IMC_TOLM_SNB_IVY_SHIFT;
1507		sad->isad_tohm = ((uint64_t)tohm_low & IMC_TOHM_SNB_IVY_MASK) <<
1508		    IMC_TOLM_SNB_IVY_SHIFT;
1509		break;
1510	case IMC_GEN_HASWELL:
1511	case IMC_GEN_BROADWELL:
1512	case IMC_GEN_SKYLAKE:
1513		sad->isad_tolm = (uint64_t)tolm & IMC_TOLM_HAS_SKX_MASK;
1514		sad->isad_tohm = ((uint64_t)tohm_low &
1515		    IMC_TOHM_LOW_HAS_SKX_MASK) | ((uint64_t)tohm_hi << 32);
1516
1517		/*
1518		 * Adjust the values to turn them into an exclusive range.
1519		 */
1520		sad->isad_tolm += IMC_TOLM_HAS_SKY_EXCL;
1521		sad->isad_tohm += IMC_TOHM_HAS_SKY_EXCL;
1522		break;
1523	default:
1524		dev_err(imc->imc_dip, CE_PANIC, "imc driver programmer error: "
1525		    "set to unknown generation: %u", imc->imc_gen);
1526		return;
1527	}
1528}
1529
1530static void
1531imc_sad_fill_rule(imc_t *imc, imc_sad_t *sad, imc_sad_rule_t *rule,
1532    uint32_t raw)
1533{
1534	uint_t attr;
1535	uint64_t limit;
1536	bzero(rule, sizeof (imc_sad_rule_t));
1537
1538	rule->isr_raw_dram = raw;
1539	rule->isr_enable = IMC_SAD_DRAM_RULE_ENABLE(raw) != 0;
1540	if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1541		switch (IMC_SAD_DRAM_INTERLEAVE_SNB_BRD(raw)) {
1542		case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6:
1543			rule->isr_imode = IMC_SAD_IMODE_8t6;
1544			break;
1545		case IMC_SAD_DRAM_INTERLEAVE_SNB_BRD_8t6XOR:
1546			rule->isr_imode = IMC_SAD_IMODE_8t6XOR;
1547			break;
1548		}
1549	} else {
1550		switch (IMC_SAD_DRAM_INTERLEAVE_SKX(raw)) {
1551		case IMC_SAD_DRAM_INTERLEAVE_SKX_8t6:
1552			rule->isr_imode = IMC_SAD_IMODE_8t6;
1553			break;
1554		case IMC_SAD_DRAM_INTERLEAVE_SKX_10t8:
1555			rule->isr_imode = IMC_SAD_IMODE_10t8;
1556			break;
1557		case IMC_SAD_DRAM_INTERLEAVE_SKX_14t12:
1558			rule->isr_imode = IMC_SAD_IMODE_14t12;
1559			break;
1560		case IMC_SAD_DRAM_INTERLEAVE_SKX_32t30:
1561			rule->isr_imode = IMC_SAD_IMODE_32t30;
1562			break;
1563		}
1564	}
1565
1566	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1567		attr = IMC_SAD_DRAM_ATTR_SKX(raw);
1568	} else {
1569		attr = IMC_SAD_DRAM_ATTR_SNB_BRD(raw);
1570	}
1571
1572	switch (attr) {
1573	case IMC_SAD_DRAM_ATTR_DRAM:
1574		rule->isr_type = IMC_SAD_TYPE_DRAM;
1575		break;
1576	case IMC_SAD_DRAM_ATTR_MMCFG:
1577		rule->isr_type = IMC_SAD_TYPE_MMCFG;
1578		break;
1579	case IMC_SAD_DRAM_ATTR_NXM:
1580		if (imc->imc_gen < IMC_GEN_SKYLAKE) {
1581			sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1582		}
1583		rule->isr_type = IMC_SAD_TYPE_NXM;
1584		break;
1585	default:
1586		sad->isad_valid |= IMC_SAD_V_BAD_DRAM_ATTR;
1587		break;
1588	}
1589
1590	/*
1591	 * Fetch the limit which represents bits 45:26 and then adjust this so
1592	 * that it is exclusive.
1593	 */
1594	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1595		limit = IMC_SAD_DRAM_LIMIT_SKX(raw);
1596	} else {
1597		limit = IMC_SAD_DRAM_LIMIT_SNB_BRD(raw);
1598	}
1599	rule->isr_limit = (limit << IMC_SAD_DRAM_LIMIT_SHIFT) +
1600	    IMC_SAD_DRAM_LIMIT_EXCLUSIVE;
1601
1602	/*
1603	 * The rest of this does not apply to Sandy Bridge.
1604	 */
1605	if (imc->imc_gen == IMC_GEN_SANDY)
1606		return;
1607
1608	if (imc->imc_gen >= IMC_GEN_IVY && imc->imc_gen < IMC_GEN_SKYLAKE) {
1609		rule->isr_a7mode = IMC_SAD_DRAM_A7_IVB_BRD(raw) != 0;
1610		return;
1611	}
1612
1613	switch (IMC_SAD_DRAM_MOD23_SKX(raw)) {
1614	case IMC_SAD_DRAM_MOD23_MOD3:
1615		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD3;
1616		break;
1617	case IMC_SAD_DRAM_MOD23_MOD2_C01:
1618		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_01;
1619		break;
1620	case IMC_SAD_DRAM_MOD23_MOD2_C12:
1621		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_12;
1622		break;
1623	case IMC_SAD_DRAM_MOD23_MOD2_C02:
1624		rule->isr_mod_type = IMC_SAD_MOD_TYPE_MOD2_02;
1625		break;
1626	}
1627
1628	rule->isr_need_mod3 = IMC_SAD_DRAM_MOD3_SKX(raw) != 0;
1629	switch (IMC_SAD_DRAM_MOD3_SKX(raw)) {
1630	case IMC_SAD_DRAM_MOD3_MODE_45t6:
1631		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t6;
1632		break;
1633	case IMC_SAD_DRAM_MOD3_MODE_45t8:
1634		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t8;
1635		break;
1636	case IMC_SAD_DRAM_MOD3_MODE_45t12:
1637		rule->isr_mod_mode = IMC_SAD_MOD_MODE_45t12;
1638		break;
1639	default:
1640		sad->isad_valid |= IMC_SAD_V_BAD_MOD3;
1641		break;
1642	}
1643}
1644
1645static void
1646imc_sad_fill_rule_interleave(imc_t *imc, imc_sad_rule_t *rule, uint32_t raw)
1647{
1648	uint_t i;
1649	uint32_t mlen, mbase, skipbits, skipafter;
1650
1651	rule->isr_raw_interleave = raw;
1652
1653	/*
1654	 * Right now all architectures always have the maximum number of SAD
1655	 * interleave targets.
1656	 */
1657	rule->isr_ntargets = IMC_MAX_SAD_INTERLEAVE;
1658
1659	/*
1660	 * Sandy Bridge has a gap in the interleave list due to the fact that it
1661	 * uses a smaller length.
1662	 */
1663	if (imc->imc_gen > IMC_GEN_SANDY) {
1664		mlen = IMC_SAD_ILEAVE_IVB_SKX_LEN;
1665		mbase = IMC_SAD_ILEAVE_IVB_SKX_MASK;
1666		skipbits = skipafter = 0;
1667	} else {
1668		mlen = IMC_SAD_ILEAVE_SNB_LEN;
1669		mbase = IMC_SAD_ILEAVE_SNB_MASK;
1670		skipbits = 2;
1671		skipafter = 4;
1672	}
1673
1674	for (i = 0; i < rule->isr_ntargets; i++) {
1675		uint32_t mask, shift;
1676
1677		shift = i * mlen;
1678		if (i >= skipafter)
1679			shift += skipbits;
1680		mask = mbase << shift;
1681		rule->isr_targets[i] = (raw & mask) >> shift;
1682	}
1683}
1684
1685static void
1686imc_sad_read_dram_rules(imc_t *imc, imc_sad_t *sad)
1687{
1688	uint_t i;
1689	off_t off;
1690
1691	sad->isad_nrules = imc->imc_gen_data->igd_sad_ndram_rules;
1692	for (i = 0, off = imc->imc_gen_data->igd_sad_dram_offset;
1693	    i < sad->isad_nrules; i++, off += sizeof (uint64_t)) {
1694		uint32_t dram, interleave;
1695		imc_sad_rule_t *rule = &sad->isad_rules[i];
1696
1697		dram = pci_config_get32(sad->isad_dram->istub_cfgspace, off);
1698		interleave = pci_config_get32(sad->isad_dram->istub_cfgspace,
1699		    off + 4);
1700
1701		if (dram == PCI_EINVAL32 || interleave == PCI_EINVAL32) {
1702			sad->isad_valid |= IMC_SAD_V_BAD_PCI_READ;
1703			return;
1704		}
1705
1706		imc_sad_fill_rule(imc, sad, rule, dram);
1707		imc_sad_fill_rule_interleave(imc, rule, interleave);
1708	}
1709}
1710
1711static void
1712imc_sad_decode_mcroute(imc_t *imc, imc_sad_t *sad)
1713{
1714	uint_t i;
1715	imc_sad_mcroute_table_t *mc = &sad->isad_mcroute;
1716
1717	if (imc->imc_gen < IMC_GEN_SKYLAKE)
1718		return;
1719	if (sad->isad_valid != 0)
1720		return;
1721
1722	mc->ismc_nroutes = IMC_MAX_SAD_MCROUTES;
1723	for (i = 0; i < IMC_MAX_SAD_MCROUTES; i++) {
1724		uint_t chanoff, ringoff;
1725
1726		ringoff = i * IMC_MC_ROUTE_RING_BITS;
1727		chanoff = i * IMC_MC_ROUTE_CHAN_BITS + IMC_MC_ROUTE_CHAN_OFFSET;
1728
1729		mc->ismc_mcroutes[i].ismce_imc = (mc->ismc_raw_mcroute >>
1730		    ringoff) & IMC_MC_ROUTE_RING_MASK;
1731		mc->ismc_mcroutes[i].ismce_pchannel = (mc->ismc_raw_mcroute >>
1732		    chanoff) & IMC_MC_ROUTE_CHAN_MASK;
1733	}
1734}
1735
1736/*
1737 * Initialize the SAD. To do this we have to do a few different things:
1738 *
1739 * 1. Determine where the top of low and high memory is.
1740 * 2. Read and decode all of the rules for the SAD
1741 * 3. On systems with a route table, decode the raw routes
1742 *
1743 * At this point in time, we treat TOLM and TOHM as a per-socket construct, even
1744 * though it really should be global, this just makes life a bit simpler.
1745 */
1746static void
1747imc_decoder_init_sad(imc_t *imc)
1748{
1749	uint_t i;
1750
1751	for (i = 0; i < imc->imc_nsockets; i++) {
1752		imc_sad_read_tohm(imc, &imc->imc_sockets[i].isock_sad);
1753		imc_sad_read_dram_rules(imc, &imc->imc_sockets[i].isock_sad);
1754		imc_sad_decode_mcroute(imc, &imc->imc_sockets[i].isock_sad);
1755	}
1756}
1757
1758static void
1759imc_tad_fill_rule(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *prev,
1760    imc_tad_rule_t *rule, uint32_t val)
1761{
1762	uint64_t limit;
1763
1764	limit = IMC_TAD_LIMIT(val);
1765	rule->itr_limit = (limit << IMC_TAD_LIMIT_SHIFT) +
1766	    IMC_TAD_LIMIT_EXCLUSIVE;
1767	rule->itr_raw = val;
1768
1769	switch (IMC_TAD_SOCK_WAY(val)) {
1770	case IMC_TAD_SOCK_WAY_1:
1771		rule->itr_sock_way = 1;
1772		break;
1773	case IMC_TAD_SOCK_WAY_2:
1774		rule->itr_sock_way = 2;
1775		break;
1776	case IMC_TAD_SOCK_WAY_4:
1777		rule->itr_sock_way = 4;
1778		break;
1779	case IMC_TAD_SOCK_WAY_8:
1780		rule->itr_sock_way = 8;
1781		break;
1782	}
1783
1784	rule->itr_chan_way = IMC_TAD_CHAN_WAY(val) + 1;
1785	rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1786	rule->itr_chan_gran = IMC_TAD_GRAN_64B;
1787
1788	/*
1789	 * Starting with Skylake the targets that are used are no longer part of
1790	 * the TAD. Those come from the IMC route table.
1791	 */
1792	if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1793		rule->itr_ntargets = 0;
1794		return;
1795	}
1796
1797	rule->itr_ntargets = IMC_TAD_SNB_BRD_NTARGETS;
1798	rule->itr_targets[0] = IMC_TAD_TARG0(val);
1799	rule->itr_targets[1] = IMC_TAD_TARG1(val);
1800	rule->itr_targets[2] = IMC_TAD_TARG2(val);
1801	rule->itr_targets[3] = IMC_TAD_TARG3(val);
1802
1803	if (prev == NULL) {
1804		rule->itr_base = 0;
1805	} else {
1806		rule->itr_base = prev->itr_limit + 1;
1807	}
1808}
1809
1810static void
1811imc_tad_fill_skx(imc_t *imc, imc_tad_t *tad, imc_tad_rule_t *rule,
1812    uint32_t val)
1813{
1814	uint64_t base;
1815
1816	rule->itr_raw_gran = val;
1817	base = IMC_TAD_BASE_BASE(val);
1818	rule->itr_base = base << IMC_TAD_BASE_SHIFT;
1819
1820	switch (IMC_TAD_BASE_CHAN_GRAN(val)) {
1821	case IMC_TAD_BASE_CHAN_GRAN_64B:
1822		rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1823		break;
1824	case IMC_TAD_BASE_CHAN_GRAN_256B:
1825		rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1826		break;
1827	case IMC_TAD_BASE_CHAN_GRAN_4KB:
1828		rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1829		break;
1830	default:
1831		tad->itad_valid |= IMC_TAD_V_BAD_CHAN_GRAN;
1832		return;
1833	}
1834
1835	switch (IMC_TAD_BASE_SOCK_GRAN(val)) {
1836	case IMC_TAD_BASE_SOCK_GRAN_64B:
1837		rule->itr_sock_gran = IMC_TAD_GRAN_64B;
1838		break;
1839	case IMC_TAD_BASE_SOCK_GRAN_256B:
1840		rule->itr_sock_gran = IMC_TAD_GRAN_256B;
1841		break;
1842	case IMC_TAD_BASE_SOCK_GRAN_4KB:
1843		rule->itr_sock_gran = IMC_TAD_GRAN_4KB;
1844		break;
1845	case IMC_TAD_BASE_SOCK_GRAN_1GB:
1846		rule->itr_sock_gran = IMC_TAD_GRAN_1GB;
1847		break;
1848	}
1849}
1850
1851/*
1852 * When mirroring is enabled, at least in Sandy Bridge to Broadwell, it's
1853 * suggested that the channel wayness will take this into account and therefore
1854 * should be accurately reflected.
1855 */
1856static void
1857imc_tad_read_rules(imc_t *imc, imc_tad_t *tad)
1858{
1859	uint_t i;
1860	off_t baseoff;
1861	imc_tad_rule_t *prev;
1862
1863	tad->itad_nrules = imc->imc_gen_data->igd_tad_nrules;
1864	for (i = 0, baseoff = imc->imc_gen_data->igd_tad_rule_offset,
1865	    prev = NULL; i < tad->itad_nrules;
1866	    i++, baseoff += sizeof (uint32_t)) {
1867		uint32_t val;
1868		off_t off;
1869		imc_tad_rule_t *rule = &tad->itad_rules[i];
1870
1871		/*
1872		 * On Skylake, the TAD rules are split among two registers. The
1873		 * latter set mimics what exists on pre-Skylake.
1874		 */
1875		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1876			off = baseoff + IMC_SKX_WAYNESS_OFFSET;
1877		} else {
1878			off = baseoff;
1879		}
1880
1881		val = pci_config_get32(tad->itad_stub->istub_cfgspace, off);
1882		if (val == PCI_EINVAL32) {
1883			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1884			return;
1885		}
1886
1887		imc_tad_fill_rule(imc, tad, prev, rule, val);
1888		prev = rule;
1889		if (imc->imc_gen < IMC_GEN_SKYLAKE)
1890			continue;
1891
1892		val = pci_config_get32(tad->itad_stub->istub_cfgspace, baseoff);
1893		if (val == PCI_EINVAL32) {
1894			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1895			return;
1896		}
1897
1898		imc_tad_fill_skx(imc, tad, rule, val);
1899	}
1900}
1901
1902/*
1903 * Check for features which change how decoding works.
1904 */
1905static void
1906imc_tad_read_features(imc_t *imc, imc_tad_t *tad, imc_mc_t *mc)
1907{
1908	uint32_t val;
1909
1910	/*
1911	 * Determine whether or not lockstep mode or mirroring are enabled.
1912	 * These change the behavior of how we're supposed to interpret channel
1913	 * wayness. Lockstep is available in the TAD's features. Mirroring is
1914	 * available on the IMC's features. This isn't present in Skylake+. On
1915	 * Skylake Mirorring is a property of the SAD rule and there is no
1916	 * lockstep.
1917	 */
1918	switch (imc->imc_gen) {
1919	case IMC_GEN_SANDY:
1920	case IMC_GEN_IVY:
1921	case IMC_GEN_HASWELL:
1922	case IMC_GEN_BROADWELL:
1923		val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1924		    imc->imc_gen_data->igd_tad_sysdef);
1925		if (val == PCI_EINVAL32) {
1926			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1927			return;
1928		}
1929		if (IMC_TAD_SYSDEF_LOCKSTEP(val)) {
1930			tad->itad_flags |= IMC_TAD_FLAG_LOCKSTEP;
1931		}
1932
1933		val = pci_config_get32(mc->icn_main1->istub_cfgspace,
1934		    imc->imc_gen_data->igd_mc_mirror);
1935		if (val == PCI_EINVAL32) {
1936			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1937			return;
1938		}
1939		if (IMC_MC_MIRROR_SNB_BRD(val)) {
1940			tad->itad_flags |= IMC_TAD_FLAG_MIRROR;
1941		}
1942		break;
1943	default:
1944		break;
1945	}
1946
1947	/*
1948	 * Now, go through and look at values that'll change how we do the
1949	 * channel index and adddress calculation. These are only present
1950	 * between Ivy Bridge and Broadwell. They don't exist on Sandy Bridge
1951	 * and they don't exist on Skylake+.
1952	 */
1953	switch (imc->imc_gen) {
1954	case IMC_GEN_IVY:
1955	case IMC_GEN_HASWELL:
1956	case IMC_GEN_BROADWELL:
1957		val = pci_config_get32(tad->itad_stub->istub_cfgspace,
1958		    imc->imc_gen_data->igd_tad_sysdef2);
1959		if (val == PCI_EINVAL32) {
1960			tad->itad_valid |= IMC_TAD_V_BAD_PCI_READ;
1961			return;
1962		}
1963		if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1964			tad->itad_flags |= IMC_TAD_FLAG_CHANSHIFT;
1965		}
1966		if (IMC_TAD_SYSDEF2_SHIFTUP(val)) {
1967			tad->itad_flags |= IMC_TAD_FLAG_CHANHASH;
1968		}
1969		break;
1970	default:
1971		break;
1972	}
1973}
1974
1975/*
1976 * Read the IMC channel interleave records
1977 */
1978static void
1979imc_tad_read_interleave(imc_t *imc, imc_channel_t *chan)
1980{
1981	uint_t i;
1982	off_t off;
1983
1984	chan->ich_ntad_offsets = imc->imc_gen_data->igd_tad_nrules;
1985	for (i = 0, off = imc->imc_gen_data->igd_tad_chan_offset;
1986	    i < chan->ich_ntad_offsets; i++, off += sizeof (uint32_t)) {
1987		uint32_t val;
1988		uint64_t offset;
1989
1990		val = pci_config_get32(chan->ich_desc->istub_cfgspace,
1991		    off);
1992		if (val == PCI_EINVAL32) {
1993			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
1994			return;
1995		}
1996
1997		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
1998			offset = IMC_TADCHAN_OFFSET_SKX(val);
1999		} else {
2000			offset = IMC_TADCHAN_OFFSET_SNB_BRD(val);
2001		}
2002
2003		chan->ich_tad_offsets[i] = offset << IMC_TADCHAN_OFFSET_SHIFT;
2004		chan->ich_tad_offsets_raw[i] = val;
2005	}
2006}
2007
2008static void
2009imc_decoder_init_tad(imc_t *imc)
2010{
2011	uint_t i;
2012
2013	for (i = 0; i < imc->imc_nsockets; i++) {
2014		uint_t j;
2015
2016		for (j = 0; j < imc->imc_sockets[i].isock_ntad; j++) {
2017			imc_tad_read_features(imc,
2018			    &imc->imc_sockets[i].isock_tad[j],
2019			    &imc->imc_sockets[i].isock_imcs[j]);
2020			imc_tad_read_rules(imc,
2021			    &imc->imc_sockets[i].isock_tad[j]);
2022		}
2023	}
2024
2025	for (i = 0; i < imc->imc_nsockets; i++) {
2026		uint_t j;
2027		imc_socket_t *sock = &imc->imc_sockets[i];
2028
2029		for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2030			uint_t k;
2031			imc_mc_t *mc = &sock->isock_imcs[j];
2032
2033			for (k = 0; k < mc->icn_nchannels; k++) {
2034				imc_channel_t *chan = &mc->icn_channels[k];
2035				imc_tad_read_interleave(imc, chan);
2036			}
2037		}
2038	}
2039}
2040
2041static void
2042imc_rir_read_ileave_offsets(imc_t *imc, imc_channel_t *chan,
2043    imc_rank_ileave_t *rank, uint_t rirno, boolean_t contig)
2044{
2045	uint_t i;
2046	off_t off, incr;
2047
2048	/*
2049	 * Rank interleave offset registers come in two forms. Either they are
2050	 * contiguous for a given wayness, meaning that all of the entries for
2051	 * wayness zero are contiguous, or they are sparse, meaning that there
2052	 * is a bank for entry zero for all wayness, then entry one for all
2053	 * wayness, etc.
2054	 */
2055	if (contig) {
2056		off = imc->imc_gen_data->igd_rir_ileave_offset +
2057		    (rirno * imc->imc_gen_data->igd_rir_nileaves *
2058		    sizeof (uint32_t));
2059		incr = sizeof (uint32_t);
2060	} else {
2061		off = imc->imc_gen_data->igd_rir_ileave_offset +
2062		    (rirno * sizeof (uint32_t));
2063		incr = imc->imc_gen_data->igd_rir_nileaves * sizeof (uint32_t);
2064	}
2065	for (i = 0; i < rank->irle_nentries; i++, off += incr) {
2066		uint32_t val;
2067		uint64_t offset;
2068		imc_rank_ileave_entry_t *ent = &rank->irle_entries[i];
2069
2070		val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2071		if (val == PCI_EINVAL32) {
2072			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2073			return;
2074		}
2075
2076		switch (imc->imc_gen) {
2077		case IMC_GEN_BROADWELL:
2078			ent->irle_target = IMC_RIR_OFFSET_TARGET_BRD(val);
2079			break;
2080		default:
2081			ent->irle_target = IMC_RIR_OFFSET_TARGET(val);
2082			break;
2083		}
2084		if (imc->imc_gen >= IMC_GEN_HASWELL) {
2085			offset = IMC_RIR_OFFSET_OFFSET_HAS_SKX(val);
2086		} else {
2087			offset = IMC_RIR_OFFSET_OFFSET_SNB_IVB(val);
2088		}
2089		ent->irle_offset = offset << IMC_RIR_OFFSET_SHIFT;
2090	}
2091}
2092
2093static void
2094imc_rir_read_wayness(imc_t *imc, imc_channel_t *chan)
2095{
2096	uint_t i;
2097	off_t off;
2098
2099	chan->ich_nrankileaves = imc->imc_gen_data->igd_rir_nways;
2100	for (i = 0, off = imc->imc_gen_data->igd_rir_way_offset;
2101	    i < chan->ich_nrankileaves; i++, off += sizeof (uint32_t)) {
2102		uint32_t val;
2103		uint64_t lim;
2104		imc_rank_ileave_t *ent = &chan->ich_rankileaves[i];
2105
2106		val = pci_config_get32(chan->ich_desc->istub_cfgspace, off);
2107		if (val == PCI_EINVAL32) {
2108			chan->ich_valid |= IMC_CHANNEL_V_BAD_PCI_READ;
2109			return;
2110		}
2111
2112		ent->irle_raw = val;
2113		ent->irle_enabled = IMC_RIR_WAYNESS_ENABLED(val) != 0;
2114		ent->irle_nways = 1 << IMC_RIR_WAYNESS_WAY(val);
2115		ent->irle_nwaysbits = IMC_RIR_WAYNESS_WAY(val);
2116		if (imc->imc_gen >= IMC_GEN_HASWELL) {
2117			lim = IMC_RIR_LIMIT_HAS_SKX(val);
2118		} else {
2119			lim = IMC_RIR_LIMIT_SNB_IVB(val);
2120		}
2121
2122		ent->irle_limit = (lim << IMC_RIR_LIMIT_SHIFT) +
2123		    IMC_RIR_LIMIT_EXCLUSIVE;
2124
2125		ent->irle_nentries = imc->imc_gen_data->igd_rir_nileaves;
2126		if (imc->imc_gen >= IMC_GEN_SKYLAKE) {
2127			imc_rir_read_ileave_offsets(imc, chan, ent, i, B_FALSE);
2128		} else {
2129			imc_rir_read_ileave_offsets(imc, chan, ent, i, B_TRUE);
2130		}
2131	}
2132}
2133
2134static void
2135imc_decoder_init_rir(imc_t *imc)
2136{
2137	uint_t i;
2138
2139	for (i = 0; i < imc->imc_nsockets; i++) {
2140		uint_t j;
2141		imc_socket_t *sock = &imc->imc_sockets[i];
2142
2143		for (j = 0; j < imc->imc_sockets[i].isock_nimc; j++) {
2144			uint_t k;
2145			imc_mc_t *mc = &sock->isock_imcs[j];
2146
2147			for (k = 0; k < mc->icn_nchannels; k++) {
2148				imc_channel_t *chan = &mc->icn_channels[k];
2149				imc_rir_read_wayness(imc, chan);
2150			}
2151		}
2152	}
2153}
2154
2155static cmi_errno_t
2156imc_mc_patounum(void *arg, uint64_t pa, uint8_t valid_hi, uint8_t valid_lo,
2157    uint32_t synd, int syndtype, mc_unum_t *unump)
2158{
2159	imc_t *imc = arg;
2160	uint_t i;
2161	imc_decode_state_t dec;
2162
2163	bzero(&dec, sizeof (dec));
2164	if (!imc_decode_pa(imc, pa, &dec)) {
2165		switch (dec.ids_fail) {
2166		case IMC_DECODE_F_LEGACY_RANGE:
2167		case IMC_DECODE_F_OUTSIDE_DRAM:
2168			return (CMIERR_MC_NOTDIMMADDR);
2169		default:
2170			return (CMIERR_MC_BADSTATE);
2171		}
2172	}
2173
2174	unump->unum_board = 0;
2175	/*
2176	 * The chip id needs to be in the order that the OS expects it, which
2177	 * may not be our order.
2178	 */
2179	for (i = 0; i < imc->imc_nsockets; i++) {
2180		if (imc->imc_spointers[i] == dec.ids_socket)
2181			break;
2182	}
2183	if (i == imc->imc_nsockets) {
2184		return (CMIERR_MC_BADSTATE);
2185	}
2186	unump->unum_chip = i;
2187	unump->unum_mc = dec.ids_tadid;
2188	unump->unum_chan = dec.ids_channelid;
2189	unump->unum_cs = dec.ids_dimmid;
2190	unump->unum_rank = dec.ids_rankid;
2191	unump->unum_offset = dec.ids_rankaddr;
2192	for (i = 0; i < MC_UNUM_NDIMM; i++) {
2193		unump->unum_dimms[i] = MC_INVALNUM;
2194	}
2195
2196	return (CMI_SUCCESS);
2197}
2198
2199static cmi_errno_t
2200imc_mc_unumtopa(void *arg, mc_unum_t *unum, nvlist_t *nvl, uint64_t *pa)
2201{
2202	return (CMIERR_UNKNOWN);
2203}
2204
2205static const cmi_mc_ops_t imc_mc_ops = {
2206	.cmi_mc_patounum = imc_mc_patounum,
2207	.cmi_mc_unumtopa = imc_mc_unumtopa
2208};
2209
2210/*
2211 * This is where we really finish attaching and become open for business. This
2212 * occurs once we have all of the expected stubs attached. Here's where all of
2213 * the real fun begins.
2214 */
2215static void
2216imc_attach_complete(void *arg)
2217{
2218	imc_t *imc = arg;
2219	cmi_errno_t err;
2220
2221	imc_set_gen_data(imc);
2222
2223	/*
2224	 * On SKX and newer, we can fail to map PCI buses at this point due to
2225	 * bad PCIe reads.
2226	 */
2227	if (!imc_map_stubs(imc)) {
2228		goto done;
2229	}
2230
2231	if (!imc_validate_stubs(imc)) {
2232		imc->imc_flags |= IMC_F_VALIDATE_FAILED;
2233		goto done;
2234	}
2235
2236	imc_fixup_stubs(imc);
2237	imc_map_sockets(imc);
2238
2239	if (!imc_create_minors(imc)) {
2240		goto done;
2241	}
2242
2243	imc_fill_data(imc);
2244	imc_nvl_create(imc);
2245
2246	/*
2247	 * Gather additional information that we need so that we can properly
2248	 * initialize the memory decoder and encoder.
2249	 */
2250	imc_decoder_init_sad(imc);
2251	imc_decoder_init_tad(imc);
2252	imc_decoder_init_rir(imc);
2253
2254	/*
2255	 * Register decoder functions. This may fail. If so, try and complain
2256	 * loudly, but stay active to allow other data to be useful. Register a
2257	 * global handle.
2258	 */
2259	if ((err = cmi_mc_register_global(&imc_mc_ops, imc)) != CMI_SUCCESS) {
2260		imc->imc_flags |= IMC_F_MCREG_FAILED;
2261		dev_err(imc->imc_dip, CE_WARN, "failed to register memory "
2262		    "decoding operations: 0x%x", err);
2263	}
2264
2265done:
2266	mutex_enter(&imc->imc_lock);
2267	imc->imc_flags &= IMC_F_ATTACH_DISPATCHED;
2268	imc->imc_flags |= IMC_F_ATTACH_COMPLETE;
2269	mutex_exit(&imc->imc_lock);
2270}
2271
2272static int
2273imc_stub_comparator(const void *l, const void *r)
2274{
2275	const imc_stub_t *sl = l, *sr = r;
2276	if (sl->istub_bus > sr->istub_bus)
2277		return (1);
2278	if (sl->istub_bus < sr->istub_bus)
2279		return (-1);
2280	if (sl->istub_dev > sr->istub_dev)
2281		return (1);
2282	if (sl->istub_dev < sr->istub_dev)
2283		return (-1);
2284	if (sl->istub_func > sr->istub_func)
2285		return (1);
2286	if (sl->istub_func < sr->istub_func)
2287		return (-1);
2288	return (0);
2289}
2290
2291static int
2292imc_stub_scan_cb(dev_info_t *dip, void *arg)
2293{
2294	int vid, did;
2295	const imc_stub_table_t *table;
2296	imc_t *imc = arg;
2297	int *regs;
2298	uint_t i, nregs;
2299
2300	if (dip == ddi_root_node()) {
2301		return (DDI_WALK_CONTINUE);
2302	}
2303
2304	/*
2305	 * Get the dev info name. PCI devices will always be children of PCI
2306	 * devices today on x86. If we reach something that has a device name
2307	 * that's not PCI, then we can prune it's children.
2308	 */
2309	if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2310		return (DDI_WALK_PRUNECHILD);
2311	}
2312
2313	/*
2314	 * Get the device and vendor ID and see if this is something the imc
2315	 * knows about or cares about.
2316	 */
2317	vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2318	    "vendor-id", PCI_EINVAL16);
2319	did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2320	    "device-id", PCI_EINVAL16);
2321	if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2322		return (DDI_WALK_CONTINUE);
2323	}
2324
2325	if (vid != IMC_PCI_VENDOR_INTC) {
2326		return (DDI_WALK_PRUNECHILD);
2327	}
2328
2329	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2330	    "reg", &regs, &nregs) != DDI_PROP_SUCCESS) {
2331		return (DDI_WALK_CONTINUE);
2332	}
2333
2334	if (nregs == 0) {
2335		ddi_prop_free(regs);
2336		return (DDI_WALK_CONTINUE);
2337	}
2338
2339
2340	table = NULL;
2341	for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2342		if (imc_stub_table[i].imcs_devid == did &&
2343		    imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2344		    imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2345			table = &imc_stub_table[i];
2346			break;
2347		}
2348	}
2349	ddi_prop_free(regs);
2350
2351	/*
2352	 * Not a match, not interesting.
2353	 */
2354	if (table == NULL) {
2355		return (DDI_WALK_CONTINUE);
2356	}
2357
2358	mutex_enter(&imc->imc_lock);
2359	imc->imc_nscanned++;
2360	mutex_exit(&imc->imc_lock);
2361
2362	return (DDI_WALK_CONTINUE);
2363}
2364
2365/*
2366 * From here, go through and see how many of the devices that we know about.
2367 */
2368static void
2369imc_stub_scan(void *arg)
2370{
2371	imc_t *imc = arg;
2372	boolean_t dispatch = B_FALSE;
2373
2374	/*
2375	 * Zero out the scan results in case we've been detached and reattached.
2376	 */
2377	mutex_enter(&imc->imc_lock);
2378	imc->imc_nscanned = 0;
2379	mutex_exit(&imc->imc_lock);
2380
2381	ddi_walk_devs(ddi_root_node(), imc_stub_scan_cb, imc);
2382
2383	mutex_enter(&imc->imc_lock);
2384	imc->imc_flags |= IMC_F_SCAN_COMPLETE;
2385	imc->imc_flags &= ~IMC_F_SCAN_DISPATCHED;
2386
2387	/*
2388	 * If the scan found no nodes, then that means that we're on a hardware
2389	 * platform that we don't support. Therefore, there's no reason to do
2390	 * anything here.
2391	 */
2392	if (imc->imc_nscanned == 0) {
2393		imc->imc_flags |= IMC_F_UNSUP_PLATFORM;
2394		mutex_exit(&imc->imc_lock);
2395		return;
2396	}
2397
2398	if (avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2399		imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2400		dispatch = B_TRUE;
2401	}
2402
2403	mutex_exit(&imc->imc_lock);
2404
2405	if (dispatch) {
2406		(void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2407		    imc, DDI_SLEEP);
2408	}
2409}
2410
2411/*
2412 * By default, refuse to allow stubs to detach.
2413 */
2414int
2415imc_detach_stub(dev_info_t *dip, ddi_detach_cmd_t cmd)
2416{
2417	imc_stub_t *stub;
2418	imc_t *imc = imc_data;
2419
2420	mutex_enter(&imc->imc_lock);
2421
2422	/*
2423	 * By default, we do not allow stubs to detach. However, if the driver
2424	 * has attached to devices on a platform it doesn't recognize or
2425	 * support or if the override flag has been set, then allow detach to
2426	 * proceed.
2427	 */
2428	if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) == 0 &&
2429	    imc_allow_detach == 0) {
2430		mutex_exit(&imc->imc_lock);
2431		return (DDI_FAILURE);
2432	}
2433
2434	for (stub = avl_first(&imc->imc_stubs); stub != NULL;
2435	    stub = AVL_NEXT(&imc->imc_stubs, stub)) {
2436		if (stub->istub_dip == dip) {
2437			break;
2438		}
2439	}
2440
2441	/*
2442	 * A device was attached to us that we somehow don't know about. Allow
2443	 * this to proceed.
2444	 */
2445	if (stub == NULL) {
2446		mutex_exit(&imc->imc_lock);
2447		return (DDI_SUCCESS);
2448	}
2449
2450	pci_config_teardown(&stub->istub_cfgspace);
2451	avl_remove(&imc->imc_stubs, stub);
2452	kmem_free(stub, sizeof (imc_stub_t));
2453	mutex_exit(&imc->imc_lock);
2454
2455	return (DDI_SUCCESS);
2456}
2457
2458int
2459imc_attach_stub(dev_info_t *dip, ddi_attach_cmd_t cmd)
2460{
2461	imc_stub_t *stub, *lookup;
2462	int did, vid, *regs;
2463	uint_t i, nregs;
2464	const imc_stub_table_t *table;
2465	avl_index_t idx;
2466	boolean_t dispatch = B_FALSE;
2467	imc_t *imc = imc_data;
2468
2469	if (cmd != DDI_ATTACH) {
2470		return (DDI_FAILURE);
2471	}
2472
2473	/*
2474	 * We've been asked to attach a stub. First, determine if this is even a
2475	 * PCI device that we should care about. Then, append it to our global
2476	 * list and kick off the configuration task. Note that we do this
2477	 * configuration task in a taskq so that we don't interfere with the
2478	 * normal attach / detach path processing.
2479	 */
2480	if (strncmp("pci", ddi_get_name(dip), 3) != 0) {
2481		return (DDI_FAILURE);
2482	}
2483
2484	/*
2485	 * Get the device and vendor ID and see if this is something the imc
2486	 * knows about or cares about.
2487	 */
2488	vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2489	    "vendor-id", PCI_EINVAL16);
2490	did = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2491	    "device-id", PCI_EINVAL16);
2492	if (vid == PCI_EINVAL16 || did == PCI_EINVAL16) {
2493		return (DDI_FAILURE);
2494	}
2495
2496	/*
2497	 * Only accept INTC parts on the imc driver.
2498	 */
2499	if (vid != IMC_PCI_VENDOR_INTC) {
2500		return (DDI_FAILURE);
2501	}
2502
2503	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2504	    "reg", &regs, &nregs) != DDI_PROP_SUCCESS) {
2505		return (DDI_FAILURE);
2506	}
2507
2508	if (nregs == 0) {
2509		ddi_prop_free(regs);
2510		return (DDI_FAILURE);
2511	}
2512
2513	/*
2514	 * Determine if this matches a known device.
2515	 */
2516	table = NULL;
2517	for (i = 0; i < ARRAY_SIZE(imc_stub_table); i++) {
2518		if (imc_stub_table[i].imcs_devid == did &&
2519		    imc_stub_table[i].imcs_pcidev == PCI_REG_DEV_G(regs[0]) &&
2520		    imc_stub_table[i].imcs_pcifunc == PCI_REG_FUNC_G(regs[0])) {
2521			table = &imc_stub_table[i];
2522			break;
2523		}
2524	}
2525
2526	if (i == ARRAY_SIZE(imc_stub_table)) {
2527		ddi_prop_free(regs);
2528		return (DDI_FAILURE);
2529	}
2530
2531	/*
2532	 * We've found something. Make sure the generation matches our current
2533	 * one. If it does, construct the entry and append it to the list.
2534	 */
2535	mutex_enter(&imc->imc_lock);
2536	if (imc->imc_gen != IMC_GEN_UNKNOWN && imc->imc_gen !=
2537	    table->imcs_gen) {
2538		mutex_exit(&imc->imc_lock);
2539		ddi_prop_free(regs);
2540		dev_err(dip, CE_WARN, "Encountered IMC stub device (%u/%u) "
2541		    "that has different hardware generation (%u) from current "
2542		    "generation (%u)", vid, did, table->imcs_gen, imc->imc_gen);
2543		return (DDI_FAILURE);
2544	} else {
2545		imc->imc_gen = table->imcs_gen;
2546	}
2547	mutex_exit(&imc->imc_lock);
2548
2549	stub = kmem_zalloc(sizeof (imc_stub_t), KM_SLEEP);
2550	stub->istub_dip = dip;
2551	stub->istub_vid = vid;
2552	stub->istub_did = did;
2553	stub->istub_bus = PCI_REG_BUS_G(regs[0]);
2554	stub->istub_dev = PCI_REG_DEV_G(regs[0]);
2555	stub->istub_func = PCI_REG_FUNC_G(regs[0]);
2556	ddi_prop_free(regs);
2557	stub->istub_table = table;
2558
2559	if (pci_config_setup(dip, &stub->istub_cfgspace) != DDI_SUCCESS) {
2560		kmem_free(stub, sizeof (stub));
2561		dev_err(dip, CE_WARN, "Failed to set up PCI config space "
2562		    "for IMC stub device %s (%u/%u)", ddi_node_name(dip),
2563		    vid, did);
2564		return (DDI_FAILURE);
2565	}
2566
2567	mutex_enter(&imc->imc_lock);
2568	if ((lookup = avl_find(&imc->imc_stubs, stub, &idx)) != NULL) {
2569		dev_err(dip, CE_WARN, "IMC stub %s (%u/%u) has duplicate "
2570		    "bdf %u/%u/%u with %s (%u/%u), not attaching",
2571		    ddi_node_name(imc->imc_dip), vid, did,
2572		    stub->istub_bus, stub->istub_dev, stub->istub_func,
2573		    ddi_node_name(lookup->istub_dip), lookup->istub_vid,
2574		    lookup->istub_did);
2575		mutex_exit(&imc->imc_lock);
2576		pci_config_teardown(&stub->istub_cfgspace);
2577		kmem_free(stub, sizeof (stub));
2578
2579		return (DDI_FAILURE);
2580	}
2581	avl_insert(&imc->imc_stubs, stub, idx);
2582
2583	if ((imc->imc_flags & IMC_F_ALL_FLAGS) == IMC_F_SCAN_COMPLETE &&
2584	    avl_numnodes(&imc->imc_stubs) == imc->imc_nscanned) {
2585		imc->imc_flags |= IMC_F_ATTACH_DISPATCHED;
2586		dispatch = B_TRUE;
2587	}
2588	mutex_exit(&imc->imc_lock);
2589
2590	if (dispatch) {
2591		(void) ddi_taskq_dispatch(imc->imc_taskq, imc_attach_complete,
2592		    imc, DDI_SLEEP);
2593	}
2594
2595	return (DDI_SUCCESS);
2596}
2597
2598static int
2599imc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
2600{
2601	imc_t *imc = imc_data;
2602
2603	if ((flag & (FEXCL | FNDELAY)) != 0)
2604		return (EINVAL);
2605
2606	if (otyp != OTYP_CHR)
2607		return (EINVAL);
2608
2609	mutex_enter(&imc->imc_lock);
2610
2611	if ((imc->imc_flags & IMC_F_UNSUP_PLATFORM) != 0) {
2612		mutex_exit(&imc->imc_lock);
2613		return (ENOTSUP);
2614	}
2615
2616	/*
2617	 * It's possible that someone has come in during the window between when
2618	 * we've created the minor node and when we've finished doing work.
2619	 */
2620	if ((imc->imc_flags & IMC_F_ATTACH_COMPLETE) == 0) {
2621		mutex_exit(&imc->imc_lock);
2622		return (EAGAIN);
2623	}
2624
2625	/*
2626	 * It's not clear how someone would get a minor that we didn't create.
2627	 * But be paranoid and make sure.
2628	 */
2629	if (getminor(*devp) >= imc->imc_nsockets) {
2630		mutex_exit(&imc->imc_lock);
2631		return (EINVAL);
2632	}
2633
2634	/*
2635	 * Make sure this socket entry has been filled in.
2636	 */
2637	if (imc->imc_spointers[getminor(*devp)] == NULL) {
2638		mutex_exit(&imc->imc_lock);
2639		return (EINVAL);
2640	}
2641
2642	mutex_exit(&imc->imc_lock);
2643
2644	return (0);
2645}
2646
2647static void
2648imc_ioctl_decode(imc_t *imc, mc_encode_ioc_t *encode)
2649{
2650	imc_decode_state_t dec;
2651	uint_t i;
2652
2653	bzero(&dec, sizeof (dec));
2654	if (!imc_decode_pa(imc, encode->mcei_pa, &dec)) {
2655		encode->mcei_err = (uint32_t)dec.ids_fail;
2656		encode->mcei_errdata = dec.ids_fail_data;
2657		return;
2658	}
2659
2660	encode->mcei_errdata = 0;
2661	encode->mcei_err = 0;
2662	encode->mcei_board = 0;
2663	for (i = 0; i < imc->imc_nsockets; i++) {
2664		if (imc->imc_spointers[i] == dec.ids_socket)
2665			break;
2666	}
2667	encode->mcei_chip = i;
2668	encode->mcei_mc = dec.ids_tadid;
2669	encode->mcei_chan = dec.ids_channelid;
2670	encode->mcei_dimm = dec.ids_dimmid;
2671	encode->mcei_rank_addr = dec.ids_rankaddr;
2672	encode->mcei_rank = dec.ids_rankid;
2673	encode->mcei_row = UINT32_MAX;
2674	encode->mcei_column = UINT32_MAX;
2675	encode->mcei_pad = 0;
2676}
2677
2678static int
2679imc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2680    int *rvalp)
2681{
2682	int ret;
2683	minor_t m;
2684	mc_snapshot_info_t info;
2685	mc_encode_ioc_t encode;
2686	imc_t *imc = imc_data;
2687	imc_socket_t *sock;
2688
2689	mutex_enter(&imc->imc_lock);
2690	m = getminor(dev);
2691	if (m >= imc->imc_nsockets) {
2692		ret = EINVAL;
2693		goto done;
2694	}
2695	sock = imc->imc_spointers[m];
2696	if (sock == NULL) {
2697		ret = EINVAL;
2698		goto done;
2699	}
2700
2701	/*
2702	 * Note, other memory controller drivers don't check mode for reading
2703	 * data nor do they care who can read it from a credential perspective.
2704	 * As such we don't either at this time.
2705	 */
2706	switch (cmd) {
2707	case MC_IOC_SNAPSHOT_INFO:
2708		imc_nvl_pack(sock, B_FALSE);
2709		if (sock->isock_buf == NULL) {
2710			ret = EIO;
2711			break;
2712		}
2713
2714		info.mcs_size = sock->isock_buflen;
2715		info.mcs_gen = sock->isock_gen;
2716
2717		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2718			ret = EFAULT;
2719			break;
2720		}
2721
2722		ret = 0;
2723		break;
2724	case MC_IOC_SNAPSHOT:
2725		imc_nvl_pack(sock, B_FALSE);
2726		if (sock->isock_buf == NULL) {
2727			ret = EIO;
2728			break;
2729		}
2730
2731		if (ddi_copyout(sock->isock_buf, (void *)arg,
2732		    sock->isock_buflen, mode) != 0) {
2733			ret = EFAULT;
2734			break;
2735		}
2736
2737		ret = 0;
2738		break;
2739	case MC_IOC_DECODE_SNAPSHOT_INFO:
2740		imc_decoder_pack(imc);
2741		if (imc->imc_decoder_buf == NULL) {
2742			ret = EIO;
2743			break;
2744		}
2745
2746		info.mcs_size = imc->imc_decoder_len;
2747		info.mcs_gen = imc->imc_spointers[0]->isock_gen;
2748
2749		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode) != 0) {
2750			ret = EFAULT;
2751			break;
2752		}
2753
2754		ret = 0;
2755		break;
2756	case MC_IOC_DECODE_SNAPSHOT:
2757		imc_decoder_pack(imc);
2758		if (imc->imc_decoder_buf == NULL) {
2759			ret = EIO;
2760			break;
2761		}
2762
2763		if (ddi_copyout(imc->imc_decoder_buf, (void *)arg,
2764		    imc->imc_decoder_len, mode) != 0) {
2765			ret = EFAULT;
2766			break;
2767		}
2768
2769		ret = 0;
2770		break;
2771	case MC_IOC_DECODE_PA:
2772		if (crgetzoneid(credp) != GLOBAL_ZONEID ||
2773		    drv_priv(credp) != 0) {
2774			ret = EPERM;
2775			break;
2776		}
2777
2778		if (ddi_copyin((void *)arg, &encode, sizeof (encode),
2779		    mode & FKIOCTL) != 0) {
2780			ret = EPERM;
2781			break;
2782		}
2783
2784		imc_ioctl_decode(imc, &encode);
2785		ret = 0;
2786
2787		if (ddi_copyout(&encode, (void *)arg, sizeof (encode),
2788		    mode & FKIOCTL) != 0) {
2789			ret = EPERM;
2790			break;
2791		}
2792		break;
2793	default:
2794		ret = EINVAL;
2795		goto done;
2796	}
2797
2798done:
2799	mutex_exit(&imc->imc_lock);
2800	return (ret);
2801}
2802
2803static int
2804imc_close(dev_t dev, int flag, int otyp, cred_t *credp)
2805{
2806	return (0);
2807}
2808
2809static int
2810imc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2811{
2812	if (cmd != DDI_ATTACH) {
2813		return (DDI_FAILURE);
2814	}
2815
2816	if (imc_data == NULL || imc_data->imc_dip != NULL) {
2817		return (DDI_FAILURE);
2818	}
2819
2820	mutex_enter(&imc_data->imc_lock);
2821	if ((imc_data->imc_taskq = ddi_taskq_create(dip, "imc", 1,
2822	    TASKQ_DEFAULTPRI, 0)) == NULL) {
2823		mutex_exit(&imc_data->imc_lock);
2824		return (DDI_FAILURE);
2825	}
2826
2827	imc_data->imc_dip = dip;
2828	imc_data->imc_flags |= IMC_F_SCAN_DISPATCHED;
2829	mutex_exit(&imc_data->imc_lock);
2830
2831	(void) ddi_taskq_dispatch(imc_data->imc_taskq, imc_stub_scan, imc_data,
2832	    DDI_SLEEP);
2833
2834	return (DDI_SUCCESS);
2835}
2836
2837/*
2838 * We only export a single instance.
2839 */
2840static int
2841imc_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp)
2842{
2843	/*
2844	 * getinfo(9E) shouldn't be called if we're not attached. But be
2845	 * paranoid.
2846	 */
2847	if (imc_data == NULL || imc_data->imc_dip == NULL) {
2848		return (DDI_FAILURE);
2849	}
2850
2851	switch (infocmd) {
2852	case DDI_INFO_DEVT2DEVINFO:
2853		*resultp = imc_data->imc_dip;
2854		break;
2855	case DDI_INFO_DEVT2INSTANCE:
2856		*resultp = (void *)0;
2857		break;
2858	default:
2859		return (DDI_FAILURE);
2860	}
2861
2862	return (DDI_SUCCESS);
2863}
2864
2865static int
2866imc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2867{
2868	if (cmd != DDI_DETACH) {
2869		return (DDI_FAILURE);
2870	}
2871
2872	if (imc_data == NULL || imc_data->imc_dip) {
2873		return (DDI_FAILURE);
2874	}
2875
2876	mutex_enter(&imc_data->imc_lock);
2877
2878	/*
2879	 * While a scan or attach is outstanding, don't allow us to detach.
2880	 */
2881	if ((imc_data->imc_flags &
2882	    (IMC_F_SCAN_DISPATCHED | IMC_F_ATTACH_DISPATCHED)) != 0) {
2883		mutex_exit(&imc_data->imc_lock);
2884		return (DDI_FAILURE);
2885	}
2886
2887	/*
2888	 * Because the stub driver depends on the imc driver, we shouldn't be
2889	 * able to have any entries in this list when we detach. However, we
2890	 * check just to make sure.
2891	 */
2892	if (!avl_is_empty(&imc_data->imc_stubs)) {
2893		mutex_exit(&imc_data->imc_lock);
2894		return (DDI_FAILURE);
2895	}
2896
2897	nvlist_free(imc_data->imc_decoder_dump);
2898	imc_data->imc_decoder_dump = NULL;
2899	if (imc_data->imc_decoder_buf != NULL) {
2900		kmem_free(imc_data->imc_decoder_buf, imc_data->imc_decoder_len);
2901		imc_data->imc_decoder_buf = NULL;
2902		imc_data->imc_decoder_len = 0;
2903	}
2904
2905	ddi_remove_minor_node(imc_data->imc_dip, NULL);
2906	imc_data->imc_dip = NULL;
2907	mutex_exit(&imc_data->imc_lock);
2908
2909	ddi_taskq_wait(imc_data->imc_taskq);
2910	ddi_taskq_destroy(imc_data->imc_taskq);
2911	imc_data->imc_taskq = NULL;
2912
2913	return (DDI_SUCCESS);
2914}
2915
2916static void
2917imc_free(void)
2918{
2919	if (imc_data == NULL) {
2920		return;
2921	}
2922
2923	VERIFY(avl_is_empty(&imc_data->imc_stubs));
2924	avl_destroy(&imc_data->imc_stubs);
2925	mutex_destroy(&imc_data->imc_lock);
2926	kmem_free(imc_data, sizeof (imc_t));
2927	imc_data = NULL;
2928}
2929
2930static void
2931imc_alloc(void)
2932{
2933	imc_data = kmem_zalloc(sizeof (imc_t), KM_SLEEP);
2934
2935	mutex_init(&imc_data->imc_lock, NULL, MUTEX_DRIVER, NULL);
2936	avl_create(&imc_data->imc_stubs, imc_stub_comparator,
2937	    sizeof (imc_stub_t), offsetof(imc_stub_t, istub_link));
2938}
2939
2940static struct cb_ops imc_cb_ops = {
2941	.cb_open = imc_open,
2942	.cb_close = imc_close,
2943	.cb_strategy = nodev,
2944	.cb_print = nodev,
2945	.cb_dump = nodev,
2946	.cb_read = nodev,
2947	.cb_write = nodev,
2948	.cb_ioctl = imc_ioctl,
2949	.cb_devmap = nodev,
2950	.cb_mmap = nodev,
2951	.cb_segmap = nodev,
2952	.cb_chpoll = nochpoll,
2953	.cb_prop_op = ddi_prop_op,
2954	.cb_flag = D_MP,
2955	.cb_rev = CB_REV,
2956	.cb_aread = nodev,
2957	.cb_awrite = nodev
2958};
2959
2960static struct dev_ops imc_dev_ops = {
2961	.devo_rev = DEVO_REV,
2962	.devo_refcnt = 0,
2963	.devo_getinfo = imc_getinfo,
2964	.devo_identify = nulldev,
2965	.devo_probe = nulldev,
2966	.devo_attach = imc_attach,
2967	.devo_detach = imc_detach,
2968	.devo_reset = nodev,
2969	.devo_cb_ops = &imc_cb_ops,
2970	.devo_quiesce = ddi_quiesce_not_needed
2971};
2972
2973static struct modldrv imc_modldrv = {
2974	.drv_modops = &mod_driverops,
2975	.drv_linkinfo = "Intel Integrated Memory Controller Driver",
2976	.drv_dev_ops = &imc_dev_ops
2977};
2978
2979static struct modlinkage imc_modlinkage = {
2980	.ml_rev = MODREV_1,
2981	.ml_linkage = { &imc_modldrv, NULL }
2982};
2983
2984int
2985_init(void)
2986{
2987	int ret;
2988
2989	if ((ret = mod_install(&imc_modlinkage)) == 0) {
2990		imc_alloc();
2991	}
2992
2993	return (ret);
2994}
2995
2996int
2997_info(struct modinfo *modinfop)
2998{
2999	return (mod_info(&imc_modlinkage, modinfop));
3000}
3001
3002int
3003_fini(void)
3004{
3005	int ret;
3006
3007	if ((ret = mod_remove(&imc_modlinkage)) == 0) {
3008		imc_free();
3009	}
3010	return (ret);
3011}
3012