1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright 2019, Joyent, Inc.
14 */
15
16/*
17 * AMD Family 17 Northbridge and Data Fabric Driver
18 *
19 * This driver attaches to the AMD Family 17h northbridge and data fabric bus.
20 * Each Zeppelin die ('processor node' in cpuid.c parlance) has its own
21 * northbridge and access to the data fabric bus. The northbridge and data
22 * fabric both provide access to various features such as:
23 *
24 *  - The System Management Network (SMN)
25 *  - Data Fabric via Fabric Indirect Config Access (FICAA)
26 *
27 * These are required to access things such as temperature sensors or memory
28 * controller configuration registers.
29 *
30 * In AMD Family 17h systems, the 'northbridge' is an ASIC that is part of the
31 * package that contains many I/O capabilities related to things like PCI
32 * express, etc. The 'data fabric' is the means by which different components
33 * both inside the socket and multiple sockets are connected together. Both the
34 * northbridge and the data fabric have dedicated PCI devices which the
35 * operating system can use to interact with them.
36 *
37 * ------------------------
38 * Mapping Devices Together
39 * ------------------------
40 *
41 * The operating system needs to expose things like temperature sensors and DRAM
42 * configuration registers in terms that are meaningful to the system such as
43 * logical CPUs, cores, etc. This driver attaches to the PCI IDs that represent
44 * the northbridge and data fabric; however, there are multiple PCI devices (one
45 * per die) that exist. This driver does manage to map all of these three things
46 * together; however, it requires some acrobatics. Unfortunately, there's no
47 * direct way to map a northbridge to its corresponding die. However, we can map
48 * a CPU die to a data fabric PCI device and a data fabric PCI device to a
49 * corresponding northbridge PCI device.
50 *
51 * In current Zen based products, there is a direct mapping between processor
52 * nodes and a data fabric PCI device. All of the devices are on PCI Bus 0 and
53 * start from Device 0x18. Device 0x18 maps to processor node 0, 0x19 to
54 * processor node 1, etc. This means that to map a logical CPU to a data fabric
55 * device, we take its processor node id, add it to 0x18 and find the PCI device
56 * that is on bus 0, device 0x18. As each data fabric device is attached based
57 * on its PCI ID, we add it to the global list, amd_nbdf_dfs that is in the
58 * amd_f17nbdf_t structure.
59 *
60 * The northbridge PCI device has a defined device and function, but the PCI bus
61 * that it's on can vary. Each die has its own series of PCI buses that are
62 * assigned to it and the northbridge PCI device is on the first of die-specific
63 * PCI bus for each die. This also means that the northbridge will not show up
64 * on PCI bus 0, which is the PCI bus that all of the data fabric devices are
65 * on. While conventionally the northbridge with the lowest PCI bus value
66 * would correspond to processor node zero, hardware does not guarantee that at
67 * all. Because we don't want to be at the mercy of firmware, we don't rely on
68 * this ordering, even though we have yet to find a system that deviates from
69 * this scheme.
70 *
71 * One of the registers in the data fabric device's function 0
72 * (AMDF17_DF_CFG_ADDR_CTL), happens to have the first PCI bus that is
73 * associated with the processor node. This means, that we can map a data fabric
74 * device to a northbridge by finding the northbridge whose PCI bus matches the
75 * value in the corresponding data fabric's AMDF17_DF_CFG_ADDR_CTL.
76 *
77 * This means that we can map a northbridge to a data fabric device and a data
78 * fabric device to a die. Because these are 1:1 mappings, there is a transitive
79 * relationship and therefore we know which northbridge is associated with which
80 * processor die. This is summarized in the following image:
81 *
82 *  +-------+      +----------------------------+         +--------------+
83 *  | Die 0 | ---> | Data Fabric PCI BDF 0/18/0 |-------> | Northbridge  |
84 *  +-------+      | AMDF17_DF_CFG_ADDR: bus 10 |         | PCI  10/0/0  |
85 *     ...         +----------------------------+         +--------------+
86 *  +-------+      +------------------------------+         +--------------+
87 *  | Die n | ---> | Data Fabric PCI BDF 0/18+n/0 |-------> | Northbridge  |
88 *  +-------+      | AMDF17_DF_CFG_ADDR: bus 133  |         | PCI 133/0/0  |
89 *                 +------------------------------+         +--------------+
90 *
91 * Note, the PCI buses used by the northbridges here are arbitrary. They do not
92 * reflect the actual values by hardware; however, the bus/device/function (BDF)
93 * of the data fabric accurately models hardware. All of the BDF values are in
94 * hex.
95 *
96 * -------------------------------
97 * Attach and Detach Complications
98 * -------------------------------
99 *
100 * Because we need to map different PCI devices together, this means that we
101 * have multiple dev_info_t structures that we need to manage. Each of these is
102 * independently attached and detached. While this is easily managed for attach,
103 * it is not for detach.
104 *
105 * Once a device has been detached it will only come back if we have an active
106 * minor node that will be accessed. While we have minor nodes associated with
107 * the northbridges, we don't with the data fabric devices. This means that if
108 * they are detached, nothing would ever cause them to be reattached. The system
109 * also doesn't provide us a way or any guarantees around making sure that we're
110 * attached to all such devices before we detach. As a result, unfortunately,
111 * it's easier to basically have detach always fail.
112 *
113 * To deal with both development and if issues arise in the field, there is a
114 * knob, amdf17df_allow_detach, which if set to a non-zero value, will allow
115 * instances to detach.
116 *
117 * ---------------
118 * Exposed Devices
119 * ---------------
120 *
121 * Currently we expose a single set of character devices which represent
122 * temperature sensors for this family of processors. Because temperature
123 * sensors exist on a per-processor node basis, we create a single minor node
124 * for each one. Because our naming matches the cpuid naming, FMA can match that
125 * up to logical CPUs and take care of matching the sensors appropriately. We
126 * internally rate limit the sensor updates to 100ms, which is controlled by the
127 * global amdf17nbdf_cache_ms.
128 */
129
130#include <sys/modctl.h>
131#include <sys/conf.h>
132#include <sys/devops.h>
133#include <sys/types.h>
134#include <sys/file.h>
135#include <sys/open.h>
136#include <sys/cred.h>
137#include <sys/ddi.h>
138#include <sys/sunddi.h>
139#include <sys/cmn_err.h>
140#include <sys/list.h>
141#include <sys/pci.h>
142#include <sys/stddef.h>
143#include <sys/stat.h>
144#include <sys/x86_archext.h>
145#include <sys/cpuvar.h>
146#include <sys/sensors.h>
147
148/*
149 * The range of minors that we'll allow.
150 */
151#define	AMDF17_MINOR_LOW	1
152#define	AMDF17_MINOR_HIGH	INT32_MAX
153
154/*
155 * This is the value of the first PCI data fabric device that globally exists.
156 * It always maps to AMD's first nodeid (what we call cpi_procnodeid).
157 */
158#define	AMDF17_DF_FIRST_DEVICE	0x18
159
160/*
161 * The data fabric devices are defined to always be on PCI bus zero.
162 */
163#define	AMDF17_DF_BUSNO		0x00
164
165/*
166 * This register contains the BUS A of the the processor node that corresponds
167 * to the data fabric device.
168 */
169#define	AMDF17_DF_CFG_ADDR_CTL		0x84
170#define	AMDF17_DF_CFG_ADDR_CTL_MASK	0xff
171
172/*
173 * Northbridge registers that are related to accessing the SMN. One writes to
174 * the SMN address register and then can read from the SMN data register.
175 */
176#define	AMDF17_NB_SMN_ADDR	0x60
177#define	AMDF17_NB_SMN_DATA	0x64
178
179/*
180 * The following are register offsets and the meaning of their bits related to
181 * temperature. These addresses are addresses in the System Management Network
182 * which is accessed through the northbridge.  They are not addresses in PCI
183 * configuration space.
184 */
185#define	AMDF17_SMU_THERMAL_CURTEMP			0x00059800
186#define	AMDF17_SMU_THERMAL_CURTEMP_TEMPERATURE(x)	((x) >> 21)
187#define	AMDF17_SMU_THERMAL_CURTEMP_RANGE_SEL		(1 << 19)
188
189#define	AMDF17_SMU_THERMAL_CURTEMP_RANGE_ADJ		(-49)
190#define	AMDF17_SMU_THERMAL_CURTEMP_DECIMAL_BITS		3
191#define	AMDF17_SMU_THERMAL_CURTEMP_BITS_MASK		0x7
192
193/*
194 * The temperature sensor in family 17 is measured in terms of 0.125 C steps.
195 */
196#define	AMDF17_THERMAL_GRANULARITY	8
197
198struct amdf17nb;
199struct amdf17df;
200
201typedef struct amdf17nb {
202	list_node_t		amd_nb_link;
203	dev_info_t		*amd_nb_dip;
204	ddi_acc_handle_t	amd_nb_cfgspace;
205	uint_t			amd_nb_bus;
206	uint_t			amd_nb_dev;
207	uint_t			amd_nb_func;
208	struct amdf17df		*amd_nb_df;
209	uint_t			amd_nb_procnodeid;
210	id_t			amd_nb_temp_minor;
211	hrtime_t		amd_nb_temp_last_read;
212	int			amd_nb_temp_off;
213	uint32_t		amd_nb_temp_reg;
214	/* Values derived from the above */
215	int64_t			amd_nb_temp;
216} amdf17nb_t;
217
218typedef struct amdf17df {
219	list_node_t		amd_df_link;
220	dev_info_t		*amd_df_f0_dip;
221	ddi_acc_handle_t	amd_df_f0_cfgspace;
222	uint_t			amd_df_procnodeid;
223	uint_t			amd_df_iobus;
224	amdf17nb_t		*amd_df_nb;
225} amdf17df_t;
226
227typedef struct amdf17nbdf {
228	kmutex_t	amd_nbdf_lock;
229	id_space_t	*amd_nbdf_minors;
230	list_t		amd_nbdf_nbs;
231	list_t		amd_nbdf_dfs;
232} amdf17nbdf_t;
233
234typedef enum {
235	AMD_NBDF_TYPE_UNKNOWN,
236	AMD_NBDF_TYPE_NORTHBRIDGE,
237	AMD_NBDF_TYPE_DATA_FABRIC
238} amdf17nbdf_type_t;
239
240typedef struct {
241	uint16_t		amd_nbdft_pci_did;
242	amdf17nbdf_type_t	amd_nbdft_type;
243} amdf17nbdf_table_t;
244
245static const amdf17nbdf_table_t amdf17nbdf_dev_map[] = {
246	/* Family 17h Ryzen, Epyc Models 00h-0fh (Zen uarch) */
247	{ 0x1450, AMD_NBDF_TYPE_NORTHBRIDGE },
248	{ 0x1460, AMD_NBDF_TYPE_DATA_FABRIC },
249	{ PCI_EINVAL16 }
250};
251
252typedef struct {
253	const char	*amd_nbdfo_brand;
254	uint_t		amd_nbdfo_family;
255	int		amd_nbdfo_off;
256} amdf17nbdf_offset_t;
257
258/*
259 * AMD processors report a control temperature (called Tctl) which may be
260 * different from the junction temperature, which is the value that is actually
261 * measured from the die (sometimes called Tdie or Tjct). This is done so that
262 * socket-based environmental monitoring can be consistent from a platform
263 * perspective, but doesn't help us. Unfortunately, these values aren't in
264 * datasheets that we can find, but have been documented partially in a series
265 * of blog posts by AMD when discussing their 'Ryzen Master' monitoring software
266 * for Windows.
267 *
268 * The brand strings below may contain partial matches such in the Threadripper
269 * cases so we can match the entire family of processors. The offset value is
270 * the quantity in degrees that we should adjust Tctl to reach Tdie.
271 */
272static const amdf17nbdf_offset_t amdf17nbdf_offsets[] = {
273	{ "AMD Ryzen 5 1600X", 0x17, -20 },
274	{ "AMD Ryzen 7 1700X", 0x17, -20 },
275	{ "AMD Ryzen 7 1800X", 0x17, -20 },
276	{ "AMD Ryzen 7 2700X", 0x17, -10 },
277	{ "AMD Ryzen Threadripper 19", 0x17, -27 },
278	{ "AMD Ryzen Threadripper 29", 0x17, -27 },
279	{ NULL }
280};
281
282/*
283 * This indicates a number of milliseconds that we should wait between reads.
284 * This is somewhat arbitrary, but the goal is to reduce cross call activity
285 * and reflect that the sensor may not update all the time.
286 */
287uint_t amdf17nbdf_cache_ms = 100;
288
289/*
290 * This indicates whether detach is allowed. It is not by default. See the
291 * theory statement section 'Attach and Detach Complications' for more
292 * information.
293 */
294uint_t amdf17nbdf_allow_detach = 0;
295
296/*
297 * Global data that we keep regarding the device.
298 */
299amdf17nbdf_t *amdf17nbdf;
300
301static amdf17nb_t *
302amdf17nbdf_lookup_nb(amdf17nbdf_t *nbdf, minor_t minor)
303{
304	ASSERT(MUTEX_HELD(&nbdf->amd_nbdf_lock));
305
306	if (minor < AMDF17_MINOR_LOW || minor > AMDF17_MINOR_HIGH) {
307		return (NULL);
308	}
309
310	for (amdf17nb_t *nb = list_head(&nbdf->amd_nbdf_nbs); nb != NULL;
311	    nb = list_next(&nbdf->amd_nbdf_nbs, nb)) {
312		if ((id_t)minor == nb->amd_nb_temp_minor) {
313			return (nb);
314		}
315	}
316
317	return (NULL);
318}
319
320static void
321amdf17nbdf_cleanup_nb(amdf17nbdf_t *nbdf, amdf17nb_t *nb)
322{
323	if (nb == NULL)
324		return;
325
326	ddi_remove_minor_node(nb->amd_nb_dip, NULL);
327	if (nb->amd_nb_temp_minor > 0) {
328		id_free(nbdf->amd_nbdf_minors, nb->amd_nb_temp_minor);
329	}
330	if (nb->amd_nb_cfgspace != NULL) {
331		pci_config_teardown(&nb->amd_nb_cfgspace);
332	}
333	kmem_free(nb, sizeof (amdf17nb_t));
334}
335
336static void
337amdf17nbdf_cleanup_df(amdf17df_t *df)
338{
339	if (df == NULL)
340		return;
341
342	if (df->amd_df_f0_cfgspace != NULL) {
343		pci_config_teardown(&df->amd_df_f0_cfgspace);
344	}
345	kmem_free(df, sizeof (amdf17df_t));
346}
347
348static int
349amdf17nbdf_smn_read(amdf17nbdf_t *nbdf, amdf17nb_t *nb, uint32_t addr,
350    uint32_t *valp)
351{
352	VERIFY(MUTEX_HELD(&nbdf->amd_nbdf_lock));
353
354	pci_config_put32(nb->amd_nb_cfgspace, AMDF17_NB_SMN_ADDR, addr);
355	*valp = pci_config_get32(nb->amd_nb_cfgspace, AMDF17_NB_SMN_DATA);
356
357	return (0);
358}
359
360static int
361amdf17nbdf_temp_read(amdf17nbdf_t *nbdf, amdf17nb_t *nb)
362{
363	int ret;
364	uint32_t reg, rawtemp, decimal;
365
366	ASSERT(MUTEX_HELD(&nbdf->amd_nbdf_lock));
367
368	/*
369	 * Update the last read time first. Even if this fails, we want to make
370	 * sure that we latch the fact that we tried.
371	 */
372	nb->amd_nb_temp_last_read = gethrtime();
373	if ((ret = amdf17nbdf_smn_read(nbdf, nb, AMDF17_SMU_THERMAL_CURTEMP,
374	    &reg)) != 0) {
375		return (ret);
376	}
377
378	nb->amd_nb_temp_reg = reg;
379
380	/*
381	 * Take the primary temperature value and break apart its decimal value
382	 * from its main value.
383	 */
384	rawtemp = AMDF17_SMU_THERMAL_CURTEMP_TEMPERATURE(reg);
385	decimal = rawtemp & AMDF17_SMU_THERMAL_CURTEMP_BITS_MASK;
386	rawtemp = rawtemp >> AMDF17_SMU_THERMAL_CURTEMP_DECIMAL_BITS;
387
388	if ((reg & AMDF17_SMU_THERMAL_CURTEMP_RANGE_SEL) != 0) {
389		rawtemp += AMDF17_SMU_THERMAL_CURTEMP_RANGE_ADJ;
390	}
391	rawtemp += nb->amd_nb_temp_off;
392	nb->amd_nb_temp = rawtemp << AMDF17_SMU_THERMAL_CURTEMP_DECIMAL_BITS;
393	nb->amd_nb_temp += decimal;
394
395	return (0);
396}
397
398static int
399amdf17nbdf_temp_init(amdf17nbdf_t *nbdf, amdf17nb_t *nb)
400{
401	uint_t i, family;
402	char buf[256];
403
404	if (cpuid_getbrandstr(CPU, buf, sizeof (buf)) >= sizeof (buf)) {
405		dev_err(nb->amd_nb_dip, CE_WARN, "!failed to read processor "
406		    "brand string, brand larger than internal buffer");
407		return (EOVERFLOW);
408	}
409
410	family = cpuid_getfamily(CPU);
411
412	for (i = 0; amdf17nbdf_offsets[i].amd_nbdfo_brand != NULL; i++) {
413		if (family != amdf17nbdf_offsets[i].amd_nbdfo_family)
414			continue;
415		if (strncmp(buf, amdf17nbdf_offsets[i].amd_nbdfo_brand,
416		    strlen(amdf17nbdf_offsets[i].amd_nbdfo_brand)) == 0) {
417			nb->amd_nb_temp_off =
418			    amdf17nbdf_offsets[i].amd_nbdfo_off;
419			break;
420		}
421	}
422
423	return (amdf17nbdf_temp_read(nbdf, nb));
424}
425
426static amdf17nbdf_type_t
427amdf17nbdf_dip_type(uint16_t dev)
428{
429	uint_t i;
430	const amdf17nbdf_table_t *tp = amdf17nbdf_dev_map;
431
432	for (i = 0; tp[i].amd_nbdft_pci_did != PCI_EINVAL16; i++) {
433		if (tp[i].amd_nbdft_pci_did == dev) {
434			return (tp[i].amd_nbdft_type);
435		}
436	}
437
438	return (AMD_NBDF_TYPE_UNKNOWN);
439}
440
441static boolean_t
442amdf17nbdf_map(amdf17nbdf_t *nbdf, amdf17nb_t *nb, amdf17df_t *df)
443{
444	int ret;
445	char buf[128];
446
447	ASSERT(MUTEX_HELD(&nbdf->amd_nbdf_lock));
448
449	/*
450	 * This means that we encountered a duplicate. We're going to stop
451	 * processing, but we're not going to fail its attach at this point.
452	 */
453	if (nb->amd_nb_df != NULL) {
454		dev_err(nb->amd_nb_dip, CE_WARN, "!trying to map NB %u/%u/%u "
455		    "to DF procnode %u, but NB is already mapped to DF "
456		    "procnode %u!",
457		    nb->amd_nb_bus, nb->amd_nb_dev, nb->amd_nb_func,
458		    df->amd_df_procnodeid, nb->amd_nb_df->amd_df_procnodeid);
459		return (B_TRUE);
460	}
461
462	/*
463	 * Now that we have found a mapping, initialize our temperature
464	 * information and create the minor node.
465	 */
466	nb->amd_nb_procnodeid = df->amd_df_procnodeid;
467	nb->amd_nb_temp_minor = id_alloc(nbdf->amd_nbdf_minors);
468
469	if ((ret = amdf17nbdf_temp_init(nbdf, nb)) != 0) {
470		dev_err(nb->amd_nb_dip, CE_WARN, "!failed to init SMN "
471		    "temperature data on node %u: %d", nb->amd_nb_procnodeid,
472		    ret);
473		return (B_FALSE);
474	}
475
476	if (snprintf(buf, sizeof (buf), "procnode.%u", nb->amd_nb_procnodeid) >=
477	    sizeof (buf)) {
478		dev_err(nb->amd_nb_dip, CE_WARN, "!unexpected buffer name "
479		    "overrun assembling temperature minor %u",
480		    nb->amd_nb_procnodeid);
481		return (B_FALSE);
482	}
483
484	if (ddi_create_minor_node(nb->amd_nb_dip, buf, S_IFCHR,
485	    nb->amd_nb_temp_minor, DDI_NT_SENSOR_TEMP_CPU, 0) != DDI_SUCCESS) {
486		dev_err(nb->amd_nb_dip, CE_WARN, "!failed to create minor node "
487		    "%s", buf);
488		return (B_FALSE);
489	}
490
491	/*
492	 * Now that's it's all done, note that they're mapped to each other.
493	 */
494	nb->amd_nb_df = df;
495	df->amd_df_nb = nb;
496
497	return (B_TRUE);
498}
499
500static boolean_t
501amdf17nbdf_add_nb(amdf17nbdf_t *nbdf, amdf17nb_t *nb)
502{
503	amdf17df_t *df;
504	boolean_t ret = B_TRUE;
505
506	mutex_enter(&nbdf->amd_nbdf_lock);
507	list_insert_tail(&nbdf->amd_nbdf_nbs, nb);
508	for (df = list_head(&nbdf->amd_nbdf_dfs); df != NULL;
509	    df = list_next(&nbdf->amd_nbdf_dfs, df)) {
510		if (nb->amd_nb_bus == df->amd_df_iobus) {
511			ret = amdf17nbdf_map(nbdf, nb, df);
512			break;
513		}
514	}
515	mutex_exit(&nbdf->amd_nbdf_lock);
516
517	return (ret);
518}
519
520static boolean_t
521amdf17nbdf_add_df(amdf17nbdf_t *nbdf, amdf17df_t *df)
522{
523	amdf17nb_t *nb;
524	boolean_t ret = B_TRUE;
525
526	mutex_enter(&nbdf->amd_nbdf_lock);
527	list_insert_tail(&nbdf->amd_nbdf_dfs, df);
528	for (nb = list_head(&nbdf->amd_nbdf_nbs); nb != NULL;
529	    nb = list_next(&nbdf->amd_nbdf_nbs, nb)) {
530		if (nb->amd_nb_bus == df->amd_df_iobus) {
531			ret = amdf17nbdf_map(nbdf, nb, df);
532		}
533	}
534	mutex_exit(&nbdf->amd_nbdf_lock);
535
536	return (ret);
537}
538
539static boolean_t
540amdf17nbdf_attach_nb(amdf17nbdf_t *nbdf, dev_info_t *dip, ddi_acc_handle_t hdl,
541    uint_t bus, uint_t dev, uint_t func)
542{
543	amdf17nb_t *nb;
544
545	nb = kmem_zalloc(sizeof (amdf17nb_t), KM_SLEEP);
546	nb->amd_nb_dip = dip;
547	nb->amd_nb_cfgspace = hdl;
548	nb->amd_nb_bus = bus;
549	nb->amd_nb_dev = dev;
550	nb->amd_nb_func = func;
551	/*
552	 * Set this to a value we won't get from the processor.
553	 */
554	nb->amd_nb_procnodeid = UINT_MAX;
555
556	if (!amdf17nbdf_add_nb(nbdf, nb)) {
557		amdf17nbdf_cleanup_nb(nbdf, nb);
558		return (B_FALSE);
559	}
560
561	return (B_TRUE);
562}
563
564static boolean_t
565amdf17nbdf_attach_df(amdf17nbdf_t *nbdf, dev_info_t *dip, ddi_acc_handle_t hdl,
566    uint_t bus, uint_t dev, uint_t func)
567{
568	amdf17df_t *df;
569
570	if (bus != AMDF17_DF_BUSNO) {
571		dev_err(dip, CE_WARN, "!encountered data fabric device with "
572		    "unexpected PCI bus assignment, found 0x%x, expected 0x%x",
573		    bus, AMDF17_DF_BUSNO);
574		return (B_FALSE);
575	}
576
577	if (dev < AMDF17_DF_FIRST_DEVICE) {
578		dev_err(dip, CE_WARN, "!encountered data fabric device with "
579		    "PCI device assignment below the first minimum device "
580		    "(0x%x): 0x%x", AMDF17_DF_FIRST_DEVICE, dev);
581		return (B_FALSE);
582	}
583
584	/*
585	 * At the moment we only care about function 0. However, we may care
586	 * about Function 4 in the future which has access to the FICAA.
587	 * However, only function zero should ever be attached, so this is just
588	 * an extra precaution.
589	 */
590	if (func != 0) {
591		dev_err(dip, CE_WARN, "!encountered data fabric device with "
592		    "unxpected PCI function assignment, found 0x%x, expected "
593		    "0x0", func);
594		return (B_FALSE);
595	}
596
597	df = kmem_zalloc(sizeof (amdf17df_t), KM_SLEEP);
598	df->amd_df_f0_dip = dip;
599	df->amd_df_f0_cfgspace = hdl;
600	df->amd_df_procnodeid = dev - AMDF17_DF_FIRST_DEVICE;
601	df->amd_df_iobus = pci_config_get32(hdl, AMDF17_DF_CFG_ADDR_CTL) &
602	    AMDF17_DF_CFG_ADDR_CTL_MASK;
603
604	if (!amdf17nbdf_add_df(nbdf, df)) {
605		amdf17nbdf_cleanup_df(df);
606		return (B_FALSE);
607	}
608
609	return (B_TRUE);
610}
611
612static int
613amdf17nbdf_open(dev_t *devp, int flags, int otype, cred_t *credp)
614{
615	amdf17nbdf_t *nbdf = amdf17nbdf;
616	minor_t m;
617
618	if (crgetzoneid(credp) != GLOBAL_ZONEID || drv_priv(credp)) {
619		return (EPERM);
620	}
621
622	if ((flags & (FEXCL | FNDELAY | FWRITE)) != 0) {
623		return (EINVAL);
624	}
625
626	if (otype != OTYP_CHR) {
627		return (EINVAL);
628	}
629
630	m = getminor(*devp);
631
632	/*
633	 * Sanity check the minor
634	 */
635	mutex_enter(&nbdf->amd_nbdf_lock);
636	if (amdf17nbdf_lookup_nb(nbdf, m) == NULL) {
637		mutex_exit(&nbdf->amd_nbdf_lock);
638		return (ENXIO);
639	}
640	mutex_exit(&nbdf->amd_nbdf_lock);
641
642	return (0);
643}
644
645static int
646amdf17nbdf_ioctl_kind(intptr_t arg, int mode)
647{
648	sensor_ioctl_kind_t kind;
649
650	bzero(&kind, sizeof (sensor_ioctl_kind_t));
651	kind.sik_kind = SENSOR_KIND_TEMPERATURE;
652
653	if (ddi_copyout((void *)&kind, (void *)arg,
654	    sizeof (sensor_ioctl_kind_t), mode & FKIOCTL) != 0) {
655		return (EFAULT);
656	}
657
658	return (0);
659}
660
661static int
662amdf17nbdf_ioctl_temp(amdf17nbdf_t *nbdf, minor_t minor, intptr_t arg, int mode)
663{
664	amdf17nb_t *nb;
665	hrtime_t diff;
666	sensor_ioctl_temperature_t temp;
667
668	bzero(&temp, sizeof (temp));
669
670	mutex_enter(&nbdf->amd_nbdf_lock);
671	nb = amdf17nbdf_lookup_nb(nbdf, minor);
672	if (nb == NULL) {
673		mutex_exit(&nbdf->amd_nbdf_lock);
674		return (ENXIO);
675	}
676
677	diff = NSEC2MSEC(gethrtime() - nb->amd_nb_temp_last_read);
678	if (diff > 0 && diff > (hrtime_t)amdf17nbdf_cache_ms) {
679		int ret;
680
681		ret = amdf17nbdf_temp_read(nbdf, nb);
682		if (ret != 0) {
683			mutex_exit(&nbdf->amd_nbdf_lock);
684			return (ret);
685		}
686	}
687
688	temp.sit_unit = SENSOR_UNIT_CELSIUS;
689	temp.sit_temp = nb->amd_nb_temp;
690	temp.sit_gran = AMDF17_THERMAL_GRANULARITY;
691	mutex_exit(&nbdf->amd_nbdf_lock);
692
693	if (ddi_copyout(&temp, (void *)arg, sizeof (temp),
694	    mode & FKIOCTL) != 0) {
695		return (EFAULT);
696	}
697
698	return (0);
699}
700
701static int
702amdf17nbdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
703    int *rvalp)
704{
705	minor_t m;
706	amdf17nbdf_t *nbdf = amdf17nbdf;
707
708	if ((mode & FREAD) == 0) {
709		return (EINVAL);
710	}
711
712	m = getminor(dev);
713
714	switch (cmd) {
715	case SENSOR_IOCTL_TYPE:
716		return (amdf17nbdf_ioctl_kind(arg, mode));
717	case SENSOR_IOCTL_TEMPERATURE:
718		return (amdf17nbdf_ioctl_temp(nbdf, m, arg, mode));
719	default:
720		return (ENOTTY);
721	}
722}
723
724/*
725 * We don't really do any state tracking on close, so for now, just allow it to
726 * always succeed.
727 */
728static int
729amdf17nbdf_close(dev_t dev, int flags, int otype, cred_t *credp)
730{
731	return (0);
732}
733
734static int
735amdf17nbdf_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
736{
737	uint_t nregs;
738	int *regs;
739	uint_t bus, dev, func;
740	uint16_t pci_did;
741	ddi_acc_handle_t pci_hdl;
742	amdf17nbdf_type_t type;
743	amdf17nbdf_t *nbdf = amdf17nbdf;
744
745	if (cmd == DDI_RESUME)
746		return (DDI_SUCCESS);
747	if (cmd != DDI_ATTACH)
748		return (DDI_FAILURE);
749
750	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, 0, "reg",
751	    &regs, &nregs) != DDI_PROP_SUCCESS) {
752		dev_err(dip, CE_WARN, "!failed to find pci 'reg' property");
753		return (DDI_FAILURE);
754	}
755
756	if (nregs < 1) {
757		ddi_prop_free(regs);
758		return (DDI_FAILURE);
759	}
760
761	bus = PCI_REG_BUS_G(regs[0]);
762	dev = PCI_REG_DEV_G(regs[0]);
763	func = PCI_REG_FUNC_G(regs[0]);
764
765	ddi_prop_free(regs);
766
767	if (pci_config_setup(dip, &pci_hdl) != DDI_SUCCESS) {
768		dev_err(dip, CE_WARN, "!failed to map pci devices");
769		return (DDI_FAILURE);
770	}
771
772	pci_did = pci_config_get16(pci_hdl, PCI_CONF_DEVID);
773
774	type = amdf17nbdf_dip_type(pci_did);
775	switch (type) {
776	case AMD_NBDF_TYPE_NORTHBRIDGE:
777		if (!amdf17nbdf_attach_nb(nbdf, dip, pci_hdl, bus, dev, func)) {
778			return (DDI_FAILURE);
779		}
780		break;
781	case AMD_NBDF_TYPE_DATA_FABRIC:
782		if (!amdf17nbdf_attach_df(nbdf, dip, pci_hdl, bus, dev, func)) {
783			return (DDI_FAILURE);
784		}
785		break;
786	default:
787		pci_config_teardown(&pci_hdl);
788		return (DDI_FAILURE);
789	}
790
791	return (DDI_SUCCESS);
792}
793
794/*
795 * Unfortunately, it's hard for us to really support detach here. The problem is
796 * that we need both the data fabric devices and the northbridges to make sure
797 * that we map everything. However, only the northbridges actually create minor
798 * nodes that'll be opened and thus trigger them to reattach when accessed. What
799 * we should probably look at doing in the future is making this into a nexus
800 * driver that enumerates children like a temperature driver.
801 */
802static int
803amdf17nbdf_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
804{
805	amdf17nbdf_t *nbdf = amdf17nbdf;
806
807	if (cmd == DDI_SUSPEND)
808		return (DDI_SUCCESS);
809
810	if (nbdf == NULL) {
811		return (DDI_FAILURE);
812	}
813
814	if (amdf17nbdf_allow_detach == 0) {
815		return (DDI_FAILURE);
816	}
817
818	mutex_enter(&nbdf->amd_nbdf_lock);
819	for (amdf17nb_t *nb = list_head(&nbdf->amd_nbdf_nbs); nb != NULL;
820	    nb = list_next(&nbdf->amd_nbdf_nbs, nb)) {
821		if (dip == nb->amd_nb_dip) {
822			list_remove(&nbdf->amd_nbdf_nbs, nb);
823			if (nb->amd_nb_df != NULL) {
824				ASSERT3P(nb->amd_nb_df->amd_df_nb, ==, nb);
825				nb->amd_nb_df->amd_df_nb = NULL;
826			}
827			amdf17nbdf_cleanup_nb(nbdf, nb);
828			mutex_exit(&nbdf->amd_nbdf_lock);
829			return (DDI_SUCCESS);
830		}
831	}
832
833	for (amdf17df_t *df = list_head(&nbdf->amd_nbdf_dfs); df != NULL;
834	    df = list_next(&nbdf->amd_nbdf_nbs, df)) {
835		if (dip == df->amd_df_f0_dip) {
836			list_remove(&nbdf->amd_nbdf_dfs, df);
837			if (df->amd_df_nb != NULL) {
838				ASSERT3P(df->amd_df_nb->amd_nb_df, ==, df);
839				df->amd_df_nb->amd_nb_df = NULL;
840			}
841			amdf17nbdf_cleanup_df(df);
842			mutex_exit(&nbdf->amd_nbdf_lock);
843			return (DDI_SUCCESS);
844		}
845	}
846	mutex_exit(&nbdf->amd_nbdf_lock);
847
848	return (DDI_FAILURE);
849}
850
851static int
852amdf17nbdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
853    void **resultp)
854{
855	dev_t dev;
856	minor_t minor;
857	amdf17nbdf_t *nbdf;
858	amdf17nb_t *nb;
859
860	switch (cmd) {
861	case DDI_INFO_DEVT2DEVINFO:
862	case DDI_INFO_DEVT2INSTANCE:
863		break;
864	default:
865		return (DDI_FAILURE);
866	}
867
868	dev = (dev_t)arg;
869	minor = getminor(dev);
870	nbdf = amdf17nbdf;
871
872	mutex_enter(&nbdf->amd_nbdf_lock);
873	nb = amdf17nbdf_lookup_nb(nbdf, (id_t)minor);
874	if (nb == NULL) {
875		mutex_exit(&nbdf->amd_nbdf_lock);
876		return (DDI_FAILURE);
877	}
878	if (cmd == DDI_INFO_DEVT2DEVINFO) {
879		*resultp = nb->amd_nb_dip;
880	} else {
881		int inst = ddi_get_instance(nb->amd_nb_dip);
882		*resultp = (void *)(uintptr_t)inst;
883	}
884	mutex_exit(&nbdf->amd_nbdf_lock);
885
886	return (DDI_SUCCESS);
887}
888
889static void
890amdf17nbdf_destroy(amdf17nbdf_t *nbdf)
891{
892	amdf17nb_t *nb;
893	amdf17df_t *df;
894
895	while ((nb = list_remove_head(&nbdf->amd_nbdf_nbs)) != NULL) {
896		amdf17nbdf_cleanup_nb(nbdf, nb);
897	}
898	list_destroy(&nbdf->amd_nbdf_nbs);
899
900	while ((df = list_remove_head(&nbdf->amd_nbdf_dfs)) != NULL) {
901		amdf17nbdf_cleanup_df(df);
902	}
903	list_destroy(&nbdf->amd_nbdf_dfs);
904
905	if (nbdf->amd_nbdf_minors != NULL) {
906		id_space_destroy(nbdf->amd_nbdf_minors);
907	}
908
909	mutex_destroy(&nbdf->amd_nbdf_lock);
910	kmem_free(nbdf, sizeof (amdf17nbdf_t));
911}
912
913static amdf17nbdf_t *
914amdf17nbdf_create(void)
915{
916	amdf17nbdf_t *nbdf;
917
918	nbdf = kmem_zalloc(sizeof (amdf17nbdf_t), KM_SLEEP);
919	mutex_init(&nbdf->amd_nbdf_lock, NULL, MUTEX_DRIVER, NULL);
920	list_create(&nbdf->amd_nbdf_nbs, sizeof (amdf17nb_t),
921	    offsetof(amdf17nb_t, amd_nb_link));
922	list_create(&nbdf->amd_nbdf_dfs, sizeof (amdf17df_t),
923	    offsetof(amdf17df_t, amd_df_link));
924	if ((nbdf->amd_nbdf_minors = id_space_create("amdf17nbdf_minors",
925	    AMDF17_MINOR_LOW, AMDF17_MINOR_HIGH)) == NULL) {
926		amdf17nbdf_destroy(nbdf);
927		return (NULL);
928	}
929
930	return (nbdf);
931}
932
933static struct cb_ops amdf17nbdf_cb_ops = {
934	.cb_open = amdf17nbdf_open,
935	.cb_close = amdf17nbdf_close,
936	.cb_strategy = nodev,
937	.cb_print = nodev,
938	.cb_dump = nodev,
939	.cb_read = nodev,
940	.cb_write = nodev,
941	.cb_ioctl = amdf17nbdf_ioctl,
942	.cb_devmap = nodev,
943	.cb_mmap = nodev,
944	.cb_segmap = nodev,
945	.cb_chpoll = nochpoll,
946	.cb_prop_op = ddi_prop_op,
947	.cb_flag = D_MP,
948	.cb_rev = CB_REV,
949	.cb_aread = nodev,
950	.cb_awrite = nodev
951};
952
953static struct dev_ops amdf17nbdf_dev_ops = {
954	.devo_rev = DEVO_REV,
955	.devo_refcnt = 0,
956	.devo_getinfo = amdf17nbdf_getinfo,
957	.devo_identify = nulldev,
958	.devo_probe = nulldev,
959	.devo_attach = amdf17nbdf_attach,
960	.devo_detach = amdf17nbdf_detach,
961	.devo_reset = nodev,
962	.devo_power = ddi_power,
963	.devo_quiesce = ddi_quiesce_not_needed,
964	.devo_cb_ops = &amdf17nbdf_cb_ops
965};
966
967static struct modldrv amdf17nbdf_modldrv = {
968	.drv_modops = &mod_driverops,
969	.drv_linkinfo = "AMD Family 17h Driver",
970	.drv_dev_ops = &amdf17nbdf_dev_ops
971};
972
973static struct modlinkage amdf17nbdf_modlinkage = {
974	.ml_rev = MODREV_1,
975	.ml_linkage = { &amdf17nbdf_modldrv, NULL }
976};
977
978int
979_init(void)
980{
981	int ret;
982	amdf17nbdf_t *nbdf;
983
984	if ((nbdf = amdf17nbdf_create()) == NULL) {
985		return (ENOMEM);
986	}
987
988	if ((ret = mod_install(&amdf17nbdf_modlinkage)) != 0) {
989		amdf17nbdf_destroy(amdf17nbdf);
990		return (ret);
991	}
992
993	amdf17nbdf = nbdf;
994	return (ret);
995}
996
997int
998_info(struct modinfo *modinfop)
999{
1000	return (mod_info(&amdf17nbdf_modlinkage, modinfop));
1001}
1002
1003int
1004_fini(void)
1005{
1006	int ret;
1007
1008	if ((ret = mod_remove(&amdf17nbdf_modlinkage)) != 0) {
1009		return (ret);
1010	}
1011
1012	amdf17nbdf_destroy(amdf17nbdf);
1013	amdf17nbdf = NULL;
1014	return (ret);
1015}
1016