1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019, Joyent, Inc.
14  * Copyright 2021 Oxide Computer Company
15  */
16 
17 /*
18  * Intel CPU Thermal sensor driver
19  *
20  * These MSRs that were used were introduced with the 'Core' family processors
21  * and have since spread beyond there, even to the Atom line. Currently,
22  * temperature sensors exist on a per-core basis and optionally on a per-package
23  * basis. The temperature sensor exposes a reading that's relative to the
24  * processor's maximum junction temperature, often referred to as Tj. We
25  * currently only support models where we can determine that junction
26  * temperature programmatically. For older processors, we would need to track
27  * down the datasheet. Unfortunately, the values here are often on a per-brand
28  * string basis. As in two CPUs with the same model and stepping, but have
29  * binned differently have different temperatures.
30  *
31  * The temperature is exposed through /dev and uses a semi-standard sensor
32  * framework. We expose one minor node per CPU core and one minor node per CPU
33  * package, if that is supported. Reads are rate-limited in the driver at 100ms
34  * by default per the global variable coretemp_cache_ms.
35  */
36 
37 #include <sys/modctl.h>
38 #include <sys/conf.h>
39 #include <sys/devops.h>
40 #include <sys/types.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/stat.h>
44 #include <sys/cred.h>
45 #include <sys/ddi.h>
46 #include <sys/sunddi.h>
47 #include <sys/list.h>
48 #include <sys/stddef.h>
49 #include <sys/cmn_err.h>
50 #include <sys/x86_archext.h>
51 #include <sys/cpu_module.h>
52 #include <sys/ontrap.h>
53 #include <sys/cpuvar.h>
54 #include <sys/x_call.h>
55 #include <sys/sensors.h>
56 
57 /*
58  * The Intel SDM says that the measurements we get are always in degrees
59  * Celsius.
60  */
61 #define	CORETEMP_GRANULARITY	1
62 
63 typedef enum coretemp_sensor_type {
64 	CORETEMP_S_CORE,
65 	CORETEMP_S_SOCKET
66 } coretemp_sensor_type_t;
67 
68 typedef struct coretemp_sensor {
69 	list_node_t		cs_link;
70 	struct coretemp		*cs_coretemp;
71 	char			cs_name[128];
72 	id_t			cs_sensor;
73 	coretemp_sensor_type_t	cs_type;
74 	enum cmi_hdl_class	cs_class;
75 	uint_t			cs_chip;
76 	uint_t			cs_core;
77 	uint_t			cs_strand;
78 	uint_t			cs_tjmax;
79 	uint_t			cs_status_msr;
80 	uint_t			cs_intr_msr;
81 	hrtime_t		cs_last_read;
82 	uint64_t		cs_status;
83 	uint64_t		cs_intr;
84 	/* The following fields are derived from above */
85 	uint_t			cs_temperature;
86 	uint_t			cs_resolution;
87 } coretemp_sensor_t;
88 
89 typedef struct coretemp {
90 	dev_info_t	*coretemp_dip;
91 	cpuset_t	*coretemp_cpuset;
92 	boolean_t	coretemp_pkg;
93 	kmutex_t	coretemp_mutex;
94 	list_t		coretemp_sensors;
95 } coretemp_t;
96 
97 coretemp_t *coretemp;
98 
99 /*
100  * This indicates a number of milliseconds that we should wait between reads.
101  * This is somewhat arbitrary, but the goal is to reduce cross call activity
102  * and reflect that the sensor may not update all the time.
103  */
104 uint_t coretemp_cache_ms = 100;
105 
106 static int
coretemp_rdmsr_xc(xc_arg_t arg1,xc_arg_t arg2,xc_arg_t arg3)107 coretemp_rdmsr_xc(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3)
108 {
109 	uint_t msr = (uint_t)arg1;
110 	uint64_t *valp = (uint64_t *)arg2;
111 	cmi_errno_t *errp = (cmi_errno_t *)arg3;
112 
113 	on_trap_data_t otd;
114 
115 	if (on_trap(&otd, OT_DATA_ACCESS) == 0) {
116 		if (checked_rdmsr(msr, valp) == 0) {
117 			*errp = CMI_SUCCESS;
118 		} else {
119 			*errp = CMIERR_NOTSUP;
120 		}
121 	} else {
122 		*errp = CMIERR_MSRGPF;
123 	}
124 	no_trap();
125 
126 	return (0);
127 }
128 
129 /*
130  * This really should just be a call to the CMI handle to provide us the MSR.
131  * However, that routine, cmi_hdl_rdmsr(), cannot be safely used until it is
132  * fixed for use outside of a panic-like context.
133  */
134 static int
coretemp_rdmsr(coretemp_t * ct,cmi_hdl_t hdl,uint_t msr,uint64_t * valp)135 coretemp_rdmsr(coretemp_t *ct, cmi_hdl_t hdl, uint_t msr, uint64_t *valp)
136 {
137 	id_t cpu = cmi_hdl_logical_id(hdl);
138 	int ret = CMI_SUCCESS;
139 
140 	ASSERT(MUTEX_HELD(&ct->coretemp_mutex));
141 	kpreempt_disable();
142 	if (CPU->cpu_id == cpu) {
143 		(void) coretemp_rdmsr_xc((xc_arg_t)msr, (xc_arg_t)valp,
144 		    (xc_arg_t)&ret);
145 	} else {
146 		cpuset_only(ct->coretemp_cpuset, (uint_t)cpu);
147 		xc_call((xc_arg_t)msr, (xc_arg_t)valp, (xc_arg_t)&ret,
148 		    (ulong_t *)ct->coretemp_cpuset, coretemp_rdmsr_xc);
149 	}
150 	kpreempt_enable();
151 
152 	return (ret);
153 }
154 
155 static int
coretemp_cmi_errno(cmi_errno_t e)156 coretemp_cmi_errno(cmi_errno_t e)
157 {
158 	switch (e) {
159 	case CMIERR_NOTSUP:
160 		return (ENOTSUP);
161 	default:
162 		return (EIO);
163 	}
164 }
165 
166 /*
167  * Answer the question of whether or not the driver can support the CPU in
168  * question. Right now we have the following constraints for supporting the CPU:
169  *
170  *   o The CPU is made by Intel
171  *   o The CPU has the Digital Thermal Sensor
172  *   o The CPU family is 6, which is usually implicit from the above
173  *   o We can determine its junction temperature through an MSR
174  *
175  * If we can't determine the junction temperature programmatically, then we need
176  * to set up tables of CPUs to do so. This can be fleshed out and improved.
177  */
178 static boolean_t
coretemp_supported(void)179 coretemp_supported(void)
180 {
181 	uint_t model;
182 
183 	if (cpuid_getvendor(CPU) != X86_VENDOR_Intel) {
184 		return (B_FALSE);
185 	}
186 
187 	if (!is_x86_feature(x86_featureset, X86FSET_CORE_THERMAL)) {
188 		return (B_FALSE);
189 	}
190 
191 	if (cpuid_getfamily(CPU) != 6) {
192 		return (B_FALSE);
193 	}
194 
195 	model = cpuid_getmodel(CPU);
196 	if (model <= INTC_MODEL_PENRYN || model == INTC_MODEL_SILVERTHORNE ||
197 	    model == INTC_MODEL_LINCROFT || model == INTC_MODEL_PENWELL ||
198 	    model == INTC_MODEL_CLOVERVIEW || model == INTC_MODEL_CEDARVIEW) {
199 		return (B_FALSE);
200 	}
201 
202 	return (B_TRUE);
203 }
204 
205 /*
206  * We need to determine the value of Tj Max as all temperature sensors are
207  * derived from this value. The ease of this depends on how old the processor in
208  * question is. The Core family processors after Penryn have support for an MSR
209  * that tells us what to go for. In the Atom family, processors starting with
210  * Silvermont have support for an MSR that documents this value. For older
211  * processors, one needs to track down the datasheet for a specific processor.
212  * Two processors in the same family/model may have different values of Tj Max.
213  * At the moment, we only support this on processors that have that MSR.
214  */
215 static int
coretemp_calculate_tjmax(coretemp_t * ct,cmi_hdl_t hdl,uint_t * tjmax)216 coretemp_calculate_tjmax(coretemp_t *ct, cmi_hdl_t hdl, uint_t *tjmax)
217 {
218 	cmi_errno_t e;
219 	uint64_t val = 0;
220 
221 	e = coretemp_rdmsr(ct, hdl, MSR_TEMPERATURE_TARGET, &val);
222 	if (e != CMI_SUCCESS) {
223 		return (coretemp_cmi_errno(e));
224 	} else if (val == 0) {
225 		return (EINVAL);
226 	}
227 
228 	*tjmax = MSR_TEMPERATURE_TARGET_TARGET(val);
229 	return (0);
230 }
231 
232 static int
coretemp_update(coretemp_t * ct,coretemp_sensor_t * sensor,cmi_hdl_t hdl)233 coretemp_update(coretemp_t *ct, coretemp_sensor_t *sensor, cmi_hdl_t hdl)
234 {
235 	cmi_errno_t e;
236 	int err = 0;
237 	uint64_t intr, status;
238 
239 	if ((e = coretemp_rdmsr(ct, hdl, sensor->cs_status_msr, &status)) !=
240 	    CMI_SUCCESS) {
241 		err = coretemp_cmi_errno(e);
242 		dev_err(ct->coretemp_dip, CE_WARN, "!failed to get thermal "
243 		    "status on %s: %d", sensor->cs_name, err);
244 		return (err);
245 	}
246 
247 	if ((e = coretemp_rdmsr(ct, hdl, sensor->cs_intr_msr, &intr)) !=
248 	    CMI_SUCCESS) {
249 		err = coretemp_cmi_errno(e);
250 		dev_err(ct->coretemp_dip, CE_WARN, "!failed to get thermal "
251 		    "interrupt on %s: %d", sensor->cs_name, err);
252 		return (err);
253 	}
254 
255 	sensor->cs_status = status;
256 	sensor->cs_intr = intr;
257 	sensor->cs_last_read = gethrtime();
258 	return (0);
259 }
260 
261 static int
coretemp_read(void * arg,sensor_ioctl_scalar_t * scalar)262 coretemp_read(void *arg, sensor_ioctl_scalar_t *scalar)
263 {
264 	coretemp_sensor_t *sensor = arg;
265 	coretemp_t *ct = sensor->cs_coretemp;
266 	hrtime_t diff;
267 	uint_t reading, resolution;
268 
269 	mutex_enter(&ct->coretemp_mutex);
270 	diff = NSEC2MSEC(gethrtime() - sensor->cs_last_read);
271 	if (diff > 0 && diff > (hrtime_t)coretemp_cache_ms) {
272 		int ret;
273 		cmi_hdl_t hdl;
274 
275 		if ((hdl = cmi_hdl_lookup(sensor->cs_class, sensor->cs_chip,
276 		    sensor->cs_core, sensor->cs_strand)) == NULL) {
277 			mutex_exit(&ct->coretemp_mutex);
278 			return (ENXIO);
279 		}
280 		ret = coretemp_update(ct, sensor, hdl);
281 		cmi_hdl_rele(hdl);
282 		if (ret != 0) {
283 			mutex_exit(&ct->coretemp_mutex);
284 			return (ret);
285 		}
286 	}
287 
288 	switch (sensor->cs_type) {
289 	case CORETEMP_S_CORE:
290 		if ((sensor->cs_status & IA32_THERM_STATUS_READ_VALID) == 0) {
291 			mutex_exit(&ct->coretemp_mutex);
292 			return (EIO);
293 		}
294 		reading = IA32_THERM_STATUS_READING(sensor->cs_status);
295 		resolution = IA32_THERM_STATUS_RESOLUTION(sensor->cs_status);
296 		break;
297 	case CORETEMP_S_SOCKET:
298 		reading = IA32_PKG_THERM_STATUS_READING(sensor->cs_status);
299 		resolution = 0;
300 		break;
301 	default:
302 		mutex_exit(&ct->coretemp_mutex);
303 		return (ENXIO);
304 	}
305 	if (reading >= sensor->cs_tjmax) {
306 		dev_err(ct->coretemp_dip, CE_WARN, "!found invalid temperature "
307 		    "on sensor %s: readout: %u, tjmax: %u, raw: 0x%"
308 		    PRIx64, sensor->cs_name, reading, sensor->cs_tjmax,
309 		    sensor->cs_status);
310 		mutex_exit(&ct->coretemp_mutex);
311 		return (EIO);
312 	}
313 	sensor->cs_temperature = sensor->cs_tjmax - reading;
314 	sensor->cs_resolution = resolution;
315 
316 	scalar->sis_unit = SENSOR_UNIT_CELSIUS;
317 	scalar->sis_value = sensor->cs_temperature;
318 	scalar->sis_gran = CORETEMP_GRANULARITY;
319 	scalar->sis_prec = sensor->cs_resolution;
320 	mutex_exit(&ct->coretemp_mutex);
321 
322 	return (0);
323 }
324 
325 static const ksensor_ops_t coretemp_temp_ops = {
326 	.kso_kind = ksensor_kind_temperature,
327 	.kso_scalar = coretemp_read
328 };
329 
330 static void
coretemp_destroy(coretemp_t * ct)331 coretemp_destroy(coretemp_t *ct)
332 {
333 	coretemp_sensor_t *sensor;
334 
335 	(void) ksensor_remove(ct->coretemp_dip, KSENSOR_ALL_IDS);
336 	while ((sensor = list_remove_head(&ct->coretemp_sensors)) != NULL) {
337 		kmem_free(sensor, sizeof (coretemp_sensor_t));
338 	}
339 	list_destroy(&ct->coretemp_sensors);
340 
341 	if (ct->coretemp_cpuset != NULL) {
342 		cpuset_free(ct->coretemp_cpuset);
343 	}
344 
345 	mutex_destroy(&ct->coretemp_mutex);
346 	kmem_free(ct, sizeof (coretemp_t));
347 }
348 
349 static boolean_t
coretemp_create_sensor(coretemp_t * ct,cmi_hdl_t hdl,uint_t tjmax,coretemp_sensor_type_t type)350 coretemp_create_sensor(coretemp_t *ct, cmi_hdl_t hdl, uint_t tjmax,
351     coretemp_sensor_type_t type)
352 {
353 	int err;
354 	coretemp_sensor_t *sensor;
355 
356 	sensor = kmem_zalloc(sizeof (coretemp_sensor_t), KM_SLEEP);
357 	sensor->cs_coretemp = ct;
358 	sensor->cs_type = type;
359 	sensor->cs_class = cmi_hdl_class(hdl);
360 	sensor->cs_chip = cmi_hdl_chipid(hdl);
361 	sensor->cs_core = cmi_hdl_coreid(hdl);
362 	sensor->cs_strand = 0;
363 	sensor->cs_tjmax = tjmax;
364 
365 	switch (sensor->cs_type) {
366 	case CORETEMP_S_CORE:
367 		if (snprintf(sensor->cs_name, sizeof (sensor->cs_name),
368 		    "chip%u.core%u", sensor->cs_chip, sensor->cs_core) >=
369 		    sizeof (sensor->cs_name)) {
370 			goto err;
371 		}
372 		sensor->cs_status_msr = MSR_IA32_THERM_STATUS;
373 		sensor->cs_intr_msr = MSR_IA32_THERM_INTERRUPT;
374 		break;
375 	case CORETEMP_S_SOCKET:
376 		if (snprintf(sensor->cs_name, sizeof (sensor->cs_name),
377 		    "chip%u", sensor->cs_chip) >= sizeof (sensor->cs_name)) {
378 			goto err;
379 		}
380 		sensor->cs_status_msr = MSR_IA32_PACKAGE_THERM_STATUS;
381 		sensor->cs_intr_msr = MSR_IA32_PACKAGE_THERM_INTERRUPT;
382 		break;
383 	}
384 
385 	if ((err = ksensor_create(ct->coretemp_dip, &coretemp_temp_ops, sensor,
386 	    sensor->cs_name, DDI_NT_SENSOR_TEMP_CPU, &sensor->cs_sensor)) !=
387 	    0) {
388 		dev_err(ct->coretemp_dip, CE_WARN, "failed to create ksensor "
389 		    "for %s: %d", sensor->cs_name, err);
390 	}
391 
392 	ASSERT(MUTEX_HELD(&ct->coretemp_mutex));
393 	list_insert_tail(&ct->coretemp_sensors, sensor);
394 
395 	return (B_TRUE);
396 err:
397 	kmem_free(sensor, sizeof (coretemp_sensor_t));
398 	return (B_FALSE);
399 }
400 
401 static int
coretemp_walk(cmi_hdl_t hdl,void * arg1,void * arg2,void * arg3)402 coretemp_walk(cmi_hdl_t hdl, void *arg1, void *arg2, void *arg3)
403 {
404 	coretemp_t *ct = arg1;
405 	boolean_t *walkerr = arg2;
406 	uint_t tjmax;
407 	int err;
408 
409 	/*
410 	 * The temperature sensor only exists on a per-core basis. Therefore we
411 	 * ignore any non-zero strand.
412 	 */
413 	if (cmi_hdl_strandid(hdl) != 0) {
414 		return (CMI_HDL_WALK_NEXT);
415 	}
416 
417 	if ((err = coretemp_calculate_tjmax(ct, hdl, &tjmax)) != 0) {
418 		dev_err(ct->coretemp_dip, CE_WARN,
419 		    "failed to read Tj Max on %u/%u: %d", cmi_hdl_chipid(hdl),
420 		    cmi_hdl_coreid(hdl), err);
421 		*walkerr = B_TRUE;
422 		return (CMI_HDL_WALK_DONE);
423 	}
424 
425 	if (!coretemp_create_sensor(ct, hdl, tjmax, CORETEMP_S_CORE)) {
426 		*walkerr = B_TRUE;
427 		return (CMI_HDL_WALK_DONE);
428 	}
429 
430 	if (ct->coretemp_pkg && cmi_hdl_coreid(hdl) == 0 &&
431 	    !coretemp_create_sensor(ct, hdl, tjmax, CORETEMP_S_SOCKET)) {
432 		*walkerr = B_TRUE;
433 		return (CMI_HDL_WALK_DONE);
434 	}
435 
436 	return (CMI_HDL_WALK_NEXT);
437 }
438 
439 static int
coretemp_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)440 coretemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
441 {
442 	boolean_t walkerr;
443 	coretemp_t *ct = NULL;
444 
445 	if (cmd == DDI_RESUME) {
446 		return (DDI_SUCCESS);
447 	} else if (cmd != DDI_ATTACH) {
448 		return (DDI_FAILURE);
449 	}
450 
451 	if (coretemp != NULL) {
452 		return (DDI_FAILURE);
453 	}
454 
455 	ct = kmem_zalloc(sizeof (coretemp_t), KM_SLEEP);
456 	ct->coretemp_dip = dip;
457 	ct->coretemp_pkg = is_x86_feature(x86_featureset, X86FSET_PKG_THERMAL);
458 	list_create(&ct->coretemp_sensors, sizeof (coretemp_sensor_t),
459 	    offsetof(coretemp_sensor_t, cs_link));
460 	mutex_init(&ct->coretemp_mutex, NULL, MUTEX_DRIVER, NULL);
461 	ct->coretemp_cpuset = cpuset_alloc(KM_SLEEP);
462 
463 	mutex_enter(&ct->coretemp_mutex);
464 	walkerr = B_FALSE;
465 	cmi_hdl_walk(coretemp_walk, ct, &walkerr, NULL);
466 
467 	if (walkerr) {
468 		mutex_exit(&ct->coretemp_mutex);
469 		goto fail;
470 	}
471 
472 	coretemp = ct;
473 	mutex_exit(&ct->coretemp_mutex);
474 	return (DDI_SUCCESS);
475 fail:
476 	coretemp = NULL;
477 	coretemp_destroy(ct);
478 	return (DDI_FAILURE);
479 
480 }
481 
482 static int
coretemp_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)483 coretemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
484 {
485 	if (cmd == DDI_SUSPEND) {
486 		return (DDI_SUCCESS);
487 	} else if (cmd != DDI_DETACH) {
488 		return (DDI_FAILURE);
489 	}
490 
491 	if (coretemp == NULL) {
492 		return (DDI_FAILURE);
493 	}
494 
495 	coretemp_destroy(coretemp);
496 	coretemp = NULL;
497 
498 	return (DDI_SUCCESS);
499 }
500 
501 static struct dev_ops coretemp_dev_ops = {
502 	.devo_rev = DEVO_REV,
503 	.devo_refcnt = 0,
504 	.devo_getinfo = nodev,
505 	.devo_identify = nulldev,
506 	.devo_probe = nulldev,
507 	.devo_attach = coretemp_attach,
508 	.devo_detach = coretemp_detach,
509 	.devo_reset = nodev,
510 	.devo_quiesce = ddi_quiesce_not_needed
511 };
512 
513 static struct modldrv coretemp_modldrv = {
514 	.drv_modops = &mod_driverops,
515 	.drv_linkinfo = "Intel CPU/Package thermal sensor",
516 	.drv_dev_ops = &coretemp_dev_ops
517 };
518 
519 static struct modlinkage coretemp_modlinkage = {
520 	.ml_rev = MODREV_1,
521 	.ml_linkage = { &coretemp_modldrv, NULL }
522 };
523 
524 int
_init(void)525 _init(void)
526 {
527 	if (!coretemp_supported()) {
528 		return (ENOTSUP);
529 	}
530 
531 	return (mod_install(&coretemp_modlinkage));
532 }
533 
534 int
_info(struct modinfo * modinfop)535 _info(struct modinfo *modinfop)
536 {
537 	return (mod_info(&coretemp_modlinkage, modinfop));
538 }
539 
540 int
_fini(void)541 _fini(void)
542 {
543 	return (mod_remove(&coretemp_modlinkage));
544 }
545