xref: /illumos-gate/usr/src/uts/sun4u/io/sysioerr.c (revision 2a1fd0ff)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 1990-2002 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /*
27  * Copyright 2019 Peter Tribble.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/ddi_impldefs.h>
35 #include <sys/cmn_err.h>
36 #include <sys/async.h>
37 #include <sys/sysiosbus.h>
38 #include <sys/sysioerr.h>
39 #include <sys/x_call.h>
40 #include <sys/machsystm.h>
41 #include <sys/sysmacros.h>
42 #include <sys/vmsystm.h>
43 #include <sys/cpu_module.h>
44 
45 /*
46  * Set the following variable in /etc/system to tell the kernel
47  * not to shutdown the machine if the temperature reaches
48  * the Thermal Warning limit.
49  */
50 int oven_test = 0;
51 
52 /*
53  * To indicate if the prom has the property of "thermal-interrupt".
54  */
55 static int thermal_interrupt_enabled = 0;
56 
57 /*
58  * adb debug_sysio_errs to 1 if you don't want your system to panic on
59  * sbus ue errors. adb sysio_err_flag to 0 if you don't want your system
60  * to check for sysio errors at all.
61  */
62 int sysio_err_flag = 1;
63 uint_t debug_sysio_errs = 0;
64 
65 /*
66  * bto_cnt = number of bus errors and timeouts allowed within bto_secs
67  * use /etc/system to change the bto_cnt to a very large number if
68  * it's a problem!
69  */
70 int bto_secs = 10;
71 int bto_cnt = 10;
72 
73 static uint_t
74 sysio_ue_intr(struct sbus_soft_state *softsp);
75 
76 static uint_t
77 sysio_ce_intr(struct sbus_soft_state *softsp);
78 
79 static uint_t
80 sbus_err_intr(struct sbus_soft_state *softsp);
81 
82 static void
83 sysio_log_ce_err(struct async_flt *ecc, char *unum);
84 
85 static void
86 sysio_log_ue_err(struct async_flt *ecc, char *unum);
87 
88 static void
89 sbus_clear_intr(struct sbus_soft_state *softsp, uint64_t *pafsr);
90 
91 static void
92 sbus_log_error(struct sbus_soft_state *softsp, uint64_t *pafsr, uint64_t *pafar,
93     ushort_t id, ushort_t inst, int cleared,
94     on_trap_data_t *ontrap_data);
95 
96 static int
97 sbus_check_bto(struct sbus_soft_state *softsp);
98 
99 static void
100 sbus_log_csr_error(struct async_flt *aflt, char *unum);
101 
102 static uint_t
103 sbus_ctrl_ecc_err(struct sbus_soft_state *softsp);
104 
105 static uint_t
106 sysio_dis_err(struct sbus_soft_state *softsp);
107 
108 static uint_t
109 sysio_init_err(struct sbus_soft_state *softsp);
110 
111 static uint_t
112 sysio_thermal_warn_intr(struct sbus_soft_state *softsp);
113 
114 static int sbus_pil[] = {SBUS_UE_PIL, SBUS_CE_PIL, SBUS_ERR_PIL, SBUS_PF_PIL,
115 	SBUS_THERMAL_PIL, SBUS_PM_PIL};
116 int
sysio_err_init(struct sbus_soft_state * softsp,caddr_t address)117 sysio_err_init(struct sbus_soft_state *softsp, caddr_t address)
118 {
119 	if (sysio_err_flag == 0) {
120 		cmn_err(CE_CONT, "Warning: sysio errors not initialized\n");
121 		return (DDI_SUCCESS);
122 	}
123 
124 	/*
125 	 * Get the address of the already mapped-in sysio/sbus error registers.
126 	 * Simply add each registers offset to the already mapped in address
127 	 * that was retrieved from the device node's "address" property,
128 	 * and passed as an argument to this function.
129 	 *
130 	 * Define a macro for the pointer arithmetic ...
131 	 */
132 
133 #define	REG_ADDR(b, o)	(uint64_t *)((caddr_t)(b) + (o))
134 
135 	softsp->sysio_ecc_reg = REG_ADDR(address, OFF_SYSIO_ECC_REGS);
136 	softsp->sysio_ue_reg = REG_ADDR(address, OFF_SYSIO_UE_REGS);
137 	softsp->sysio_ce_reg = REG_ADDR(address, OFF_SYSIO_CE_REGS);
138 	softsp->sbus_err_reg = REG_ADDR(address, OFF_SBUS_ERR_REGS);
139 
140 #undef	REG_ADDR
141 
142 	/*
143 	 * create the interrupt-priorities property if it doesn't
144 	 * already exist to provide a hint as to the PIL level for
145 	 * our interrupt.
146 	 */
147 	{
148 		int len;
149 
150 		if (ddi_getproplen(DDI_DEV_T_ANY, softsp->dip,
151 		    DDI_PROP_DONTPASS, "interrupt-priorities",
152 		    &len) != DDI_PROP_SUCCESS) {
153 				/* Create the interrupt-priorities property. */
154 			(void) ddi_prop_update_int_array(DDI_DEV_T_NONE,
155 			    softsp->dip, "interrupt-priorities",
156 			    (int *)sbus_pil, sizeof (sbus_pil) / sizeof (int));
157 		}
158 	}
159 
160 	(void) ddi_add_intr(softsp->dip, 0, NULL, NULL,
161 	    (uint_t (*)())sysio_ue_intr, (caddr_t)softsp);
162 	(void) ddi_add_intr(softsp->dip, 1, NULL, NULL,
163 	    (uint_t (*)())sysio_ce_intr, (caddr_t)softsp);
164 	(void) ddi_add_intr(softsp->dip, 2, NULL, NULL,
165 	    (uint_t (*)())sbus_err_intr, (caddr_t)softsp);
166 	/*
167 	 * If the thermal-interrupt property is in place,
168 	 * then register the thermal warning interrupt handler and
169 	 * program its mapping register
170 	 */
171 	thermal_interrupt_enabled = ddi_getprop(DDI_DEV_T_ANY, softsp->dip,
172 		DDI_PROP_DONTPASS, "thermal-interrupt", -1);
173 
174 	if (thermal_interrupt_enabled == 1) {
175 		(void) ddi_add_intr(softsp->dip, 4, NULL, NULL,
176 		    (uint_t (*)())sysio_thermal_warn_intr, (caddr_t)softsp);
177 	}
178 
179 	bus_func_register(BF_TYPE_UE, (busfunc_t)sbus_ctrl_ecc_err, softsp);
180 	bus_func_register(BF_TYPE_ERRDIS, (busfunc_t)sysio_dis_err, softsp);
181 
182 	(void) sysio_init_err(softsp);
183 
184 	return (DDI_SUCCESS);
185 }
186 
187 int
sysio_err_resume_init(struct sbus_soft_state * softsp)188 sysio_err_resume_init(struct sbus_soft_state *softsp)
189 {
190 	(void) sysio_init_err(softsp);
191 	return (DDI_SUCCESS);
192 }
193 
194 int
sysio_err_uninit(struct sbus_soft_state * softsp)195 sysio_err_uninit(struct sbus_soft_state *softsp)
196 {
197 	/* remove the interrupts from the interrupt list */
198 	(void) sysio_dis_err(softsp);
199 
200 	ddi_remove_intr(softsp->dip, 0, NULL);
201 	ddi_remove_intr(softsp->dip, 1, NULL);
202 	ddi_remove_intr(softsp->dip, 2, NULL);
203 
204 	if (thermal_interrupt_enabled == 1) {
205 		ddi_remove_intr(softsp->dip, 4, NULL);
206 	}
207 
208 	bus_func_unregister(BF_TYPE_UE, (busfunc_t)sbus_ctrl_ecc_err, softsp);
209 	bus_func_unregister(BF_TYPE_ERRDIS, (busfunc_t)sysio_dis_err, softsp);
210 
211 	return (DDI_SUCCESS);
212 }
213 
214 static uint_t
sysio_init_err(struct sbus_soft_state * softsp)215 sysio_init_err(struct sbus_soft_state *softsp)
216 {
217 	volatile uint64_t tmp_mondo_vec, tmpreg;
218 	volatile uint64_t *mondo_vec_reg;
219 	uint_t cpu_id, acpu_id;
220 
221 	acpu_id = intr_dist_cpuid();
222 	/*
223 	 * Program the mondo vector accordingly.  This MUST be the
224 	 * last thing we do.  Once we program the mondo, the device
225 	 * may begin to interrupt. Store it in the hardware reg.
226 	 */
227 	mondo_vec_reg = (uint64_t *)(softsp->intr_mapping_reg + UE_ECC_MAPREG);
228 	cpu_id = acpu_id;
229 	tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
230 	*mondo_vec_reg = tmp_mondo_vec;
231 
232 	mondo_vec_reg = (uint64_t *)(softsp->intr_mapping_reg + CE_ECC_MAPREG);
233 	cpu_id = acpu_id;
234 	tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
235 	*mondo_vec_reg = tmp_mondo_vec;
236 
237 	mondo_vec_reg =
238 	    (uint64_t *)(softsp->intr_mapping_reg + SBUS_ERR_MAPREG);
239 	cpu_id = acpu_id;
240 
241 	tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) | INTERRUPT_VALID;
242 	*mondo_vec_reg = tmp_mondo_vec;
243 
244 	if (thermal_interrupt_enabled == 1) {
245 		mondo_vec_reg = (softsp->intr_mapping_reg + THERMAL_MAPREG);
246 		cpu_id = acpu_id;
247 		tmp_mondo_vec = (cpu_id << INTERRUPT_CPU_FIELD) |
248 			INTERRUPT_VALID;
249 		*mondo_vec_reg = tmp_mondo_vec;
250 	}
251 
252 	/* Flush store buffers */
253 	tmpreg = *softsp->sbus_ctrl_reg;
254 
255 	/*
256 	 * XXX - This may already be set by the OBP.
257 	 */
258 	tmpreg = SYSIO_APCKEN;
259 	*softsp->sysio_ctrl_reg |= tmpreg;
260 	tmpreg = (SECR_ECC_EN | SECR_UE_INTEN | SECR_CE_INTEN);
261 	*softsp->sysio_ecc_reg = tmpreg;
262 	tmpreg = SB_CSR_ERRINT_EN;
263 	*softsp->sbus_err_reg |= tmpreg;
264 
265 	/* Initialize timeout/bus error counter */
266 	softsp->bto_timestamp = 0;
267 	softsp->bto_ctr = 0;
268 
269 	return (0);
270 }
271 
272 static uint_t
sysio_dis_err(struct sbus_soft_state * softsp)273 sysio_dis_err(struct sbus_soft_state *softsp)
274 {
275 	volatile uint64_t tmpreg;
276 	volatile uint64_t *mondo_vec_reg, *clear_vec_reg;
277 
278 	*softsp->sysio_ctrl_reg &= ~SYSIO_APCKEN;
279 	*softsp->sysio_ecc_reg = 0;
280 	*softsp->sbus_err_reg &= ~SB_CSR_ERRINT_EN;
281 
282 	/* Flush store buffers */
283 	tmpreg = *softsp->sbus_ctrl_reg;
284 #ifdef lint
285 	tmpreg = tmpreg;
286 #endif
287 
288 	/* Unmap mapping registers */
289 	mondo_vec_reg = (softsp->intr_mapping_reg + UE_ECC_MAPREG);
290 	clear_vec_reg = (softsp->clr_intr_reg + UE_ECC_CLEAR);
291 
292 	*mondo_vec_reg = 0;
293 
294 	*clear_vec_reg = 0;
295 
296 	mondo_vec_reg = (softsp->intr_mapping_reg + CE_ECC_MAPREG);
297 	clear_vec_reg = (softsp->clr_intr_reg + CE_ECC_CLEAR);
298 
299 	*mondo_vec_reg = 0;
300 
301 	*clear_vec_reg = 0;
302 
303 	mondo_vec_reg = (softsp->intr_mapping_reg + SBUS_ERR_MAPREG);
304 	clear_vec_reg = (softsp->clr_intr_reg + SBUS_ERR_CLEAR);
305 
306 	*mondo_vec_reg = 0;
307 
308 	*clear_vec_reg = 0;
309 
310 	/* Flush store buffers */
311 	tmpreg = *softsp->sbus_ctrl_reg;
312 
313 	return (BF_NONE);
314 }
315 
316 /*
317  * Gather information about the error into an async_flt structure, and then
318  * enqueue the error for reporting and processing and panic.
319  */
320 static uint_t
sysio_ue_intr(struct sbus_soft_state * softsp)321 sysio_ue_intr(struct sbus_soft_state *softsp)
322 {
323 	volatile uint64_t t_afsr;
324 	volatile uint64_t t_afar;
325 	volatile uint64_t *ue_reg, *afar_reg, *clear_reg;
326 	struct async_flt ecc;
327 	uint64_t offset;
328 
329 	/*
330 	 * Disable all further sbus errors, for this sbus instance, for
331 	 * what is guaranteed to be a fatal error. And grab any other cpus.
332 	 */
333 	(void) sysio_dis_err(softsp);		/* disabled sysio errors */
334 
335 	/*
336 	 * Then read and clear the afsr/afar and clear interrupt regs.
337 	 */
338 	ue_reg = (uint64_t *)softsp->sysio_ue_reg;
339 	t_afsr = *ue_reg;
340 	afar_reg = (uint64_t *)ue_reg + 1;
341 	t_afar = *afar_reg;
342 	*ue_reg = t_afsr;
343 
344 	clear_reg = (softsp->clr_intr_reg + UE_ECC_CLEAR);
345 	*clear_reg = 0;
346 
347 	/*
348 	 * The AFSR DW_OFFSET field contains the offset of the doubleword with
349 	 * the ECC error relative to the 64-byte aligned PA.  We multiply by 8
350 	 * to convert to a byte offset, and then add this to flt_addr.
351 	 */
352 	offset = ((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT) * 8;
353 
354 	bzero(&ecc, sizeof (ecc));
355 	ecc.flt_id = gethrtime();
356 	ecc.flt_stat = t_afsr;
357 	ecc.flt_addr = P2ALIGN(t_afar, 64) + offset;
358 	ecc.flt_func = sysio_log_ue_err;
359 	ecc.flt_bus_id = softsp->upa_id;
360 	ecc.flt_inst = ddi_get_instance(softsp->dip);
361 	ecc.flt_status = ECC_IOBUS;
362 	ecc.flt_in_memory = (pf_is_memory(t_afar >> MMU_PAGESHIFT)) ? 1: 0;
363 	ecc.flt_class = BUS_FAULT;
364 	ecc.flt_panic = (debug_sysio_errs == 0);
365 
366 	errorq_dispatch(ue_queue, &ecc, sizeof (ecc), ecc.flt_panic);
367 
368 	/*
369 	 * If the UE is in memory and fatal, save the fault info so the
370 	 * panic code will know to check for copyback errors.
371 	 */
372 	if (ecc.flt_panic && ecc.flt_in_memory)
373 		panic_aflt = ecc;
374 
375 	/*
376 	 * We must also check for other bus UE errors, and panic if
377 	 * any fatal ones are detected at this point.
378 	 */
379 	if (bus_func_invoke(BF_TYPE_UE) == BF_FATAL)
380 		ecc.flt_panic = 1;
381 
382 	if (ecc.flt_panic)
383 		cmn_err(CE_PANIC, "Fatal Sbus%d UE Error", ecc.flt_inst);
384 
385 	return (DDI_INTR_CLAIMED);
386 }
387 
388 /*
389  * callback logging function from the common error handling code
390  */
391 static void
sysio_log_ue_err(struct async_flt * ecc,char * unum)392 sysio_log_ue_err(struct async_flt *ecc, char *unum)
393 {
394 	uint64_t t_afsr = ecc->flt_stat;
395 	uint64_t t_afar = ecc->flt_addr;
396 
397 	ushort_t id = ecc->flt_bus_id;
398 	ushort_t inst = ecc->flt_inst;
399 
400 	if (t_afsr & SB_UE_AFSR_P_PIO) {
401 		cmn_err(CE_WARN, "SBus%d UE Primary Error from PIO: "
402 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
403 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
404 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
405 	}
406 	if (t_afsr & SB_UE_AFSR_P_DRD) {
407 		cmn_err(CE_WARN, "SBus%d UE Primary Error DMA read: "
408 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
409 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
410 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
411 	}
412 	if (t_afsr & SB_UE_AFSR_P_DWR) {
413 		cmn_err(CE_WARN, "SBus%d UE Primary Error DVMA write: "
414 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
415 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
416 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
417 	}
418 	/*
419 	 * We should never hit the secondary error panics.
420 	 */
421 	if (t_afsr & SB_UE_AFSR_S_PIO) {
422 		cmn_err(CE_WARN, "SBus%d UE Secondary Error from PIO: "
423 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
424 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
425 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
426 	}
427 	if (t_afsr & SB_UE_AFSR_S_DRD) {
428 		cmn_err(CE_WARN, "SBus%d UE Secondary Error DMA read: "
429 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
430 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
431 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
432 	}
433 	if (t_afsr & SB_UE_AFSR_S_DWR) {
434 		cmn_err(CE_WARN, "SBus%d UE Secondary  Error DMA write: "
435 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d",
436 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
437 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
438 	}
439 
440 	if ((debug_sysio_errs) || (aft_verbose)) {
441 		(void) read_ecc_data(ecc, 1, 0);
442 		cmn_err(CE_CONT, "\tOffset 0x%x, Size %d, UPA MID 0x%x\n",
443 		    (uint32_t)((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT),
444 		    (uint32_t)((t_afsr & SB_UE_AFSR_SIZE) >> SB_UE_SIZE_SHIFT),
445 		    (uint32_t)((t_afsr & SB_UE_AFSR_MID) >> SB_UE_MID_SHIFT));
446 	}
447 }
448 
449 /*
450  * gather the information about the error, plus a pointer to
451  * the callback logging function, and call the generic ce_error handler.
452  */
453 static uint_t
sysio_ce_intr(struct sbus_soft_state * softsp)454 sysio_ce_intr(struct sbus_soft_state *softsp)
455 {
456 	volatile uint64_t t_afsr;
457 	volatile uint64_t t_afar;
458 	volatile uint64_t *afar_reg, *clear_reg, *ce_reg;
459 	struct async_flt ecc;
460 	uint64_t offset;
461 
462 	ce_reg = (uint64_t *)softsp->sysio_ce_reg;
463 	t_afsr = *ce_reg;
464 	afar_reg = (uint64_t *)ce_reg + 1;
465 	t_afar = *afar_reg;
466 	*ce_reg = t_afsr;
467 
468 	clear_reg = (softsp->clr_intr_reg + CE_ECC_CLEAR);
469 	*clear_reg = 0;
470 
471 	/*
472 	 * The AFSR DW_OFFSET field contains the offset of the doubleword with
473 	 * the ECC error relative to the 64-byte aligned PA.  We multiply by 8
474 	 * to convert to a byte offset, and then add this to flt_addr.
475 	 */
476 	offset = ((t_afsr & SB_UE_AFSR_OFF) >> SB_UE_DW_SHIFT) * 8;
477 
478 	bzero(&ecc, sizeof (ecc));
479 	ecc.flt_id = gethrtime();
480 	ecc.flt_stat = t_afsr;
481 	ecc.flt_addr = P2ALIGN(t_afar, 64) + offset;
482 	ecc.flt_func = sysio_log_ce_err;
483 	ecc.flt_bus_id = softsp->upa_id;
484 	ecc.flt_inst = ddi_get_instance(softsp->dip);
485 	ecc.flt_status = ECC_IOBUS;
486 
487 	ecc.flt_synd = (ushort_t)((t_afsr & SB_CE_AFSR_SYND) >>
488 	    SB_CE_SYND_SHIFT);
489 
490 	ecc.flt_in_memory = (pf_is_memory(t_afar >> MMU_PAGESHIFT)) ? 1: 0;
491 	ecc.flt_class = BUS_FAULT;
492 
493 	ce_scrub(&ecc);
494 	errorq_dispatch(ce_queue, &ecc, sizeof (ecc), ERRORQ_ASYNC);
495 
496 	return (DDI_INTR_CLAIMED);
497 }
498 
499 /*
500  * callback logging function from the common error handling code
501  */
502 static void
sysio_log_ce_err(struct async_flt * ecc,char * unum)503 sysio_log_ce_err(struct async_flt *ecc, char *unum)
504 {
505 	uint64_t t_afsr = ecc->flt_stat;
506 	uint64_t t_afar = ecc->flt_addr;
507 	ushort_t id = ecc->flt_bus_id;
508 	ushort_t inst = ecc->flt_inst;
509 	int ce_verbose = ce_verbose_memory;
510 	char *syndrome_str = "!\tSyndrome 0x%x, Offset 0x%x, Size %d, "
511 	    "UPA MID 0x%x\n";
512 
513 	if ((!ce_verbose_memory) && (!debug_sysio_errs))
514 		return;
515 
516 	if (t_afsr & SB_CE_AFSR_P_PIO) {
517 		char *fmtstr = "!SBus%d CE Primary Error from PIO: "
518 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n";
519 
520 		if ((debug_sysio_errs) || (ce_verbose > 1))
521 			fmtstr++;
522 
523 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
524 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32),
525 		    (uint32_t)t_afar, id);
526 	}
527 	if (t_afsr & SB_CE_AFSR_P_DRD) {
528 		char *fmtstr = "!SBus%d CE Primary Error DMA read: "
529 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
530 		    "Id %d\n";
531 
532 		if ((debug_sysio_errs) || (ce_verbose > 1))
533 			fmtstr++;
534 
535 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
536 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
537 		    unum, id);
538 	}
539 	if (t_afsr & SB_CE_AFSR_P_DWR) {
540 		char *fmtstr = "!SBus%d CE Primary Error DMA write: "
541 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s Id %d\n";
542 
543 		if ((debug_sysio_errs) || (ce_verbose > 1))
544 			fmtstr++;
545 
546 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
547 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
548 		    unum, id);
549 	}
550 
551 	if (t_afsr & SB_CE_AFSR_S_PIO) {
552 		char *fmtstr = "!SBus%d CE Secondary Error from PIO: "
553 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n";
554 
555 		if ((debug_sysio_errs) || (ce_verbose > 1))
556 			fmtstr++;
557 
558 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
559 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
560 		    id);
561 	}
562 	if (t_afsr & SB_CE_AFSR_S_DRD) {
563 		char *fmtstr = "!SBus%d CE Secondary Error DMA read: "
564 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
565 		    "Id %d\n";
566 
567 		if ((debug_sysio_errs) || (ce_verbose > 1))
568 			fmtstr++;
569 
570 		cmn_err(CE_CONT, fmtstr, inst, (uint32_t)(t_afsr>>32),
571 		    (uint32_t)t_afsr, (uint32_t)(t_afar>>32), (uint32_t)t_afar,
572 		    unum, id);
573 	}
574 	if (t_afsr & SB_CE_AFSR_S_DWR) {
575 		char *fmtstr = "!SBus%d CE Secondary Error DMA write: "
576 		    "AFSR 0x%08x.%08x AFAR 0x%08x.%08x MemMod %s "
577 		    "Id %d\n";
578 
579 		if ((debug_sysio_errs) || (ce_verbose > 1))
580 			fmtstr++;
581 
582 		cmn_err(CE_CONT, fmtstr,
583 		    inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
584 		    (uint32_t)(t_afar>>32), (uint32_t)t_afar, unum, id);
585 	}
586 
587 	if ((debug_sysio_errs) || (ce_verbose > 1))
588 		syndrome_str++;
589 
590 	cmn_err(CE_CONT, syndrome_str,
591 	    (uint32_t)((t_afsr & SB_CE_AFSR_SYND) >> SB_CE_SYND_SHIFT),
592 	    (uint32_t)((t_afsr & SB_CE_AFSR_OFF) >> SB_CE_OFFSET_SHIFT),
593 	    (uint32_t)((t_afsr & SB_CE_AFSR_SIZE) >> SB_CE_SIZE_SHIFT),
594 	    (uint32_t)((t_afsr & SB_CE_AFSR_MID) >> SB_CE_MID_SHIFT));
595 }
596 
597 static uint_t
sbus_err_intr(struct sbus_soft_state * softsp)598 sbus_err_intr(struct sbus_soft_state *softsp)
599 {
600 	volatile uint64_t t_afsr;
601 	volatile uint64_t t_afar;
602 	ushort_t id, inst;
603 	int cleared = 0;
604 	volatile uint64_t *afar_reg;
605 	on_trap_data_t *otp = softsp->ontrap_data;
606 
607 	t_afsr = *softsp->sbus_err_reg;
608 	afar_reg = (uint64_t *)softsp->sbus_err_reg + 1;
609 	t_afar = *afar_reg;
610 
611 	if (otp == NULL || !(otp->ot_prot & OT_DATA_ACCESS)) {
612 		sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
613 		cleared = 1;
614 	}
615 
616 	id = (ushort_t)softsp->upa_id;
617 	inst = (ushort_t)ddi_get_instance(softsp->dip);
618 
619 	if (debug_sysio_errs) {
620 		if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS))
621 			otp->ot_trap |= OT_DATA_ACCESS;
622 		if (!cleared)
623 			sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
624 
625 		cmn_err(CE_CONT, "SBus%d Error: AFSR 0x%08x.%08x "
626 			"AFAR 0x%08x.%08x Id %d\n",
627 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
628 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
629 
630 		debug_enter("sbus_err_intr");
631 	} else {
632 		sbus_log_error(softsp, (uint64_t *)&t_afsr,
633 		    (uint64_t *)&t_afar, id, inst, cleared, otp);
634 	}
635 	if (!cleared) {
636 		sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
637 	}
638 
639 	return (DDI_INTR_CLAIMED);
640 }
641 
642 static void
sbus_clear_intr(struct sbus_soft_state * softsp,uint64_t * pafsr)643 sbus_clear_intr(struct sbus_soft_state *softsp, uint64_t *pafsr)
644 {
645 	volatile uint64_t *clear_reg;
646 
647 	*softsp->sbus_err_reg = *pafsr;
648 	clear_reg = (softsp->clr_intr_reg + SBUS_ERR_CLEAR);
649 	*clear_reg = 0;
650 }
651 
652 static void
sbus_log_error(struct sbus_soft_state * softsp,uint64_t * pafsr,uint64_t * pafar,ushort_t id,ushort_t inst,int cleared,on_trap_data_t * otp)653 sbus_log_error(struct sbus_soft_state *softsp, uint64_t *pafsr, uint64_t *pafar,
654     ushort_t id, ushort_t inst, int cleared, on_trap_data_t *otp)
655 {
656 	uint64_t t_afsr;
657 	uint64_t t_afar;
658 	int level = CE_WARN;
659 
660 	t_afsr = *pafsr;
661 	t_afar = *pafar;
662 	if (t_afsr & SB_AFSR_P_LE) {
663 		if (!cleared)
664 			sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
665 		cmn_err(CE_PANIC, "SBus%d Primary Error Late PIO: "
666 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
667 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
668 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
669 	}
670 	if (t_afsr & SB_AFSR_P_TO) {
671 		if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) {
672 			otp->ot_trap |= OT_DATA_ACCESS;
673 			return;
674 		}
675 		if (sbus_check_bto(softsp)) {
676 			if (!cleared)
677 				sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
678 			level = CE_PANIC;
679 		}
680 		cmn_err(level, "SBus%d Primary Error Timeout: "
681 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
682 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
683 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
684 	}
685 	if (t_afsr & SB_AFSR_P_BERR) {
686 		if (otp != NULL && (otp->ot_prot & OT_DATA_ACCESS)) {
687 			otp->ot_trap |= OT_DATA_ACCESS;
688 			return;
689 		}
690 		if (sbus_check_bto(softsp)) {
691 			if (!cleared)
692 				sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
693 			level = CE_PANIC;
694 		}
695 		cmn_err(level, "SBus%d Primary Error Bus Error: "
696 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d\n",
697 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
698 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
699 	}
700 
701 	if (t_afsr & SB_AFSR_S_LE) {
702 		if (!cleared)
703 			sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
704 		cmn_err(CE_PANIC, "SBus%d Secondary Late PIO Error: "
705 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
706 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
707 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
708 	}
709 	if (t_afsr & SB_AFSR_S_TO) {
710 		if (sbus_check_bto(softsp)) {
711 			if (!cleared)
712 				sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
713 			level = CE_PANIC;
714 		}
715 		cmn_err(level, "SBus%d Secondary Timeout Error: "
716 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
717 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
718 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
719 	}
720 	if (t_afsr & SB_AFSR_S_BERR) {
721 		if (sbus_check_bto(softsp)) {
722 			if (!cleared)
723 				sbus_clear_intr(softsp, (uint64_t *)&t_afsr);
724 			level = CE_PANIC;
725 		}
726 		cmn_err(level, "SBus%d Secondary Bus Error: "
727 			"AFSR 0x%08x.%08x AFAR 0x%08x.%08x Id %d",
728 			inst, (uint32_t)(t_afsr>>32), (uint32_t)t_afsr,
729 			(uint32_t)(t_afar>>32), (uint32_t)t_afar, id);
730 	}
731 }
732 
733 
734 static int
sbus_check_bto(struct sbus_soft_state * softsp)735 sbus_check_bto(struct sbus_soft_state *softsp)
736 {
737 	hrtime_t now = gethrtime();		/* high PIL safe */
738 	hrtime_t diff = now - softsp->bto_timestamp;
739 
740 	if (diff > ((hrtime_t)bto_secs * NANOSEC) || diff < 0LL) {
741 		/*
742 		 * Reset error counter as this bus error has occurred
743 		 * after more than bto_secs duration.
744 		 */
745 		softsp->bto_timestamp = now;
746 		softsp->bto_ctr = 0;
747 	}
748 	if (softsp->bto_ctr++ >= bto_cnt)
749 		return (1);
750 	return (0);
751 }
752 
753 static uint_t
sbus_ctrl_ecc_err(struct sbus_soft_state * softsp)754 sbus_ctrl_ecc_err(struct sbus_soft_state *softsp)
755 {
756 	uint64_t t_sb_csr;
757 	ushort_t id, inst;
758 
759 	t_sb_csr = *softsp->sbus_ctrl_reg;
760 	id = (ushort_t)softsp->upa_id;
761 	inst = (ushort_t)ddi_get_instance(softsp->dip);
762 
763 	if (debug_sysio_errs) {
764 		cmn_err(CE_CONT, "sbus_ctrl_ecc_error: SBus%d Control Reg "
765 		    "0x%016llx Id %d\n", inst, (u_longlong_t)t_sb_csr, id);
766 	}
767 
768 	if (t_sb_csr & (SB_CSR_DPERR_S14|SB_CSR_DPERR_S13|SB_CSR_DPERR_S3|
769 	    SB_CSR_DPERR_S2|SB_CSR_DPERR_S1|SB_CSR_DPERR_S0|SB_CSR_PIO_PERRS)) {
770 		struct async_flt aflt;
771 
772 		*softsp->sbus_ctrl_reg = t_sb_csr; /* clear error bits */
773 
774 		bzero(&aflt, sizeof (aflt));
775 		aflt.flt_id = gethrtime();
776 		aflt.flt_stat = t_sb_csr;
777 		aflt.flt_func = sbus_log_csr_error;
778 		aflt.flt_bus_id = id;
779 		aflt.flt_inst = inst;
780 		aflt.flt_status = ECC_IOBUS;
781 		aflt.flt_class = BUS_FAULT;
782 		aflt.flt_panic = 1;
783 
784 		errorq_dispatch(ue_queue, &aflt, sizeof (aflt), aflt.flt_panic);
785 		return (BF_FATAL);
786 	}
787 
788 	return (BF_NONE);
789 }
790 
791 /*ARGSUSED*/
792 static void
sbus_log_csr_error(struct async_flt * aflt,char * unum)793 sbus_log_csr_error(struct async_flt *aflt, char *unum)
794 {
795 	uint64_t t_sb_csr = aflt->flt_stat;
796 	uint_t id = aflt->flt_bus_id;
797 	uint_t inst = aflt->flt_inst;
798 
799 	/*
800 	 * Print out SBus error information.
801 	 */
802 	if (t_sb_csr & SB_CSR_DPERR_S14) {
803 		cmn_err(CE_WARN,
804 		"SBus%d Slot 14 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
805 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
806 	}
807 	if (t_sb_csr & SB_CSR_DPERR_S13) {
808 		cmn_err(CE_WARN,
809 		"SBus%d Slot 13 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
810 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
811 	}
812 	if (t_sb_csr & SB_CSR_DPERR_S3) {
813 		cmn_err(CE_WARN,
814 		"SBus%d Slot 3 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
815 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
816 	}
817 	if (t_sb_csr & SB_CSR_DPERR_S2) {
818 		cmn_err(CE_WARN,
819 		"SBus%d Slot 2 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
820 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
821 	}
822 	if (t_sb_csr & SB_CSR_DPERR_S1) {
823 		cmn_err(CE_WARN,
824 		"SBus%d Slot 1 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
825 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
826 	}
827 	if (t_sb_csr & SB_CSR_DPERR_S0) {
828 		cmn_err(CE_WARN,
829 		"SBus%d Slot 0 DVMA Parity Error: AFSR 0x%08x.%08x Id %d",
830 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
831 	}
832 	if (t_sb_csr & SB_CSR_PPERR_S15) {
833 		cmn_err(CE_WARN,
834 		"SBus%d Slot 15 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
835 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
836 	}
837 	if (t_sb_csr & SB_CSR_PPERR_S14) {
838 		cmn_err(CE_WARN,
839 		"SBus%d Slot 14 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
840 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
841 	}
842 	if (t_sb_csr & SB_CSR_PPERR_S13) {
843 		cmn_err(CE_WARN,
844 		"SBus%d Slot 13 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
845 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
846 	}
847 	if (t_sb_csr & SB_CSR_PPERR_S3) {
848 		cmn_err(CE_WARN,
849 		"SBus%d Slot 3 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
850 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
851 	}
852 	if (t_sb_csr & SB_CSR_PPERR_S2) {
853 		cmn_err(CE_WARN,
854 		"SBus%d Slot 2 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
855 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
856 	}
857 	if (t_sb_csr & SB_CSR_PPERR_S1) {
858 		cmn_err(CE_WARN,
859 		"SBus%d Slot 1 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
860 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
861 	}
862 	if (t_sb_csr & SB_CSR_PPERR_S0) {
863 		cmn_err(CE_WARN,
864 		"SBus%d Slot 0 PIO Parity Error: AFSR 0x%08x.%08x Id %d",
865 			inst, (uint32_t)(t_sb_csr>>32), (uint32_t)t_sb_csr, id);
866 	}
867 }
868 
869 /*
870  * Sysio Thermal Warning interrupt handler
871  */
872 static uint_t
sysio_thermal_warn_intr(struct sbus_soft_state * softsp)873 sysio_thermal_warn_intr(struct sbus_soft_state *softsp)
874 {
875 	volatile uint64_t *clear_reg;
876 	volatile uint64_t tmp_mondo_vec;
877 	volatile uint64_t *mondo_vec_reg;
878 	const char thermal_warn_msg[] =
879 	    "Severe over-temperature condition detected!";
880 
881 	/*
882 	 * Take off the Thermal Warning interrupt and
883 	 * remove its interrupt handler.
884 	 */
885 	mondo_vec_reg = (softsp->intr_mapping_reg + THERMAL_MAPREG);
886 	tmp_mondo_vec = *mondo_vec_reg;
887 	tmp_mondo_vec &= ~INTERRUPT_VALID;
888 	*mondo_vec_reg = tmp_mondo_vec;
889 
890 	ddi_remove_intr(softsp->dip, 4, NULL);
891 
892 	clear_reg = (softsp->clr_intr_reg + THERMAL_CLEAR);
893 	*clear_reg = 0;
894 
895 	if (oven_test) {
896 		cmn_err(CE_NOTE, "OVEN TEST: %s", thermal_warn_msg);
897 		return (DDI_INTR_CLAIMED);
898 	}
899 
900 	cmn_err(CE_WARN, "%s", thermal_warn_msg);
901 	cmn_err(CE_WARN, "Powering down...");
902 
903 	do_shutdown();
904 
905 	/*
906 	 * just in case do_shutdown() fails
907 	 */
908 	(void) timeout((void(*)(void *))power_down, NULL,
909 	    thermal_powerdown_delay * hz);
910 
911 	return (DDI_INTR_CLAIMED);
912 }
913