xref: /illumos-gate/usr/src/uts/i86pc/cpu/generic_cpu/gcpu_mca.c (revision fa2e767ebc4f144f2041d566b0b7a9d45e8c6f1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/mca_x86.h>
30 #include <sys/cpu_module_impl.h>
31 #include <sys/cpu_module_ms.h>
32 #include <sys/cmn_err.h>
33 #include <sys/cpuvar.h>
34 #include <sys/pghw.h>
35 #include <sys/x86_archext.h>
36 #include <sys/sysmacros.h>
37 #include <sys/regset.h>
38 #include <sys/privregs.h>
39 #include <sys/systm.h>
40 #include <sys/types.h>
41 #include <sys/log.h>
42 #include <sys/psw.h>
43 #include <sys/fm/protocol.h>
44 #include <sys/fm/util.h>
45 #include <sys/errorq.h>
46 #include <sys/mca_x86.h>
47 #include <sys/fm/cpu/GMCA.h>
48 #include <sys/sysevent.h>
49 #include <sys/ontrap.h>
50 
51 #include "gcpu.h"
52 
53 /*
54  * Set to suppress logging of telemetry found at initialization.
55  */
56 int gcpu_suppress_log_on_init = 0;
57 
58 /*
59  * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
60  * error logout time.  The stack will be included in the ereport if the
61  * error type selects stack inclusion, or in all cases if
62  * gcpu_mca_stack_ereport_include is nonzero.
63  */
64 int gcpu_mca_stack_flag = 0;
65 int gcpu_mca_stack_ereport_include = 0;
66 
67 /*
68  * The number of times to re-read MCA telemetry to try to obtain a
69  * consistent snapshot if we find it to be changing under our feet.
70  */
71 int gcpu_mca_telemetry_retries = 5;
72 
73 static gcpu_error_disp_t gcpu_errtypes[] = {
74 
75 	/*
76 	 * Unclassified
77 	 */
78 	{
79 		FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
80 		NULL,
81 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
82 		MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
83 		MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
84 	},
85 
86 	/*
87 	 * Microcode ROM Parity Error
88 	 */
89 	{
90 		FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
91 		NULL,
92 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
93 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
94 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
95 	},
96 
97 	/*
98 	 * External - BINIT# from another processor during power-on config
99 	 */
100 	{
101 		FM_EREPORT_CPU_GENERIC_EXTERNAL,
102 		NULL,
103 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
104 		MCAX86_SIMPLE_EXTERNAL_MASKON,
105 		MCAX86_SIMPLE_EXTERNAL_MASKOFF
106 	},
107 
108 	/*
109 	 * Functional redundancy check master/slave error
110 	 */
111 	{
112 		FM_EREPORT_CPU_GENERIC_FRC,
113 		NULL,
114 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
115 		MCAX86_SIMPLE_FRC_MASKON,
116 		MCAX86_SIMPLE_FRC_MASKOFF
117 	},
118 
119 	/*
120 	 * Internal timer error
121 	 */
122 	{
123 		FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
124 		NULL,
125 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
126 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
127 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
128 	},
129 
130 	/*
131 	 * Internal unclassified
132 	 */
133 	{
134 		FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
135 		NULL,
136 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
137 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
138 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
139 	},
140 
141 	/*
142 	 * Compound error codes - generic memory hierarchy
143 	 */
144 	{
145 		FM_EREPORT_CPU_GENERIC_GENMEMHIER,
146 		NULL,
147 		FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
148 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
149 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
150 	},
151 
152 	/*
153 	 * Compound error codes - TLB errors
154 	 */
155 	{
156 		FM_EREPORT_CPU_GENERIC_TLB,
157 		"%1$s" "TLB" "%2$s" "_ERR",
158 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
159 		MCAX86_COMPOUND_TLB_MASKON,
160 		MCAX86_COMPOUND_TLB_MASKOFF
161 	},
162 
163 	/*
164 	 * Compound error codes - memory hierarchy
165 	 */
166 	{
167 		FM_EREPORT_CPU_GENERIC_MEMHIER,
168 		"%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
169 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
170 		MCAX86_COMPOUND_MEMHIER_MASKON,
171 		MCAX86_COMPOUND_MEMHIER_MASKOFF
172 	},
173 
174 	/*
175 	 * Compound error codes - bus and interconnect errors
176 	 */
177 	{
178 		FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
179 		"BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
180 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
181 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
182 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
183 	},
184 };
185 
186 static gcpu_error_disp_t gcpu_unknown = {
187 	FM_EREPORT_CPU_GENERIC_UNKNOWN,
188 	"UNKNOWN",
189 	FM_EREPORT_PAYLOAD_FLAGS_COMMON,
190 	0,
191 	0
192 };
193 
194 static errorq_t *gcpu_mca_queue;
195 static kmutex_t gcpu_mca_queue_lock;
196 
197 static const gcpu_error_disp_t *
198 gcpu_disp_match(uint16_t code)
199 {
200 	const gcpu_error_disp_t *ged = gcpu_errtypes;
201 	int i;
202 
203 	for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
204 	    i++, ged++) {
205 		uint16_t on = ged->ged_errcode_mask_on;
206 		uint16_t off = ged->ged_errcode_mask_off;
207 
208 		if ((code & on) == on && (code & off) == 0)
209 			return (ged);
210 	}
211 
212 	return (NULL);
213 }
214 
215 static uint8_t
216 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
217 {
218 	return ((uint8_t)(code & mask) >> shift);
219 }
220 
221 #define	BIT_STRIP(code, name) \
222 	bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
223 	MCAX86_ERRCODE_##name##_SHIFT)
224 
225 #define	GCPU_MNEMONIC_UNDEF	"undefined"
226 #define	GCPU_MNEMONIC_RESVD	"reserved"
227 
228 /*
229  * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
230  * mnemonics and to ereport class name components.
231  */
232 
233 struct gcpu_mnexp {
234 	const char *mne_compound;	/* used in expanding compound errname */
235 	const char *mne_ereport;	/* used in expanding ereport class */
236 };
237 
238 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
239 	{ "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },		/* INSTR */
240 	{ "D", FM_EREPORT_CPU_GENERIC_TT_DATA },		/* DATA */
241 	{ "G", FM_EREPORT_CPU_GENERIC_TT_GEN },			/* GEN */
242 	{ GCPU_MNEMONIC_UNDEF, "" }
243 };
244 
245 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
246 	{ "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },			/* L0 */
247 	{ "L1",	FM_EREPORT_CPU_GENERIC_LL_L1 },			/* L1 */
248 	{ "L2",	FM_EREPORT_CPU_GENERIC_LL_L2 },			/* L2 */
249 	{ "LG", FM_EREPORT_CPU_GENERIC_LL_LG }			/* LG */
250 };
251 
252 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
253 	{ "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },		/* ERR */
254 	{ "RD",	FM_EREPORT_CPU_GENERIC_RRRR_RD },		/* RD */
255 	{ "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },		/* WR */
256 	{ "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },		/* DRD */
257 	{ "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },		/* DWR */
258 	{ "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },		/* IRD */
259 	{ "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },	/* PREFETCH */
260 	{ "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },		/* EVICT */
261 	{ "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },		/* SNOOP */
262 };
263 
264 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
265 	{ "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },		/* SRC */
266 	{ "RES", FM_EREPORT_CPU_GENERIC_PP_RES },		/* RES */
267 	{ "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },		/* OBS */
268 	{ "", FM_EREPORT_CPU_GENERIC_PP_GEN }			/* GEN */
269 };
270 
271 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
272 	{ "M", FM_EREPORT_CPU_GENERIC_II_MEM },			/* MEM */
273 	{ GCPU_MNEMONIC_RESVD, "" },
274 	{ "IO", FM_EREPORT_CPU_GENERIC_II_IO },			/* IO */
275 	{ "", FM_EREPORT_CPU_GENERIC_II_GEN }			/* GEN */
276 };
277 
278 static struct gcpu_mnexp gcpu_T_mnemonics[] = {	 /* MCAX86_ERRCODE_T_* */
279 	{ "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },	/* NONE */
280 	{ "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }		/* TIMEOUT */
281 };
282 
283 enum gcpu_mn_namespace {
284 	GCPU_MN_NAMESPACE_COMPOUND,
285 	GCPU_MN_NAMESPACE_EREPORT
286 };
287 
288 static const char *
289 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val,
290     enum gcpu_mn_namespace nspace)
291 {
292 	if (val >= tbl_sz)
293 		return (GCPU_MNEMONIC_UNDEF);	/* for all namespaces */
294 
295 	switch (nspace) {
296 	case GCPU_MN_NAMESPACE_COMPOUND:
297 		return (tbl[val].mne_compound);
298 		/*NOTREACHED*/
299 
300 	case GCPU_MN_NAMESPACE_EREPORT:
301 		return (tbl[val].mne_ereport);
302 		/*NOTREACHED*/
303 
304 	default:
305 		return (GCPU_MNEMONIC_UNDEF);
306 		/*NOTREACHED*/
307 	}
308 }
309 
310 /*
311  * The ereport class leaf component is either a simple string with no
312  * format specifiers, or a string with one or more embedded %n$s specifiers -
313  * positional selection for string arguments.  The kernel snprintf does
314  * not support %n$ (and teaching it to do so is too big a headache) so
315  * we will expand this restricted format string ourselves.
316  */
317 
318 #define	GCPU_CLASS_VARCOMPS	7
319 
320 #define	GCPU_MNEMONIC(code, name, nspace) \
321 	gcpu_mnemonic(gcpu_##name##_mnemonics, \
322 	sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
323 	BIT_STRIP(code, name), nspace)
324 
325 static void
326 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
327     enum gcpu_mn_namespace nspace)
328 {
329 	uint16_t code = MCAX86_ERRCODE(status);
330 	const char *mn[GCPU_CLASS_VARCOMPS];
331 	char *p = buf;			/* current position in buf */
332 	char *q = buf + buflen;		/* pointer past last char in buf */
333 	int which, expfmtchar, error;
334 	char c;
335 
336 	mn[0] = GCPU_MNEMONIC(code, TT, nspace);
337 	mn[1] = GCPU_MNEMONIC(code, LL, nspace);
338 	mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
339 	mn[3] = GCPU_MNEMONIC(code, PP, nspace);
340 	mn[4] = GCPU_MNEMONIC(code, II, nspace);
341 	mn[5] = GCPU_MNEMONIC(code, T, nspace);
342 	mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
343 
344 	while (p < q - 1 && (c = *fmt++) != '\0') {
345 		if (c != '%') {
346 			/* not the beginning of a format specifier - copy */
347 			*p++ = c;
348 			continue;
349 		}
350 
351 		error = 0;
352 		which = -1;
353 		expfmtchar = -1;
354 
355 nextfmt:
356 		if ((c = *fmt++) == '\0')
357 			break;	/* early termination of fmt specifier */
358 
359 		switch (c) {
360 		case '1':
361 		case '2':
362 		case '3':
363 		case '4':
364 		case '5':
365 		case '6':
366 		case '7':
367 			if (which != -1) { /* allow only one positional digit */
368 				error++;
369 				break;
370 			}
371 			which = c - '1';
372 			goto nextfmt;
373 			/*NOTREACHED*/
374 
375 		case '$':
376 			if (which == -1) { /* no position specified */
377 				error++;
378 				break;
379 			}
380 			expfmtchar = 's';
381 			goto nextfmt;
382 			/*NOTREACHED*/
383 
384 		case 's':
385 			if (expfmtchar != 's') {
386 				error++;
387 				break;
388 			}
389 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
390 			    mn[which]);
391 			p += strlen(p);
392 			break;
393 
394 		default:
395 			error++;
396 			break;
397 		}
398 
399 		if (error)
400 			break;
401 	}
402 
403 	*p = '\0';	/* NUL termination */
404 }
405 
406 static void
407 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
408     const char *cpuclass, const char *leafclass)
409 {
410 	char *p = buf;			/* current position in buf */
411 	char *q = buf + buflen;		/* pointer past last char in buf */
412 
413 	(void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
414 	    FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
415 
416 	p += strlen(p);
417 	if (p >= q)
418 		return;
419 
420 	if (leafclass == NULL) {
421 		gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
422 		    GCPU_MN_NAMESPACE_EREPORT);
423 	} else {
424 		(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
425 		    leafclass);
426 	}
427 }
428 
429 /*
430  * Create an "hc" scheme FMRI identifying the given cpu.  We don't know
431  * the actual topology/connectivity of cpus in the system, so we'll
432  * apply /motherboard=0/chip=.../cpu=... in all cases.
433  */
434 static nvlist_t *
435 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
436 {
437 	nvlist_t *nvl;
438 
439 	if ((nvl = fm_nvlist_create(nva)) == NULL)
440 		return (NULL);
441 
442 	fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 3,
443 	    "motherboard", 0,
444 	    "chip", cmi_hdl_chipid(hdl),
445 	    "cpu", cmi_hdl_coreid(hdl));
446 
447 	return (nvl);
448 }
449 
450 int gcpu_bleat_count_thresh = 5;
451 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
452 
453 /*
454  * Called when we are unable to propogate a logout structure onto an
455  * errorq for subsequent ereport preparation and logging etc.  The caller
456  * should usually only decide to call this for severe errors - those we
457  * suspect we may need to panic for.
458  */
459 static void
460 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
461 {
462 	hrtime_t now  = gethrtime_waitfree();
463 	static hrtime_t gcpu_last_bleat;
464 	gcpu_bank_logout_t *gbl;
465 	static int bleatcount;
466 	int i;
467 
468 	/*
469 	 * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
470 	 * can come as fast as we like, but once we've spammed that many
471 	 * to the console we require a minimum interval to pass before
472 	 * any more complaints.
473 	 */
474 	if (++bleatcount > gcpu_bleat_count_thresh) {
475 		if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
476 			return;
477 		else
478 			bleatcount = 0;
479 	}
480 	gcpu_last_bleat = now;
481 
482 	cmn_err(CE_WARN, "Machine-Check Errors unlogged on chip %d core %d, "
483 	    "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl));
484 	cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
485 	    (u_longlong_t)gcl->gcl_mcg_status);
486 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
487 		uint64_t status = gbl->gbl_status;
488 
489 		if (!(status & MSR_MC_STATUS_VAL))
490 			continue;
491 
492 		switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
493 		case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
494 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
495 			    "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
496 			    i, IA32_MSR_MC(i, STATUS),
497 			    (u_longlong_t)status,
498 			    (u_longlong_t)gbl->gbl_addr,
499 			    (u_longlong_t)gbl->gbl_misc);
500 			break;
501 
502 		case MSR_MC_STATUS_ADDRV:
503 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
504 			    "STAT 0x%016llx ADDR 0x%016llx",
505 			    i, IA32_MSR_MC(i, STATUS),
506 			    (u_longlong_t)status,
507 			    (u_longlong_t)gbl->gbl_addr);
508 			break;
509 
510 		case MSR_MC_STATUS_MISCV:
511 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
512 			    "STAT 0x%016llx MISC 0x%016llx",
513 			    i, IA32_MSR_MC(i, STATUS),
514 			    (u_longlong_t)status,
515 			    (u_longlong_t)gbl->gbl_misc);
516 			break;
517 
518 		default:
519 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
520 			    "STAT 0x%016llx",
521 			    i, IA32_MSR_MC(i, STATUS),
522 			    (u_longlong_t)status);
523 			break;
524 
525 		}
526 	}
527 }
528 
529 #define	_GCPU_BSTATUS(status, what) \
530 	FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
531 	(status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
532 
533 static void
534 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
535     uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
536 {
537 	uint64_t members = ged ? ged->ged_ereport_members :
538 	    FM_EREPORT_PAYLOAD_FLAGS_COMMON;
539 	uint64_t mcg = gcl->gcl_mcg_status;
540 	int mcip = mcg & MCG_STATUS_MCIP;
541 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
542 	uint64_t bstat = gbl->gbl_status;
543 
544 	/*
545 	 * Include the compound error name if requested and if this
546 	 * is a compound error type.
547 	 */
548 	if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
549 	    ged->ged_compound_fmt != NULL) {
550 		char buf[FM_MAX_CLASS];
551 
552 		gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
553 		    GCPU_MN_NAMESPACE_COMPOUND);
554 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
555 		    DATA_TYPE_STRING, buf, NULL);
556 	}
557 
558 	/*
559 	 * Include disposition information for this error
560 	 */
561 	if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
562 	    gbl->gbl_disp != 0) {
563 		int i, empty = 1;
564 		char buf[128];
565 		char *p = buf, *q = buf + 128;
566 		static struct _gcpu_disp_name {
567 			uint64_t dv;
568 			const char *dn;
569 		} disp_names[] = {
570 			{ CMI_ERRDISP_CURCTXBAD,
571 			    "processor_context_corrupt" },
572 			{ CMI_ERRDISP_RIPV_INVALID,
573 			    "return_ip_invalid" },
574 			{ CMI_ERRDISP_UC_UNCONSTRAINED,
575 			    "unconstrained" },
576 			{ CMI_ERRDISP_FORCEFATAL,
577 			    "forcefatal" },
578 			{ CMI_ERRDISP_IGNORED,
579 			    "ignored" },
580 			{ CMI_ERRDISP_PCC_CLEARED,
581 			    "corrupt_context_cleared" },
582 			{ CMI_ERRDISP_UC_CLEARED,
583 			    "uncorrected_data_cleared" },
584 			{ CMI_ERRDISP_POISONED,
585 			    "poisoned" },
586 			{ CMI_ERRDISP_INCONSISTENT,
587 			    "telemetry_unstable" },
588 		};
589 
590 		for (i = 0; i < sizeof (disp_names) /
591 		    sizeof (struct _gcpu_disp_name); i++) {
592 			if ((gbl->gbl_disp & disp_names[i].dv) == 0)
593 				continue;
594 
595 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
596 			    "%s%s", empty ? "" : ",", disp_names[i].dn);
597 			p += strlen(p);
598 			empty = 0;
599 		}
600 
601 		if (p != buf)
602 			fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
603 			    DATA_TYPE_STRING, buf, NULL);
604 	}
605 
606 	/*
607 	 * If MCG_STATUS is included add that and an indication of whether
608 	 * this ereport was the result of a machine check or poll.
609 	 */
610 	if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
611 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
612 		    DATA_TYPE_UINT64, mcg, NULL);
613 
614 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
615 		    DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
616 	}
617 
618 	/*
619 	 * If an instruction pointer is to be included add one provided
620 	 * MCG_STATUS indicated it is valid; meaningless for polled events.
621 	 */
622 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
623 	    mcg & MCG_STATUS_EIPV) {
624 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
625 		    DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
626 	}
627 
628 	/*
629 	 * Add an indication of whether the trap occured during privileged code.
630 	 */
631 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
632 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
633 		    DATA_TYPE_BOOLEAN_VALUE,
634 		    gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
635 	}
636 
637 	/*
638 	 * If requested, add the index of the MCA bank.  This indicates the
639 	 * n'th bank of 4 MCA registers, and does not necessarily correspond
640 	 * to MCi_* - use the bank offset to correlate
641 	 */
642 	if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
643 		fm_payload_set(ereport,
644 		    /* Bank number */
645 		    FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
646 		    /* Offset of MCi_CTL */
647 		    FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
648 		    IA32_MSR_MC(bankno, CTL),
649 		    NULL);
650 	}
651 
652 	/*
653 	 * Add MCi_STATUS if requested, and decode it.
654 	 */
655 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
656 		const char *tbes[] = {
657 			"No tracking",			/* 00 */
658 			"Green - below threshold",	/* 01 */
659 			"Yellow - above threshold",	/* 10 */
660 			"Reserved"			/* 11 */
661 		};
662 
663 		fm_payload_set(ereport,
664 		    /* Bank MCi_STATUS */
665 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
666 		    /* Overflow? */
667 		    _GCPU_BSTATUS(bstat, OVER),
668 		    /* Uncorrected? */
669 		    _GCPU_BSTATUS(bstat, UC),
670 		    /* Enabled? */
671 		    _GCPU_BSTATUS(bstat, EN),
672 		    /* Processor context corrupt? */
673 		    _GCPU_BSTATUS(bstat, PCC),
674 		    /* Error code */
675 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
676 		    DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
677 		    /* Model-specific error code */
678 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
679 		    DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
680 		    NULL);
681 
682 		/*
683 		 * If MCG_CAP.TES_P indicates that that thresholding info
684 		 * is present in the architural component of the bank status
685 		 * then include threshold information for this bank.
686 		 */
687 		if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
688 			fm_payload_set(ereport,
689 			    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
690 			    DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
691 			    NULL);
692 		}
693 	}
694 
695 	/*
696 	 * MCi_ADDR info if requested and valid.
697 	 */
698 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
699 	    bstat & MSR_MC_STATUS_ADDRV) {
700 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
701 		    DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
702 	}
703 
704 	/*
705 	 * MCi_MISC if requested and MCi_STATUS.MISCV).
706 	 */
707 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
708 	    bstat & MSR_MC_STATUS_MISCV) {
709 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
710 		    DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
711 	}
712 
713 }
714 
715 /*
716  * Construct and post an ereport based on the logout information from a
717  * single MCA bank.  We are not necessarily running on the cpu that
718  * detected the error.
719  */
720 static void
721 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
722     const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
723 {
724 	gcpu_data_t *gcpu = gcl->gcl_gcpu;
725 	cmi_hdl_t hdl = gcpu->gcpu_hdl;
726 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
727 	const char *cpuclass = NULL, *leafclass = NULL;
728 	uint16_t code = MCAX86_ERRCODE(status);
729 	errorq_elem_t *eqep, *scr_eqep;
730 	nvlist_t *ereport, *detector;
731 	char buf[FM_MAX_CLASS];
732 	const char *classfmt;
733 	nv_alloc_t *nva;
734 
735 	if (panicstr) {
736 		if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
737 			return;
738 		ereport = errorq_elem_nvl(ereport_errorq, eqep);
739 
740 		/*
741 		 * Allocate another element for scratch space, but fallback
742 		 * to the one we have if that fails.  We'd like to use the
743 		 * additional scratch space for nvlist construction.
744 		 */
745 		if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
746 			nva = errorq_elem_nva(ereport_errorq, scr_eqep);
747 		else
748 			nva = errorq_elem_nva(ereport_errorq, eqep);
749 	} else {
750 		ereport = fm_nvlist_create(NULL);
751 		nva = NULL;
752 	}
753 
754 	if (ereport == NULL)
755 		return;
756 
757 	/*
758 	 * Common payload data required by the protocol:
759 	 *	- ereport class
760 	 *	- detector
761 	 *	- ENA
762 	 */
763 
764 	/*
765 	 * Ereport class - call into model-specific support to allow it to
766 	 * provide a cpu class or leaf class, otherwise calculate our own.
767 	 */
768 	cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
769 	classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
770 	gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
771 	    leafclass);
772 
773 	/*
774 	 * The detector FMRI.
775 	 */
776 	if ((detector = cms_ereport_detector(hdl, mscookie, nva)) == NULL)
777 		detector = gcpu_fmri_create(hdl, nva);
778 
779 	/*
780 	 * Should we define a new ENA format 3?? for chip/core/strand?
781 	 * It will be better when virtualized.
782 	 */
783 	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
784 	    fm_ena_generate_cpu(gcl->gcl_timestamp,
785 	    cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
786 	    cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
787 
788 	if (panicstr) {
789 		fm_nvlist_destroy(detector, FM_NVA_RETAIN);
790 		nv_alloc_reset(nva);
791 	} else {
792 		fm_nvlist_destroy(detector, FM_NVA_FREE);
793 	}
794 
795 	/*
796 	 * Add the architectural ereport class-specific payload data.
797 	 */
798 	gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
799 
800 	/*
801 	 * Allow model-specific code to add ereport members.
802 	 */
803 	cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
804 	    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
805 
806 	/*
807 	 * Include stack if options is turned on and either selected in
808 	 * the payload member bitmask or inclusion is forced.
809 	 */
810 	if (gcpu_mca_stack_flag &&
811 	    (cms_ereport_includestack(hdl, mscookie) ==
812 	    B_TRUE || gcpu_mca_stack_ereport_include)) {
813 		fm_payload_stack_add(ereport, gcl->gcl_stack,
814 		    gcl->gcl_stackdepth);
815 	}
816 
817 	/*
818 	 * Post ereport.
819 	 */
820 	if (panicstr) {
821 		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
822 		if (scr_eqep)
823 			errorq_cancel(ereport_errorq, scr_eqep);
824 	} else {
825 		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
826 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
827 	}
828 
829 }
830 
831 /*ARGSUSED*/
832 void
833 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
834 {
835 	const gcpu_logout_t *gcl = data;
836 	const gcpu_bank_logout_t *gbl;
837 	int i;
838 
839 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
840 		const gcpu_error_disp_t *gened;
841 		cms_cookie_t mscookie;
842 
843 		if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
844 		    !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
845 			uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
846 
847 			/*
848 			 * Perform a match based on IA32 MCA architectural
849 			 * components alone.
850 			 */
851 			gened = gcpu_disp_match(code); /* may be NULL */
852 
853 			/*
854 			 * Now see if an model-specific match can be made.
855 			 */
856 			mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i,
857 			    gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
858 			    gcl->gcl_ms_logout);
859 
860 			/*
861 			 * Prepare and dispatch an ereport for logging and
862 			 * diagnosis.
863 			 */
864 			gcpu_ereport_post(gcl, i, gened, mscookie,
865 			    gbl->gbl_status);
866 		} else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
867 		    (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
868 			/*
869 			 * Telemetry kept changing as we tried to read
870 			 * it.  Force an unknown ereport leafclass but
871 			 * keep the telemetry unchanged for logging.
872 			 */
873 			gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
874 			    gbl->gbl_status);
875 		}
876 	}
877 }
878 
879 static size_t gcpu_mca_queue_datasz = 0;
880 
881 /*
882  * The following code is ready to make a weak attempt at growing the
883  * errorq structure size.  Since it is not foolproof (we don't know
884  * who may already be producing to the outgoing errorq) our caller
885  * instead assures that we'll always be called with no greater data
886  * size than on our first call.
887  */
888 static void
889 gcpu_errorq_init(size_t datasz)
890 {
891 	int slots;
892 
893 	mutex_enter(&gcpu_mca_queue_lock);
894 
895 	if (gcpu_mca_queue_datasz >= datasz) {
896 		mutex_exit(&gcpu_mca_queue_lock);
897 		return;
898 	}
899 
900 	membar_producer();
901 	if (gcpu_mca_queue) {
902 		gcpu_mca_queue_datasz = 0;
903 		errorq_destroy(gcpu_mca_queue);
904 	}
905 
906 	slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
907 	slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
908 
909 	gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
910 	    NULL, slots, datasz, 1, ERRORQ_VITAL);
911 
912 	if (gcpu_mca_queue != NULL)
913 		gcpu_mca_queue_datasz = datasz;
914 
915 	mutex_exit(&gcpu_mca_queue_lock);
916 }
917 
918 /*
919  * Perform MCA initialization as described in section 14.6 of Intel 64
920  * and IA-32 Architectures Software Developer's Manual Volume 3A.
921  */
922 
923 static uint_t global_nbanks;
924 
925 void
926 gcpu_mca_init(cmi_hdl_t hdl)
927 {
928 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
929 	uint64_t cap;
930 	uint_t vendor = cmi_hdl_vendor(hdl);
931 	uint_t family = cmi_hdl_family(hdl);
932 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
933 	int mcg_ctl_present;
934 	uint_t nbanks;
935 	size_t mslsz;
936 	int i;
937 
938 	if (gcpu == NULL)
939 		return;
940 
941 	/*
942 	 * Protect from some silly /etc/system settings.
943 	 */
944 	if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
945 		gcpu_mca_telemetry_retries = 5;
946 
947 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
948 		return;
949 
950 	/*
951 	 * CPU startup code only calls cmi_mca_init if x86_feature indicates
952 	 * both MCA and MCE support (i.e., X86_MCA).  P5, K6, and earlier
953 	 * processors, which have their own * more primitive way of doing
954 	 * machine checks, will not have cmi_mca_init called since their
955 	 * CPUID information will not indicate both MCA and MCE features.
956 	 */
957 #ifndef	__xpv
958 	ASSERT(x86_feature & X86_MCA);
959 #endif /* __xpv */
960 
961 	/*
962 	 * Determine whether the IA32_MCG_CTL register is present.  If it
963 	 * is we will enable all features by writing -1 to it towards
964 	 * the end of this initialization;  if it is absent then volume 3A
965 	 * says we must nonetheless continue to initialize the individual
966 	 * banks.
967 	 */
968 	mcg_ctl_present = cap & MCG_CAP_CTL_P;
969 
970 	/*
971 	 * We squirell values away for inspection/debugging.
972 	 */
973 	mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
974 	if (mcg_ctl_present)
975 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
976 		    &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
977 
978 	/*
979 	 * Determine the number of error-reporting banks implemented.
980 	 */
981 	mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
982 
983 	if (nbanks != 0 && global_nbanks == 0)
984 		global_nbanks = nbanks;	/* no race - BSP will get here first */
985 
986 	/*
987 	 * If someone is hiding the number of banks (perhaps we are fully
988 	 * virtualized?) or if this processor has more banks than the
989 	 * first to set global_nbanks then bail.  The latter requirement
990 	 * is because we need to size our errorq data structure and we
991 	 * don't want to have to grow the errorq (destroy and recreate)
992 	 * which may just lose some telemetry.
993 	 */
994 	if (nbanks == 0 || nbanks > global_nbanks)
995 		return;
996 
997 	mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
998 	    sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
999 
1000 	/*
1001 	 * Calculate the size we need to allocate for a gcpu_logout_t
1002 	 * with a gcl_data array big enough for all banks of this cpu.
1003 	 * Add any space requested by the model-specific logout support.
1004 	 */
1005 	mslsz = cms_logout_size(hdl);
1006 	mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1007 	    (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1008 
1009 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1010 		gcpu_logout_t *gcl;
1011 
1012 		mca->gcpu_mca_logout[i] = gcl =
1013 		    kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1014 		gcl->gcl_gcpu = gcpu;
1015 		gcl->gcl_nbanks = nbanks;
1016 		gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1017 		    (char *)(&gcl->gcl_data[0]) + nbanks *
1018 		    sizeof (gcpu_bank_logout_t);
1019 
1020 	}
1021 	mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1022 
1023 	/*
1024 	 * Create our errorq to transport the logout structures.  This
1025 	 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1026 	 */
1027 	gcpu_errorq_init(mca->gcpu_mca_lgsz);
1028 
1029 	/*
1030 	 * Not knowing which, if any, banks are shared between cores we
1031 	 * assure serialization of MCA bank initialization by each cpu
1032 	 * on the chip.  On chip architectures in which some banks are
1033 	 * shared this will mean the shared resource is initialized more
1034 	 * than once - we're simply aiming to avoid simultaneous MSR writes
1035 	 * to the shared resource.
1036 	 *
1037 	 * Even with these precautions, some platforms may yield a GP fault
1038 	 * if a core other than a designated master tries to write anything
1039 	 * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
1040 	 * those writes under on_trap protection.
1041 	 */
1042 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1043 
1044 	/*
1045 	 * Initialize poller data, but don't start polling yet.
1046 	 */
1047 	gcpu_mca_poll_init(hdl);
1048 
1049 	/*
1050 	 * Work out which MCA banks we will initialize.  In MCA logout
1051 	 * code we will only read those banks which we initialize here.
1052 	 */
1053 	for (i = 0; i < nbanks; i++) {
1054 		/*
1055 		 * On Intel family 6 and AMD family 6 we must not enable
1056 		 * machine check from bank 0 detectors.  In the Intel
1057 		 * case bank 0 is reserved for the platform, while in the
1058 		 * AMD case reports are that enabling bank 0 (DC) produces
1059 		 * spurious machine checks.
1060 		 */
1061 		if (i == 0 && ((vendor == X86_VENDOR_Intel ||
1062 		    vendor == X86_VENDOR_AMD) && family == 6))
1063 			continue;
1064 
1065 		if (cms_bankctl_skipinit(hdl, i))
1066 			continue;
1067 
1068 		/*
1069 		 * Record which MCA banks were enabled, both from the
1070 		 * point of view of this core and accumulating for the
1071 		 * whole chip (if some cores share a bank we must be
1072 		 * sure either can logout from it).
1073 		 */
1074 		mca->gcpu_actv_banks |= 1 << i;
1075 		atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1076 	}
1077 
1078 	/*
1079 	 * Log any valid telemetry lurking in the MCA banks, but do not
1080 	 * clear the status registers.  Ignore the disposition returned -
1081 	 * we have already paniced or reset for any nasty errors found here.
1082 	 *
1083 	 * Intel vol 3A says that we should not do this on family 0x6,
1084 	 * and that for any extended family the BIOS clears things
1085 	 * on power-on reset so you'll only potentially find valid telemetry
1086 	 * on warm reset (we do it for both - on power-on reset we should
1087 	 * just see zeroes).
1088 	 *
1089 	 * AMD docs since K7 say we should process anything we find here.
1090 	 */
1091 	if (!gcpu_suppress_log_on_init &&
1092 	    (vendor == X86_VENDOR_Intel && family >= 0xf ||
1093 	    vendor == X86_VENDOR_AMD))
1094 		gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE);
1095 
1096 	/*
1097 	 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1098 	 * model-specific module the power of veto.
1099 	 */
1100 	for (i = 0; i < nbanks; i++) {
1101 		struct gcpu_bios_bankcfg *bcfgp =
1102 		    mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1103 
1104 		/*
1105 		 * Stash inherited bank MCA state, even for banks we will
1106 		 * not initialize ourselves.  Do not read the MISC register
1107 		 * unconditionally - on some processors that will #GP on
1108 		 * banks that do not implement the MISC register (would be
1109 		 * caught by on_trap, anyway).
1110 		 */
1111 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1112 		    &bcfgp->bios_bank_ctl);
1113 
1114 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1115 		    &bcfgp->bios_bank_status);
1116 
1117 		if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV)
1118 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1119 			    &bcfgp->bios_bank_addr);
1120 
1121 		if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1122 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1123 			    &bcfgp->bios_bank_misc);
1124 
1125 		if (!(mca->gcpu_actv_banks & 1 << i))
1126 			continue;
1127 
1128 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1129 		    cms_bankctl_val(hdl, i, -1ULL));
1130 
1131 		if (!cms_bankstatus_skipinit(hdl, i)) {
1132 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1133 			    cms_bankstatus_val(hdl, i, 0ULL));
1134 		}
1135 	}
1136 
1137 	/*
1138 	 * Now let the model-specific support perform further initialization
1139 	 * of non-architectural features.
1140 	 */
1141 	cms_mca_init(hdl, nbanks);
1142 
1143 	(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1144 	membar_producer();
1145 
1146 	/* enable all machine-check features */
1147 	if (mcg_ctl_present)
1148 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1149 		    cms_mcgctl_val(hdl, nbanks, -1ULL));
1150 
1151 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1152 
1153 	/* enable machine-check exception in CR4 */
1154 	cmi_hdl_enable_mce(hdl);
1155 }
1156 
1157 static uint64_t
1158 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1159     gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1160 {
1161 	int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1162 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1163 	int nbanks = mca->gcpu_mca_nbanks;
1164 	gcpu_mce_status_t mce;
1165 	gcpu_bank_logout_t *gbl;
1166 	uint64_t disp = 0;
1167 	int i;
1168 
1169 	if (mcesp == NULL)
1170 		mcesp = &mce;
1171 
1172 	mcesp->mce_nerr = nerr;
1173 
1174 	mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1175 	    mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1176 	    mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1177 
1178 	/*
1179 	 * If this a machine check then if the return instruction pointer
1180 	 * is not valid the current context is lost.
1181 	 */
1182 	if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1183 		disp |= CMI_ERRDISP_RIPV_INVALID;
1184 
1185 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1186 		uint64_t mcistatus = gbl->gbl_status;
1187 		uint32_t ms_scope;
1188 		int pcc, uc;
1189 		int poisoned;
1190 
1191 		if (!(mcistatus & MSR_MC_STATUS_VAL))
1192 			continue;
1193 
1194 		if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1195 			continue;
1196 
1197 		pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1198 		uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1199 		mcesp->mce_npcc += pcc;
1200 		mcesp->mce_nuc += uc;
1201 
1202 		ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1203 		    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1204 
1205 		if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1206 			pcc = 0;
1207 			mcesp->mce_npcc_ok++;
1208 			gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1209 		}
1210 
1211 		if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1212 			uc = 0;
1213 			mcesp->mce_nuc_ok++;
1214 			gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1215 		}
1216 
1217 		if (uc) {
1218 			poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1219 			if (poisoned) {
1220 				mcesp->mce_nuc_poisoned++;
1221 				gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1222 			}
1223 		}
1224 
1225 		if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1226 			/*
1227 			 * We're not being instructed to ignore the error,
1228 			 * so apply our standard disposition logic to it.
1229 			 */
1230 			if (uc && !poisoned) {
1231 				unconstrained++;
1232 				gbl->gbl_disp |= disp |
1233 				    CMI_ERRDISP_UC_UNCONSTRAINED;
1234 			}
1235 
1236 			if (pcc && ismc) {
1237 				curctxbad++;
1238 				gbl->gbl_disp |= disp |
1239 				    CMI_ERRDISP_CURCTXBAD;
1240 			}
1241 
1242 			/*
1243 			 * Even if the above may not indicate that the error
1244 			 * is terminal, model-specific support may insist
1245 			 * that we treat it as such.  Such errors wil be
1246 			 * fatal even if discovered via poll.
1247 			 */
1248 			if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1249 				forcefatal++;
1250 				mcesp->mce_forcefatal++;
1251 				gbl->gbl_disp |= disp |
1252 				    CMI_ERRDISP_FORCEFATAL;
1253 			}
1254 		} else {
1255 			mcesp->mce_ignored++;
1256 			gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1257 		}
1258 	}
1259 
1260 	if (unconstrained > 0)
1261 		disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1262 
1263 	if (curctxbad > 0)
1264 		disp |= CMI_ERRDISP_CURCTXBAD;
1265 
1266 	if (forcefatal > 0)
1267 		disp |= CMI_ERRDISP_FORCEFATAL;
1268 
1269 	if (gcpu_mca_queue != NULL) {
1270 		int how;
1271 
1272 		if (ismc) {
1273 			how = cmi_mce_response(rp, disp) ?
1274 			    ERRORQ_ASYNC :	/* no panic, so arrange drain */
1275 			    ERRORQ_SYNC;	/* panic flow will drain */
1276 		} else {
1277 			how = (disp & CMI_ERRDISP_FORCEFATAL &&
1278 			    cmi_panic_on_ue()) ?
1279 			    ERRORQ_SYNC :	/* poller will panic */
1280 			    ERRORQ_ASYNC;	/* no panic */
1281 		}
1282 
1283 		errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1284 	} else if (disp != 0) {
1285 		gcpu_bleat(hdl, gcl);
1286 	}
1287 
1288 	mcesp->mce_disp = disp;
1289 
1290 	return (disp);
1291 }
1292 
1293 /*
1294  * Gather error telemetry from our source, and then submit it for
1295  * processing.
1296  */
1297 
1298 #define	IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1299 	((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1300 
1301 #define	STATUS_EQV(s1, s2) \
1302 	(((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1303 
1304 static uint32_t gcpu_deferrred_polled_clears;
1305 
1306 void
1307 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1308     gcpu_mce_status_t *mcesp, boolean_t clrstatus)
1309 {
1310 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1311 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1312 	int nbanks = mca->gcpu_mca_nbanks;
1313 	gcpu_bank_logout_t *gbl, *pgbl;
1314 	gcpu_logout_t *gcl, *pgcl;
1315 	int ismc = (rp != NULL);
1316 	int ispoll = !ismc;
1317 	int i, nerr = 0;
1318 	cmi_errno_t err;
1319 	uint64_t mcg_status;
1320 	uint64_t disp;
1321 	uint64_t cap;
1322 
1323 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1324 	    CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1325 	    CMI_SUCCESS) {
1326 		if (mcesp != NULL)
1327 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1328 		return;
1329 	}
1330 
1331 	if (ismc) {
1332 		gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1333 	} else {
1334 		int pidx = mca->gcpu_mca_nextpoll_idx;
1335 		int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1336 		    GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1337 
1338 		gcl = mca->gcpu_mca_logout[pidx];	/* current logout */
1339 		pgcl = mca->gcpu_mca_logout[ppidx];	/* previous logout */
1340 		mca->gcpu_mca_nextpoll_idx = ppidx;	/* switch next time */
1341 	}
1342 
1343 	gcl->gcl_timestamp = gethrtime_waitfree();
1344 	gcl->gcl_mcg_status = mcg_status;
1345 	gcl->gcl_ip = rp ? rp->r_pc : 0;
1346 
1347 	gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1348 	if (cap & MCG_CAP_TES_P)
1349 		gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1350 
1351 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1352 		uint64_t status, status2, addr, misc;
1353 		int retries = gcpu_mca_telemetry_retries;
1354 
1355 		gbl->gbl_status = 0;
1356 		gbl->gbl_disp = 0;
1357 		gbl->gbl_clrdefcnt = 0;
1358 
1359 		/*
1360 		 * Only logout from MCA banks we have initialized from at
1361 		 * least one core.  If a core shares an MCA bank with another
1362 		 * but perhaps lost the race to initialize it, then it must
1363 		 * still be allowed to logout from the shared bank.
1364 		 */
1365 		if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1366 			continue;
1367 
1368 		/*
1369 		 * On a poll look only at the banks we've been asked to check.
1370 		 */
1371 		if (rp == NULL && !(bankmask & 1 << i))
1372 			continue;
1373 
1374 
1375 		if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1376 		    CMI_SUCCESS)
1377 			continue;
1378 retry:
1379 		if (!(status & MSR_MC_STATUS_VAL))
1380 			continue;
1381 
1382 		addr = -1;
1383 		misc = 0;
1384 
1385 		if (status & MSR_MC_STATUS_ADDRV)
1386 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1387 
1388 		if (status & MSR_MC_STATUS_MISCV)
1389 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1390 
1391 		/*
1392 		 * Allow the model-specific code to extract bank telemetry.
1393 		 */
1394 		cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1395 
1396 		/*
1397 		 * Not all cpu models assure us that the status/address/misc
1398 		 * data will not change during the above sequence of MSR reads,
1399 		 * or that it can only change by the addition of the OVerflow
1400 		 * bit to the status register.  If the status has changed
1401 		 * other than in the overflow bit then we attempt to reread
1402 		 * for a consistent snapshot, but eventually give up and
1403 		 * go with what we've got.  We only perform this check
1404 		 * for a poll - a further #MC during a #MC will reset, and
1405 		 * polled errors should not overwrite higher-priority
1406 		 * trapping errors (but could set the overflow bit).
1407 		 */
1408 		if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1409 		    &status2)) == CMI_SUCCESS) {
1410 			if (!STATUS_EQV(status, status2)) {
1411 				if (retries-- > 0) {
1412 					status = status2;
1413 					goto retry;
1414 				} else {
1415 					gbl->gbl_disp |=
1416 					    CMI_ERRDISP_INCONSISTENT;
1417 				}
1418 			}
1419 		} else if (ispoll && err != CMI_SUCCESS) {
1420 			gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1421 		}
1422 
1423 		nerr++;
1424 		gbl->gbl_status = status;
1425 		gbl->gbl_addr = addr;
1426 		gbl->gbl_misc = misc;
1427 
1428 		if (clrstatus == B_FALSE)
1429 			goto serialize;
1430 
1431 		/*
1432 		 * For machine checks we always clear status here.  For polls
1433 		 * we must be a little more cautious since there is an
1434 		 * outside chance that we may clear telemetry from a shared
1435 		 * MCA bank on which a sibling core is machine checking.
1436 		 *
1437 		 * For polled observations of errors that look like they may
1438 		 * produce a machine check (UC/PCC and ENabled, although these
1439 		 * do not guarantee a machine check on error occurence)
1440 		 * we will not clear the status at this wakeup unless
1441 		 * we saw the same status at the previous poll.  We will
1442 		 * always process and log the current observations - it
1443 		 * is only the clearing of MCi_STATUS which may be
1444 		 * deferred until the next wakeup.
1445 		 */
1446 		if (ismc || !IS_MCE_CANDIDATE(status)) {
1447 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1448 			goto serialize;
1449 		}
1450 
1451 		/*
1452 		 * We have a polled observation of a machine check
1453 		 * candidate.  If we saw essentially the same status at the
1454 		 * last poll then clear the status now since this appears
1455 		 * not to be a #MC candidate after all.  If we see quite
1456 		 * different status now then do not clear, but reconsider at
1457 		 * the next poll.  In no actual machine check clears
1458 		 * the status in the interim then the status should not
1459 		 * keep changing forever (meaning we'd never clear it)
1460 		 * since before long we'll simply have latched the highest-
1461 		 * priority error and set the OVerflow bit.  Nonetheless
1462 		 * we count how many times we defer clearing and after
1463 		 * a while insist on clearing the status.
1464 		 */
1465 		pgbl = &pgcl->gcl_data[i];
1466 		if (pgbl->gbl_clrdefcnt != 0) {
1467 			/* We deferred clear on this bank at last wakeup */
1468 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1469 			    pgbl->gbl_clrdefcnt > 5) {
1470 				/*
1471 				 * Status is unchanged so clear it now and,
1472 				 * since we have already logged this info,
1473 				 * avoid logging it again.
1474 				 */
1475 				gbl->gbl_status = 0;
1476 				nerr--;
1477 				(void) cmi_hdl_wrmsr(hdl,
1478 				    IA32_MSR_MC(i, STATUS), 0ULL);
1479 			} else {
1480 				/* Record deferral for next wakeup */
1481 				gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1482 			}
1483 		} else {
1484 			/* Record initial deferral for next wakeup */
1485 			gbl->gbl_clrdefcnt = 1;
1486 			gcpu_deferrred_polled_clears++;
1487 		}
1488 
1489 serialize:
1490 		/*
1491 		 * Intel Vol 3A says to execute a serializing instruction
1492 		 * here, ie CPUID.  Well WRMSR is also defined to be
1493 		 * serializing, so the status clear above should suffice.
1494 		 * To be a good citizen, and since some clears are deferred,
1495 		 * we'll execute a CPUID instruction here.
1496 		 */
1497 		{
1498 			struct cpuid_regs tmp;
1499 			(void) __cpuid_insn(&tmp);
1500 		}
1501 	}
1502 
1503 	if (gcpu_mca_stack_flag)
1504 		gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1505 	else
1506 		gcl->gcl_stackdepth = 0;
1507 
1508 	/*
1509 	 * Decide our disposition for this error or errors, and submit for
1510 	 * logging and subsequent diagnosis.
1511 	 */
1512 	if (nerr != 0) {
1513 		disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1514 	} else {
1515 		disp = 0;
1516 		if (mcesp) {
1517 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1518 		}
1519 	}
1520 
1521 	/*
1522 	 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1523 	 * If a second #MC had occured before now the system would have
1524 	 * reset.  We can only do thise once gcpu_mca_process has copied
1525 	 * the logout structure.
1526 	 */
1527 	if (ismc && mcg_status & MCG_STATUS_MCIP)
1528 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1529 
1530 	/*
1531 	 * At this point we have read and logged all telemetry that is visible
1532 	 * under the MCA.  On architectures for which the NorthBridge is
1533 	 * on-chip this may include NB-observed errors, but where the NB
1534 	 * is off chip it may have been the source of the #MC request and
1535 	 * so we must call into the memory-controller driver to give it
1536 	 * a chance to log errors.
1537 	 */
1538 	if (ismc) {
1539 		int willpanic = (cmi_mce_response(rp, disp) == 0);
1540 		cmi_mc_logout(hdl, 1, willpanic);
1541 	}
1542 }
1543 
1544 int gcpu_mca_trap_vomit_summary = 0;
1545 
1546 /*
1547  * On a native machine check exception we come here from mcetrap via
1548  * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
1549  * cpus of the chip, so it is possible that another cpu on this chip could
1550  * initiate a poll while we're in the #mc handler;  it is also possible that
1551  * this trap has occured during a poll on this cpu.  So we must acquire
1552  * the chip-wide poll lock, but be careful to avoid deadlock.
1553  *
1554  * The 'data' pointer cannot be NULL due to init order.
1555  */
1556 uint64_t
1557 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1558 {
1559 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1560 	kmutex_t *poll_lock = NULL;
1561 	gcpu_mce_status_t mce;
1562 	uint64_t mcg_status;
1563 	int tooklock = 0;
1564 
1565 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1566 	    CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
1567 		return (0);
1568 
1569 	/*
1570 	 * Synchronize with any poller from another core that may happen
1571 	 * to share access to one or more of the MCA banks.
1572 	 */
1573 	if (gcpu->gcpu_shared != NULL)
1574 		poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
1575 
1576 	if (poll_lock != NULL && !mutex_owned(poll_lock)) {
1577 		/*
1578 		 * The lock is not owned by the thread we have
1579 		 * interrupted.  Spin for this adaptive lock.
1580 		 */
1581 		while (!mutex_tryenter(poll_lock)) {
1582 			while (mutex_owner(poll_lock) != NULL)
1583 				;
1584 		}
1585 		tooklock = 1;
1586 	}
1587 
1588 	gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE);
1589 
1590 	if (tooklock)
1591 		mutex_exit(poll_lock);
1592 
1593 	/*
1594 	 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
1595 	 */
1596 	if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
1597 		cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
1598 		    "%u PCC (%u ok), "
1599 		    "%u UC (%d ok, %u poisoned), "
1600 		    "%u forcefatal, %u ignored",
1601 		    mce.mce_nerr, (u_longlong_t)mce.mce_disp,
1602 		    mce.mce_npcc, mce.mce_npcc_ok,
1603 		    mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
1604 		    mce.mce_forcefatal, mce.mce_ignored);
1605 	}
1606 
1607 	return (mce.mce_disp);
1608 }
1609 
1610 /*ARGSUSED*/
1611 void
1612 gcpu_faulted_enter(cmi_hdl_t hdl)
1613 {
1614 	/* Nothing to do here */
1615 }
1616 
1617 /*ARGSUSED*/
1618 void
1619 gcpu_faulted_exit(cmi_hdl_t hdl)
1620 {
1621 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1622 
1623 	gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
1624 }
1625 
1626 /*
1627  * Write the requested values to the indicated MSRs.  Having no knowledge
1628  * of the model-specific requirements for writing to these model-specific
1629  * registers, we will only blindly write to those MSRs if the 'force'
1630  * argument is nonzero.  That option should only be used in prototyping
1631  * and debugging.
1632  */
1633 /*ARGSUSED*/
1634 cmi_errno_t
1635 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
1636     int force)
1637 {
1638 	int i, errs = 0;
1639 
1640 	for (i = 0; i < nregs; i++) {
1641 		uint_t msr = regs[i].cmr_msrnum;
1642 		uint64_t val = regs[i].cmr_msrval;
1643 
1644 		if (cms_present(hdl)) {
1645 			if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
1646 				errs++;
1647 		} else if (force) {
1648 			errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
1649 		} else {
1650 			errs++;
1651 		}
1652 	}
1653 
1654 	return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
1655 }
1656