1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/mca_x86.h>
30 #include <sys/cpu_module_impl.h>
31 #include <sys/cpu_module_ms.h>
32 #include <sys/cmn_err.h>
33 #include <sys/cpuvar.h>
34 #include <sys/pghw.h>
35 #include <sys/x86_archext.h>
36 #include <sys/sysmacros.h>
37 #include <sys/regset.h>
38 #include <sys/privregs.h>
39 #include <sys/systm.h>
40 #include <sys/types.h>
41 #include <sys/log.h>
42 #include <sys/psw.h>
43 #include <sys/fm/protocol.h>
44 #include <sys/fm/util.h>
45 #include <sys/errorq.h>
46 #include <sys/mca_x86.h>
47 #include <sys/fm/cpu/GMCA.h>
48 #include <sys/sysevent.h>
49 #include <sys/ontrap.h>
50 
51 #include "gcpu.h"
52 
53 /*
54  * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
55  * error logout time.  The stack will be included in the ereport if the
56  * error type selects stack inclusion, or in all cases if
57  * gcpu_mca_stack_ereport_include is nonzero.
58  */
59 int gcpu_mca_stack_flag = 0;
60 int gcpu_mca_stack_ereport_include = 0;
61 
62 /*
63  * The number of times to re-read MCA telemetry to try to obtain a
64  * consistent snapshot if we find it to be changing under our feet.
65  */
66 int gcpu_mca_telemetry_retries = 5;
67 
68 static gcpu_error_disp_t gcpu_errtypes[] = {
69 
70 	/*
71 	 * Unclassified
72 	 */
73 	{
74 		FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
75 		NULL,
76 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
77 		MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
78 		MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
79 	},
80 
81 	/*
82 	 * Microcode ROM Parity Error
83 	 */
84 	{
85 		FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
86 		NULL,
87 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
88 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
89 		MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
90 	},
91 
92 	/*
93 	 * External - BINIT# from another processor during power-on config
94 	 */
95 	{
96 		FM_EREPORT_CPU_GENERIC_EXTERNAL,
97 		NULL,
98 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
99 		MCAX86_SIMPLE_EXTERNAL_MASKON,
100 		MCAX86_SIMPLE_EXTERNAL_MASKOFF
101 	},
102 
103 	/*
104 	 * Functional redundancy check master/slave error
105 	 */
106 	{
107 		FM_EREPORT_CPU_GENERIC_FRC,
108 		NULL,
109 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
110 		MCAX86_SIMPLE_FRC_MASKON,
111 		MCAX86_SIMPLE_FRC_MASKOFF
112 	},
113 
114 	/*
115 	 * Internal timer error
116 	 */
117 	{
118 		FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
119 		NULL,
120 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
121 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
122 		MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
123 	},
124 
125 	/*
126 	 * Internal unclassified
127 	 */
128 	{
129 		FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
130 		NULL,
131 		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
132 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
133 		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
134 	},
135 
136 	/*
137 	 * Compound error codes - generic memory hierarchy
138 	 */
139 	{
140 		FM_EREPORT_CPU_GENERIC_GENMEMHIER,
141 		NULL,
142 		FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
143 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
144 		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
145 	},
146 
147 	/*
148 	 * Compound error codes - TLB errors
149 	 */
150 	{
151 		FM_EREPORT_CPU_GENERIC_TLB,
152 		"%1$s" "TLB" "%2$s" "_ERR",
153 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
154 		MCAX86_COMPOUND_TLB_MASKON,
155 		MCAX86_COMPOUND_TLB_MASKOFF
156 	},
157 
158 	/*
159 	 * Compound error codes - memory hierarchy
160 	 */
161 	{
162 		FM_EREPORT_CPU_GENERIC_MEMHIER,
163 		"%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
164 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
165 		MCAX86_COMPOUND_MEMHIER_MASKON,
166 		MCAX86_COMPOUND_MEMHIER_MASKOFF
167 	},
168 
169 	/*
170 	 * Compound error codes - bus and interconnect errors
171 	 */
172 	{
173 		FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
174 		"BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
175 		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
176 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
177 		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
178 	},
179 };
180 
181 static gcpu_error_disp_t gcpu_unknown = {
182 	FM_EREPORT_CPU_GENERIC_UNKNOWN,
183 	"UNKNOWN",
184 	FM_EREPORT_PAYLOAD_FLAGS_COMMON,
185 	0,
186 	0
187 };
188 
189 static errorq_t *gcpu_mca_queue;
190 static kmutex_t gcpu_mca_queue_lock;
191 
192 static const gcpu_error_disp_t *
193 gcpu_disp_match(uint16_t code)
194 {
195 	const gcpu_error_disp_t *ged = gcpu_errtypes;
196 	int i;
197 
198 	for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
199 	    i++, ged++) {
200 		uint16_t on = ged->ged_errcode_mask_on;
201 		uint16_t off = ged->ged_errcode_mask_off;
202 
203 		if ((code & on) == on && (code & off) == 0)
204 			return (ged);
205 	}
206 
207 	return (NULL);
208 }
209 
210 static uint8_t
211 bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
212 {
213 	return ((uint8_t)(code & mask) >> shift);
214 }
215 
216 #define	BIT_STRIP(code, name) \
217 	bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
218 	MCAX86_ERRCODE_##name##_SHIFT)
219 
220 #define	GCPU_MNEMONIC_UNDEF	"undefined"
221 #define	GCPU_MNEMONIC_RESVD	"reserved"
222 
223 /*
224  * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
225  * mnemonics and to ereport class name components.
226  */
227 
228 struct gcpu_mnexp {
229 	const char *mne_compound;	/* used in expanding compound errname */
230 	const char *mne_ereport;	/* used in expanding ereport class */
231 };
232 
233 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
234 	{ "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },		/* INSTR */
235 	{ "D", FM_EREPORT_CPU_GENERIC_TT_DATA },		/* DATA */
236 	{ "G", FM_EREPORT_CPU_GENERIC_TT_GEN },			/* GEN */
237 	{ GCPU_MNEMONIC_UNDEF, "" }
238 };
239 
240 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
241 	{ "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },			/* L0 */
242 	{ "L1",	FM_EREPORT_CPU_GENERIC_LL_L1 },			/* L1 */
243 	{ "L2",	FM_EREPORT_CPU_GENERIC_LL_L2 },			/* L2 */
244 	{ "LG", FM_EREPORT_CPU_GENERIC_LL_LG }			/* LG */
245 };
246 
247 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
248 	{ "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },		/* ERR */
249 	{ "RD",	FM_EREPORT_CPU_GENERIC_RRRR_RD },		/* RD */
250 	{ "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },		/* WR */
251 	{ "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },		/* DRD */
252 	{ "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },		/* DWR */
253 	{ "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },		/* IRD */
254 	{ "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },	/* PREFETCH */
255 	{ "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },		/* EVICT */
256 	{ "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },		/* SNOOP */
257 };
258 
259 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
260 	{ "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },		/* SRC */
261 	{ "RES", FM_EREPORT_CPU_GENERIC_PP_RES },		/* RES */
262 	{ "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },		/* OBS */
263 	{ "", FM_EREPORT_CPU_GENERIC_PP_GEN }			/* GEN */
264 };
265 
266 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
267 	{ "M", FM_EREPORT_CPU_GENERIC_II_MEM },			/* MEM */
268 	{ GCPU_MNEMONIC_RESVD, "" },
269 	{ "IO", FM_EREPORT_CPU_GENERIC_II_IO },			/* IO */
270 	{ "", FM_EREPORT_CPU_GENERIC_II_GEN }			/* GEN */
271 };
272 
273 static struct gcpu_mnexp gcpu_T_mnemonics[] = {	 /* MCAX86_ERRCODE_T_* */
274 	{ "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },	/* NONE */
275 	{ "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }		/* TIMEOUT */
276 };
277 
278 enum gcpu_mn_namespace {
279 	GCPU_MN_NAMESPACE_COMPOUND,
280 	GCPU_MN_NAMESPACE_EREPORT
281 };
282 
283 static const char *
284 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val,
285     enum gcpu_mn_namespace nspace)
286 {
287 	if (val >= tbl_sz)
288 		return (GCPU_MNEMONIC_UNDEF);	/* for all namespaces */
289 
290 	switch (nspace) {
291 	case GCPU_MN_NAMESPACE_COMPOUND:
292 		return (tbl[val].mne_compound);
293 		/*NOTREACHED*/
294 
295 	case GCPU_MN_NAMESPACE_EREPORT:
296 		return (tbl[val].mne_ereport);
297 		/*NOTREACHED*/
298 
299 	default:
300 		return (GCPU_MNEMONIC_UNDEF);
301 		/*NOTREACHED*/
302 	}
303 }
304 
305 /*
306  * The ereport class leaf component is either a simple string with no
307  * format specifiers, or a string with one or more embedded %n$s specifiers -
308  * positional selection for string arguments.  The kernel snprintf does
309  * not support %n$ (and teaching it to do so is too big a headache) so
310  * we will expand this restricted format string ourselves.
311  */
312 
313 #define	GCPU_CLASS_VARCOMPS	7
314 
315 #define	GCPU_MNEMONIC(code, name, nspace) \
316 	gcpu_mnemonic(gcpu_##name##_mnemonics, \
317 	sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
318 	BIT_STRIP(code, name), nspace)
319 
320 static void
321 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
322     enum gcpu_mn_namespace nspace)
323 {
324 	uint16_t code = MCAX86_ERRCODE(status);
325 	const char *mn[GCPU_CLASS_VARCOMPS];
326 	char *p = buf;			/* current position in buf */
327 	char *q = buf + buflen;		/* pointer past last char in buf */
328 	int which, expfmtchar, error;
329 	char c;
330 
331 	mn[0] = GCPU_MNEMONIC(code, TT, nspace);
332 	mn[1] = GCPU_MNEMONIC(code, LL, nspace);
333 	mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
334 	mn[3] = GCPU_MNEMONIC(code, PP, nspace);
335 	mn[4] = GCPU_MNEMONIC(code, II, nspace);
336 	mn[5] = GCPU_MNEMONIC(code, T, nspace);
337 	mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
338 
339 	while (p < q - 1 && (c = *fmt++) != '\0') {
340 		if (c != '%') {
341 			/* not the beginning of a format specifier - copy */
342 			*p++ = c;
343 			continue;
344 		}
345 
346 		error = 0;
347 		which = -1;
348 		expfmtchar = -1;
349 
350 nextfmt:
351 		if ((c = *fmt++) == '\0')
352 			break;	/* early termination of fmt specifier */
353 
354 		switch (c) {
355 		case '1':
356 		case '2':
357 		case '3':
358 		case '4':
359 		case '5':
360 		case '6':
361 		case '7':
362 			if (which != -1) { /* allow only one positional digit */
363 				error++;
364 				break;
365 			}
366 			which = c - '1';
367 			goto nextfmt;
368 			/*NOTREACHED*/
369 
370 		case '$':
371 			if (which == -1) { /* no position specified */
372 				error++;
373 				break;
374 			}
375 			expfmtchar = 's';
376 			goto nextfmt;
377 			/*NOTREACHED*/
378 
379 		case 's':
380 			if (expfmtchar != 's') {
381 				error++;
382 				break;
383 			}
384 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
385 			    mn[which]);
386 			p += strlen(p);
387 			break;
388 
389 		default:
390 			error++;
391 			break;
392 		}
393 
394 		if (error)
395 			break;
396 	}
397 
398 	*p = '\0';	/* NUL termination */
399 }
400 
401 static void
402 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
403     const char *cpuclass, const char *leafclass)
404 {
405 	char *p = buf;			/* current position in buf */
406 	char *q = buf + buflen;		/* pointer past last char in buf */
407 
408 	(void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
409 	    FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
410 
411 	p += strlen(p);
412 	if (p >= q)
413 		return;
414 
415 	if (leafclass == NULL) {
416 		gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
417 		    GCPU_MN_NAMESPACE_EREPORT);
418 	} else {
419 		(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
420 		    leafclass);
421 	}
422 }
423 
424 /*
425  * Create an "hc" scheme FMRI identifying the given cpu.  We don't know
426  * the actual topology/connectivity of cpus in the system, so we'll
427  * apply /motherboard=0/chip=.../cpu=... in all cases.
428  */
429 static nvlist_t *
430 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
431 {
432 	nvlist_t *nvl;
433 
434 	if ((nvl = fm_nvlist_create(nva)) == NULL)
435 		return (NULL);
436 
437 	fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 3,
438 	    "motherboard", 0,
439 	    "chip", cmi_hdl_chipid(hdl),
440 	    "cpu", cmi_hdl_coreid(hdl));
441 
442 	return (nvl);
443 }
444 
445 int gcpu_bleat_count_thresh = 5;
446 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
447 
448 /*
449  * Called when we are unable to propogate a logout structure onto an
450  * errorq for subsequent ereport preparation and logging etc.  The caller
451  * should usually only decide to call this for severe errors - those we
452  * suspect we may need to panic for.
453  */
454 static void
455 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
456 {
457 	hrtime_t now  = gethrtime_waitfree();
458 	static hrtime_t gcpu_last_bleat;
459 	gcpu_bank_logout_t *gbl;
460 	static int bleatcount;
461 	int i;
462 
463 	/*
464 	 * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
465 	 * can come as fast as we like, but once we've spammed that many
466 	 * to the console we require a minimum interval to pass before
467 	 * any more complaints.
468 	 */
469 	if (++bleatcount > gcpu_bleat_count_thresh) {
470 		if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
471 			return;
472 		else
473 			bleatcount = 0;
474 	}
475 	gcpu_last_bleat = now;
476 
477 	cmn_err(CE_WARN, "Machine-Check Errors unlogged on chip %d core %d, "
478 	    "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl));
479 	cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
480 	    (u_longlong_t)gcl->gcl_mcg_status);
481 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
482 		uint64_t status = gbl->gbl_status;
483 
484 		if (!(status & MSR_MC_STATUS_VAL))
485 			continue;
486 
487 		switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
488 		case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
489 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
490 			    "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
491 			    i, IA32_MSR_MC(i, STATUS),
492 			    (u_longlong_t)status,
493 			    (u_longlong_t)gbl->gbl_addr,
494 			    (u_longlong_t)gbl->gbl_misc);
495 			break;
496 
497 		case MSR_MC_STATUS_ADDRV:
498 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
499 			    "STAT 0x%016llx ADDR 0x%016llx",
500 			    i, IA32_MSR_MC(i, STATUS),
501 			    (u_longlong_t)status,
502 			    (u_longlong_t)gbl->gbl_addr);
503 			break;
504 
505 		case MSR_MC_STATUS_MISCV:
506 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
507 			    "STAT 0x%016llx MISC 0x%016llx",
508 			    i, IA32_MSR_MC(i, STATUS),
509 			    (u_longlong_t)status,
510 			    (u_longlong_t)gbl->gbl_misc);
511 			break;
512 
513 		default:
514 			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
515 			    "STAT 0x%016llx",
516 			    i, IA32_MSR_MC(i, STATUS),
517 			    (u_longlong_t)status);
518 			break;
519 
520 		}
521 	}
522 }
523 
524 #define	_GCPU_BSTATUS(status, what) \
525 	FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
526 	(status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
527 
528 static void
529 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
530     uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
531 {
532 	uint64_t members = ged ? ged->ged_ereport_members :
533 	    FM_EREPORT_PAYLOAD_FLAGS_COMMON;
534 	uint64_t mcg = gcl->gcl_mcg_status;
535 	int mcip = mcg & MCG_STATUS_MCIP;
536 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
537 	uint64_t bstat = gbl->gbl_status;
538 
539 	/*
540 	 * Include the compound error name if requested and if this
541 	 * is a compound error type.
542 	 */
543 	if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
544 	    ged->ged_compound_fmt != NULL) {
545 		char buf[FM_MAX_CLASS];
546 
547 		gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
548 		    GCPU_MN_NAMESPACE_COMPOUND);
549 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
550 		    DATA_TYPE_STRING, buf, NULL);
551 	}
552 
553 	/*
554 	 * Include disposition information for this error
555 	 */
556 	if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
557 	    gbl->gbl_disp != 0) {
558 		int i, empty = 1;
559 		char buf[128];
560 		char *p = buf, *q = buf + 128;
561 		static struct _gcpu_disp_name {
562 			uint64_t dv;
563 			const char *dn;
564 		} disp_names[] = {
565 			{ CMI_ERRDISP_CURCTXBAD,
566 			    "processor_context_corrupt" },
567 			{ CMI_ERRDISP_RIPV_INVALID,
568 			    "return_ip_invalid" },
569 			{ CMI_ERRDISP_UC_UNCONSTRAINED,
570 			    "unconstrained" },
571 			{ CMI_ERRDISP_FORCEFATAL,
572 			    "forcefatal" },
573 			{ CMI_ERRDISP_IGNORED,
574 			    "ignored" },
575 			{ CMI_ERRDISP_PCC_CLEARED,
576 			    "corrupt_context_cleared" },
577 			{ CMI_ERRDISP_UC_CLEARED,
578 			    "uncorrected_data_cleared" },
579 			{ CMI_ERRDISP_POISONED,
580 			    "poisoned" },
581 			{ CMI_ERRDISP_INCONSISTENT,
582 			    "telemetry_unstable" },
583 		};
584 
585 		for (i = 0; i < sizeof (disp_names) /
586 		    sizeof (struct _gcpu_disp_name); i++) {
587 			if ((gbl->gbl_disp & disp_names[i].dv) == 0)
588 				continue;
589 
590 			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
591 			    "%s%s", empty ? "" : ",", disp_names[i].dn);
592 			p += strlen(p);
593 			empty = 0;
594 		}
595 
596 		if (p != buf)
597 			fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
598 			    DATA_TYPE_STRING, buf, NULL);
599 	}
600 
601 	/*
602 	 * If MCG_STATUS is included add that and an indication of whether
603 	 * this ereport was the result of a machine check or poll.
604 	 */
605 	if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
606 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
607 		    DATA_TYPE_UINT64, mcg, NULL);
608 
609 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
610 		    DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
611 	}
612 
613 	/*
614 	 * If an instruction pointer is to be included add one provided
615 	 * MCG_STATUS indicated it is valid; meaningless for polled events.
616 	 */
617 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
618 	    mcg & MCG_STATUS_EIPV) {
619 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
620 		    DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
621 	}
622 
623 	/*
624 	 * Add an indication of whether the trap occured during privileged code.
625 	 */
626 	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
627 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
628 		    DATA_TYPE_BOOLEAN_VALUE,
629 		    gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
630 	}
631 
632 	/*
633 	 * If requested, add the index of the MCA bank.  This indicates the
634 	 * n'th bank of 4 MCA registers, and does not necessarily correspond
635 	 * to MCi_* - use the bank offset to correlate
636 	 */
637 	if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
638 		fm_payload_set(ereport,
639 		    /* Bank number */
640 		    FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
641 		    /* Offset of MCi_CTL */
642 		    FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
643 		    IA32_MSR_MC(bankno, CTL),
644 		    NULL);
645 	}
646 
647 	/*
648 	 * Add MCi_STATUS if requested, and decode it.
649 	 */
650 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
651 		const char *tbes[] = {
652 			"No tracking",			/* 00 */
653 			"Green - below threshold",	/* 01 */
654 			"Yellow - above threshold",	/* 10 */
655 			"Reserved"			/* 11 */
656 		};
657 
658 		fm_payload_set(ereport,
659 		    /* Bank MCi_STATUS */
660 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
661 		    /* Overflow? */
662 		    _GCPU_BSTATUS(bstat, OVER),
663 		    /* Uncorrected? */
664 		    _GCPU_BSTATUS(bstat, UC),
665 		    /* Enabled? */
666 		    _GCPU_BSTATUS(bstat, EN),
667 		    /* Processor context corrupt? */
668 		    _GCPU_BSTATUS(bstat, PCC),
669 		    /* Error code */
670 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
671 		    DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
672 		    /* Model-specific error code */
673 		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
674 		    DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
675 		    NULL);
676 
677 		/*
678 		 * If MCG_CAP.TES_P indicates that that thresholding info
679 		 * is present in the architural component of the bank status
680 		 * then include threshold information for this bank.
681 		 */
682 		if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
683 			fm_payload_set(ereport,
684 			    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
685 			    DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
686 			    NULL);
687 		}
688 	}
689 
690 	/*
691 	 * MCi_ADDR info if requested and valid.
692 	 */
693 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
694 	    bstat & MSR_MC_STATUS_ADDRV) {
695 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
696 		    DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
697 	}
698 
699 	/*
700 	 * MCi_MISC if requested and MCi_STATUS.MISCV).
701 	 */
702 	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
703 	    bstat & MSR_MC_STATUS_MISCV) {
704 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
705 		    DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
706 	}
707 
708 }
709 
710 /*
711  * Construct and post an ereport based on the logout information from a
712  * single MCA bank.  We are not necessarily running on the cpu that
713  * detected the error.
714  */
715 static void
716 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
717     const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
718 {
719 	gcpu_data_t *gcpu = gcl->gcl_gcpu;
720 	cmi_hdl_t hdl = gcpu->gcpu_hdl;
721 	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
722 	const char *cpuclass = NULL, *leafclass = NULL;
723 	uint16_t code = MCAX86_ERRCODE(status);
724 	errorq_elem_t *eqep, *scr_eqep;
725 	nvlist_t *ereport, *detector;
726 	char buf[FM_MAX_CLASS];
727 	const char *classfmt;
728 	nv_alloc_t *nva;
729 
730 	if (panicstr) {
731 		if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
732 			return;
733 		ereport = errorq_elem_nvl(ereport_errorq, eqep);
734 
735 		/*
736 		 * Allocate another element for scratch space, but fallback
737 		 * to the one we have if that fails.  We'd like to use the
738 		 * additional scratch space for nvlist construction.
739 		 */
740 		if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
741 			nva = errorq_elem_nva(ereport_errorq, scr_eqep);
742 		else
743 			nva = errorq_elem_nva(ereport_errorq, eqep);
744 	} else {
745 		ereport = fm_nvlist_create(NULL);
746 		nva = NULL;
747 	}
748 
749 	if (ereport == NULL)
750 		return;
751 
752 	/*
753 	 * Common payload data required by the protocol:
754 	 *	- ereport class
755 	 *	- detector
756 	 *	- ENA
757 	 */
758 
759 	/*
760 	 * Ereport class - call into model-specific support to allow it to
761 	 * provide a cpu class or leaf class, otherwise calculate our own.
762 	 */
763 	cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
764 	classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
765 	gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
766 	    leafclass);
767 
768 	/*
769 	 * The detector FMRI.
770 	 */
771 	if ((detector = cms_ereport_detector(hdl, mscookie, nva)) == NULL)
772 		detector = gcpu_fmri_create(hdl, nva);
773 
774 	/*
775 	 * Should we define a new ENA format 3?? for chip/core/strand?
776 	 * It will be better when virtualized.
777 	 */
778 	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
779 	    fm_ena_generate_cpu(gcl->gcl_timestamp,
780 	    cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
781 	    cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
782 
783 	if (panicstr) {
784 		fm_nvlist_destroy(detector, FM_NVA_RETAIN);
785 		nv_alloc_reset(nva);
786 	} else {
787 		fm_nvlist_destroy(detector, FM_NVA_FREE);
788 	}
789 
790 	/*
791 	 * Add the architectural ereport class-specific payload data.
792 	 */
793 	gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
794 
795 	/*
796 	 * Allow model-specific code to add ereport members.
797 	 */
798 	cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
799 	    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
800 
801 	/*
802 	 * Include stack if options is turned on and either selected in
803 	 * the payload member bitmask or inclusion is forced.
804 	 */
805 	if (gcpu_mca_stack_flag &&
806 	    (cms_ereport_includestack(hdl, mscookie) ==
807 	    B_TRUE || gcpu_mca_stack_ereport_include)) {
808 		fm_payload_stack_add(ereport, gcl->gcl_stack,
809 		    gcl->gcl_stackdepth);
810 	}
811 
812 	/*
813 	 * Post ereport.
814 	 */
815 	if (panicstr) {
816 		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
817 		if (scr_eqep)
818 			errorq_cancel(ereport_errorq, scr_eqep);
819 	} else {
820 		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
821 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
822 	}
823 
824 }
825 
826 /*ARGSUSED*/
827 void
828 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
829 {
830 	const gcpu_logout_t *gcl = data;
831 	const gcpu_bank_logout_t *gbl;
832 	int i;
833 
834 	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
835 		const gcpu_error_disp_t *gened;
836 		cms_cookie_t mscookie;
837 
838 		if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
839 		    !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
840 			uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
841 
842 			/*
843 			 * Perform a match based on IA32 MCA architectural
844 			 * components alone.
845 			 */
846 			gened = gcpu_disp_match(code); /* may be NULL */
847 
848 			/*
849 			 * Now see if an model-specific match can be made.
850 			 */
851 			mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i,
852 			    gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
853 			    gcl->gcl_ms_logout);
854 
855 			/*
856 			 * Prepare and dispatch an ereport for logging and
857 			 * diagnosis.
858 			 */
859 			gcpu_ereport_post(gcl, i, gened, mscookie,
860 			    gbl->gbl_status);
861 		} else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
862 		    (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
863 			/*
864 			 * Telemetry kept changing as we tried to read
865 			 * it.  Force an unknown ereport leafclass but
866 			 * keep the telemetry unchanged for logging.
867 			 */
868 			gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
869 			    gbl->gbl_status);
870 		}
871 	}
872 }
873 
874 static size_t gcpu_mca_queue_datasz = 0;
875 
876 /*
877  * The following code is ready to make a weak attempt at growing the
878  * errorq structure size.  Since it is not foolproof (we don't know
879  * who may already be producing to the outgoing errorq) our caller
880  * instead assures that we'll always be called with no greater data
881  * size than on our first call.
882  */
883 static void
884 gcpu_errorq_init(size_t datasz)
885 {
886 	int slots;
887 
888 	mutex_enter(&gcpu_mca_queue_lock);
889 
890 	if (gcpu_mca_queue_datasz >= datasz) {
891 		mutex_exit(&gcpu_mca_queue_lock);
892 		return;
893 	}
894 
895 	membar_producer();
896 	if (gcpu_mca_queue) {
897 		gcpu_mca_queue_datasz = 0;
898 		errorq_destroy(gcpu_mca_queue);
899 	}
900 
901 	slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
902 	slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
903 
904 	gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
905 	    NULL, slots, datasz, 1, ERRORQ_VITAL);
906 
907 	if (gcpu_mca_queue != NULL)
908 		gcpu_mca_queue_datasz = datasz;
909 
910 	mutex_exit(&gcpu_mca_queue_lock);
911 }
912 
913 /*
914  * Perform MCA initialization as described in section 14.6 of Intel 64
915  * and IA-32 Architectures Software Developer's Manual Volume 3A.
916  */
917 
918 static uint_t global_nbanks;
919 
920 void
921 gcpu_mca_init(cmi_hdl_t hdl)
922 {
923 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
924 	uint64_t cap;
925 	uint_t vendor = cmi_hdl_vendor(hdl);
926 	uint_t family = cmi_hdl_family(hdl);
927 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
928 	int mcg_ctl_present;
929 	uint_t nbanks;
930 	size_t mslsz;
931 	int i;
932 
933 	if (gcpu == NULL)
934 		return;
935 
936 	/*
937 	 * Protect from some silly /etc/system settings.
938 	 */
939 	if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
940 		gcpu_mca_telemetry_retries = 5;
941 
942 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
943 		return;
944 
945 	/*
946 	 * CPU startup code only calls cmi_mca_init if x86_feature indicates
947 	 * both MCA and MCE support (i.e., X86_MCA).  P5, K6, and earlier
948 	 * processors, which have their own * more primitive way of doing
949 	 * machine checks, will not have cmi_mca_init called since their
950 	 * CPUID information will not indicate both MCA and MCE features.
951 	 */
952 #ifndef	__xpv
953 	ASSERT(x86_feature & X86_MCA);
954 #endif /* __xpv */
955 
956 	/*
957 	 * Determine whether the IA32_MCG_CTL register is present.  If it
958 	 * is we will enable all features by writing -1 to it towards
959 	 * the end of this initialization;  if it is absent then volume 3A
960 	 * says we must nonetheless continue to initialize the individual
961 	 * banks.
962 	 */
963 	mcg_ctl_present = cap & MCG_CAP_CTL_P;
964 
965 	/*
966 	 * We squirell values away for inspection/debugging.
967 	 */
968 	mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
969 	if (mcg_ctl_present)
970 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
971 		    &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
972 
973 	/*
974 	 * Determine the number of error-reporting banks implemented.
975 	 */
976 	mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
977 
978 	if (nbanks != 0 && global_nbanks == 0)
979 		global_nbanks = nbanks;	/* no race - BSP will get here first */
980 
981 	/*
982 	 * If someone is hiding the number of banks (perhaps we are fully
983 	 * virtualized?) or if this processor has more banks than the
984 	 * first to set global_nbanks then bail.  The latter requirement
985 	 * is because we need to size our errorq data structure and we
986 	 * don't want to have to grow the errorq (destroy and recreate)
987 	 * which may just lose some telemetry.
988 	 */
989 	if (nbanks == 0 || nbanks > global_nbanks)
990 		return;
991 
992 	mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
993 	    sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
994 
995 	/*
996 	 * Calculate the size we need to allocate for a gcpu_logout_t
997 	 * with a gcl_data array big enough for all banks of this cpu.
998 	 * Add any space requested by the model-specific logout support.
999 	 */
1000 	mslsz = cms_logout_size(hdl);
1001 	mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1002 	    (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1003 
1004 	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1005 		gcpu_logout_t *gcl;
1006 
1007 		mca->gcpu_mca_logout[i] = gcl =
1008 		    kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1009 		gcl->gcl_gcpu = gcpu;
1010 		gcl->gcl_nbanks = nbanks;
1011 		gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1012 		    (char *)(&gcl->gcl_data[0]) + nbanks *
1013 		    sizeof (gcpu_bank_logout_t);
1014 
1015 	}
1016 	mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1017 
1018 	/*
1019 	 * Create our errorq to transport the logout structures.  This
1020 	 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1021 	 */
1022 	gcpu_errorq_init(mca->gcpu_mca_lgsz);
1023 
1024 	/*
1025 	 * Not knowing which, if any, banks are shared between cores we
1026 	 * assure serialization of MCA bank initialization by each cpu
1027 	 * on the chip.  On chip architectures in which some banks are
1028 	 * shared this will mean the shared resource is initialized more
1029 	 * than once - we're simply aiming to avoid simultaneous MSR writes
1030 	 * to the shared resource.
1031 	 *
1032 	 * Even with these precautions, some platforms may yield a GP fault
1033 	 * if a core other than a designated master tries to write anything
1034 	 * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
1035 	 * those writes under on_trap protection.
1036 	 */
1037 	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1038 
1039 	/*
1040 	 * Initialize poller data, but don't start polling yet.
1041 	 */
1042 	gcpu_mca_poll_init(hdl);
1043 
1044 	/*
1045 	 * Work out which MCA banks we will initialize.  In MCA logout
1046 	 * code we will only read those banks which we initialize here.
1047 	 */
1048 	for (i = 0; i < nbanks; i++) {
1049 		/*
1050 		 * On Intel family 6 and AMD family 6 we must not enable
1051 		 * machine check from bank 0 detectors.  In the Intel
1052 		 * case bank 0 is reserved for the platform, while in the
1053 		 * AMD case reports are that enabling bank 0 (DC) produces
1054 		 * spurious machine checks.
1055 		 */
1056 		if (i == 0 && ((vendor == X86_VENDOR_Intel ||
1057 		    vendor == X86_VENDOR_AMD) && family == 6))
1058 			continue;
1059 
1060 		if (cms_bankctl_skipinit(hdl, i))
1061 			continue;
1062 
1063 		/*
1064 		 * Record which MCA banks were enabled, both from the
1065 		 * point of view of this core and accumulating for the
1066 		 * whole chip (if some cores share a bank we must be
1067 		 * sure either can logout from it).
1068 		 */
1069 		mca->gcpu_actv_banks |= 1 << i;
1070 		atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1071 	}
1072 
1073 	/*
1074 	 * Log any valid telemetry lurking in the MCA banks, but do not
1075 	 * clear the status registers.  Ignore the disposition returned -
1076 	 * we have already paniced or reset for any nasty errors found here.
1077 	 */
1078 	gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE);
1079 
1080 	/*
1081 	 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1082 	 * model-specific module the power of veto.
1083 	 */
1084 	for (i = 0; i < nbanks; i++) {
1085 		struct gcpu_bios_bankcfg *bcfgp =
1086 		    mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1087 
1088 		/*
1089 		 * Stash inherited bank MCA state, even for banks we will
1090 		 * not initialize ourselves.  Do not read the MISC register
1091 		 * unconditionally - on some processors that will #GP on
1092 		 * banks that do not implement the MISC register (would be
1093 		 * caught by on_trap, anyway).
1094 		 */
1095 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1096 		    &bcfgp->bios_bank_ctl);
1097 
1098 		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1099 		    &bcfgp->bios_bank_status);
1100 
1101 		if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV)
1102 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1103 			    &bcfgp->bios_bank_addr);
1104 
1105 		if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1106 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1107 			    &bcfgp->bios_bank_misc);
1108 
1109 		if (!(mca->gcpu_actv_banks & 1 << i))
1110 			continue;
1111 
1112 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1113 		    cms_bankctl_val(hdl, i, -1ULL));
1114 
1115 		if (!cms_bankstatus_skipinit(hdl, i)) {
1116 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1117 			    cms_bankstatus_val(hdl, i, 0ULL));
1118 		}
1119 	}
1120 
1121 	/*
1122 	 * Now let the model-specific support perform further initialization
1123 	 * of non-architectural features.
1124 	 */
1125 	cms_mca_init(hdl, nbanks);
1126 
1127 	(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1128 	membar_producer();
1129 
1130 	/* enable all machine-check features */
1131 	if (mcg_ctl_present)
1132 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1133 		    cms_mcgctl_val(hdl, nbanks, -1ULL));
1134 
1135 	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1136 
1137 	/* enable machine-check exception in CR4 */
1138 	cmi_hdl_enable_mce(hdl);
1139 }
1140 
1141 static uint64_t
1142 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1143     gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1144 {
1145 	int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1146 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1147 	int nbanks = mca->gcpu_mca_nbanks;
1148 	gcpu_mce_status_t mce;
1149 	gcpu_bank_logout_t *gbl;
1150 	uint64_t disp = 0;
1151 	int i;
1152 
1153 	if (mcesp == NULL)
1154 		mcesp = &mce;
1155 
1156 	mcesp->mce_nerr = nerr;
1157 
1158 	mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1159 	    mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1160 	    mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1161 
1162 	/*
1163 	 * If this a machine check then if the return instruction pointer
1164 	 * is not valid the current context is lost.
1165 	 */
1166 	if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1167 		disp |= CMI_ERRDISP_RIPV_INVALID;
1168 
1169 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1170 		uint64_t mcistatus = gbl->gbl_status;
1171 		uint32_t ms_scope;
1172 		int pcc, uc;
1173 		int poisoned;
1174 
1175 		if (!(mcistatus & MSR_MC_STATUS_VAL))
1176 			continue;
1177 
1178 		if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1179 			continue;
1180 
1181 		pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1182 		uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1183 		mcesp->mce_npcc += pcc;
1184 		mcesp->mce_nuc += uc;
1185 
1186 		ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1187 		    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1188 
1189 		if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1190 			pcc = 0;
1191 			mcesp->mce_npcc_ok++;
1192 			gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1193 		}
1194 
1195 		if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1196 			uc = 0;
1197 			mcesp->mce_nuc_ok++;
1198 			gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1199 		}
1200 
1201 		if (uc) {
1202 			poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1203 			if (poisoned) {
1204 				mcesp->mce_nuc_poisoned++;
1205 				gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1206 			}
1207 		}
1208 
1209 		if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1210 			/*
1211 			 * We're not being instructed to ignore the error,
1212 			 * so apply our standard disposition logic to it.
1213 			 */
1214 			if (uc && !poisoned) {
1215 				unconstrained++;
1216 				gbl->gbl_disp |= disp |
1217 				    CMI_ERRDISP_UC_UNCONSTRAINED;
1218 			}
1219 
1220 			if (pcc && ismc) {
1221 				curctxbad++;
1222 				gbl->gbl_disp |= disp |
1223 				    CMI_ERRDISP_CURCTXBAD;
1224 			}
1225 
1226 			/*
1227 			 * Even if the above may not indicate that the error
1228 			 * is terminal, model-specific support may insist
1229 			 * that we treat it as such.  Such errors wil be
1230 			 * fatal even if discovered via poll.
1231 			 */
1232 			if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1233 				forcefatal++;
1234 				mcesp->mce_forcefatal++;
1235 				gbl->gbl_disp |= disp |
1236 				    CMI_ERRDISP_FORCEFATAL;
1237 			}
1238 		} else {
1239 			mcesp->mce_ignored++;
1240 			gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1241 		}
1242 	}
1243 
1244 	if (unconstrained > 0)
1245 		disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1246 
1247 	if (curctxbad > 0)
1248 		disp |= CMI_ERRDISP_CURCTXBAD;
1249 
1250 	if (forcefatal > 0)
1251 		disp |= CMI_ERRDISP_FORCEFATAL;
1252 
1253 	if (gcpu_mca_queue != NULL) {
1254 		int how;
1255 
1256 		if (ismc) {
1257 			how = cmi_mce_response(rp, disp) ?
1258 			    ERRORQ_ASYNC :	/* no panic, so arrange drain */
1259 			    ERRORQ_SYNC;	/* panic flow will drain */
1260 		} else {
1261 			how = (disp & CMI_ERRDISP_FORCEFATAL &&
1262 			    cmi_panic_on_ue()) ?
1263 			    ERRORQ_SYNC :	/* poller will panic */
1264 			    ERRORQ_ASYNC;	/* no panic */
1265 		}
1266 
1267 		errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1268 	} else if (disp != 0) {
1269 		gcpu_bleat(hdl, gcl);
1270 	}
1271 
1272 	mcesp->mce_disp = disp;
1273 
1274 	return (disp);
1275 }
1276 
1277 /*
1278  * Gather error telemetry from our source, and then submit it for
1279  * processing.
1280  */
1281 
1282 #define	IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1283 	((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1284 
1285 #define	STATUS_EQV(s1, s2) \
1286 	(((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1287 
1288 static uint32_t gcpu_deferrred_polled_clears;
1289 
1290 void
1291 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1292     gcpu_mce_status_t *mcesp, boolean_t clrstatus)
1293 {
1294 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1295 	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1296 	int nbanks = mca->gcpu_mca_nbanks;
1297 	gcpu_bank_logout_t *gbl, *pgbl;
1298 	gcpu_logout_t *gcl, *pgcl;
1299 	int ismc = (rp != NULL);
1300 	int ispoll = !ismc;
1301 	int i, nerr = 0;
1302 	cmi_errno_t err;
1303 	uint64_t mcg_status;
1304 	uint64_t disp;
1305 	uint64_t cap;
1306 
1307 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1308 	    CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1309 	    CMI_SUCCESS) {
1310 		if (mcesp != NULL)
1311 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1312 		return;
1313 	}
1314 
1315 	if (ismc) {
1316 		gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1317 	} else {
1318 		int pidx = mca->gcpu_mca_nextpoll_idx;
1319 		int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1320 		    GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1321 
1322 		gcl = mca->gcpu_mca_logout[pidx];	/* current logout */
1323 		pgcl = mca->gcpu_mca_logout[ppidx];	/* previous logout */
1324 		mca->gcpu_mca_nextpoll_idx = ppidx;	/* switch next time */
1325 	}
1326 
1327 	gcl->gcl_timestamp = gethrtime_waitfree();
1328 	gcl->gcl_mcg_status = mcg_status;
1329 	gcl->gcl_ip = rp ? rp->r_pc : 0;
1330 
1331 	gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1332 	if (cap & MCG_CAP_TES_P)
1333 		gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1334 
1335 	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1336 		uint64_t status, status2, addr, misc;
1337 		int retries = gcpu_mca_telemetry_retries;
1338 
1339 		gbl->gbl_status = 0;
1340 		gbl->gbl_disp = 0;
1341 		gbl->gbl_clrdefcnt = 0;
1342 
1343 		/*
1344 		 * Only logout from MCA banks we have initialized from at
1345 		 * least one core.  If a core shares an MCA bank with another
1346 		 * but perhaps lost the race to initialize it, then it must
1347 		 * still be allowed to logout from the shared bank.
1348 		 */
1349 		if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1350 			continue;
1351 
1352 		/*
1353 		 * On a poll look only at the banks we've been asked to check.
1354 		 */
1355 		if (rp == NULL && !(bankmask & 1 << i))
1356 			continue;
1357 
1358 
1359 		if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1360 		    CMI_SUCCESS)
1361 			continue;
1362 retry:
1363 		if (!(status & MSR_MC_STATUS_VAL))
1364 			continue;
1365 
1366 		addr = -1;
1367 		misc = 0;
1368 
1369 		if (status & MSR_MC_STATUS_ADDRV)
1370 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1371 
1372 		if (status & MSR_MC_STATUS_MISCV)
1373 			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1374 
1375 		/*
1376 		 * Allow the model-specific code to extract bank telemetry.
1377 		 */
1378 		cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1379 
1380 		/*
1381 		 * Not all cpu models assure us that the status/address/misc
1382 		 * data will not change during the above sequence of MSR reads,
1383 		 * or that it can only change by the addition of the OVerflow
1384 		 * bit to the status register.  If the status has changed
1385 		 * other than in the overflow bit then we attempt to reread
1386 		 * for a consistent snapshot, but eventually give up and
1387 		 * go with what we've got.  We only perform this check
1388 		 * for a poll - a further #MC during a #MC will reset, and
1389 		 * polled errors should not overwrite higher-priority
1390 		 * trapping errors (but could set the overflow bit).
1391 		 */
1392 		if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1393 		    &status2)) == CMI_SUCCESS) {
1394 			if (!STATUS_EQV(status, status2)) {
1395 				if (retries-- > 0) {
1396 					status = status2;
1397 					goto retry;
1398 				} else {
1399 					gbl->gbl_disp |=
1400 					    CMI_ERRDISP_INCONSISTENT;
1401 				}
1402 			}
1403 		} else if (ispoll && err != CMI_SUCCESS) {
1404 			gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1405 		}
1406 
1407 		nerr++;
1408 		gbl->gbl_status = status;
1409 		gbl->gbl_addr = addr;
1410 		gbl->gbl_misc = misc;
1411 
1412 		if (clrstatus == B_FALSE)
1413 			goto serialize;
1414 
1415 		/*
1416 		 * For machine checks we always clear status here.  For polls
1417 		 * we must be a little more cautious since there is an
1418 		 * outside chance that we may clear telemetry from a shared
1419 		 * MCA bank on which a sibling core is machine checking.
1420 		 *
1421 		 * For polled observations of errors that look like they may
1422 		 * produce a machine check (UC/PCC and ENabled, although these
1423 		 * do not guarantee a machine check on error occurence)
1424 		 * we will not clear the status at this wakeup unless
1425 		 * we saw the same status at the previous poll.  We will
1426 		 * always process and log the current observations - it
1427 		 * is only the clearing of MCi_STATUS which may be
1428 		 * deferred until the next wakeup.
1429 		 */
1430 		if (ismc || !IS_MCE_CANDIDATE(status)) {
1431 			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1432 			goto serialize;
1433 		}
1434 
1435 		/*
1436 		 * We have a polled observation of a machine check
1437 		 * candidate.  If we saw essentially the same status at the
1438 		 * last poll then clear the status now since this appears
1439 		 * not to be a #MC candidate after all.  If we see quite
1440 		 * different status now then do not clear, but reconsider at
1441 		 * the next poll.  In no actual machine check clears
1442 		 * the status in the interim then the status should not
1443 		 * keep changing forever (meaning we'd never clear it)
1444 		 * since before long we'll simply have latched the highest-
1445 		 * priority error and set the OVerflow bit.  Nonetheless
1446 		 * we count how many times we defer clearing and after
1447 		 * a while insist on clearing the status.
1448 		 */
1449 		pgbl = &pgcl->gcl_data[i];
1450 		if (pgbl->gbl_clrdefcnt != 0) {
1451 			/* We deferred clear on this bank at last wakeup */
1452 			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1453 			    pgbl->gbl_clrdefcnt > 5) {
1454 				/*
1455 				 * Status is unchanged so clear it now and,
1456 				 * since we have already logged this info,
1457 				 * avoid logging it again.
1458 				 */
1459 				gbl->gbl_status = 0;
1460 				nerr--;
1461 				(void) cmi_hdl_wrmsr(hdl,
1462 				    IA32_MSR_MC(i, STATUS), 0ULL);
1463 			} else {
1464 				/* Record deferral for next wakeup */
1465 				gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1466 			}
1467 		} else {
1468 			/* Record initial deferral for next wakeup */
1469 			gbl->gbl_clrdefcnt = 1;
1470 			gcpu_deferrred_polled_clears++;
1471 		}
1472 
1473 serialize:
1474 		/*
1475 		 * Intel Vol 3A says to execute a serializing instruction
1476 		 * here, ie CPUID.  Well WRMSR is also defined to be
1477 		 * serializing, so the status clear above should suffice.
1478 		 * To be a good citizen, and since some clears are deferred,
1479 		 * we'll execute a CPUID instruction here.
1480 		 */
1481 		{
1482 			struct cpuid_regs tmp;
1483 			(void) __cpuid_insn(&tmp);
1484 		}
1485 	}
1486 
1487 	if (gcpu_mca_stack_flag)
1488 		gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1489 	else
1490 		gcl->gcl_stackdepth = 0;
1491 
1492 	/*
1493 	 * Decide our disposition for this error or errors, and submit for
1494 	 * logging and subsequent diagnosis.
1495 	 */
1496 	if (nerr != 0) {
1497 		disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1498 	} else {
1499 		disp = 0;
1500 		if (mcesp) {
1501 			mcesp->mce_nerr = mcesp->mce_disp = 0;
1502 		}
1503 	}
1504 
1505 	/*
1506 	 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1507 	 * If a second #MC had occured before now the system would have
1508 	 * reset.  We can only do thise once gcpu_mca_process has copied
1509 	 * the logout structure.
1510 	 */
1511 	if (ismc && mcg_status & MCG_STATUS_MCIP)
1512 		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1513 
1514 	/*
1515 	 * At this point we have read and logged all telemetry that is visible
1516 	 * under the MCA.  On architectures for which the NorthBridge is
1517 	 * on-chip this may include NB-observed errors, but where the NB
1518 	 * is off chip it may have been the source of the #MC request and
1519 	 * so we must call into the memory-controller driver to give it
1520 	 * a chance to log errors.
1521 	 */
1522 	if (ismc) {
1523 		int willpanic = (cmi_mce_response(rp, disp) == 0);
1524 		cmi_mc_logout(hdl, 1, willpanic);
1525 	}
1526 }
1527 
1528 int gcpu_mca_trap_vomit_summary = 0;
1529 
1530 /*
1531  * On a native machine check exception we come here from mcetrap via
1532  * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
1533  * cpus of the chip, so it is possible that another cpu on this chip could
1534  * initiate a poll while we're in the #mc handler;  it is also possible that
1535  * this trap has occured during a poll on this cpu.  So we must acquire
1536  * the chip-wide poll lock, but be careful to avoid deadlock.
1537  *
1538  * The 'data' pointer cannot be NULL due to init order.
1539  */
1540 uint64_t
1541 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1542 {
1543 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1544 	kmutex_t *poll_lock = NULL;
1545 	gcpu_mce_status_t mce;
1546 	uint64_t mcg_status;
1547 	int tooklock = 0;
1548 
1549 	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1550 	    CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
1551 		return (0);
1552 
1553 	/*
1554 	 * Synchronize with any poller from another core that may happen
1555 	 * to share access to one or more of the MCA banks.
1556 	 */
1557 	if (gcpu->gcpu_shared != NULL)
1558 		poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
1559 
1560 	if (poll_lock != NULL && !mutex_owned(poll_lock)) {
1561 		/*
1562 		 * The lock is not owned by the thread we have
1563 		 * interrupted.  Spin for this adaptive lock.
1564 		 */
1565 		while (!mutex_tryenter(poll_lock)) {
1566 			while (mutex_owner(poll_lock) != NULL)
1567 				;
1568 		}
1569 		tooklock = 1;
1570 	}
1571 
1572 	gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE);
1573 
1574 	if (tooklock)
1575 		mutex_exit(poll_lock);
1576 
1577 	/*
1578 	 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
1579 	 */
1580 	if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
1581 		cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
1582 		    "%u PCC (%u ok), "
1583 		    "%u UC (%d ok, %u poisoned), "
1584 		    "%u forcefatal, %u ignored",
1585 		    mce.mce_nerr, (u_longlong_t)mce.mce_disp,
1586 		    mce.mce_npcc, mce.mce_npcc_ok,
1587 		    mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
1588 		    mce.mce_forcefatal, mce.mce_ignored);
1589 	}
1590 
1591 	return (mce.mce_disp);
1592 }
1593 
1594 /*ARGSUSED*/
1595 void
1596 gcpu_faulted_enter(cmi_hdl_t hdl)
1597 {
1598 	/* Nothing to do here */
1599 }
1600 
1601 /*ARGSUSED*/
1602 void
1603 gcpu_faulted_exit(cmi_hdl_t hdl)
1604 {
1605 	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1606 
1607 	gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
1608 }
1609 
1610 /*
1611  * Write the requested values to the indicated MSRs.  Having no knowledge
1612  * of the model-specific requirements for writing to these model-specific
1613  * registers, we will only blindly write to those MSRs if the 'force'
1614  * argument is nonzero.  That option should only be used in prototyping
1615  * and debugging.
1616  */
1617 /*ARGSUSED*/
1618 cmi_errno_t
1619 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
1620     int force)
1621 {
1622 	int i, errs = 0;
1623 
1624 	for (i = 0; i < nregs; i++) {
1625 		uint_t msr = regs[i].cmr_msrnum;
1626 		uint64_t val = regs[i].cmr_msrval;
1627 
1628 		if (cms_present(hdl)) {
1629 			if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
1630 				errs++;
1631 		} else if (force) {
1632 			errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
1633 		} else {
1634 			errs++;
1635 		}
1636 	}
1637 
1638 	return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
1639 }
1640