1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2018, Joyent, Inc.
25 */
26/*
27 * Copyright (c) 2010, Intel Corporation.
28 * All rights reserved.
29 */
30
31#include <sys/mca_x86.h>
32#include <sys/cpu_module_impl.h>
33#include <sys/cpu_module_ms.h>
34#include <sys/cmn_err.h>
35#include <sys/cpuvar.h>
36#include <sys/pghw.h>
37#include <sys/x86_archext.h>
38#include <sys/sysmacros.h>
39#include <sys/regset.h>
40#include <sys/privregs.h>
41#include <sys/systm.h>
42#include <sys/types.h>
43#include <sys/log.h>
44#include <sys/psw.h>
45#include <sys/fm/protocol.h>
46#include <sys/fm/util.h>
47#include <sys/errorq.h>
48#include <sys/mca_x86.h>
49#include <sys/fm/cpu/GMCA.h>
50#include <sys/fm/smb/fmsmb.h>
51#include <sys/sysevent.h>
52#include <sys/ontrap.h>
53#include <sys/smp_impldefs.h>
54
55#include "gcpu.h"
56
57extern int x86gentopo_legacy;	/* x86 generic topology support */
58
59static uint_t gcpu_force_addr_in_payload = 0;
60
61/*
62 * Clear to log telemetry found at initialization.  While processor docs
63 * say you should process this telemetry on all but Intel family 0x6
64 * there are way too many exceptions and we want to avoid bogus
65 * diagnoses.
66 */
67int gcpu_suppress_log_on_init = 1;
68
69/*
70 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at
71 * error logout time.  The stack will be included in the ereport if the
72 * error type selects stack inclusion, or in all cases if
73 * gcpu_mca_stack_ereport_include is nonzero.
74 */
75int gcpu_mca_stack_flag = 0;
76int gcpu_mca_stack_ereport_include = 0;
77
78/*
79 * The number of times to re-read MCA telemetry to try to obtain a
80 * consistent snapshot if we find it to be changing under our feet.
81 */
82int gcpu_mca_telemetry_retries = 5;
83
84#ifndef __xpv
85int gcpu_mca_cmci_throttling_threshold = 10;
86int gcpu_mca_cmci_reenable_threshold = 1000;
87
88/*
89 * This is used to determine whether or not we have registered the CMCI CPU
90 * setup function. This is protected by cpu_lock.
91 */
92static boolean_t gcpu_mca_cpu_registered = B_FALSE;
93#endif
94
95static gcpu_error_disp_t gcpu_errtypes[] = {
96
97	/*
98	 * Unclassified
99	 */
100	{
101		FM_EREPORT_CPU_GENERIC_UNCLASSIFIED,
102		NULL,
103		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
104		MCAX86_SIMPLE_UNCLASSIFIED_MASKON,
105		MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF
106	},
107
108	/*
109	 * Microcode ROM Parity Error
110	 */
111	{
112		FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY,
113		NULL,
114		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
115		MCAX86_SIMPLE_MC_CODE_PARITY_MASKON,
116		MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF
117	},
118
119	/*
120	 * External - BINIT# from another processor during power-on config
121	 */
122	{
123		FM_EREPORT_CPU_GENERIC_EXTERNAL,
124		NULL,
125		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
126		MCAX86_SIMPLE_EXTERNAL_MASKON,
127		MCAX86_SIMPLE_EXTERNAL_MASKOFF
128	},
129
130	/*
131	 * Functional redundancy check master/slave error
132	 */
133	{
134		FM_EREPORT_CPU_GENERIC_FRC,
135		NULL,
136		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
137		MCAX86_SIMPLE_FRC_MASKON,
138		MCAX86_SIMPLE_FRC_MASKOFF
139	},
140
141	/*
142	 * Internal parity error
143	 */
144	{
145		FM_EREPORT_CPU_GENERIC_INTERNAL_PARITY,
146		NULL,
147		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
148		MCAX86_SIMPLE_INTERNAL_PARITY_MASKON,
149		MCAX86_SIMPLE_INTERNAL_PARITY_MASKOFF
150	},
151
152
153	/*
154	 * Internal timer error
155	 */
156	{
157		FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER,
158		NULL,
159		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
160		MCAX86_SIMPLE_INTERNAL_TIMER_MASKON,
161		MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF
162	},
163
164	/*
165	 * Internal unclassified
166	 */
167	{
168		FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS,
169		NULL,
170		FM_EREPORT_PAYLOAD_FLAGS_COMMON,
171		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON,
172		MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF
173	},
174
175	/*
176	 * Compound error codes - generic memory hierarchy
177	 */
178	{
179		FM_EREPORT_CPU_GENERIC_GENMEMHIER,
180		NULL,
181		FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */
182		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON,
183		MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF
184	},
185
186	/*
187	 * Compound error codes - TLB errors
188	 */
189	{
190		FM_EREPORT_CPU_GENERIC_TLB,
191		"%1$s" "TLB" "%2$s" "_ERR",
192		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
193		MCAX86_COMPOUND_TLB_MASKON,
194		MCAX86_COMPOUND_TLB_MASKOFF
195	},
196
197	/*
198	 * Compound error codes - memory hierarchy
199	 */
200	{
201		FM_EREPORT_CPU_GENERIC_MEMHIER,
202		"%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR",
203		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
204		MCAX86_COMPOUND_MEMHIER_MASKON,
205		MCAX86_COMPOUND_MEMHIER_MASKOFF
206	},
207
208	/*
209	 * Compound error codes - bus and interconnect errors
210	 */
211	{
212		FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT,
213		"BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR",
214		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
215		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON,
216		MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF
217	},
218	/*
219	 * Compound error codes - memory controller errors
220	 */
221	{
222		FM_EREPORT_CPU_GENERIC_MEMORY_CONTROLLER,
223		"MC" "_" "%8$s" "_" "%9$s" "_ERR",
224		FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR,
225		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKON,
226		MCAX86_COMPOUND_MEMORY_CONTROLLER_MASKOFF
227	},
228};
229
230static gcpu_error_disp_t gcpu_unknown = {
231	FM_EREPORT_CPU_GENERIC_UNKNOWN,
232	"UNKNOWN",
233	FM_EREPORT_PAYLOAD_FLAGS_COMMON,
234	0,
235	0
236};
237
238static errorq_t *gcpu_mca_queue;
239static kmutex_t gcpu_mca_queue_lock;
240
241#ifdef __xpv
242static int isxpv = 1;
243#else
244static int isxpv = 0;
245#endif
246
247static const gcpu_error_disp_t *
248gcpu_disp_match(uint16_t code)
249{
250	const gcpu_error_disp_t *ged = gcpu_errtypes;
251	int i;
252
253	for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t);
254	    i++, ged++) {
255		uint16_t on = ged->ged_errcode_mask_on;
256		uint16_t off = ged->ged_errcode_mask_off;
257
258		if ((code & on) == on && (code & off) == 0)
259			return (ged);
260	}
261
262	return (NULL);
263}
264
265static uint16_t
266bit_strip(uint16_t code, uint16_t mask, uint16_t shift)
267{
268	return ((code & mask) >> shift);
269}
270
271#define	BIT_STRIP(code, name) \
272	bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \
273	MCAX86_ERRCODE_##name##_SHIFT)
274
275#define	GCPU_MNEMONIC_UNDEF	"undefined"
276#define	GCPU_MNEMONIC_RESVD	"reserved"
277
278/*
279 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name
280 * mnemonics and to ereport class name components.
281 */
282
283struct gcpu_mnexp {
284	const char *mne_compound;	/* used in expanding compound errname */
285	const char *mne_ereport;	/* used in expanding ereport class */
286};
287
288static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */
289	{ "I", FM_EREPORT_CPU_GENERIC_TT_INSTR },		/* INSTR */
290	{ "D", FM_EREPORT_CPU_GENERIC_TT_DATA },		/* DATA */
291	{ "G", FM_EREPORT_CPU_GENERIC_TT_GEN },			/* GEN */
292	{ GCPU_MNEMONIC_UNDEF, "" }
293};
294
295static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */
296	{ "LO", FM_EREPORT_CPU_GENERIC_LL_L0 },			/* L0 */
297	{ "L1",	FM_EREPORT_CPU_GENERIC_LL_L1 },			/* L1 */
298	{ "L2",	FM_EREPORT_CPU_GENERIC_LL_L2 },			/* L2 */
299	{ "LG", FM_EREPORT_CPU_GENERIC_LL_LG }			/* LG */
300};
301
302static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */
303	{ "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR },		/* ERR */
304	{ "RD",	FM_EREPORT_CPU_GENERIC_RRRR_RD },		/* RD */
305	{ "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR },		/* WR */
306	{ "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD },		/* DRD */
307	{ "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR },		/* DWR */
308	{ "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD },		/* IRD */
309	{ "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH },	/* PREFETCH */
310	{ "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT },		/* EVICT */
311	{ "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP },		/* SNOOP */
312};
313
314static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */
315	{ "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC },		/* SRC */
316	{ "RES", FM_EREPORT_CPU_GENERIC_PP_RES },		/* RES */
317	{ "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS },		/* OBS */
318	{ "", FM_EREPORT_CPU_GENERIC_PP_GEN }			/* GEN */
319};
320
321static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */
322	{ "M", FM_EREPORT_CPU_GENERIC_II_MEM },			/* MEM */
323	{ GCPU_MNEMONIC_RESVD, "" },
324	{ "IO", FM_EREPORT_CPU_GENERIC_II_IO },			/* IO */
325	{ "", FM_EREPORT_CPU_GENERIC_II_GEN }			/* GEN */
326};
327
328static struct gcpu_mnexp gcpu_T_mnemonics[] = {	 /* MCAX86_ERRCODE_T_* */
329	{ "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT },	/* NONE */
330	{ "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT }		/* TIMEOUT */
331};
332
333static struct gcpu_mnexp gcpu_CCCC_mnemonics[] = { /* MCAX86_ERRCODE_CCCC_* */
334	{ "CH0", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH0 */
335	{ "CH1", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH1 */
336	{ "CH2", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH2 */
337	{ "CH3", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH3 */
338	{ "CH4", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH4 */
339	{ "CH5", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH5 */
340	{ "CH6", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH6 */
341	{ "CH7", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH7 */
342	{ "CH8", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH8 */
343	{ "CH9", FM_EREPORT_CPU_GENERIC_CCCC },		/* CH9 */
344	{ "CH10", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH10 */
345	{ "CH11", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH11 */
346	{ "CH12", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH12 */
347	{ "CH13", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH13 */
348	{ "CH14", FM_EREPORT_CPU_GENERIC_CCCC },	/* CH14 */
349	{ "CH", FM_EREPORT_CPU_GENERIC_CCCC }		/* GEN */
350};
351
352static struct gcpu_mnexp gcpu_MMM_mnemonics[] = { /* MCAX86_ERRCODE_MMM_* */
353	{ "GEN", FM_EREPORT_CPU_GENERIC_MMM_ERR },	/* GEN ERR */
354	{ "RD", FM_EREPORT_CPU_GENERIC_MMM_RD },	/* READ  */
355	{ "WR", FM_EREPORT_CPU_GENERIC_MMM_WR },	/* WRITE  */
356	{ "ADDR_CMD", FM_EREPORT_CPU_GENERIC_MMM_ADRCMD },	/* ADDR, CMD  */
357	{ "SCRUB", FM_EREPORT_CPU_GENERIC_MMM_SCRUB },
358	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
359	{ GCPU_MNEMONIC_RESVD, ""},			/* RESERVED  */
360	{ GCPU_MNEMONIC_RESVD, ""}			/* RESERVED  */
361};
362
363enum gcpu_mn_namespace {
364	GCPU_MN_NAMESPACE_COMPOUND,
365	GCPU_MN_NAMESPACE_EREPORT
366};
367
368static const char *
369gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint16_t val,
370    enum gcpu_mn_namespace nspace)
371{
372	if (val >= tbl_sz || val > 0xff)
373		return (GCPU_MNEMONIC_UNDEF);	/* for all namespaces */
374
375	switch (nspace) {
376	case GCPU_MN_NAMESPACE_COMPOUND:
377		return (tbl[val].mne_compound);
378		/*NOTREACHED*/
379
380	case GCPU_MN_NAMESPACE_EREPORT:
381		return (tbl[val].mne_ereport);
382		/*NOTREACHED*/
383
384	default:
385		return (GCPU_MNEMONIC_UNDEF);
386		/*NOTREACHED*/
387	}
388}
389
390/*
391 * The ereport class leaf component is either a simple string with no
392 * format specifiers, or a string with one or more embedded %n$s specifiers -
393 * positional selection for string arguments.  The kernel snprintf does
394 * not support %n$ (and teaching it to do so is too big a headache) so
395 * we will expand this restricted format string ourselves.
396 */
397
398#define	GCPU_CLASS_VARCOMPS	9
399
400#define	GCPU_MNEMONIC(code, name, nspace) \
401	gcpu_mnemonic(gcpu_##name##_mnemonics, \
402	sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \
403	BIT_STRIP(code, name), nspace)
404
405static void
406gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
407    enum gcpu_mn_namespace nspace)
408{
409	uint16_t code = MCAX86_ERRCODE(status);
410	const char *mn[GCPU_CLASS_VARCOMPS];
411	char *p = buf;			/* current position in buf */
412	char *q = buf + buflen;		/* pointer past last char in buf */
413	int which, expfmtchar, error;
414	char c;
415
416	mn[0] = GCPU_MNEMONIC(code, TT, nspace);
417	mn[1] = GCPU_MNEMONIC(code, LL, nspace);
418	mn[2] = GCPU_MNEMONIC(code, RRRR, nspace);
419	mn[3] = GCPU_MNEMONIC(code, PP, nspace);
420	mn[4] = GCPU_MNEMONIC(code, II, nspace);
421	mn[5] = GCPU_MNEMONIC(code, T, nspace);
422	mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : "";
423	mn[7] = GCPU_MNEMONIC(code, CCCC, nspace);
424	mn[8] = GCPU_MNEMONIC(code, MMM, nspace);
425
426	while (p < q - 1 && (c = *fmt++) != '\0') {
427		if (c != '%') {
428			/* not the beginning of a format specifier - copy */
429			*p++ = c;
430			continue;
431		}
432
433		error = 0;
434		which = -1;
435		expfmtchar = -1;
436
437nextfmt:
438		if ((c = *fmt++) == '\0')
439			break;	/* early termination of fmt specifier */
440
441		switch (c) {
442		case '1':
443		case '2':
444		case '3':
445		case '4':
446		case '5':
447		case '6':
448		case '7':
449		case '8':
450		case '9':
451			if (which != -1) { /* allow only one positional digit */
452				error++;
453				break;
454			}
455			which = c - '1';
456			goto nextfmt;
457			/*NOTREACHED*/
458
459		case '$':
460			if (which == -1) { /* no position specified */
461				error++;
462				break;
463			}
464			expfmtchar = 's';
465			goto nextfmt;
466			/*NOTREACHED*/
467
468		case 's':
469			if (expfmtchar != 's') {
470				error++;
471				break;
472			}
473			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
474			    mn[which]);
475			p += strlen(p);
476			break;
477
478		default:
479			error++;
480			break;
481		}
482
483		if (error)
484			break;
485	}
486
487	*p = '\0';	/* NUL termination */
488}
489
490static void
491gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status,
492    const char *cpuclass, const char *leafclass)
493{
494	char *p = buf;			/* current position in buf */
495	char *q = buf + buflen;		/* pointer past last char in buf */
496
497	(void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.",
498	    FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC);
499
500	p += strlen(p);
501	if (p >= q)
502		return;
503
504	if (leafclass == NULL) {
505		gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status,
506		    GCPU_MN_NAMESPACE_EREPORT);
507	} else {
508		(void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s",
509		    leafclass);
510	}
511}
512
513/*
514 * Create an "hc" scheme FMRI identifying the given cpu with
515 * motherboard/chip/core/strand instance numbers.
516 */
517static nvlist_t *
518gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva)
519{
520	nvlist_t *nvl, *fmri;
521
522	if ((nvl = fm_nvlist_create(nva)) == NULL)
523		return (NULL);
524
525	if (!x86gentopo_legacy) {
526		fmri = cmi_hdl_smb_bboard(hdl);
527		if (fmri == NULL)
528			return (NULL);
529
530		fm_fmri_hc_create(nvl, FM_HC_SCHEME_VERSION,
531		    NULL, NULL, fmri, 3,
532		    "chip", cmi_hdl_smb_chipid(hdl),
533		    "core", cmi_hdl_coreid(hdl),
534		    "strand", cmi_hdl_strandid(hdl));
535	} else {
536		fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 4,
537		    "motherboard", 0,
538		    "chip", cmi_hdl_chipid(hdl),
539		    "core", cmi_hdl_coreid(hdl),
540		    "strand", cmi_hdl_strandid(hdl));
541	}
542
543	return (nvl);
544}
545
546int gcpu_bleat_count_thresh = 5;
547hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL;
548
549/*
550 * Called when we are unable to propogate a logout structure onto an
551 * errorq for subsequent ereport preparation and logging etc.  The caller
552 * should usually only decide to call this for severe errors - those we
553 * suspect we may need to panic for.
554 */
555static void
556gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl)
557{
558	hrtime_t now  = gethrtime_waitfree();
559	static hrtime_t gcpu_last_bleat;
560	gcpu_bank_logout_t *gbl;
561	static int bleatcount;
562	int i;
563
564	/*
565	 * Throttle spamming of the console.  The first gcpu_bleat_count_thresh
566	 * can come as fast as we like, but once we've spammed that many
567	 * to the console we require a minimum interval to pass before
568	 * any more complaints.
569	 */
570	if (++bleatcount > gcpu_bleat_count_thresh) {
571		if (now - gcpu_last_bleat < gcpu_bleat_min_interval)
572			return;
573		else
574			bleatcount = 0;
575	}
576	gcpu_last_bleat = now;
577
578	cmn_err(CE_WARN,
579	    "Machine-Check Errors unlogged on chip %d core %d strand %d, "
580	    "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl),
581	    cmi_hdl_strandid(hdl));
582	cmn_err(CE_WARN, "MCG_STATUS 0x%016llx",
583	    (u_longlong_t)gcl->gcl_mcg_status);
584	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
585		uint64_t status = gbl->gbl_status;
586
587		if (!(status & MSR_MC_STATUS_VAL))
588			continue;
589
590		/* Force ADDRV for AMD Family 0xf and above */
591		if (gcpu_force_addr_in_payload)
592			status = status | MSR_MC_STATUS_ADDRV;
593
594		switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) {
595		case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV:
596			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
597			    "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx",
598			    i, IA32_MSR_MC(i, STATUS),
599			    (u_longlong_t)gbl->gbl_status,
600			    (u_longlong_t)gbl->gbl_addr,
601			    (u_longlong_t)gbl->gbl_misc);
602			break;
603
604		case MSR_MC_STATUS_ADDRV:
605			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
606			    "STAT 0x%016llx ADDR 0x%016llx",
607			    i, IA32_MSR_MC(i, STATUS),
608			    (u_longlong_t)gbl->gbl_status,
609			    (u_longlong_t)gbl->gbl_addr);
610			break;
611
612		case MSR_MC_STATUS_MISCV:
613			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
614			    "STAT 0x%016llx MISC 0x%016llx",
615			    i, IA32_MSR_MC(i, STATUS),
616			    (u_longlong_t)gbl->gbl_status,
617			    (u_longlong_t)gbl->gbl_misc);
618			break;
619
620		default:
621			cmn_err(CE_WARN, "Bank %d (offset 0x%llx) "
622			    "STAT 0x%016llx",
623			    i, IA32_MSR_MC(i, STATUS),
624			    (u_longlong_t)gbl->gbl_status);
625			break;
626
627		}
628	}
629}
630
631#define	_GCPU_BSTATUS(status, what) \
632	FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \
633	(status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE
634
635static void
636gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl,
637    uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code)
638{
639	uint64_t members = ged ? ged->ged_ereport_members :
640	    FM_EREPORT_PAYLOAD_FLAGS_COMMON;
641	uint64_t mcg = gcl->gcl_mcg_status;
642	int mcip = mcg & MCG_STATUS_MCIP;
643	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno];
644	uint64_t bstat = gbl->gbl_status;
645
646	/*
647	 * Include the compound error name if requested and if this
648	 * is a compound error type.
649	 */
650	if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged &&
651	    ged->ged_compound_fmt != NULL) {
652		char buf[FM_MAX_CLASS];
653
654		gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code,
655		    GCPU_MN_NAMESPACE_COMPOUND);
656		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR,
657		    DATA_TYPE_STRING, buf, NULL);
658	}
659
660	/*
661	 * Include disposition information for this error
662	 */
663	if (members & FM_EREPORT_PAYLOAD_FLAG_DISP &&
664	    gbl->gbl_disp != 0) {
665		int i, empty = 1;
666		char buf[128];
667		char *p = buf, *q = buf + 128;
668		static struct _gcpu_disp_name {
669			uint64_t dv;
670			const char *dn;
671		} disp_names[] = {
672			{ CMI_ERRDISP_CURCTXBAD,
673			    "processor_context_corrupt" },
674			{ CMI_ERRDISP_RIPV_INVALID,
675			    "return_ip_invalid" },
676			{ CMI_ERRDISP_UC_UNCONSTRAINED,
677			    "unconstrained" },
678			{ CMI_ERRDISP_FORCEFATAL,
679			    "forcefatal" },
680			{ CMI_ERRDISP_IGNORED,
681			    "ignored" },
682			{ CMI_ERRDISP_PCC_CLEARED,
683			    "corrupt_context_cleared" },
684			{ CMI_ERRDISP_UC_CLEARED,
685			    "uncorrected_data_cleared" },
686			{ CMI_ERRDISP_POISONED,
687			    "poisoned" },
688			{ CMI_ERRDISP_INCONSISTENT,
689			    "telemetry_unstable" },
690		};
691
692		for (i = 0; i < sizeof (disp_names) /
693		    sizeof (struct _gcpu_disp_name); i++) {
694			if ((gbl->gbl_disp & disp_names[i].dv) == 0)
695				continue;
696
697			(void) snprintf(p, (uintptr_t)q - (uintptr_t)p,
698			    "%s%s", empty ? "" : ",", disp_names[i].dn);
699			p += strlen(p);
700			empty = 0;
701		}
702
703		if (p != buf)
704			fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP,
705			    DATA_TYPE_STRING, buf, NULL);
706	}
707
708	/*
709	 * If MCG_STATUS is included add that and an indication of whether
710	 * this ereport was the result of a machine check or poll.
711	 */
712	if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) {
713		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS,
714		    DATA_TYPE_UINT64, mcg, NULL);
715
716		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP,
717		    DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL);
718	}
719
720	/*
721	 * If an instruction pointer is to be included add one provided
722	 * MCG_STATUS indicated it is valid; meaningless for polled events.
723	 */
724	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP &&
725	    mcg & MCG_STATUS_EIPV) {
726		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP,
727		    DATA_TYPE_UINT64, gcl->gcl_ip, NULL);
728	}
729
730	/*
731	 * Add an indication of whether the trap occured during privileged code.
732	 */
733	if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) {
734		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV,
735		    DATA_TYPE_BOOLEAN_VALUE,
736		    gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL);
737	}
738
739	/*
740	 * If requested, add the index of the MCA bank.  This indicates the
741	 * n'th bank of 4 MCA registers, and does not necessarily correspond
742	 * to MCi_* - use the bank offset to correlate
743	 */
744	if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) {
745		fm_payload_set(ereport,
746		    /* Bank number */
747		    FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno,
748		    /* Offset of MCi_CTL */
749		    FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64,
750		    IA32_MSR_MC(bankno, CTL),
751		    NULL);
752	}
753
754	/*
755	 * Add MCi_STATUS if requested, and decode it.
756	 */
757	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) {
758		const char *tbes[] = {
759			"No tracking",			/* 00 */
760			"Green - below threshold",	/* 01 */
761			"Yellow - above threshold",	/* 10 */
762			"Reserved"			/* 11 */
763		};
764
765		fm_payload_set(ereport,
766		    /* Bank MCi_STATUS */
767		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat,
768		    /* Overflow? */
769		    _GCPU_BSTATUS(bstat, OVER),
770		    /* Uncorrected? */
771		    _GCPU_BSTATUS(bstat, UC),
772		    /* Enabled? */
773		    _GCPU_BSTATUS(bstat, EN),
774		    /* Processor context corrupt? */
775		    _GCPU_BSTATUS(bstat, PCC),
776		    /* Error code */
777		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE,
778		    DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat),
779		    /* Model-specific error code */
780		    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE,
781		    DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat),
782		    NULL);
783
784		/*
785		 * If MCG_CAP.TES_P indicates that that thresholding info
786		 * is present in the architural component of the bank status
787		 * then include threshold information for this bank.
788		 */
789		if (gcl->gcl_flags & GCPU_GCL_F_TES_P) {
790			fm_payload_set(ereport,
791			    FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES,
792			    DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)],
793			    NULL);
794		}
795	}
796
797	/*
798	 * Add MCi_ADDR info if requested and valid. We force addition of
799	 * MCi_ADDR, even if its not valid on AMD family 0xf and above,
800	 * to aid in analysis of ereports, for WatchDog errors.
801	 */
802	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR &&
803	    ((bstat & MSR_MC_STATUS_ADDRV) ||
804	    gcpu_force_addr_in_payload)) {
805		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR,
806		    DATA_TYPE_UINT64, gbl->gbl_addr, NULL);
807	}
808
809	/*
810	 * MCi_MISC if requested and MCi_STATUS.MISCV).
811	 */
812	if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC &&
813	    bstat & MSR_MC_STATUS_MISCV) {
814		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC,
815		    DATA_TYPE_UINT64, gbl->gbl_misc, NULL);
816	}
817
818}
819
820/*
821 * Construct and post an ereport based on the logout information from a
822 * single MCA bank.  We are not necessarily running on the cpu that
823 * detected the error.
824 */
825static void
826gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx,
827    const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status)
828{
829	gcpu_data_t *gcpu = gcl->gcl_gcpu;
830	cmi_hdl_t hdl = gcpu->gcpu_hdl;
831	const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx];
832	const char *cpuclass = NULL, *leafclass = NULL;
833	uint16_t code = MCAX86_ERRCODE(status);
834	errorq_elem_t *eqep, *scr_eqep;
835	nvlist_t *ereport, *detector;
836	char buf[FM_MAX_CLASS];
837	const char *classfmt;
838	nv_alloc_t *nva;
839
840	if (panicstr) {
841		if ((eqep = errorq_reserve(ereport_errorq)) == NULL)
842			return;
843		ereport = errorq_elem_nvl(ereport_errorq, eqep);
844
845		/*
846		 * Allocate another element for scratch space, but fallback
847		 * to the one we have if that fails.  We'd like to use the
848		 * additional scratch space for nvlist construction.
849		 */
850		if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL)
851			nva = errorq_elem_nva(ereport_errorq, scr_eqep);
852		else
853			nva = errorq_elem_nva(ereport_errorq, eqep);
854	} else {
855		ereport = fm_nvlist_create(NULL);
856		nva = NULL;
857	}
858
859	if (ereport == NULL)
860		return;
861
862	/*
863	 * Common payload data required by the protocol:
864	 *	- ereport class
865	 *	- detector
866	 *	- ENA
867	 */
868
869	/*
870	 * Ereport class - call into model-specific support to allow it to
871	 * provide a cpu class or leaf class, otherwise calculate our own.
872	 */
873	cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass);
874	classfmt = ged ?  ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN;
875	gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass,
876	    leafclass);
877
878	/*
879	 * The detector FMRI.
880	 */
881	if ((detector = cms_ereport_detector(hdl, bankidx, mscookie,
882	    nva)) == NULL)
883		detector = gcpu_fmri_create(hdl, nva);
884
885	/*
886	 * Should we define a new ENA format 3?? for chip/core/strand?
887	 * It will be better when virtualized.
888	 */
889	fm_ereport_set(ereport, FM_EREPORT_VERSION, buf,
890	    fm_ena_generate_cpu(gcl->gcl_timestamp,
891	    cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 |
892	    cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL);
893
894	if (panicstr) {
895		fm_nvlist_destroy(detector, FM_NVA_RETAIN);
896		nv_alloc_reset(nva);
897	} else {
898		fm_nvlist_destroy(detector, FM_NVA_FREE);
899	}
900
901	/*
902	 * Add the architectural ereport class-specific payload data.
903	 */
904	gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code);
905
906	/*
907	 * Allow model-specific code to add ereport members.
908	 */
909	cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status,
910	    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie);
911
912	/*
913	 * Include stack if options is turned on and either selected in
914	 * the payload member bitmask or inclusion is forced.
915	 */
916	if (gcpu_mca_stack_flag &&
917	    (cms_ereport_includestack(hdl, mscookie) ==
918	    B_TRUE || gcpu_mca_stack_ereport_include)) {
919		fm_payload_stack_add(ereport, gcl->gcl_stack,
920		    gcl->gcl_stackdepth);
921	}
922
923	/*
924	 * If injection has taken place anytime in the past then note this
925	 * on the ereport.
926	 */
927	if (cmi_inj_tainted() == B_TRUE) {
928		fm_payload_set(ereport, "__injected", DATA_TYPE_BOOLEAN_VALUE,
929		    B_TRUE, NULL);
930	}
931
932	/*
933	 * Post ereport.
934	 */
935	if (panicstr) {
936		errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC);
937		if (scr_eqep)
938			errorq_cancel(ereport_errorq, scr_eqep);
939	} else {
940		(void) fm_ereport_post(ereport, EVCH_TRYHARD);
941		fm_nvlist_destroy(ereport, FM_NVA_FREE);
942	}
943
944}
945
946/*ARGSUSED*/
947void
948gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe)
949{
950	const gcpu_logout_t *gcl = data;
951	const gcpu_bank_logout_t *gbl;
952	int ismc;
953	int i;
954
955	ismc = gcl->ismc;
956	for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) {
957		const gcpu_error_disp_t *gened;
958		cms_cookie_t mscookie;
959
960		if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
961		    !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
962			uint16_t code = MCAX86_ERRCODE(gbl->gbl_status);
963
964			/*
965			 * Perform a match based on IA32 MCA architectural
966			 * components alone.
967			 */
968			gened = gcpu_disp_match(code); /* may be NULL */
969
970			/*
971			 * Now see if an model-specific match can be made.
972			 */
973			mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, ismc,
974			    i, gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc,
975			    gcl->gcl_ms_logout);
976
977			/*
978			 * Prepare and dispatch an ereport for logging and
979			 * diagnosis.
980			 */
981			gcpu_ereport_post(gcl, i, gened, mscookie,
982			    gbl->gbl_status);
983		} else if (gbl->gbl_status & MSR_MC_STATUS_VAL &&
984		    (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) {
985			/*
986			 * Telemetry kept changing as we tried to read
987			 * it.  Force an unknown ereport leafclass but
988			 * keep the telemetry unchanged for logging.
989			 */
990			gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL,
991			    gbl->gbl_status);
992		}
993	}
994}
995
996static size_t gcpu_mca_queue_datasz = 0;
997
998/*
999 * The following code is ready to make a weak attempt at growing the
1000 * errorq structure size.  Since it is not foolproof (we don't know
1001 * who may already be producing to the outgoing errorq) our caller
1002 * instead assures that we'll always be called with no greater data
1003 * size than on our first call.
1004 */
1005static void
1006gcpu_errorq_init(size_t datasz)
1007{
1008	int slots;
1009
1010	mutex_enter(&gcpu_mca_queue_lock);
1011
1012	if (gcpu_mca_queue_datasz >= datasz) {
1013		mutex_exit(&gcpu_mca_queue_lock);
1014		return;
1015	}
1016
1017	membar_producer();
1018	if (gcpu_mca_queue) {
1019		gcpu_mca_queue_datasz = 0;
1020		errorq_destroy(gcpu_mca_queue);
1021	}
1022
1023	slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS);
1024	slots = MIN(slots, GCPU_MCA_MAX_ERRORS);
1025
1026	gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain,
1027	    NULL, slots, datasz, 1, ERRORQ_VITAL);
1028
1029	if (gcpu_mca_queue != NULL)
1030		gcpu_mca_queue_datasz = datasz;
1031
1032	mutex_exit(&gcpu_mca_queue_lock);
1033}
1034
1035/*
1036 * Perform MCA initialization as described in section 14.6 of Intel 64
1037 * and IA-32 Architectures Software Developer's Manual Volume 3A.
1038 */
1039
1040static uint_t global_nbanks;
1041
1042#ifndef __xpv
1043/*ARGSUSED*/
1044int
1045gcpu_cmci_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
1046{
1047	/*
1048	 * In general, we'd expect that in a multi-socket configuration, either
1049	 * all CPUs would support CMCI or none of them would.  Unfortunately,
1050	 * that may not be the case in the wild.  While we'd rather check the
1051	 * handle's enablement state here, that itself is a bit complicated. We
1052	 * don't have a guarantee in a heterogenous situation that the CPU in
1053	 * question is using the generic CPU module or not, even though we've
1054	 * been registered. As such, we allow the interrupt to be registered and
1055	 * written to the local apic anyways. We won't have a CMCI interrupt
1056	 * generated anyways because the MCA banks will not be programmed as
1057	 * such for that CPU by the polling thread.
1058	 */
1059	switch (what) {
1060	case CPU_ON:
1061		psm_cmci_setup(cpuid, B_TRUE);
1062		break;
1063	case CPU_OFF:
1064		psm_cmci_setup(cpuid, B_FALSE);
1065		break;
1066	default:
1067		break;
1068	}
1069
1070	return (0);
1071}
1072
1073void
1074gcpu_mca_cmci_enable(cmi_hdl_t hdl)
1075{
1076	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1077	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1078
1079	/*
1080	 * If this CPU doesn't support CMCI, don't do anything.
1081	 */
1082	if ((mca->gcpu_mca_flags & GCPU_MCA_F_CMCI_CAPABLE) == 0)
1083		return;
1084
1085	/*
1086	 * If we don't have support from the PSM module, then there's nothing we
1087	 * can do. Note that this changes as we start up the system. The only
1088	 * case where it may be mistakenly NULL is for the boot CPU. The boot
1089	 * CPU will have this taken care of for it in gcpu_post_startup(), once
1090	 * we know for certain whether or not the PSM module supports CMCI.
1091	 */
1092	if (psm_cmci_setup == NULL) {
1093		return;
1094	}
1095
1096	mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_ENABLE;
1097	if (MUTEX_HELD(&cpu_lock)) {
1098		if (!gcpu_mca_cpu_registered) {
1099			register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL);
1100			gcpu_mca_cpu_registered = B_TRUE;
1101		}
1102	} else {
1103		mutex_enter(&cpu_lock);
1104		if (!gcpu_mca_cpu_registered) {
1105			register_cpu_setup_func(gcpu_cmci_cpu_setup, NULL);
1106			gcpu_mca_cpu_registered = B_TRUE;
1107		}
1108		mutex_exit(&cpu_lock);
1109	}
1110
1111	/*
1112	 * Call the PSM op to make sure that we initialize things on
1113	 * this CPU.
1114	 */
1115	psm_cmci_setup(cmi_hdl_logical_id(hdl), B_TRUE);
1116}
1117#endif	/* !__xpv */
1118
1119void
1120gcpu_mca_init(cmi_hdl_t hdl)
1121{
1122	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1123	uint64_t cap;
1124	uint_t vendor = cmi_hdl_vendor(hdl);
1125	uint_t family = cmi_hdl_family(hdl);
1126	uint_t rev = cmi_hdl_chiprev(hdl);
1127	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1128	int mcg_ctl_present;
1129	uint_t nbanks;
1130	uint32_t ctl_skip_mask = 0;
1131	uint32_t status_skip_mask = 0;
1132	size_t mslsz;
1133	int i;
1134#ifndef __xpv
1135	int mcg_ctl2_present;
1136	uint32_t cmci_capable = 0;
1137#endif
1138	if (gcpu == NULL)
1139		return;
1140
1141	/* We add MCi_ADDR always for AMD Family 0xf and above */
1142	if (X86_CHIPFAM_ATLEAST(rev, X86_CHIPREV_AMD_F_REV_B))
1143		gcpu_force_addr_in_payload = 1;
1144
1145	/*
1146	 * Protect from some silly /etc/system settings.
1147	 */
1148	if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100)
1149		gcpu_mca_telemetry_retries = 5;
1150
1151	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS)
1152		return;
1153
1154	/*
1155	 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
1156	 * both MCA and MCE support (i.e., X86FSET_MCA).  P5, K6, and earlier
1157	 * processors, which have their own more primitive way of doing
1158	 * machine checks, will not have cmi_mca_init called since their
1159	 * CPUID information will not indicate both MCA and MCE features.
1160	 */
1161	ASSERT(is_x86_feature(x86_featureset, X86FSET_MCA));
1162
1163	/*
1164	 * Determine whether the IA32_MCG_CTL register is present.  If it
1165	 * is we will enable all features by writing -1 to it towards
1166	 * the end of this initialization;  if it is absent then volume 3A
1167	 * says we must nonetheless continue to initialize the individual
1168	 * banks.
1169	 */
1170	mcg_ctl_present = cap & MCG_CAP_CTL_P;
1171#ifndef __xpv
1172	mcg_ctl2_present = cap & MCG_CAP_CTL2_P;
1173#endif
1174
1175	/*
1176	 * We squirell values away for inspection/debugging.
1177	 */
1178	mca->gcpu_mca_bioscfg.bios_mcg_cap = cap;
1179	if (mcg_ctl_present)
1180		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL,
1181		    &mca->gcpu_mca_bioscfg.bios_mcg_ctl);
1182
1183	/*
1184	 * Determine the number of error-reporting banks implemented.
1185	 */
1186	mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK;
1187
1188	if (nbanks != 0 && global_nbanks == 0)
1189		global_nbanks = nbanks;	/* no race - BSP will get here first */
1190
1191	/*
1192	 * If someone is hiding the number of banks (perhaps we are fully
1193	 * virtualized?) or if this processor has more banks than the
1194	 * first to set global_nbanks then bail.  The latter requirement
1195	 * is because we need to size our errorq data structure and we
1196	 * don't want to have to grow the errorq (destroy and recreate)
1197	 * which may just lose some telemetry.
1198	 */
1199	if (nbanks == 0 || nbanks > global_nbanks)
1200		return;
1201
1202	mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks *
1203	    sizeof (struct gcpu_bios_bankcfg), KM_SLEEP);
1204
1205	/*
1206	 * Calculate the size we need to allocate for a gcpu_logout_t
1207	 * with a gcl_data array big enough for all banks of this cpu.
1208	 * Add any space requested by the model-specific logout support.
1209	 */
1210	mslsz = cms_logout_size(hdl);
1211	mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) +
1212	    (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz;
1213
1214	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
1215		gcpu_logout_t *gcl;
1216
1217		mca->gcpu_mca_logout[i] = gcl =
1218		    kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP);
1219		gcl->gcl_gcpu = gcpu;
1220		gcl->gcl_nbanks = nbanks;
1221		gcl->gcl_ms_logout = (mslsz == 0) ? NULL :
1222		    (char *)(&gcl->gcl_data[0]) + nbanks *
1223		    sizeof (gcpu_bank_logout_t);
1224
1225	}
1226
1227#ifdef __xpv
1228	gcpu_xpv_mca_init(nbanks);
1229#endif
1230
1231	mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1;
1232
1233#ifndef __xpv
1234	mca->gcpu_bank_cmci = kmem_zalloc(sizeof (gcpu_mca_cmci_t) * nbanks,
1235	    KM_SLEEP);
1236#endif
1237
1238	/*
1239	 * Create our errorq to transport the logout structures.  This
1240	 * can fail so users of gcpu_mca_queue must be prepared for NULL.
1241	 */
1242	gcpu_errorq_init(mca->gcpu_mca_lgsz);
1243
1244	/*
1245	 * Not knowing which, if any, banks are shared between cores we
1246	 * assure serialization of MCA bank initialization by each cpu
1247	 * on the chip.  On chip architectures in which some banks are
1248	 * shared this will mean the shared resource is initialized more
1249	 * than once - we're simply aiming to avoid simultaneous MSR writes
1250	 * to the shared resource.
1251	 *
1252	 * Even with these precautions, some platforms may yield a GP fault
1253	 * if a core other than a designated master tries to write anything
1254	 * but all 0's to MCi_{STATUS,ADDR,CTL}.  So we will perform
1255	 * those writes under on_trap protection.
1256	 */
1257	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
1258
1259	/*
1260	 * Initialize poller data, but don't start polling yet.
1261	 */
1262	gcpu_mca_poll_init(hdl);
1263
1264	/*
1265	 * Work out which MCA banks we will initialize.  In MCA logout
1266	 * code we will only read those banks which we initialize here.
1267	 */
1268	for (i = 0; i < nbanks; i++) {
1269		boolean_t skipctl = cms_bankctl_skipinit(hdl, i);
1270		boolean_t skipstatus = cms_bankstatus_skipinit(hdl, i);
1271
1272		if (!cms_present(hdl)) {
1273			/*
1274			 * Model-specific support is not present, try to use
1275			 * sane defaults.
1276			 *
1277			 * On AMD family 6 processors, reports about spurious
1278			 * machine checks indicate that bank 0 should be
1279			 * skipped.
1280			 *
1281			 * On Intel family 6 processors, the documentation tells
1282			 * us not to write to MC0_CTL.
1283			 *
1284			 */
1285			if (i == 0 && family == 6) {
1286				switch (vendor) {
1287				case X86_VENDOR_AMD:
1288					skipstatus = B_TRUE;
1289					/*FALLTHRU*/
1290				case X86_VENDOR_Intel:
1291					skipctl = B_TRUE;
1292					break;
1293				}
1294			}
1295		}
1296
1297		ctl_skip_mask |= skipctl << i;
1298		status_skip_mask |= skipstatus << i;
1299
1300		if (skipctl && skipstatus)
1301			continue;
1302
1303		/*
1304		 * Record which MCA banks were enabled, from the point of view
1305		 * of the whole chip (if some cores share a bank we must be
1306		 * sure either can logout from it).
1307		 */
1308		atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i);
1309
1310#ifndef __xpv
1311		/*
1312		 * check CMCI capability
1313		 */
1314		if (mcg_ctl2_present) {
1315			uint64_t ctl2;
1316			uint32_t cap = 0;
1317			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1318			if (ctl2 & MSR_MC_CTL2_EN)
1319				continue;
1320			ctl2 |= MSR_MC_CTL2_EN;
1321			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1322			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(i), &ctl2);
1323			mca->gcpu_bank_cmci[i].cmci_cap = cap =
1324			    (ctl2 & MSR_MC_CTL2_EN) ? 1 : 0;
1325			if (cap)
1326				cmci_capable ++;
1327			/*
1328			 * Set threshold to 1 while unset the en field, to avoid
1329			 * CMCI trigged before APIC LVT entry init.
1330			 */
1331			ctl2 = ctl2 & (~MSR_MC_CTL2_EN) | 1;
1332			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(i), ctl2);
1333
1334			/*
1335			 * init cmci related count
1336			 */
1337			mca->gcpu_bank_cmci[i].cmci_enabled = 0;
1338			mca->gcpu_bank_cmci[i].drtcmci = 0;
1339			mca->gcpu_bank_cmci[i].ncmci = 0;
1340		}
1341#endif
1342	}
1343
1344#ifndef __xpv
1345	if (cmci_capable) {
1346		mca->gcpu_mca_flags |= GCPU_MCA_F_CMCI_CAPABLE;
1347		gcpu_mca_cmci_enable(hdl);
1348	}
1349#endif
1350
1351#ifndef __xpv
1352	/*
1353	 * Log any valid telemetry lurking in the MCA banks, but do not
1354	 * clear the status registers.  Ignore the disposition returned -
1355	 * we have already paniced or reset for any nasty errors found here.
1356	 *
1357	 * Intel vol 3A says that we should not do this on family 0x6,
1358	 * and that for any extended family the BIOS clears things
1359	 * on power-on reset so you'll only potentially find valid telemetry
1360	 * on warm reset (we do it for both - on power-on reset we should
1361	 * just see zeroes).
1362	 *
1363	 * AMD docs since K7 say we should process anything we find here.
1364	 */
1365	if (!gcpu_suppress_log_on_init &&
1366	    (vendor == X86_VENDOR_Intel && family >= 0xf ||
1367	    vendor == X86_VENDOR_AMD))
1368		gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE,
1369		    GCPU_MPT_WHAT_POKE_ERR);
1370
1371	/*
1372	 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the
1373	 * model-specific module the power of veto.
1374	 */
1375	for (i = 0; i < nbanks; i++) {
1376		struct gcpu_bios_bankcfg *bcfgp =
1377		    mca->gcpu_mca_bioscfg.bios_bankcfg + i;
1378
1379		/*
1380		 * Stash inherited bank MCA state, even for banks we will
1381		 * not initialize ourselves.  Do not read the MISC register
1382		 * unconditionally - on some processors that will #GP on
1383		 * banks that do not implement the MISC register (would be
1384		 * caught by on_trap, anyway).
1385		 */
1386		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL),
1387		    &bcfgp->bios_bank_ctl);
1388
1389		(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1390		    &bcfgp->bios_bank_status);
1391
1392		if ((bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) ||
1393		    gcpu_force_addr_in_payload) {
1394			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR),
1395			    &bcfgp->bios_bank_addr);
1396		}
1397
1398		/*
1399		 * In some old BIOS the status value after boot can indicate
1400		 * MISCV when there is actually no MISC register for
1401		 * that bank.  The following read could therefore
1402		 * aggravate a general protection fault.  This should be
1403		 * caught by on_trap, but the #GP fault handler is busted
1404		 * and can suffer a double fault even before we get to
1405		 * trap() to check for on_trap protection.  Until that
1406		 * issue is fixed we remove the one access that we know
1407		 * can cause a #GP.
1408		 *
1409		 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV)
1410		 *	(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC),
1411		 *	    &bcfgp->bios_bank_misc);
1412		 */
1413		bcfgp->bios_bank_misc = 0;
1414
1415		if (!(ctl_skip_mask & (1 << i))) {
1416			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL),
1417			    cms_bankctl_val(hdl, i, -1ULL));
1418		}
1419
1420		if (!(status_skip_mask & (1 << i))) {
1421			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS),
1422			    cms_bankstatus_val(hdl, i, 0ULL));
1423		}
1424	}
1425#endif
1426	/*
1427	 * Now let the model-specific support perform further initialization
1428	 * of non-architectural features.
1429	 */
1430	cms_mca_init(hdl, nbanks);
1431
1432#ifndef __xpv
1433	(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL);
1434	membar_producer();
1435
1436	/* enable all machine-check features */
1437	if (mcg_ctl_present)
1438		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL,
1439		    cms_mcgctl_val(hdl, nbanks, -1ULL));
1440#endif
1441
1442	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
1443
1444#ifndef __xpv
1445	/* enable machine-check exception in CR4 */
1446	cmi_hdl_enable_mce(hdl);
1447#endif
1448}
1449
1450static uint64_t
1451gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu,
1452    gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp)
1453{
1454	int curctxbad = 0, unconstrained = 0, forcefatal = 0;
1455	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1456	int nbanks = mca->gcpu_mca_nbanks;
1457	gcpu_mce_status_t mce;
1458	gcpu_bank_logout_t *gbl;
1459	uint64_t disp = 0;
1460	int i;
1461
1462	if (mcesp == NULL)
1463		mcesp = &mce;
1464
1465	mcesp->mce_nerr = nerr;
1466
1467	mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc =
1468	    mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned =
1469	    mcesp->mce_forcefatal = mcesp->mce_ignored = 0;
1470
1471	/*
1472	 * If this a machine check then if the return instruction pointer
1473	 * is not valid the current context is lost.
1474	 */
1475	if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV))
1476		disp |= CMI_ERRDISP_RIPV_INVALID;
1477	gcl->ismc = ismc;
1478
1479	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1480		uint64_t mcistatus = gbl->gbl_status;
1481		uint32_t ms_scope;
1482		int pcc, uc;
1483		int poisoned;
1484
1485		if (!(mcistatus & MSR_MC_STATUS_VAL))
1486			continue;
1487
1488		if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)
1489			continue;
1490
1491		pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0;
1492		uc = (mcistatus & MSR_MC_STATUS_UC) != 0;
1493		mcesp->mce_npcc += pcc;
1494		mcesp->mce_nuc += uc;
1495
1496		ms_scope = cms_error_action(hdl, ismc, i, mcistatus,
1497		    gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout);
1498
1499		if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) {
1500			pcc = 0;
1501			mcesp->mce_npcc_ok++;
1502			gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED;
1503		}
1504
1505		if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) {
1506			uc = 0;
1507			mcesp->mce_nuc_ok++;
1508			gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED;
1509		}
1510
1511		if (uc) {
1512			poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0;
1513			if (poisoned) {
1514				mcesp->mce_nuc_poisoned++;
1515				gbl->gbl_disp |= CMI_ERRDISP_POISONED;
1516			}
1517		}
1518
1519		if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) {
1520			/*
1521			 * We're not being instructed to ignore the error,
1522			 * so apply our standard disposition logic to it.
1523			 */
1524			if (uc && !poisoned) {
1525				unconstrained++;
1526				gbl->gbl_disp |= disp |
1527				    CMI_ERRDISP_UC_UNCONSTRAINED;
1528			}
1529
1530			if (pcc && ismc) {
1531				curctxbad++;
1532				gbl->gbl_disp |= disp |
1533				    CMI_ERRDISP_CURCTXBAD;
1534			}
1535
1536			/*
1537			 * Even if the above may not indicate that the error
1538			 * is terminal, model-specific support may insist
1539			 * that we treat it as such.  Such errors wil be
1540			 * fatal even if discovered via poll.
1541			 */
1542			if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) {
1543				forcefatal++;
1544				mcesp->mce_forcefatal++;
1545				gbl->gbl_disp |= disp |
1546				    CMI_ERRDISP_FORCEFATAL;
1547			}
1548		} else {
1549			mcesp->mce_ignored++;
1550			gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED;
1551		}
1552	}
1553
1554	if (unconstrained > 0)
1555		disp |= CMI_ERRDISP_UC_UNCONSTRAINED;
1556
1557	if (curctxbad > 0)
1558		disp |= CMI_ERRDISP_CURCTXBAD;
1559
1560	if (forcefatal > 0)
1561		disp |= CMI_ERRDISP_FORCEFATAL;
1562
1563	if (gcpu_mca_queue != NULL) {
1564		int how;
1565
1566		if (ismc) {
1567			how = cmi_mce_response(rp, disp) ?
1568			    ERRORQ_ASYNC :	/* no panic, so arrange drain */
1569			    ERRORQ_SYNC;	/* panic flow will drain */
1570		} else {
1571			how = (disp & CMI_ERRDISP_FORCEFATAL &&
1572			    cmi_panic_on_ue()) ?
1573			    ERRORQ_SYNC :	/* poller will panic */
1574			    ERRORQ_ASYNC;	/* no panic */
1575		}
1576
1577		errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how);
1578	} else if (disp != 0) {
1579		gcpu_bleat(hdl, gcl);
1580	}
1581
1582	mcesp->mce_disp = disp;
1583
1584	return (disp);
1585}
1586
1587/*
1588 * Gather error telemetry from our source, and then submit it for
1589 * processing.
1590 */
1591
1592#define	IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \
1593	((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0)
1594
1595#define	STATUS_EQV(s1, s2) \
1596	(((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER))
1597
1598static uint32_t gcpu_deferrred_polled_clears;
1599
1600#ifndef __xpv
1601static void
1602gcpu_cmci_logout(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1603    uint64_t status, int what)
1604{
1605	uint64_t ctl2;
1606
1607	if (bank_cmci_p->cmci_cap && (what == GCPU_MPT_WHAT_CYC_ERR) &&
1608	    (!(status & MSR_MC_STATUS_VAL) || ((status & MSR_MC_STATUS_VAL) &&
1609	    !(status & MSR_MC_STATUS_CEC_MASK)))) {
1610
1611		if (!(bank_cmci_p->cmci_enabled)) {
1612			/*
1613			 * when cmci is disabled, and the bank has no error or
1614			 * no corrected error for
1615			 * gcpu_mca_cmci_reenable_threshold consecutive polls,
1616			 * turn on this bank's cmci.
1617			 */
1618
1619			bank_cmci_p->drtcmci ++;
1620
1621			if (bank_cmci_p->drtcmci >=
1622			    gcpu_mca_cmci_reenable_threshold) {
1623
1624				/* turn on cmci */
1625
1626				(void) cmi_hdl_rdmsr(hdl,
1627				    IA32_MSR_MC_CTL2(bank), &ctl2);
1628				ctl2 |= MSR_MC_CTL2_EN;
1629				(void) cmi_hdl_wrmsr(hdl,
1630				    IA32_MSR_MC_CTL2(bank), ctl2);
1631
1632				/* reset counter and set flag */
1633				bank_cmci_p->drtcmci = 0;
1634				bank_cmci_p->cmci_enabled = 1;
1635			}
1636		} else {
1637			/*
1638			 * when cmci is enabled,if is in cyclic poll and the
1639			 * bank has no error or no corrected error, reset ncmci
1640			 * counter
1641			 */
1642			bank_cmci_p->ncmci = 0;
1643		}
1644	}
1645}
1646
1647static void
1648gcpu_cmci_throttle(cmi_hdl_t hdl, int bank, gcpu_mca_cmci_t *bank_cmci_p,
1649    int what)
1650{
1651	uint64_t ctl2 = 0;
1652
1653	/*
1654	 * if cmci of this bank occurred beyond
1655	 * gcpu_mca_cmci_throttling_threshold between 2 polls,
1656	 * turn off this bank's CMCI;
1657	 */
1658	if (bank_cmci_p->cmci_enabled && what == GCPU_MPT_WHAT_CMCI_ERR) {
1659
1660		/* if it is cmci trap, increase the count */
1661		bank_cmci_p->ncmci++;
1662
1663		if (bank_cmci_p->ncmci >= gcpu_mca_cmci_throttling_threshold) {
1664
1665			/* turn off cmci */
1666
1667			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC_CTL2(bank),
1668			    &ctl2);
1669			ctl2 &= ~MSR_MC_CTL2_EN;
1670			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC_CTL2(bank),
1671			    ctl2);
1672
1673			/* clear the flag and count */
1674
1675			bank_cmci_p->cmci_enabled = 0;
1676			bank_cmci_p->ncmci = 0;
1677		}
1678	}
1679}
1680#endif
1681
1682static void
1683clear_mc(int first, int last, int ismc, boolean_t clrstatus,
1684    cmi_hdl_t hdl, gcpu_logout_t *gcl, gcpu_logout_t *pgcl)
1685{
1686	int i;
1687	gcpu_bank_logout_t *gbl, *pgbl;
1688	uint64_t status;
1689
1690	if (first < 0 || last < 0)
1691		return;
1692
1693	for (i = first, gbl = &gcl->gcl_data[first]; i <= last; i++, gbl++) {
1694		status = gbl->gbl_status;
1695		if (status == 0)
1696			continue;
1697		if (clrstatus == B_FALSE)
1698			goto serialize;
1699
1700		/*
1701		 * For i86xpv we always clear status in order to invalidate
1702		 * the interposed telemetry.
1703		 *
1704		 * For native machine checks we always clear status here.  For
1705		 * native polls we must be a little more cautious since there
1706		 * is an outside chance that we may clear telemetry from a
1707		 * shared MCA bank on which a sibling core is machine checking.
1708		 *
1709		 * For polled observations of errors that look like they may
1710		 * produce a machine check (UC/PCC and ENabled, although these
1711		 * do not guarantee a machine check on error occurence)
1712		 * we will not clear the status at this wakeup unless
1713		 * we saw the same status at the previous poll.	 We will
1714		 * always process and log the current observations - it
1715		 * is only the clearing of MCi_STATUS which may be
1716		 * deferred until the next wakeup.
1717		 */
1718		if (isxpv || ismc || !IS_MCE_CANDIDATE(status)) {
1719			(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL);
1720			goto serialize;
1721		}
1722
1723		/*
1724		 * We have a polled observation of a machine check
1725		 * candidate.  If we saw essentially the same status at the
1726		 * last poll then clear the status now since this appears
1727		 * not to be a #MC candidate after all.	 If we see quite
1728		 * different status now then do not clear, but reconsider at
1729		 * the next poll.  In no actual machine check clears
1730		 * the status in the interim then the status should not
1731		 * keep changing forever (meaning we'd never clear it)
1732		 * since before long we'll simply have latched the highest-
1733		 * priority error and set the OVerflow bit.  Nonetheless
1734		 * we count how many times we defer clearing and after
1735		 * a while insist on clearing the status.
1736		 */
1737		pgbl = &pgcl->gcl_data[i];
1738		if (pgbl->gbl_clrdefcnt != 0) {
1739			/* We deferred clear on this bank at last wakeup */
1740			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) ||
1741			    pgbl->gbl_clrdefcnt > 5) {
1742				/*
1743				 * Status is unchanged so clear it now and,
1744				 * since we have already logged this info,
1745				 * avoid logging it again.
1746				 */
1747				gbl->gbl_status = 0;
1748				(void) cmi_hdl_wrmsr(hdl,
1749				    IA32_MSR_MC(i, STATUS), 0ULL);
1750			} else {
1751				/* Record deferral for next wakeup */
1752				gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1;
1753			}
1754		} else {
1755			/* Record initial deferral for next wakeup */
1756			gbl->gbl_clrdefcnt = 1;
1757			gcpu_deferrred_polled_clears++;
1758		}
1759
1760serialize:
1761		{
1762#ifdef __xpv
1763			;
1764#else
1765			/*
1766			 * Intel Vol 3A says to execute a serializing
1767			 * instruction here, ie CPUID.	Well WRMSR is also
1768			 * defined to be serializing, so the status clear above
1769			 * should suffice.  To be a good citizen, and since
1770			 * some clears are deferred, we'll execute a CPUID
1771			 * instruction here.
1772			 */
1773			struct cpuid_regs tmp;
1774			(void) __cpuid_insn(&tmp);
1775#endif
1776		}
1777	}
1778}
1779
1780/*ARGSUSED5*/
1781void
1782gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask,
1783    gcpu_mce_status_t *mcesp, boolean_t clrstatus, int what)
1784{
1785	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
1786	gcpu_mca_t *mca = &gcpu->gcpu_mca;
1787	int nbanks = mca->gcpu_mca_nbanks;
1788	gcpu_bank_logout_t *gbl, *pgbl;
1789	gcpu_logout_t *gcl, *pgcl;
1790	int ismc = (rp != NULL);
1791	int ispoll = !ismc;
1792	int i, nerr = 0;
1793	cmi_errno_t err;
1794	uint64_t mcg_status;
1795	uint64_t disp;
1796	uint64_t cap;
1797	int first = -1;
1798	int last = -1;
1799	int willpanic = 0;
1800
1801	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
1802	    CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) !=
1803	    CMI_SUCCESS) {
1804		if (mcesp != NULL)
1805			mcesp->mce_nerr = mcesp->mce_disp = 0;
1806		return;
1807	}
1808
1809	if (ismc) {
1810		gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION];
1811	} else {
1812		int pidx = mca->gcpu_mca_nextpoll_idx;
1813		int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ?
1814		    GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1;
1815
1816		gcl = mca->gcpu_mca_logout[pidx];	/* current logout */
1817		pgcl = mca->gcpu_mca_logout[ppidx];	/* previous logout */
1818		mca->gcpu_mca_nextpoll_idx = ppidx;	/* switch next time */
1819	}
1820
1821	gcl->gcl_timestamp = gethrtime_waitfree();
1822	gcl->gcl_mcg_status = mcg_status;
1823	gcl->gcl_ip = rp ? rp->r_pc : 0;
1824
1825	gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0;
1826	if (cap & MCG_CAP_TES_P)
1827		gcl->gcl_flags |= GCPU_GCL_F_TES_P;
1828
1829	for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) {
1830		uint64_t status, status2, addr, misc;
1831		int retries = gcpu_mca_telemetry_retries;
1832
1833		gbl->gbl_status = 0;
1834		gbl->gbl_disp = 0;
1835		gbl->gbl_clrdefcnt = 0;
1836
1837		/*
1838		 * Only logout from MCA banks we have initialized from at
1839		 * least one core.  If a core shares an MCA bank with another
1840		 * but perhaps lost the race to initialize it, then it must
1841		 * still be allowed to logout from the shared bank.
1842		 */
1843		if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i))
1844			continue;
1845
1846		/*
1847		 * On a poll look only at the banks we've been asked to check.
1848		 */
1849		if (rp == NULL && !(bankmask & 1 << i))
1850			continue;
1851
1852
1853		if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) !=
1854		    CMI_SUCCESS)
1855			continue;
1856
1857#ifndef __xpv
1858		gcpu_cmci_logout(hdl, i, &mca->gcpu_bank_cmci[i], status, what);
1859#endif
1860
1861retry:
1862		if (!(status & MSR_MC_STATUS_VAL))
1863			continue;
1864
1865		/* First and last bank that have valid status */
1866		if (first < 0)
1867			first = i;
1868		last = i;
1869
1870		addr = -1;
1871		misc = 0;
1872
1873		if ((status & MSR_MC_STATUS_ADDRV) ||
1874		    gcpu_force_addr_in_payload)
1875			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr);
1876
1877		if (status & MSR_MC_STATUS_MISCV)
1878			(void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc);
1879
1880#ifndef __xpv
1881		gcpu_cmci_throttle(hdl, i, &mca->gcpu_bank_cmci[i], what);
1882#endif
1883
1884		/*
1885		 * Allow the model-specific code to extract bank telemetry.
1886		 */
1887		cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout);
1888
1889		/*
1890		 * Not all cpu models assure us that the status/address/misc
1891		 * data will not change during the above sequence of MSR reads,
1892		 * or that it can only change by the addition of the OVerflow
1893		 * bit to the status register.  If the status has changed
1894		 * other than in the overflow bit then we attempt to reread
1895		 * for a consistent snapshot, but eventually give up and
1896		 * go with what we've got.  We only perform this check
1897		 * for a poll - a further #MC during a #MC will reset, and
1898		 * polled errors should not overwrite higher-priority
1899		 * trapping errors (but could set the overflow bit).
1900		 */
1901		if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS),
1902		    &status2)) == CMI_SUCCESS) {
1903			if (!STATUS_EQV(status, status2)) {
1904				if (retries-- > 0) {
1905					status = status2;
1906					goto retry;
1907				} else {
1908					gbl->gbl_disp |=
1909					    CMI_ERRDISP_INCONSISTENT;
1910				}
1911			}
1912		} else if (ispoll && err != CMI_SUCCESS) {
1913			gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT;
1914		}
1915
1916		nerr++;
1917		gbl->gbl_status = status;
1918		gbl->gbl_addr = addr;
1919		gbl->gbl_misc = misc;
1920
1921		/*
1922		 * For polled observation, if the count of deferred status
1923		 * clears updated in the clear_mc() is nonzero and the
1924		 * MCi_STATUS has not changed, the last wakeup has produced
1925		 * the ereport of the error. Therefore, clear the status in
1926		 * this wakeup to avoid duplicate ereport.
1927		 */
1928		pgbl = &pgcl->gcl_data[i];
1929		if (!isxpv && ispoll && IS_MCE_CANDIDATE(status) &&
1930		    pgbl->gbl_clrdefcnt != 0) {
1931			if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status)) {
1932				gbl->gbl_status = 0;
1933				(void) cmi_hdl_wrmsr(hdl,
1934				    IA32_MSR_MC(i, STATUS), 0ULL);
1935			}
1936		}
1937	}
1938
1939	if (gcpu_mca_stack_flag)
1940		gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH);
1941	else
1942		gcl->gcl_stackdepth = 0;
1943
1944	/*
1945	 * Decide our disposition for this error or errors, and submit for
1946	 * logging and subsequent diagnosis.
1947	 */
1948	if (nerr != 0) {
1949		disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp);
1950
1951		willpanic = (ismc && cmi_mce_response(rp, disp) == 0);
1952
1953		if (!willpanic)
1954			clear_mc(first, last, ismc, clrstatus, hdl, gcl, pgcl);
1955	} else {
1956		disp = 0;
1957		if (mcesp) {
1958			mcesp->mce_nerr = mcesp->mce_disp = 0;
1959		}
1960	}
1961
1962	/*
1963	 * Clear MCG_STATUS if MCIP is set (machine check in progress).
1964	 * If a second #MC had occured before now the system would have
1965	 * reset.  We can only do thise once gcpu_mca_process has copied
1966	 * the logout structure.
1967	 */
1968	if (ismc && mcg_status & MCG_STATUS_MCIP)
1969		(void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0);
1970
1971	/*
1972	 * At this point we have read and logged all telemetry that is visible
1973	 * under the MCA.  On architectures for which the NorthBridge is
1974	 * on-chip this may include NB-observed errors, but where the NB
1975	 * is off chip it may have been the source of the #MC request and
1976	 * so we must call into the memory-controller driver to give it
1977	 * a chance to log errors.
1978	 */
1979	if (ismc) {
1980		cmi_mc_logout(hdl, 1, willpanic);
1981	}
1982}
1983
1984#ifndef __xpv
1985int gcpu_mca_trap_vomit_summary = 0;
1986
1987/*
1988 * On a native machine check exception we come here from mcetrap via
1989 * cmi_mca_trap.  A machine check on one cpu of a chip does not trap others
1990 * cpus of the chip, so it is possible that another cpu on this chip could
1991 * initiate a poll while we're in the #mc handler;  it is also possible that
1992 * this trap has occured during a poll on this cpu.  So we must acquire
1993 * the chip-wide poll lock, but be careful to avoid deadlock.
1994 *
1995 * The 'data' pointer cannot be NULL due to init order.
1996 */
1997uint64_t
1998gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp)
1999{
2000	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2001	kmutex_t *poll_lock = NULL;
2002	gcpu_mce_status_t mce;
2003	uint64_t mcg_status;
2004	int tooklock = 0;
2005
2006	if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) !=
2007	    CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP))
2008		return (0);
2009
2010	/*
2011	 * Synchronize with any poller from another core that may happen
2012	 * to share access to one or more of the MCA banks.
2013	 */
2014	if (gcpu->gcpu_shared != NULL)
2015		poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock;
2016
2017	if (poll_lock != NULL && !mutex_owned(poll_lock)) {
2018		/*
2019		 * The lock is not owned by the thread we have
2020		 * interrupted.  Spin for this adaptive lock.
2021		 */
2022		while (!mutex_tryenter(poll_lock)) {
2023			while (mutex_owner(poll_lock) != NULL)
2024				;
2025		}
2026		tooklock = 1;
2027	}
2028
2029	gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE, GCPU_MPT_WHAT_MC_ERR);
2030
2031	if (tooklock)
2032		mutex_exit(poll_lock);
2033
2034	/*
2035	 * gcpu_mca_trap_vomit_summary may be set for debug assistance.
2036	 */
2037	if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) {
2038		cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, "
2039		    "%u PCC (%u ok), "
2040		    "%u UC (%d ok, %u poisoned), "
2041		    "%u forcefatal, %u ignored",
2042		    mce.mce_nerr, (u_longlong_t)mce.mce_disp,
2043		    mce.mce_npcc, mce.mce_npcc_ok,
2044		    mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned,
2045		    mce.mce_forcefatal, mce.mce_ignored);
2046	}
2047
2048	return (mce.mce_disp);
2049}
2050#endif
2051
2052/*ARGSUSED*/
2053void
2054gcpu_faulted_enter(cmi_hdl_t hdl)
2055{
2056	/* Nothing to do here */
2057}
2058
2059/*ARGSUSED*/
2060void
2061gcpu_faulted_exit(cmi_hdl_t hdl)
2062{
2063	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2064
2065	gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING;
2066}
2067
2068/*
2069 * Write the requested values to the indicated MSRs.  Having no knowledge
2070 * of the model-specific requirements for writing to these model-specific
2071 * registers, we will only blindly write to those MSRs if the 'force'
2072 * argument is nonzero.  That option should only be used in prototyping
2073 * and debugging.
2074 */
2075/*ARGSUSED*/
2076cmi_errno_t
2077gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs,
2078    int force)
2079{
2080	int i, errs = 0;
2081
2082	for (i = 0; i < nregs; i++) {
2083		uint_t msr = regs[i].cmr_msrnum;
2084		uint64_t val = regs[i].cmr_msrval;
2085
2086		if (cms_present(hdl)) {
2087			if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS)
2088				errs++;
2089		} else if (force) {
2090			errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS);
2091		} else {
2092			errs++;
2093		}
2094	}
2095
2096	return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN);
2097}
2098
2099/* deconfigure gcpu_mca_init() */
2100void
2101gcpu_mca_fini(cmi_hdl_t hdl)
2102{
2103	gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl);
2104	gcpu_mca_t *mca = &gcpu->gcpu_mca;
2105	int i;
2106
2107	/*
2108	 * CPU startup code only calls cmi_mca_init if x86_featureset indicates
2109	 * both MCA and MCE support (i.e., X86FSET_MCA).  P5, K6, and earlier
2110	 * processors, which have their own more primitive way of doing
2111	 * machine checks, will not have cmi_mca_init called since their
2112	 * CPUID information will not indicate both MCA and MCE features.
2113	 */
2114	if (!is_x86_feature(x86_featureset, X86FSET_MCA))
2115		return;
2116#ifndef __xpv
2117	/*
2118	 * disable machine check in CR4
2119	 */
2120	cmi_ntv_hwdisable_mce(hdl);
2121#endif
2122	mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock);
2123	gcpu_mca_poll_fini(hdl);
2124	mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock);
2125
2126	/*
2127	 * free resources allocated during init
2128	 */
2129	if (mca->gcpu_bank_cmci != NULL) {
2130		kmem_free(mca->gcpu_bank_cmci, sizeof (gcpu_mca_cmci_t) *
2131		    mca->gcpu_mca_nbanks);
2132	}
2133
2134	for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) {
2135		if (mca->gcpu_mca_logout[i] != NULL) {
2136			kmem_free(mca->gcpu_mca_logout[i], mca->gcpu_mca_lgsz);
2137		}
2138	}
2139
2140	if (mca->gcpu_mca_bioscfg.bios_bankcfg != NULL) {
2141		kmem_free(mca->gcpu_mca_bioscfg.bios_bankcfg,
2142		    sizeof (struct gcpu_bios_bankcfg) * mca->gcpu_mca_nbanks);
2143	}
2144}
2145