1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/mca_x86.h> 30 #include <sys/cpu_module_impl.h> 31 #include <sys/cpu_module_ms.h> 32 #include <sys/cmn_err.h> 33 #include <sys/cpuvar.h> 34 #include <sys/pghw.h> 35 #include <sys/x86_archext.h> 36 #include <sys/sysmacros.h> 37 #include <sys/regset.h> 38 #include <sys/privregs.h> 39 #include <sys/systm.h> 40 #include <sys/types.h> 41 #include <sys/log.h> 42 #include <sys/psw.h> 43 #include <sys/fm/protocol.h> 44 #include <sys/fm/util.h> 45 #include <sys/errorq.h> 46 #include <sys/mca_x86.h> 47 #include <sys/fm/cpu/GMCA.h> 48 #include <sys/sysevent.h> 49 #include <sys/ontrap.h> 50 51 #include "gcpu.h" 52 53 /* 54 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 55 * error logout time. The stack will be included in the ereport if the 56 * error type selects stack inclusion, or in all cases if 57 * gcpu_mca_stack_ereport_include is nonzero. 58 */ 59 int gcpu_mca_stack_flag = 0; 60 int gcpu_mca_stack_ereport_include = 0; 61 62 /* 63 * The number of times to re-read MCA telemetry to try to obtain a 64 * consistent snapshot if we find it to be changing under our feet. 65 */ 66 int gcpu_mca_telemetry_retries = 5; 67 68 static gcpu_error_disp_t gcpu_errtypes[] = { 69 70 /* 71 * Unclassified 72 */ 73 { 74 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 75 NULL, 76 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 77 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 78 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 79 }, 80 81 /* 82 * Microcode ROM Parity Error 83 */ 84 { 85 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 86 NULL, 87 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 88 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 89 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 90 }, 91 92 /* 93 * External - BINIT# from another processor during power-on config 94 */ 95 { 96 FM_EREPORT_CPU_GENERIC_EXTERNAL, 97 NULL, 98 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 99 MCAX86_SIMPLE_EXTERNAL_MASKON, 100 MCAX86_SIMPLE_EXTERNAL_MASKOFF 101 }, 102 103 /* 104 * Functional redundancy check master/slave error 105 */ 106 { 107 FM_EREPORT_CPU_GENERIC_FRC, 108 NULL, 109 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 110 MCAX86_SIMPLE_FRC_MASKON, 111 MCAX86_SIMPLE_FRC_MASKOFF 112 }, 113 114 /* 115 * Internal timer error 116 */ 117 { 118 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 119 NULL, 120 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 121 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 122 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 123 }, 124 125 /* 126 * Internal unclassified 127 */ 128 { 129 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 130 NULL, 131 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 132 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 133 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 134 }, 135 136 /* 137 * Compound error codes - generic memory hierarchy 138 */ 139 { 140 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 141 NULL, 142 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 143 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 144 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 145 }, 146 147 /* 148 * Compound error codes - TLB errors 149 */ 150 { 151 FM_EREPORT_CPU_GENERIC_TLB, 152 "%1$s" "TLB" "%2$s" "_ERR", 153 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 154 MCAX86_COMPOUND_TLB_MASKON, 155 MCAX86_COMPOUND_TLB_MASKOFF 156 }, 157 158 /* 159 * Compound error codes - memory hierarchy 160 */ 161 { 162 FM_EREPORT_CPU_GENERIC_MEMHIER, 163 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 164 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 165 MCAX86_COMPOUND_MEMHIER_MASKON, 166 MCAX86_COMPOUND_MEMHIER_MASKOFF 167 }, 168 169 /* 170 * Compound error codes - bus and interconnect errors 171 */ 172 { 173 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 174 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 175 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 176 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 177 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 178 }, 179 }; 180 181 static gcpu_error_disp_t gcpu_unknown = { 182 FM_EREPORT_CPU_GENERIC_UNKNOWN, 183 "UNKNOWN", 184 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 185 0, 186 0 187 }; 188 189 static errorq_t *gcpu_mca_queue; 190 static kmutex_t gcpu_mca_queue_lock; 191 192 static const gcpu_error_disp_t * 193 gcpu_disp_match(uint16_t code) 194 { 195 const gcpu_error_disp_t *ged = gcpu_errtypes; 196 int i; 197 198 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 199 i++, ged++) { 200 uint16_t on = ged->ged_errcode_mask_on; 201 uint16_t off = ged->ged_errcode_mask_off; 202 203 if ((code & on) == on && (code & off) == 0) 204 return (ged); 205 } 206 207 return (NULL); 208 } 209 210 static uint8_t 211 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 212 { 213 return ((uint8_t)(code & mask) >> shift); 214 } 215 216 #define BIT_STRIP(code, name) \ 217 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 218 MCAX86_ERRCODE_##name##_SHIFT) 219 220 #define GCPU_MNEMONIC_UNDEF "undefined" 221 #define GCPU_MNEMONIC_RESVD "reserved" 222 223 /* 224 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 225 * mnemonics and to ereport class name components. 226 */ 227 228 struct gcpu_mnexp { 229 const char *mne_compound; /* used in expanding compound errname */ 230 const char *mne_ereport; /* used in expanding ereport class */ 231 }; 232 233 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 234 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 235 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 236 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 237 { GCPU_MNEMONIC_UNDEF, "" } 238 }; 239 240 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 241 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 242 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 243 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 244 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 245 }; 246 247 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 248 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 249 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 250 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 251 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 252 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 253 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 254 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 255 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 256 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 257 }; 258 259 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 260 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 261 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 262 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 263 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 264 }; 265 266 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 267 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 268 { GCPU_MNEMONIC_RESVD, "" }, 269 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 270 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 271 }; 272 273 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 274 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 275 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 276 }; 277 278 enum gcpu_mn_namespace { 279 GCPU_MN_NAMESPACE_COMPOUND, 280 GCPU_MN_NAMESPACE_EREPORT 281 }; 282 283 static const char * 284 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val, 285 enum gcpu_mn_namespace nspace) 286 { 287 if (val >= tbl_sz) 288 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 289 290 switch (nspace) { 291 case GCPU_MN_NAMESPACE_COMPOUND: 292 return (tbl[val].mne_compound); 293 /*NOTREACHED*/ 294 295 case GCPU_MN_NAMESPACE_EREPORT: 296 return (tbl[val].mne_ereport); 297 /*NOTREACHED*/ 298 299 default: 300 return (GCPU_MNEMONIC_UNDEF); 301 /*NOTREACHED*/ 302 } 303 } 304 305 /* 306 * The ereport class leaf component is either a simple string with no 307 * format specifiers, or a string with one or more embedded %n$s specifiers - 308 * positional selection for string arguments. The kernel snprintf does 309 * not support %n$ (and teaching it to do so is too big a headache) so 310 * we will expand this restricted format string ourselves. 311 */ 312 313 #define GCPU_CLASS_VARCOMPS 7 314 315 #define GCPU_MNEMONIC(code, name, nspace) \ 316 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 317 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 318 BIT_STRIP(code, name), nspace) 319 320 static void 321 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 322 enum gcpu_mn_namespace nspace) 323 { 324 uint16_t code = MCAX86_ERRCODE(status); 325 const char *mn[GCPU_CLASS_VARCOMPS]; 326 char *p = buf; /* current position in buf */ 327 char *q = buf + buflen; /* pointer past last char in buf */ 328 int which, expfmtchar, error; 329 char c; 330 331 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 332 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 333 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 334 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 335 mn[4] = GCPU_MNEMONIC(code, II, nspace); 336 mn[5] = GCPU_MNEMONIC(code, T, nspace); 337 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 338 339 while (p < q - 1 && (c = *fmt++) != '\0') { 340 if (c != '%') { 341 /* not the beginning of a format specifier - copy */ 342 *p++ = c; 343 continue; 344 } 345 346 error = 0; 347 which = -1; 348 expfmtchar = -1; 349 350 nextfmt: 351 if ((c = *fmt++) == '\0') 352 break; /* early termination of fmt specifier */ 353 354 switch (c) { 355 case '1': 356 case '2': 357 case '3': 358 case '4': 359 case '5': 360 case '6': 361 case '7': 362 if (which != -1) { /* allow only one positional digit */ 363 error++; 364 break; 365 } 366 which = c - '1'; 367 goto nextfmt; 368 /*NOTREACHED*/ 369 370 case '$': 371 if (which == -1) { /* no position specified */ 372 error++; 373 break; 374 } 375 expfmtchar = 's'; 376 goto nextfmt; 377 /*NOTREACHED*/ 378 379 case 's': 380 if (expfmtchar != 's') { 381 error++; 382 break; 383 } 384 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 385 mn[which]); 386 p += strlen(p); 387 break; 388 389 default: 390 error++; 391 break; 392 } 393 394 if (error) 395 break; 396 } 397 398 *p = '\0'; /* NUL termination */ 399 } 400 401 static void 402 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 403 const char *cpuclass, const char *leafclass) 404 { 405 char *p = buf; /* current position in buf */ 406 char *q = buf + buflen; /* pointer past last char in buf */ 407 408 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 409 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 410 411 p += strlen(p); 412 if (p >= q) 413 return; 414 415 if (leafclass == NULL) { 416 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 417 GCPU_MN_NAMESPACE_EREPORT); 418 } else { 419 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 420 leafclass); 421 } 422 } 423 424 /* 425 * Create an "hc" scheme FMRI identifying the given cpu. We don't know 426 * the actual topology/connectivity of cpus in the system, so we'll 427 * apply /motherboard=0/chip=.../cpu=... in all cases. 428 */ 429 static nvlist_t * 430 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 431 { 432 nvlist_t *nvl; 433 434 if ((nvl = fm_nvlist_create(nva)) == NULL) 435 return (NULL); 436 437 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 3, 438 "motherboard", 0, 439 "chip", cmi_hdl_chipid(hdl), 440 "cpu", cmi_hdl_coreid(hdl)); 441 442 return (nvl); 443 } 444 445 int gcpu_bleat_count_thresh = 5; 446 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 447 448 /* 449 * Called when we are unable to propogate a logout structure onto an 450 * errorq for subsequent ereport preparation and logging etc. The caller 451 * should usually only decide to call this for severe errors - those we 452 * suspect we may need to panic for. 453 */ 454 static void 455 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 456 { 457 hrtime_t now = gethrtime_waitfree(); 458 static hrtime_t gcpu_last_bleat; 459 gcpu_bank_logout_t *gbl; 460 static int bleatcount; 461 int i; 462 463 /* 464 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 465 * can come as fast as we like, but once we've spammed that many 466 * to the console we require a minimum interval to pass before 467 * any more complaints. 468 */ 469 if (++bleatcount > gcpu_bleat_count_thresh) { 470 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 471 return; 472 else 473 bleatcount = 0; 474 } 475 gcpu_last_bleat = now; 476 477 cmn_err(CE_WARN, "Machine-Check Errors unlogged on chip %d core %d, " 478 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl)); 479 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 480 (u_longlong_t)gcl->gcl_mcg_status); 481 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 482 uint64_t status = gbl->gbl_status; 483 484 if (!(status & MSR_MC_STATUS_VAL)) 485 continue; 486 487 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 488 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 489 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 490 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 491 i, IA32_MSR_MC(i, STATUS), 492 (u_longlong_t)status, 493 (u_longlong_t)gbl->gbl_addr, 494 (u_longlong_t)gbl->gbl_misc); 495 break; 496 497 case MSR_MC_STATUS_ADDRV: 498 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 499 "STAT 0x%016llx ADDR 0x%016llx", 500 i, IA32_MSR_MC(i, STATUS), 501 (u_longlong_t)status, 502 (u_longlong_t)gbl->gbl_addr); 503 break; 504 505 case MSR_MC_STATUS_MISCV: 506 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 507 "STAT 0x%016llx MISC 0x%016llx", 508 i, IA32_MSR_MC(i, STATUS), 509 (u_longlong_t)status, 510 (u_longlong_t)gbl->gbl_misc); 511 break; 512 513 default: 514 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 515 "STAT 0x%016llx", 516 i, IA32_MSR_MC(i, STATUS), 517 (u_longlong_t)status); 518 break; 519 520 } 521 } 522 } 523 524 #define _GCPU_BSTATUS(status, what) \ 525 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 526 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 527 528 static void 529 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 530 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 531 { 532 uint64_t members = ged ? ged->ged_ereport_members : 533 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 534 uint64_t mcg = gcl->gcl_mcg_status; 535 int mcip = mcg & MCG_STATUS_MCIP; 536 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 537 uint64_t bstat = gbl->gbl_status; 538 539 /* 540 * Include the compound error name if requested and if this 541 * is a compound error type. 542 */ 543 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 544 ged->ged_compound_fmt != NULL) { 545 char buf[FM_MAX_CLASS]; 546 547 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 548 GCPU_MN_NAMESPACE_COMPOUND); 549 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 550 DATA_TYPE_STRING, buf, NULL); 551 } 552 553 /* 554 * Include disposition information for this error 555 */ 556 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 557 gbl->gbl_disp != 0) { 558 int i, empty = 1; 559 char buf[128]; 560 char *p = buf, *q = buf + 128; 561 static struct _gcpu_disp_name { 562 uint64_t dv; 563 const char *dn; 564 } disp_names[] = { 565 { CMI_ERRDISP_CURCTXBAD, 566 "processor_context_corrupt" }, 567 { CMI_ERRDISP_RIPV_INVALID, 568 "return_ip_invalid" }, 569 { CMI_ERRDISP_UC_UNCONSTRAINED, 570 "unconstrained" }, 571 { CMI_ERRDISP_FORCEFATAL, 572 "forcefatal" }, 573 { CMI_ERRDISP_IGNORED, 574 "ignored" }, 575 { CMI_ERRDISP_PCC_CLEARED, 576 "corrupt_context_cleared" }, 577 { CMI_ERRDISP_UC_CLEARED, 578 "uncorrected_data_cleared" }, 579 { CMI_ERRDISP_POISONED, 580 "poisoned" }, 581 { CMI_ERRDISP_INCONSISTENT, 582 "telemetry_unstable" }, 583 }; 584 585 for (i = 0; i < sizeof (disp_names) / 586 sizeof (struct _gcpu_disp_name); i++) { 587 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 588 continue; 589 590 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 591 "%s%s", empty ? "" : ",", disp_names[i].dn); 592 p += strlen(p); 593 empty = 0; 594 } 595 596 if (p != buf) 597 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 598 DATA_TYPE_STRING, buf, NULL); 599 } 600 601 /* 602 * If MCG_STATUS is included add that and an indication of whether 603 * this ereport was the result of a machine check or poll. 604 */ 605 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 606 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 607 DATA_TYPE_UINT64, mcg, NULL); 608 609 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 610 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 611 } 612 613 /* 614 * If an instruction pointer is to be included add one provided 615 * MCG_STATUS indicated it is valid; meaningless for polled events. 616 */ 617 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 618 mcg & MCG_STATUS_EIPV) { 619 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 620 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 621 } 622 623 /* 624 * Add an indication of whether the trap occured during privileged code. 625 */ 626 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 627 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 628 DATA_TYPE_BOOLEAN_VALUE, 629 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 630 } 631 632 /* 633 * If requested, add the index of the MCA bank. This indicates the 634 * n'th bank of 4 MCA registers, and does not necessarily correspond 635 * to MCi_* - use the bank offset to correlate 636 */ 637 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 638 fm_payload_set(ereport, 639 /* Bank number */ 640 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 641 /* Offset of MCi_CTL */ 642 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 643 IA32_MSR_MC(bankno, CTL), 644 NULL); 645 } 646 647 /* 648 * Add MCi_STATUS if requested, and decode it. 649 */ 650 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 651 const char *tbes[] = { 652 "No tracking", /* 00 */ 653 "Green - below threshold", /* 01 */ 654 "Yellow - above threshold", /* 10 */ 655 "Reserved" /* 11 */ 656 }; 657 658 fm_payload_set(ereport, 659 /* Bank MCi_STATUS */ 660 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 661 /* Overflow? */ 662 _GCPU_BSTATUS(bstat, OVER), 663 /* Uncorrected? */ 664 _GCPU_BSTATUS(bstat, UC), 665 /* Enabled? */ 666 _GCPU_BSTATUS(bstat, EN), 667 /* Processor context corrupt? */ 668 _GCPU_BSTATUS(bstat, PCC), 669 /* Error code */ 670 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 671 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 672 /* Model-specific error code */ 673 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 674 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 675 NULL); 676 677 /* 678 * If MCG_CAP.TES_P indicates that that thresholding info 679 * is present in the architural component of the bank status 680 * then include threshold information for this bank. 681 */ 682 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 683 fm_payload_set(ereport, 684 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 685 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 686 NULL); 687 } 688 } 689 690 /* 691 * MCi_ADDR info if requested and valid. 692 */ 693 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 694 bstat & MSR_MC_STATUS_ADDRV) { 695 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 696 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 697 } 698 699 /* 700 * MCi_MISC if requested and MCi_STATUS.MISCV). 701 */ 702 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 703 bstat & MSR_MC_STATUS_MISCV) { 704 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 705 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 706 } 707 708 } 709 710 /* 711 * Construct and post an ereport based on the logout information from a 712 * single MCA bank. We are not necessarily running on the cpu that 713 * detected the error. 714 */ 715 static void 716 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 717 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 718 { 719 gcpu_data_t *gcpu = gcl->gcl_gcpu; 720 cmi_hdl_t hdl = gcpu->gcpu_hdl; 721 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 722 const char *cpuclass = NULL, *leafclass = NULL; 723 uint16_t code = MCAX86_ERRCODE(status); 724 errorq_elem_t *eqep, *scr_eqep; 725 nvlist_t *ereport, *detector; 726 char buf[FM_MAX_CLASS]; 727 const char *classfmt; 728 nv_alloc_t *nva; 729 730 if (panicstr) { 731 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 732 return; 733 ereport = errorq_elem_nvl(ereport_errorq, eqep); 734 735 /* 736 * Allocate another element for scratch space, but fallback 737 * to the one we have if that fails. We'd like to use the 738 * additional scratch space for nvlist construction. 739 */ 740 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 741 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 742 else 743 nva = errorq_elem_nva(ereport_errorq, eqep); 744 } else { 745 ereport = fm_nvlist_create(NULL); 746 nva = NULL; 747 } 748 749 if (ereport == NULL) 750 return; 751 752 /* 753 * Common payload data required by the protocol: 754 * - ereport class 755 * - detector 756 * - ENA 757 */ 758 759 /* 760 * Ereport class - call into model-specific support to allow it to 761 * provide a cpu class or leaf class, otherwise calculate our own. 762 */ 763 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 764 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 765 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 766 leafclass); 767 768 /* 769 * The detector FMRI. 770 */ 771 if ((detector = cms_ereport_detector(hdl, mscookie, nva)) == NULL) 772 detector = gcpu_fmri_create(hdl, nva); 773 774 /* 775 * Should we define a new ENA format 3?? for chip/core/strand? 776 * It will be better when virtualized. 777 */ 778 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 779 fm_ena_generate_cpu(gcl->gcl_timestamp, 780 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 781 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 782 783 if (panicstr) { 784 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 785 nv_alloc_reset(nva); 786 } else { 787 fm_nvlist_destroy(detector, FM_NVA_FREE); 788 } 789 790 /* 791 * Add the architectural ereport class-specific payload data. 792 */ 793 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 794 795 /* 796 * Allow model-specific code to add ereport members. 797 */ 798 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 799 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 800 801 /* 802 * Include stack if options is turned on and either selected in 803 * the payload member bitmask or inclusion is forced. 804 */ 805 if (gcpu_mca_stack_flag && 806 (cms_ereport_includestack(hdl, mscookie) == 807 B_TRUE || gcpu_mca_stack_ereport_include)) { 808 fm_payload_stack_add(ereport, gcl->gcl_stack, 809 gcl->gcl_stackdepth); 810 } 811 812 /* 813 * Post ereport. 814 */ 815 if (panicstr) { 816 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 817 if (scr_eqep) 818 errorq_cancel(ereport_errorq, scr_eqep); 819 } else { 820 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 821 fm_nvlist_destroy(ereport, FM_NVA_FREE); 822 } 823 824 } 825 826 /*ARGSUSED*/ 827 void 828 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 829 { 830 const gcpu_logout_t *gcl = data; 831 const gcpu_bank_logout_t *gbl; 832 int i; 833 834 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 835 const gcpu_error_disp_t *gened; 836 cms_cookie_t mscookie; 837 838 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 839 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 840 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 841 842 /* 843 * Perform a match based on IA32 MCA architectural 844 * components alone. 845 */ 846 gened = gcpu_disp_match(code); /* may be NULL */ 847 848 /* 849 * Now see if an model-specific match can be made. 850 */ 851 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i, 852 gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 853 gcl->gcl_ms_logout); 854 855 /* 856 * Prepare and dispatch an ereport for logging and 857 * diagnosis. 858 */ 859 gcpu_ereport_post(gcl, i, gened, mscookie, 860 gbl->gbl_status); 861 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 862 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 863 /* 864 * Telemetry kept changing as we tried to read 865 * it. Force an unknown ereport leafclass but 866 * keep the telemetry unchanged for logging. 867 */ 868 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 869 gbl->gbl_status); 870 } 871 } 872 } 873 874 static size_t gcpu_mca_queue_datasz = 0; 875 876 /* 877 * The following code is ready to make a weak attempt at growing the 878 * errorq structure size. Since it is not foolproof (we don't know 879 * who may already be producing to the outgoing errorq) our caller 880 * instead assures that we'll always be called with no greater data 881 * size than on our first call. 882 */ 883 static void 884 gcpu_errorq_init(size_t datasz) 885 { 886 int slots; 887 888 mutex_enter(&gcpu_mca_queue_lock); 889 890 if (gcpu_mca_queue_datasz >= datasz) { 891 mutex_exit(&gcpu_mca_queue_lock); 892 return; 893 } 894 895 membar_producer(); 896 if (gcpu_mca_queue) { 897 gcpu_mca_queue_datasz = 0; 898 errorq_destroy(gcpu_mca_queue); 899 } 900 901 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 902 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 903 904 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 905 NULL, slots, datasz, 1, ERRORQ_VITAL); 906 907 if (gcpu_mca_queue != NULL) 908 gcpu_mca_queue_datasz = datasz; 909 910 mutex_exit(&gcpu_mca_queue_lock); 911 } 912 913 /* 914 * Perform MCA initialization as described in section 14.6 of Intel 64 915 * and IA-32 Architectures Software Developer's Manual Volume 3A. 916 */ 917 918 static uint_t global_nbanks; 919 920 void 921 gcpu_mca_init(cmi_hdl_t hdl) 922 { 923 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 924 uint64_t cap; 925 uint_t vendor = cmi_hdl_vendor(hdl); 926 uint_t family = cmi_hdl_family(hdl); 927 gcpu_mca_t *mca = &gcpu->gcpu_mca; 928 int mcg_ctl_present; 929 uint_t nbanks; 930 size_t mslsz; 931 int i; 932 933 if (gcpu == NULL) 934 return; 935 936 /* 937 * Protect from some silly /etc/system settings. 938 */ 939 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 940 gcpu_mca_telemetry_retries = 5; 941 942 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 943 return; 944 945 /* 946 * CPU startup code only calls cmi_mca_init if x86_feature indicates 947 * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier 948 * processors, which have their own * more primitive way of doing 949 * machine checks, will not have cmi_mca_init called since their 950 * CPUID information will not indicate both MCA and MCE features. 951 */ 952 #ifndef __xpv 953 ASSERT(x86_feature & X86_MCA); 954 #endif /* __xpv */ 955 956 /* 957 * Determine whether the IA32_MCG_CTL register is present. If it 958 * is we will enable all features by writing -1 to it towards 959 * the end of this initialization; if it is absent then volume 3A 960 * says we must nonetheless continue to initialize the individual 961 * banks. 962 */ 963 mcg_ctl_present = cap & MCG_CAP_CTL_P; 964 965 /* 966 * We squirell values away for inspection/debugging. 967 */ 968 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 969 if (mcg_ctl_present) 970 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 971 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 972 973 /* 974 * Determine the number of error-reporting banks implemented. 975 */ 976 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 977 978 if (nbanks != 0 && global_nbanks == 0) 979 global_nbanks = nbanks; /* no race - BSP will get here first */ 980 981 /* 982 * If someone is hiding the number of banks (perhaps we are fully 983 * virtualized?) or if this processor has more banks than the 984 * first to set global_nbanks then bail. The latter requirement 985 * is because we need to size our errorq data structure and we 986 * don't want to have to grow the errorq (destroy and recreate) 987 * which may just lose some telemetry. 988 */ 989 if (nbanks == 0 || nbanks > global_nbanks) 990 return; 991 992 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 993 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 994 995 /* 996 * Calculate the size we need to allocate for a gcpu_logout_t 997 * with a gcl_data array big enough for all banks of this cpu. 998 * Add any space requested by the model-specific logout support. 999 */ 1000 mslsz = cms_logout_size(hdl); 1001 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1002 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1003 1004 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1005 gcpu_logout_t *gcl; 1006 1007 mca->gcpu_mca_logout[i] = gcl = 1008 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1009 gcl->gcl_gcpu = gcpu; 1010 gcl->gcl_nbanks = nbanks; 1011 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1012 (char *)(&gcl->gcl_data[0]) + nbanks * 1013 sizeof (gcpu_bank_logout_t); 1014 1015 } 1016 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1017 1018 /* 1019 * Create our errorq to transport the logout structures. This 1020 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1021 */ 1022 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1023 1024 /* 1025 * Not knowing which, if any, banks are shared between cores we 1026 * assure serialization of MCA bank initialization by each cpu 1027 * on the chip. On chip architectures in which some banks are 1028 * shared this will mean the shared resource is initialized more 1029 * than once - we're simply aiming to avoid simultaneous MSR writes 1030 * to the shared resource. 1031 * 1032 * Even with these precautions, some platforms may yield a GP fault 1033 * if a core other than a designated master tries to write anything 1034 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1035 * those writes under on_trap protection. 1036 */ 1037 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1038 1039 /* 1040 * Initialize poller data, but don't start polling yet. 1041 */ 1042 gcpu_mca_poll_init(hdl); 1043 1044 /* 1045 * Work out which MCA banks we will initialize. In MCA logout 1046 * code we will only read those banks which we initialize here. 1047 */ 1048 for (i = 0; i < nbanks; i++) { 1049 /* 1050 * On Intel family 6 and AMD family 6 we must not enable 1051 * machine check from bank 0 detectors. In the Intel 1052 * case bank 0 is reserved for the platform, while in the 1053 * AMD case reports are that enabling bank 0 (DC) produces 1054 * spurious machine checks. 1055 */ 1056 if (i == 0 && ((vendor == X86_VENDOR_Intel || 1057 vendor == X86_VENDOR_AMD) && family == 6)) 1058 continue; 1059 1060 if (cms_bankctl_skipinit(hdl, i)) 1061 continue; 1062 1063 /* 1064 * Record which MCA banks were enabled, both from the 1065 * point of view of this core and accumulating for the 1066 * whole chip (if some cores share a bank we must be 1067 * sure either can logout from it). 1068 */ 1069 mca->gcpu_actv_banks |= 1 << i; 1070 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1071 } 1072 1073 /* 1074 * Log any valid telemetry lurking in the MCA banks, but do not 1075 * clear the status registers. Ignore the disposition returned - 1076 * we have already paniced or reset for any nasty errors found here. 1077 */ 1078 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE); 1079 1080 /* 1081 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1082 * model-specific module the power of veto. 1083 */ 1084 for (i = 0; i < nbanks; i++) { 1085 struct gcpu_bios_bankcfg *bcfgp = 1086 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1087 1088 /* 1089 * Stash inherited bank MCA state, even for banks we will 1090 * not initialize ourselves. Do not read the MISC register 1091 * unconditionally - on some processors that will #GP on 1092 * banks that do not implement the MISC register (would be 1093 * caught by on_trap, anyway). 1094 */ 1095 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1096 &bcfgp->bios_bank_ctl); 1097 1098 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1099 &bcfgp->bios_bank_status); 1100 1101 if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) 1102 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1103 &bcfgp->bios_bank_addr); 1104 1105 if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1106 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1107 &bcfgp->bios_bank_misc); 1108 1109 if (!(mca->gcpu_actv_banks & 1 << i)) 1110 continue; 1111 1112 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1113 cms_bankctl_val(hdl, i, -1ULL)); 1114 1115 if (!cms_bankstatus_skipinit(hdl, i)) { 1116 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1117 cms_bankstatus_val(hdl, i, 0ULL)); 1118 } 1119 } 1120 1121 /* 1122 * Now let the model-specific support perform further initialization 1123 * of non-architectural features. 1124 */ 1125 cms_mca_init(hdl, nbanks); 1126 1127 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1128 membar_producer(); 1129 1130 /* enable all machine-check features */ 1131 if (mcg_ctl_present) 1132 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1133 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1134 1135 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1136 1137 /* enable machine-check exception in CR4 */ 1138 cmi_hdl_enable_mce(hdl); 1139 } 1140 1141 static uint64_t 1142 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1143 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1144 { 1145 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1146 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1147 int nbanks = mca->gcpu_mca_nbanks; 1148 gcpu_mce_status_t mce; 1149 gcpu_bank_logout_t *gbl; 1150 uint64_t disp = 0; 1151 int i; 1152 1153 if (mcesp == NULL) 1154 mcesp = &mce; 1155 1156 mcesp->mce_nerr = nerr; 1157 1158 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1159 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1160 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1161 1162 /* 1163 * If this a machine check then if the return instruction pointer 1164 * is not valid the current context is lost. 1165 */ 1166 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1167 disp |= CMI_ERRDISP_RIPV_INVALID; 1168 1169 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1170 uint64_t mcistatus = gbl->gbl_status; 1171 uint32_t ms_scope; 1172 int pcc, uc; 1173 int poisoned; 1174 1175 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1176 continue; 1177 1178 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1179 continue; 1180 1181 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1182 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1183 mcesp->mce_npcc += pcc; 1184 mcesp->mce_nuc += uc; 1185 1186 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1187 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1188 1189 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1190 pcc = 0; 1191 mcesp->mce_npcc_ok++; 1192 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1193 } 1194 1195 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1196 uc = 0; 1197 mcesp->mce_nuc_ok++; 1198 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1199 } 1200 1201 if (uc) { 1202 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1203 if (poisoned) { 1204 mcesp->mce_nuc_poisoned++; 1205 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1206 } 1207 } 1208 1209 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1210 /* 1211 * We're not being instructed to ignore the error, 1212 * so apply our standard disposition logic to it. 1213 */ 1214 if (uc && !poisoned) { 1215 unconstrained++; 1216 gbl->gbl_disp |= disp | 1217 CMI_ERRDISP_UC_UNCONSTRAINED; 1218 } 1219 1220 if (pcc && ismc) { 1221 curctxbad++; 1222 gbl->gbl_disp |= disp | 1223 CMI_ERRDISP_CURCTXBAD; 1224 } 1225 1226 /* 1227 * Even if the above may not indicate that the error 1228 * is terminal, model-specific support may insist 1229 * that we treat it as such. Such errors wil be 1230 * fatal even if discovered via poll. 1231 */ 1232 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1233 forcefatal++; 1234 mcesp->mce_forcefatal++; 1235 gbl->gbl_disp |= disp | 1236 CMI_ERRDISP_FORCEFATAL; 1237 } 1238 } else { 1239 mcesp->mce_ignored++; 1240 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1241 } 1242 } 1243 1244 if (unconstrained > 0) 1245 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1246 1247 if (curctxbad > 0) 1248 disp |= CMI_ERRDISP_CURCTXBAD; 1249 1250 if (forcefatal > 0) 1251 disp |= CMI_ERRDISP_FORCEFATAL; 1252 1253 if (gcpu_mca_queue != NULL) { 1254 int how; 1255 1256 if (ismc) { 1257 how = cmi_mce_response(rp, disp) ? 1258 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1259 ERRORQ_SYNC; /* panic flow will drain */ 1260 } else { 1261 how = (disp & CMI_ERRDISP_FORCEFATAL && 1262 cmi_panic_on_ue()) ? 1263 ERRORQ_SYNC : /* poller will panic */ 1264 ERRORQ_ASYNC; /* no panic */ 1265 } 1266 1267 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1268 } else if (disp != 0) { 1269 gcpu_bleat(hdl, gcl); 1270 } 1271 1272 mcesp->mce_disp = disp; 1273 1274 return (disp); 1275 } 1276 1277 /* 1278 * Gather error telemetry from our source, and then submit it for 1279 * processing. 1280 */ 1281 1282 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1283 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1284 1285 #define STATUS_EQV(s1, s2) \ 1286 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1287 1288 static uint32_t gcpu_deferrred_polled_clears; 1289 1290 void 1291 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1292 gcpu_mce_status_t *mcesp, boolean_t clrstatus) 1293 { 1294 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1295 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1296 int nbanks = mca->gcpu_mca_nbanks; 1297 gcpu_bank_logout_t *gbl, *pgbl; 1298 gcpu_logout_t *gcl, *pgcl; 1299 int ismc = (rp != NULL); 1300 int ispoll = !ismc; 1301 int i, nerr = 0; 1302 cmi_errno_t err; 1303 uint64_t mcg_status; 1304 uint64_t disp; 1305 uint64_t cap; 1306 1307 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1308 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1309 CMI_SUCCESS) { 1310 if (mcesp != NULL) 1311 mcesp->mce_nerr = mcesp->mce_disp = 0; 1312 return; 1313 } 1314 1315 if (ismc) { 1316 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1317 } else { 1318 int pidx = mca->gcpu_mca_nextpoll_idx; 1319 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1320 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1321 1322 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1323 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1324 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1325 } 1326 1327 gcl->gcl_timestamp = gethrtime_waitfree(); 1328 gcl->gcl_mcg_status = mcg_status; 1329 gcl->gcl_ip = rp ? rp->r_pc : 0; 1330 1331 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1332 if (cap & MCG_CAP_TES_P) 1333 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1334 1335 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1336 uint64_t status, status2, addr, misc; 1337 int retries = gcpu_mca_telemetry_retries; 1338 1339 gbl->gbl_status = 0; 1340 gbl->gbl_disp = 0; 1341 gbl->gbl_clrdefcnt = 0; 1342 1343 /* 1344 * Only logout from MCA banks we have initialized from at 1345 * least one core. If a core shares an MCA bank with another 1346 * but perhaps lost the race to initialize it, then it must 1347 * still be allowed to logout from the shared bank. 1348 */ 1349 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1350 continue; 1351 1352 /* 1353 * On a poll look only at the banks we've been asked to check. 1354 */ 1355 if (rp == NULL && !(bankmask & 1 << i)) 1356 continue; 1357 1358 1359 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1360 CMI_SUCCESS) 1361 continue; 1362 retry: 1363 if (!(status & MSR_MC_STATUS_VAL)) 1364 continue; 1365 1366 addr = -1; 1367 misc = 0; 1368 1369 if (status & MSR_MC_STATUS_ADDRV) 1370 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1371 1372 if (status & MSR_MC_STATUS_MISCV) 1373 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1374 1375 /* 1376 * Allow the model-specific code to extract bank telemetry. 1377 */ 1378 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1379 1380 /* 1381 * Not all cpu models assure us that the status/address/misc 1382 * data will not change during the above sequence of MSR reads, 1383 * or that it can only change by the addition of the OVerflow 1384 * bit to the status register. If the status has changed 1385 * other than in the overflow bit then we attempt to reread 1386 * for a consistent snapshot, but eventually give up and 1387 * go with what we've got. We only perform this check 1388 * for a poll - a further #MC during a #MC will reset, and 1389 * polled errors should not overwrite higher-priority 1390 * trapping errors (but could set the overflow bit). 1391 */ 1392 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1393 &status2)) == CMI_SUCCESS) { 1394 if (!STATUS_EQV(status, status2)) { 1395 if (retries-- > 0) { 1396 status = status2; 1397 goto retry; 1398 } else { 1399 gbl->gbl_disp |= 1400 CMI_ERRDISP_INCONSISTENT; 1401 } 1402 } 1403 } else if (ispoll && err != CMI_SUCCESS) { 1404 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1405 } 1406 1407 nerr++; 1408 gbl->gbl_status = status; 1409 gbl->gbl_addr = addr; 1410 gbl->gbl_misc = misc; 1411 1412 if (clrstatus == B_FALSE) 1413 goto serialize; 1414 1415 /* 1416 * For machine checks we always clear status here. For polls 1417 * we must be a little more cautious since there is an 1418 * outside chance that we may clear telemetry from a shared 1419 * MCA bank on which a sibling core is machine checking. 1420 * 1421 * For polled observations of errors that look like they may 1422 * produce a machine check (UC/PCC and ENabled, although these 1423 * do not guarantee a machine check on error occurence) 1424 * we will not clear the status at this wakeup unless 1425 * we saw the same status at the previous poll. We will 1426 * always process and log the current observations - it 1427 * is only the clearing of MCi_STATUS which may be 1428 * deferred until the next wakeup. 1429 */ 1430 if (ismc || !IS_MCE_CANDIDATE(status)) { 1431 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1432 goto serialize; 1433 } 1434 1435 /* 1436 * We have a polled observation of a machine check 1437 * candidate. If we saw essentially the same status at the 1438 * last poll then clear the status now since this appears 1439 * not to be a #MC candidate after all. If we see quite 1440 * different status now then do not clear, but reconsider at 1441 * the next poll. In no actual machine check clears 1442 * the status in the interim then the status should not 1443 * keep changing forever (meaning we'd never clear it) 1444 * since before long we'll simply have latched the highest- 1445 * priority error and set the OVerflow bit. Nonetheless 1446 * we count how many times we defer clearing and after 1447 * a while insist on clearing the status. 1448 */ 1449 pgbl = &pgcl->gcl_data[i]; 1450 if (pgbl->gbl_clrdefcnt != 0) { 1451 /* We deferred clear on this bank at last wakeup */ 1452 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1453 pgbl->gbl_clrdefcnt > 5) { 1454 /* 1455 * Status is unchanged so clear it now and, 1456 * since we have already logged this info, 1457 * avoid logging it again. 1458 */ 1459 gbl->gbl_status = 0; 1460 nerr--; 1461 (void) cmi_hdl_wrmsr(hdl, 1462 IA32_MSR_MC(i, STATUS), 0ULL); 1463 } else { 1464 /* Record deferral for next wakeup */ 1465 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1466 } 1467 } else { 1468 /* Record initial deferral for next wakeup */ 1469 gbl->gbl_clrdefcnt = 1; 1470 gcpu_deferrred_polled_clears++; 1471 } 1472 1473 serialize: 1474 /* 1475 * Intel Vol 3A says to execute a serializing instruction 1476 * here, ie CPUID. Well WRMSR is also defined to be 1477 * serializing, so the status clear above should suffice. 1478 * To be a good citizen, and since some clears are deferred, 1479 * we'll execute a CPUID instruction here. 1480 */ 1481 { 1482 struct cpuid_regs tmp; 1483 (void) __cpuid_insn(&tmp); 1484 } 1485 } 1486 1487 if (gcpu_mca_stack_flag) 1488 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1489 else 1490 gcl->gcl_stackdepth = 0; 1491 1492 /* 1493 * Decide our disposition for this error or errors, and submit for 1494 * logging and subsequent diagnosis. 1495 */ 1496 if (nerr != 0) { 1497 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1498 } else { 1499 disp = 0; 1500 if (mcesp) { 1501 mcesp->mce_nerr = mcesp->mce_disp = 0; 1502 } 1503 } 1504 1505 /* 1506 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1507 * If a second #MC had occured before now the system would have 1508 * reset. We can only do thise once gcpu_mca_process has copied 1509 * the logout structure. 1510 */ 1511 if (ismc && mcg_status & MCG_STATUS_MCIP) 1512 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1513 1514 /* 1515 * At this point we have read and logged all telemetry that is visible 1516 * under the MCA. On architectures for which the NorthBridge is 1517 * on-chip this may include NB-observed errors, but where the NB 1518 * is off chip it may have been the source of the #MC request and 1519 * so we must call into the memory-controller driver to give it 1520 * a chance to log errors. 1521 */ 1522 if (ismc) { 1523 int willpanic = (cmi_mce_response(rp, disp) == 0); 1524 cmi_mc_logout(hdl, 1, willpanic); 1525 } 1526 } 1527 1528 int gcpu_mca_trap_vomit_summary = 0; 1529 1530 /* 1531 * On a native machine check exception we come here from mcetrap via 1532 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1533 * cpus of the chip, so it is possible that another cpu on this chip could 1534 * initiate a poll while we're in the #mc handler; it is also possible that 1535 * this trap has occured during a poll on this cpu. So we must acquire 1536 * the chip-wide poll lock, but be careful to avoid deadlock. 1537 * 1538 * The 'data' pointer cannot be NULL due to init order. 1539 */ 1540 uint64_t 1541 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1542 { 1543 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1544 kmutex_t *poll_lock = NULL; 1545 gcpu_mce_status_t mce; 1546 uint64_t mcg_status; 1547 int tooklock = 0; 1548 1549 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1550 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 1551 return (0); 1552 1553 /* 1554 * Synchronize with any poller from another core that may happen 1555 * to share access to one or more of the MCA banks. 1556 */ 1557 if (gcpu->gcpu_shared != NULL) 1558 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 1559 1560 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 1561 /* 1562 * The lock is not owned by the thread we have 1563 * interrupted. Spin for this adaptive lock. 1564 */ 1565 while (!mutex_tryenter(poll_lock)) { 1566 while (mutex_owner(poll_lock) != NULL) 1567 ; 1568 } 1569 tooklock = 1; 1570 } 1571 1572 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE); 1573 1574 if (tooklock) 1575 mutex_exit(poll_lock); 1576 1577 /* 1578 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 1579 */ 1580 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 1581 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 1582 "%u PCC (%u ok), " 1583 "%u UC (%d ok, %u poisoned), " 1584 "%u forcefatal, %u ignored", 1585 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 1586 mce.mce_npcc, mce.mce_npcc_ok, 1587 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 1588 mce.mce_forcefatal, mce.mce_ignored); 1589 } 1590 1591 return (mce.mce_disp); 1592 } 1593 1594 /*ARGSUSED*/ 1595 void 1596 gcpu_faulted_enter(cmi_hdl_t hdl) 1597 { 1598 /* Nothing to do here */ 1599 } 1600 1601 /*ARGSUSED*/ 1602 void 1603 gcpu_faulted_exit(cmi_hdl_t hdl) 1604 { 1605 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1606 1607 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 1608 } 1609 1610 /* 1611 * Write the requested values to the indicated MSRs. Having no knowledge 1612 * of the model-specific requirements for writing to these model-specific 1613 * registers, we will only blindly write to those MSRs if the 'force' 1614 * argument is nonzero. That option should only be used in prototyping 1615 * and debugging. 1616 */ 1617 /*ARGSUSED*/ 1618 cmi_errno_t 1619 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 1620 int force) 1621 { 1622 int i, errs = 0; 1623 1624 for (i = 0; i < nregs; i++) { 1625 uint_t msr = regs[i].cmr_msrnum; 1626 uint64_t val = regs[i].cmr_msrval; 1627 1628 if (cms_present(hdl)) { 1629 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 1630 errs++; 1631 } else if (force) { 1632 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 1633 } else { 1634 errs++; 1635 } 1636 } 1637 1638 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 1639 } 1640