1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/mca_x86.h> 30 #include <sys/cpu_module_impl.h> 31 #include <sys/cpu_module_ms.h> 32 #include <sys/cmn_err.h> 33 #include <sys/cpuvar.h> 34 #include <sys/pghw.h> 35 #include <sys/x86_archext.h> 36 #include <sys/sysmacros.h> 37 #include <sys/regset.h> 38 #include <sys/privregs.h> 39 #include <sys/systm.h> 40 #include <sys/types.h> 41 #include <sys/log.h> 42 #include <sys/psw.h> 43 #include <sys/fm/protocol.h> 44 #include <sys/fm/util.h> 45 #include <sys/errorq.h> 46 #include <sys/mca_x86.h> 47 #include <sys/fm/cpu/GMCA.h> 48 #include <sys/sysevent.h> 49 #include <sys/ontrap.h> 50 51 #include "gcpu.h" 52 53 /* 54 * Set to suppress logging of telemetry found at initialization. 55 */ 56 int gcpu_suppress_log_on_init = 0; 57 58 /* 59 * gcpu_mca_stack_flag is a debug assist option to capture a stack trace at 60 * error logout time. The stack will be included in the ereport if the 61 * error type selects stack inclusion, or in all cases if 62 * gcpu_mca_stack_ereport_include is nonzero. 63 */ 64 int gcpu_mca_stack_flag = 0; 65 int gcpu_mca_stack_ereport_include = 0; 66 67 /* 68 * The number of times to re-read MCA telemetry to try to obtain a 69 * consistent snapshot if we find it to be changing under our feet. 70 */ 71 int gcpu_mca_telemetry_retries = 5; 72 73 static gcpu_error_disp_t gcpu_errtypes[] = { 74 75 /* 76 * Unclassified 77 */ 78 { 79 FM_EREPORT_CPU_GENERIC_UNCLASSIFIED, 80 NULL, 81 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 82 MCAX86_SIMPLE_UNCLASSIFIED_MASKON, 83 MCAX86_SIMPLE_UNCLASSIFIED_MASKOFF 84 }, 85 86 /* 87 * Microcode ROM Parity Error 88 */ 89 { 90 FM_EREPORT_CPU_GENERIC_MC_CODE_PARITY, 91 NULL, 92 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 93 MCAX86_SIMPLE_MC_CODE_PARITY_MASKON, 94 MCAX86_SIMPLE_MC_CODE_PARITY_MASKOFF 95 }, 96 97 /* 98 * External - BINIT# from another processor during power-on config 99 */ 100 { 101 FM_EREPORT_CPU_GENERIC_EXTERNAL, 102 NULL, 103 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 104 MCAX86_SIMPLE_EXTERNAL_MASKON, 105 MCAX86_SIMPLE_EXTERNAL_MASKOFF 106 }, 107 108 /* 109 * Functional redundancy check master/slave error 110 */ 111 { 112 FM_EREPORT_CPU_GENERIC_FRC, 113 NULL, 114 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 115 MCAX86_SIMPLE_FRC_MASKON, 116 MCAX86_SIMPLE_FRC_MASKOFF 117 }, 118 119 /* 120 * Internal timer error 121 */ 122 { 123 FM_EREPORT_CPU_GENERIC_INTERNAL_TIMER, 124 NULL, 125 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 126 MCAX86_SIMPLE_INTERNAL_TIMER_MASKON, 127 MCAX86_SIMPLE_INTERNAL_TIMER_MASKOFF 128 }, 129 130 /* 131 * Internal unclassified 132 */ 133 { 134 FM_EREPORT_CPU_GENERIC_INTERNAL_UNCLASS, 135 NULL, 136 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 137 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKON, 138 MCAX86_SIMPLE_INTERNAL_UNCLASS_MASK_MASKOFF 139 }, 140 141 /* 142 * Compound error codes - generic memory hierarchy 143 */ 144 { 145 FM_EREPORT_CPU_GENERIC_GENMEMHIER, 146 NULL, 147 FM_EREPORT_PAYLOAD_FLAGS_COMMON, /* yes, no compound name */ 148 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKON, 149 MCAX86_COMPOUND_GENERIC_MEMHIER_MASKOFF 150 }, 151 152 /* 153 * Compound error codes - TLB errors 154 */ 155 { 156 FM_EREPORT_CPU_GENERIC_TLB, 157 "%1$s" "TLB" "%2$s" "_ERR", 158 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 159 MCAX86_COMPOUND_TLB_MASKON, 160 MCAX86_COMPOUND_TLB_MASKOFF 161 }, 162 163 /* 164 * Compound error codes - memory hierarchy 165 */ 166 { 167 FM_EREPORT_CPU_GENERIC_MEMHIER, 168 "%1$s" "CACHE" "%2$s" "_" "%3$s" "_ERR", 169 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 170 MCAX86_COMPOUND_MEMHIER_MASKON, 171 MCAX86_COMPOUND_MEMHIER_MASKOFF 172 }, 173 174 /* 175 * Compound error codes - bus and interconnect errors 176 */ 177 { 178 FM_EREPORT_CPU_GENERIC_BUS_INTERCONNECT, 179 "BUS" "%2$s" "_" "%4$s" "_" "%3$s" "_" "%5$s" "_" "%6$s" "_ERR", 180 FM_EREPORT_PAYLOAD_FLAGS_COMPOUND_ERR, 181 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKON, 182 MCAX86_COMPOUND_BUS_INTERCONNECT_MASKOFF 183 }, 184 }; 185 186 static gcpu_error_disp_t gcpu_unknown = { 187 FM_EREPORT_CPU_GENERIC_UNKNOWN, 188 "UNKNOWN", 189 FM_EREPORT_PAYLOAD_FLAGS_COMMON, 190 0, 191 0 192 }; 193 194 static errorq_t *gcpu_mca_queue; 195 static kmutex_t gcpu_mca_queue_lock; 196 197 static const gcpu_error_disp_t * 198 gcpu_disp_match(uint16_t code) 199 { 200 const gcpu_error_disp_t *ged = gcpu_errtypes; 201 int i; 202 203 for (i = 0; i < sizeof (gcpu_errtypes) / sizeof (gcpu_error_disp_t); 204 i++, ged++) { 205 uint16_t on = ged->ged_errcode_mask_on; 206 uint16_t off = ged->ged_errcode_mask_off; 207 208 if ((code & on) == on && (code & off) == 0) 209 return (ged); 210 } 211 212 return (NULL); 213 } 214 215 static uint8_t 216 bit_strip(uint16_t code, uint16_t mask, uint16_t shift) 217 { 218 return ((uint8_t)(code & mask) >> shift); 219 } 220 221 #define BIT_STRIP(code, name) \ 222 bit_strip(code, MCAX86_ERRCODE_##name##_MASK, \ 223 MCAX86_ERRCODE_##name##_SHIFT) 224 225 #define GCPU_MNEMONIC_UNDEF "undefined" 226 #define GCPU_MNEMONIC_RESVD "reserved" 227 228 /* 229 * Mappings of TT, LL, RRRR, PP, II and T values to compound error name 230 * mnemonics and to ereport class name components. 231 */ 232 233 struct gcpu_mnexp { 234 const char *mne_compound; /* used in expanding compound errname */ 235 const char *mne_ereport; /* used in expanding ereport class */ 236 }; 237 238 static struct gcpu_mnexp gcpu_TT_mnemonics[] = { /* MCAX86_ERRCODE_TT_* */ 239 { "I", FM_EREPORT_CPU_GENERIC_TT_INSTR }, /* INSTR */ 240 { "D", FM_EREPORT_CPU_GENERIC_TT_DATA }, /* DATA */ 241 { "G", FM_EREPORT_CPU_GENERIC_TT_GEN }, /* GEN */ 242 { GCPU_MNEMONIC_UNDEF, "" } 243 }; 244 245 static struct gcpu_mnexp gcpu_LL_mnemonics[] = { /* MCAX86_ERRCODE_LL_* */ 246 { "LO", FM_EREPORT_CPU_GENERIC_LL_L0 }, /* L0 */ 247 { "L1", FM_EREPORT_CPU_GENERIC_LL_L1 }, /* L1 */ 248 { "L2", FM_EREPORT_CPU_GENERIC_LL_L2 }, /* L2 */ 249 { "LG", FM_EREPORT_CPU_GENERIC_LL_LG } /* LG */ 250 }; 251 252 static struct gcpu_mnexp gcpu_RRRR_mnemonics[] = { /* MCAX86_ERRCODE_RRRR_* */ 253 { "ERR", FM_EREPORT_CPU_GENERIC_RRRR_ERR }, /* ERR */ 254 { "RD", FM_EREPORT_CPU_GENERIC_RRRR_RD }, /* RD */ 255 { "WR", FM_EREPORT_CPU_GENERIC_RRRR_WR }, /* WR */ 256 { "DRD", FM_EREPORT_CPU_GENERIC_RRRR_DRD }, /* DRD */ 257 { "DWR", FM_EREPORT_CPU_GENERIC_RRRR_DWR }, /* DWR */ 258 { "IRD", FM_EREPORT_CPU_GENERIC_RRRR_IRD }, /* IRD */ 259 { "PREFETCH", FM_EREPORT_CPU_GENERIC_RRRR_PREFETCH }, /* PREFETCH */ 260 { "EVICT", FM_EREPORT_CPU_GENERIC_RRRR_EVICT }, /* EVICT */ 261 { "SNOOP", FM_EREPORT_CPU_GENERIC_RRRR_SNOOP }, /* SNOOP */ 262 }; 263 264 static struct gcpu_mnexp gcpu_PP_mnemonics[] = { /* MCAX86_ERRCODE_PP_* */ 265 { "SRC", FM_EREPORT_CPU_GENERIC_PP_SRC }, /* SRC */ 266 { "RES", FM_EREPORT_CPU_GENERIC_PP_RES }, /* RES */ 267 { "OBS", FM_EREPORT_CPU_GENERIC_PP_OBS }, /* OBS */ 268 { "", FM_EREPORT_CPU_GENERIC_PP_GEN } /* GEN */ 269 }; 270 271 static struct gcpu_mnexp gcpu_II_mnemonics[] = { /* MCAX86_ERRCODE_II_* */ 272 { "M", FM_EREPORT_CPU_GENERIC_II_MEM }, /* MEM */ 273 { GCPU_MNEMONIC_RESVD, "" }, 274 { "IO", FM_EREPORT_CPU_GENERIC_II_IO }, /* IO */ 275 { "", FM_EREPORT_CPU_GENERIC_II_GEN } /* GEN */ 276 }; 277 278 static struct gcpu_mnexp gcpu_T_mnemonics[] = { /* MCAX86_ERRCODE_T_* */ 279 { "NOTIMEOUT", FM_EREPORT_CPU_GENERIC_T_NOTIMEOUT }, /* NONE */ 280 { "TIMEOUT", FM_EREPORT_CPU_GENERIC_T_TIMEOUT } /* TIMEOUT */ 281 }; 282 283 enum gcpu_mn_namespace { 284 GCPU_MN_NAMESPACE_COMPOUND, 285 GCPU_MN_NAMESPACE_EREPORT 286 }; 287 288 static const char * 289 gcpu_mnemonic(const struct gcpu_mnexp *tbl, size_t tbl_sz, uint8_t val, 290 enum gcpu_mn_namespace nspace) 291 { 292 if (val >= tbl_sz) 293 return (GCPU_MNEMONIC_UNDEF); /* for all namespaces */ 294 295 switch (nspace) { 296 case GCPU_MN_NAMESPACE_COMPOUND: 297 return (tbl[val].mne_compound); 298 /*NOTREACHED*/ 299 300 case GCPU_MN_NAMESPACE_EREPORT: 301 return (tbl[val].mne_ereport); 302 /*NOTREACHED*/ 303 304 default: 305 return (GCPU_MNEMONIC_UNDEF); 306 /*NOTREACHED*/ 307 } 308 } 309 310 /* 311 * The ereport class leaf component is either a simple string with no 312 * format specifiers, or a string with one or more embedded %n$s specifiers - 313 * positional selection for string arguments. The kernel snprintf does 314 * not support %n$ (and teaching it to do so is too big a headache) so 315 * we will expand this restricted format string ourselves. 316 */ 317 318 #define GCPU_CLASS_VARCOMPS 7 319 320 #define GCPU_MNEMONIC(code, name, nspace) \ 321 gcpu_mnemonic(gcpu_##name##_mnemonics, \ 322 sizeof (gcpu_##name##_mnemonics) / sizeof (struct gcpu_mnexp), \ 323 BIT_STRIP(code, name), nspace) 324 325 static void 326 gcpu_mn_fmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 327 enum gcpu_mn_namespace nspace) 328 { 329 uint16_t code = MCAX86_ERRCODE(status); 330 const char *mn[GCPU_CLASS_VARCOMPS]; 331 char *p = buf; /* current position in buf */ 332 char *q = buf + buflen; /* pointer past last char in buf */ 333 int which, expfmtchar, error; 334 char c; 335 336 mn[0] = GCPU_MNEMONIC(code, TT, nspace); 337 mn[1] = GCPU_MNEMONIC(code, LL, nspace); 338 mn[2] = GCPU_MNEMONIC(code, RRRR, nspace); 339 mn[3] = GCPU_MNEMONIC(code, PP, nspace); 340 mn[4] = GCPU_MNEMONIC(code, II, nspace); 341 mn[5] = GCPU_MNEMONIC(code, T, nspace); 342 mn[6] = (status & MSR_MC_STATUS_UC) ? "_uc" : ""; 343 344 while (p < q - 1 && (c = *fmt++) != '\0') { 345 if (c != '%') { 346 /* not the beginning of a format specifier - copy */ 347 *p++ = c; 348 continue; 349 } 350 351 error = 0; 352 which = -1; 353 expfmtchar = -1; 354 355 nextfmt: 356 if ((c = *fmt++) == '\0') 357 break; /* early termination of fmt specifier */ 358 359 switch (c) { 360 case '1': 361 case '2': 362 case '3': 363 case '4': 364 case '5': 365 case '6': 366 case '7': 367 if (which != -1) { /* allow only one positional digit */ 368 error++; 369 break; 370 } 371 which = c - '1'; 372 goto nextfmt; 373 /*NOTREACHED*/ 374 375 case '$': 376 if (which == -1) { /* no position specified */ 377 error++; 378 break; 379 } 380 expfmtchar = 's'; 381 goto nextfmt; 382 /*NOTREACHED*/ 383 384 case 's': 385 if (expfmtchar != 's') { 386 error++; 387 break; 388 } 389 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 390 mn[which]); 391 p += strlen(p); 392 break; 393 394 default: 395 error++; 396 break; 397 } 398 399 if (error) 400 break; 401 } 402 403 *p = '\0'; /* NUL termination */ 404 } 405 406 static void 407 gcpu_erpt_clsfmt(const char *fmt, char *buf, size_t buflen, uint64_t status, 408 const char *cpuclass, const char *leafclass) 409 { 410 char *p = buf; /* current position in buf */ 411 char *q = buf + buflen; /* pointer past last char in buf */ 412 413 (void) snprintf(buf, (uintptr_t)q - (uintptr_t)p, "%s.%s.", 414 FM_ERROR_CPU, cpuclass ? cpuclass : FM_EREPORT_CPU_GENERIC); 415 416 p += strlen(p); 417 if (p >= q) 418 return; 419 420 if (leafclass == NULL) { 421 gcpu_mn_fmt(fmt, p, (uintptr_t)q - (uintptr_t)p, status, 422 GCPU_MN_NAMESPACE_EREPORT); 423 } else { 424 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, "%s", 425 leafclass); 426 } 427 } 428 429 /* 430 * Create an "hc" scheme FMRI identifying the given cpu. We don't know 431 * the actual topology/connectivity of cpus in the system, so we'll 432 * apply /motherboard=0/chip=.../cpu=... in all cases. 433 */ 434 static nvlist_t * 435 gcpu_fmri_create(cmi_hdl_t hdl, nv_alloc_t *nva) 436 { 437 nvlist_t *nvl; 438 439 if ((nvl = fm_nvlist_create(nva)) == NULL) 440 return (NULL); 441 442 fm_fmri_hc_set(nvl, FM_HC_SCHEME_VERSION, NULL, NULL, 3, 443 "motherboard", 0, 444 "chip", cmi_hdl_chipid(hdl), 445 "cpu", cmi_hdl_coreid(hdl)); 446 447 return (nvl); 448 } 449 450 int gcpu_bleat_count_thresh = 5; 451 hrtime_t gcpu_bleat_min_interval = 10 * 1000000000ULL; 452 453 /* 454 * Called when we are unable to propogate a logout structure onto an 455 * errorq for subsequent ereport preparation and logging etc. The caller 456 * should usually only decide to call this for severe errors - those we 457 * suspect we may need to panic for. 458 */ 459 static void 460 gcpu_bleat(cmi_hdl_t hdl, gcpu_logout_t *gcl) 461 { 462 hrtime_t now = gethrtime_waitfree(); 463 static hrtime_t gcpu_last_bleat; 464 gcpu_bank_logout_t *gbl; 465 static int bleatcount; 466 int i; 467 468 /* 469 * Throttle spamming of the console. The first gcpu_bleat_count_thresh 470 * can come as fast as we like, but once we've spammed that many 471 * to the console we require a minimum interval to pass before 472 * any more complaints. 473 */ 474 if (++bleatcount > gcpu_bleat_count_thresh) { 475 if (now - gcpu_last_bleat < gcpu_bleat_min_interval) 476 return; 477 else 478 bleatcount = 0; 479 } 480 gcpu_last_bleat = now; 481 482 cmn_err(CE_WARN, "Machine-Check Errors unlogged on chip %d core %d, " 483 "raw dump follows", cmi_hdl_chipid(hdl), cmi_hdl_coreid(hdl)); 484 cmn_err(CE_WARN, "MCG_STATUS 0x%016llx", 485 (u_longlong_t)gcl->gcl_mcg_status); 486 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 487 uint64_t status = gbl->gbl_status; 488 489 if (!(status & MSR_MC_STATUS_VAL)) 490 continue; 491 492 switch (status & (MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV)) { 493 case MSR_MC_STATUS_ADDRV | MSR_MC_STATUS_MISCV: 494 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 495 "STAT 0x%016llx ADDR 0x%016llx MISC 0x%016llx", 496 i, IA32_MSR_MC(i, STATUS), 497 (u_longlong_t)status, 498 (u_longlong_t)gbl->gbl_addr, 499 (u_longlong_t)gbl->gbl_misc); 500 break; 501 502 case MSR_MC_STATUS_ADDRV: 503 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 504 "STAT 0x%016llx ADDR 0x%016llx", 505 i, IA32_MSR_MC(i, STATUS), 506 (u_longlong_t)status, 507 (u_longlong_t)gbl->gbl_addr); 508 break; 509 510 case MSR_MC_STATUS_MISCV: 511 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 512 "STAT 0x%016llx MISC 0x%016llx", 513 i, IA32_MSR_MC(i, STATUS), 514 (u_longlong_t)status, 515 (u_longlong_t)gbl->gbl_misc); 516 break; 517 518 default: 519 cmn_err(CE_WARN, "Bank %d (offset 0x%llx) " 520 "STAT 0x%016llx", 521 i, IA32_MSR_MC(i, STATUS), 522 (u_longlong_t)status); 523 break; 524 525 } 526 } 527 } 528 529 #define _GCPU_BSTATUS(status, what) \ 530 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_##what, DATA_TYPE_BOOLEAN_VALUE, \ 531 (status) & MSR_MC_STATUS_##what ? B_TRUE : B_FALSE 532 533 static void 534 gcpu_ereport_add_logout(nvlist_t *ereport, const gcpu_logout_t *gcl, 535 uint_t bankno, const gcpu_error_disp_t *ged, uint16_t code) 536 { 537 uint64_t members = ged ? ged->ged_ereport_members : 538 FM_EREPORT_PAYLOAD_FLAGS_COMMON; 539 uint64_t mcg = gcl->gcl_mcg_status; 540 int mcip = mcg & MCG_STATUS_MCIP; 541 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankno]; 542 uint64_t bstat = gbl->gbl_status; 543 544 /* 545 * Include the compound error name if requested and if this 546 * is a compound error type. 547 */ 548 if (members & FM_EREPORT_PAYLOAD_FLAG_COMPOUND_ERR && ged && 549 ged->ged_compound_fmt != NULL) { 550 char buf[FM_MAX_CLASS]; 551 552 gcpu_mn_fmt(ged->ged_compound_fmt, buf, sizeof (buf), code, 553 GCPU_MN_NAMESPACE_COMPOUND); 554 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_COMPOUND_ERR, 555 DATA_TYPE_STRING, buf, NULL); 556 } 557 558 /* 559 * Include disposition information for this error 560 */ 561 if (members & FM_EREPORT_PAYLOAD_FLAG_DISP && 562 gbl->gbl_disp != 0) { 563 int i, empty = 1; 564 char buf[128]; 565 char *p = buf, *q = buf + 128; 566 static struct _gcpu_disp_name { 567 uint64_t dv; 568 const char *dn; 569 } disp_names[] = { 570 { CMI_ERRDISP_CURCTXBAD, 571 "processor_context_corrupt" }, 572 { CMI_ERRDISP_RIPV_INVALID, 573 "return_ip_invalid" }, 574 { CMI_ERRDISP_UC_UNCONSTRAINED, 575 "unconstrained" }, 576 { CMI_ERRDISP_FORCEFATAL, 577 "forcefatal" }, 578 { CMI_ERRDISP_IGNORED, 579 "ignored" }, 580 { CMI_ERRDISP_PCC_CLEARED, 581 "corrupt_context_cleared" }, 582 { CMI_ERRDISP_UC_CLEARED, 583 "uncorrected_data_cleared" }, 584 { CMI_ERRDISP_POISONED, 585 "poisoned" }, 586 { CMI_ERRDISP_INCONSISTENT, 587 "telemetry_unstable" }, 588 }; 589 590 for (i = 0; i < sizeof (disp_names) / 591 sizeof (struct _gcpu_disp_name); i++) { 592 if ((gbl->gbl_disp & disp_names[i].dv) == 0) 593 continue; 594 595 (void) snprintf(p, (uintptr_t)q - (uintptr_t)p, 596 "%s%s", empty ? "" : ",", disp_names[i].dn); 597 p += strlen(p); 598 empty = 0; 599 } 600 601 if (p != buf) 602 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_DISP, 603 DATA_TYPE_STRING, buf, NULL); 604 } 605 606 /* 607 * If MCG_STATUS is included add that and an indication of whether 608 * this ereport was the result of a machine check or poll. 609 */ 610 if (members & FM_EREPORT_PAYLOAD_FLAG_MCG_STATUS) { 611 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS, 612 DATA_TYPE_UINT64, mcg, NULL); 613 614 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MCG_STATUS_MCIP, 615 DATA_TYPE_BOOLEAN_VALUE, mcip ? B_TRUE : B_FALSE, NULL); 616 } 617 618 /* 619 * If an instruction pointer is to be included add one provided 620 * MCG_STATUS indicated it is valid; meaningless for polled events. 621 */ 622 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_IP && 623 mcg & MCG_STATUS_EIPV) { 624 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_IP, 625 DATA_TYPE_UINT64, gcl->gcl_ip, NULL); 626 } 627 628 /* 629 * Add an indication of whether the trap occured during privileged code. 630 */ 631 if (mcip && members & FM_EREPORT_PAYLOAD_FLAG_PRIV) { 632 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_PRIV, 633 DATA_TYPE_BOOLEAN_VALUE, 634 gcl->gcl_flags & GCPU_GCL_F_PRIV ? B_TRUE : B_FALSE, NULL); 635 } 636 637 /* 638 * If requested, add the index of the MCA bank. This indicates the 639 * n'th bank of 4 MCA registers, and does not necessarily correspond 640 * to MCi_* - use the bank offset to correlate 641 */ 642 if (members & FM_EREPORT_PAYLOAD_FLAG_BANK_NUM) { 643 fm_payload_set(ereport, 644 /* Bank number */ 645 FM_EREPORT_PAYLOAD_NAME_BANK_NUM, DATA_TYPE_UINT8, bankno, 646 /* Offset of MCi_CTL */ 647 FM_EREPORT_PAYLOAD_NAME_BANK_MSR_OFFSET, DATA_TYPE_UINT64, 648 IA32_MSR_MC(bankno, CTL), 649 NULL); 650 } 651 652 /* 653 * Add MCi_STATUS if requested, and decode it. 654 */ 655 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_STATUS) { 656 const char *tbes[] = { 657 "No tracking", /* 00 */ 658 "Green - below threshold", /* 01 */ 659 "Yellow - above threshold", /* 10 */ 660 "Reserved" /* 11 */ 661 }; 662 663 fm_payload_set(ereport, 664 /* Bank MCi_STATUS */ 665 FM_EREPORT_PAYLOAD_NAME_MC_STATUS, DATA_TYPE_UINT64, bstat, 666 /* Overflow? */ 667 _GCPU_BSTATUS(bstat, OVER), 668 /* Uncorrected? */ 669 _GCPU_BSTATUS(bstat, UC), 670 /* Enabled? */ 671 _GCPU_BSTATUS(bstat, EN), 672 /* Processor context corrupt? */ 673 _GCPU_BSTATUS(bstat, PCC), 674 /* Error code */ 675 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_ERRCODE, 676 DATA_TYPE_UINT16, MCAX86_ERRCODE(bstat), 677 /* Model-specific error code */ 678 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_EXTERRCODE, 679 DATA_TYPE_UINT16, MCAX86_MSERRCODE(bstat), 680 NULL); 681 682 /* 683 * If MCG_CAP.TES_P indicates that that thresholding info 684 * is present in the architural component of the bank status 685 * then include threshold information for this bank. 686 */ 687 if (gcl->gcl_flags & GCPU_GCL_F_TES_P) { 688 fm_payload_set(ereport, 689 FM_EREPORT_PAYLOAD_NAME_MC_STATUS_TES, 690 DATA_TYPE_STRING, tbes[MCAX86_TBES_VALUE(bstat)], 691 NULL); 692 } 693 } 694 695 /* 696 * MCi_ADDR info if requested and valid. 697 */ 698 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_ADDR && 699 bstat & MSR_MC_STATUS_ADDRV) { 700 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_ADDR, 701 DATA_TYPE_UINT64, gbl->gbl_addr, NULL); 702 } 703 704 /* 705 * MCi_MISC if requested and MCi_STATUS.MISCV). 706 */ 707 if (members & FM_EREPORT_PAYLOAD_FLAG_MC_MISC && 708 bstat & MSR_MC_STATUS_MISCV) { 709 fm_payload_set(ereport, FM_EREPORT_PAYLOAD_NAME_MC_MISC, 710 DATA_TYPE_UINT64, gbl->gbl_misc, NULL); 711 } 712 713 } 714 715 /* 716 * Construct and post an ereport based on the logout information from a 717 * single MCA bank. We are not necessarily running on the cpu that 718 * detected the error. 719 */ 720 static void 721 gcpu_ereport_post(const gcpu_logout_t *gcl, int bankidx, 722 const gcpu_error_disp_t *ged, cms_cookie_t mscookie, uint64_t status) 723 { 724 gcpu_data_t *gcpu = gcl->gcl_gcpu; 725 cmi_hdl_t hdl = gcpu->gcpu_hdl; 726 const gcpu_bank_logout_t *gbl = &gcl->gcl_data[bankidx]; 727 const char *cpuclass = NULL, *leafclass = NULL; 728 uint16_t code = MCAX86_ERRCODE(status); 729 errorq_elem_t *eqep, *scr_eqep; 730 nvlist_t *ereport, *detector; 731 char buf[FM_MAX_CLASS]; 732 const char *classfmt; 733 nv_alloc_t *nva; 734 735 if (panicstr) { 736 if ((eqep = errorq_reserve(ereport_errorq)) == NULL) 737 return; 738 ereport = errorq_elem_nvl(ereport_errorq, eqep); 739 740 /* 741 * Allocate another element for scratch space, but fallback 742 * to the one we have if that fails. We'd like to use the 743 * additional scratch space for nvlist construction. 744 */ 745 if ((scr_eqep = errorq_reserve(ereport_errorq)) != NULL) 746 nva = errorq_elem_nva(ereport_errorq, scr_eqep); 747 else 748 nva = errorq_elem_nva(ereport_errorq, eqep); 749 } else { 750 ereport = fm_nvlist_create(NULL); 751 nva = NULL; 752 } 753 754 if (ereport == NULL) 755 return; 756 757 /* 758 * Common payload data required by the protocol: 759 * - ereport class 760 * - detector 761 * - ENA 762 */ 763 764 /* 765 * Ereport class - call into model-specific support to allow it to 766 * provide a cpu class or leaf class, otherwise calculate our own. 767 */ 768 cms_ereport_class(hdl, mscookie, &cpuclass, &leafclass); 769 classfmt = ged ? ged->ged_class_fmt : FM_EREPORT_CPU_GENERIC_UNKNOWN; 770 gcpu_erpt_clsfmt(classfmt, buf, sizeof (buf), status, cpuclass, 771 leafclass); 772 773 /* 774 * The detector FMRI. 775 */ 776 if ((detector = cms_ereport_detector(hdl, mscookie, nva)) == NULL) 777 detector = gcpu_fmri_create(hdl, nva); 778 779 /* 780 * Should we define a new ENA format 3?? for chip/core/strand? 781 * It will be better when virtualized. 782 */ 783 fm_ereport_set(ereport, FM_EREPORT_VERSION, buf, 784 fm_ena_generate_cpu(gcl->gcl_timestamp, 785 cmi_hdl_chipid(hdl) << 6 | cmi_hdl_coreid(hdl) << 3 | 786 cmi_hdl_strandid(hdl), FM_ENA_FMT1), detector, NULL); 787 788 if (panicstr) { 789 fm_nvlist_destroy(detector, FM_NVA_RETAIN); 790 nv_alloc_reset(nva); 791 } else { 792 fm_nvlist_destroy(detector, FM_NVA_FREE); 793 } 794 795 /* 796 * Add the architectural ereport class-specific payload data. 797 */ 798 gcpu_ereport_add_logout(ereport, gcl, bankidx, ged, code); 799 800 /* 801 * Allow model-specific code to add ereport members. 802 */ 803 cms_ereport_add_logout(hdl, ereport, nva, bankidx, gbl->gbl_status, 804 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout, mscookie); 805 806 /* 807 * Include stack if options is turned on and either selected in 808 * the payload member bitmask or inclusion is forced. 809 */ 810 if (gcpu_mca_stack_flag && 811 (cms_ereport_includestack(hdl, mscookie) == 812 B_TRUE || gcpu_mca_stack_ereport_include)) { 813 fm_payload_stack_add(ereport, gcl->gcl_stack, 814 gcl->gcl_stackdepth); 815 } 816 817 /* 818 * Post ereport. 819 */ 820 if (panicstr) { 821 errorq_commit(ereport_errorq, eqep, ERRORQ_SYNC); 822 if (scr_eqep) 823 errorq_cancel(ereport_errorq, scr_eqep); 824 } else { 825 (void) fm_ereport_post(ereport, EVCH_TRYHARD); 826 fm_nvlist_destroy(ereport, FM_NVA_FREE); 827 } 828 829 } 830 831 /*ARGSUSED*/ 832 void 833 gcpu_mca_drain(void *ignored, const void *data, const errorq_elem_t *eqe) 834 { 835 const gcpu_logout_t *gcl = data; 836 const gcpu_bank_logout_t *gbl; 837 int i; 838 839 for (i = 0, gbl = &gcl->gcl_data[0]; i < gcl->gcl_nbanks; i++, gbl++) { 840 const gcpu_error_disp_t *gened; 841 cms_cookie_t mscookie; 842 843 if (gbl->gbl_status & MSR_MC_STATUS_VAL && 844 !(gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 845 uint16_t code = MCAX86_ERRCODE(gbl->gbl_status); 846 847 /* 848 * Perform a match based on IA32 MCA architectural 849 * components alone. 850 */ 851 gened = gcpu_disp_match(code); /* may be NULL */ 852 853 /* 854 * Now see if an model-specific match can be made. 855 */ 856 mscookie = cms_disp_match(gcl->gcl_gcpu->gcpu_hdl, i, 857 gbl->gbl_status, gbl->gbl_addr, gbl->gbl_misc, 858 gcl->gcl_ms_logout); 859 860 /* 861 * Prepare and dispatch an ereport for logging and 862 * diagnosis. 863 */ 864 gcpu_ereport_post(gcl, i, gened, mscookie, 865 gbl->gbl_status); 866 } else if (gbl->gbl_status & MSR_MC_STATUS_VAL && 867 (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT)) { 868 /* 869 * Telemetry kept changing as we tried to read 870 * it. Force an unknown ereport leafclass but 871 * keep the telemetry unchanged for logging. 872 */ 873 gcpu_ereport_post(gcl, i, &gcpu_unknown, NULL, 874 gbl->gbl_status); 875 } 876 } 877 } 878 879 static size_t gcpu_mca_queue_datasz = 0; 880 881 /* 882 * The following code is ready to make a weak attempt at growing the 883 * errorq structure size. Since it is not foolproof (we don't know 884 * who may already be producing to the outgoing errorq) our caller 885 * instead assures that we'll always be called with no greater data 886 * size than on our first call. 887 */ 888 static void 889 gcpu_errorq_init(size_t datasz) 890 { 891 int slots; 892 893 mutex_enter(&gcpu_mca_queue_lock); 894 895 if (gcpu_mca_queue_datasz >= datasz) { 896 mutex_exit(&gcpu_mca_queue_lock); 897 return; 898 } 899 900 membar_producer(); 901 if (gcpu_mca_queue) { 902 gcpu_mca_queue_datasz = 0; 903 errorq_destroy(gcpu_mca_queue); 904 } 905 906 slots = MAX(GCPU_MCA_ERRS_PERCPU * max_ncpus, GCPU_MCA_MIN_ERRORS); 907 slots = MIN(slots, GCPU_MCA_MAX_ERRORS); 908 909 gcpu_mca_queue = errorq_create("gcpu_mca_queue", gcpu_mca_drain, 910 NULL, slots, datasz, 1, ERRORQ_VITAL); 911 912 if (gcpu_mca_queue != NULL) 913 gcpu_mca_queue_datasz = datasz; 914 915 mutex_exit(&gcpu_mca_queue_lock); 916 } 917 918 /* 919 * Perform MCA initialization as described in section 14.6 of Intel 64 920 * and IA-32 Architectures Software Developer's Manual Volume 3A. 921 */ 922 923 static uint_t global_nbanks; 924 925 void 926 gcpu_mca_init(cmi_hdl_t hdl) 927 { 928 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 929 uint64_t cap; 930 uint_t vendor = cmi_hdl_vendor(hdl); 931 uint_t family = cmi_hdl_family(hdl); 932 gcpu_mca_t *mca = &gcpu->gcpu_mca; 933 int mcg_ctl_present; 934 uint_t nbanks; 935 size_t mslsz; 936 int i; 937 938 if (gcpu == NULL) 939 return; 940 941 /* 942 * Protect from some silly /etc/system settings. 943 */ 944 if (gcpu_mca_telemetry_retries < 0 || gcpu_mca_telemetry_retries > 100) 945 gcpu_mca_telemetry_retries = 5; 946 947 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != CMI_SUCCESS) 948 return; 949 950 /* 951 * CPU startup code only calls cmi_mca_init if x86_feature indicates 952 * both MCA and MCE support (i.e., X86_MCA). P5, K6, and earlier 953 * processors, which have their own * more primitive way of doing 954 * machine checks, will not have cmi_mca_init called since their 955 * CPUID information will not indicate both MCA and MCE features. 956 */ 957 #ifndef __xpv 958 ASSERT(x86_feature & X86_MCA); 959 #endif /* __xpv */ 960 961 /* 962 * Determine whether the IA32_MCG_CTL register is present. If it 963 * is we will enable all features by writing -1 to it towards 964 * the end of this initialization; if it is absent then volume 3A 965 * says we must nonetheless continue to initialize the individual 966 * banks. 967 */ 968 mcg_ctl_present = cap & MCG_CAP_CTL_P; 969 970 /* 971 * We squirell values away for inspection/debugging. 972 */ 973 mca->gcpu_mca_bioscfg.bios_mcg_cap = cap; 974 if (mcg_ctl_present) 975 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CTL, 976 &mca->gcpu_mca_bioscfg.bios_mcg_ctl); 977 978 /* 979 * Determine the number of error-reporting banks implemented. 980 */ 981 mca->gcpu_mca_nbanks = nbanks = cap & MCG_CAP_COUNT_MASK; 982 983 if (nbanks != 0 && global_nbanks == 0) 984 global_nbanks = nbanks; /* no race - BSP will get here first */ 985 986 /* 987 * If someone is hiding the number of banks (perhaps we are fully 988 * virtualized?) or if this processor has more banks than the 989 * first to set global_nbanks then bail. The latter requirement 990 * is because we need to size our errorq data structure and we 991 * don't want to have to grow the errorq (destroy and recreate) 992 * which may just lose some telemetry. 993 */ 994 if (nbanks == 0 || nbanks > global_nbanks) 995 return; 996 997 mca->gcpu_mca_bioscfg.bios_bankcfg = kmem_zalloc(nbanks * 998 sizeof (struct gcpu_bios_bankcfg), KM_SLEEP); 999 1000 /* 1001 * Calculate the size we need to allocate for a gcpu_logout_t 1002 * with a gcl_data array big enough for all banks of this cpu. 1003 * Add any space requested by the model-specific logout support. 1004 */ 1005 mslsz = cms_logout_size(hdl); 1006 mca->gcpu_mca_lgsz = sizeof (gcpu_logout_t) + 1007 (nbanks - 1) * sizeof (gcpu_bank_logout_t) + mslsz; 1008 1009 for (i = 0; i < GCPU_MCA_LOGOUT_NUM; i++) { 1010 gcpu_logout_t *gcl; 1011 1012 mca->gcpu_mca_logout[i] = gcl = 1013 kmem_zalloc(mca->gcpu_mca_lgsz, KM_SLEEP); 1014 gcl->gcl_gcpu = gcpu; 1015 gcl->gcl_nbanks = nbanks; 1016 gcl->gcl_ms_logout = (mslsz == 0) ? NULL : 1017 (char *)(&gcl->gcl_data[0]) + nbanks * 1018 sizeof (gcpu_bank_logout_t); 1019 1020 } 1021 mca->gcpu_mca_nextpoll_idx = GCPU_MCA_LOGOUT_POLLER_1; 1022 1023 /* 1024 * Create our errorq to transport the logout structures. This 1025 * can fail so users of gcpu_mca_queue must be prepared for NULL. 1026 */ 1027 gcpu_errorq_init(mca->gcpu_mca_lgsz); 1028 1029 /* 1030 * Not knowing which, if any, banks are shared between cores we 1031 * assure serialization of MCA bank initialization by each cpu 1032 * on the chip. On chip architectures in which some banks are 1033 * shared this will mean the shared resource is initialized more 1034 * than once - we're simply aiming to avoid simultaneous MSR writes 1035 * to the shared resource. 1036 * 1037 * Even with these precautions, some platforms may yield a GP fault 1038 * if a core other than a designated master tries to write anything 1039 * but all 0's to MCi_{STATUS,ADDR,CTL}. So we will perform 1040 * those writes under on_trap protection. 1041 */ 1042 mutex_enter(&gcpu->gcpu_shared->gcpus_cfglock); 1043 1044 /* 1045 * Initialize poller data, but don't start polling yet. 1046 */ 1047 gcpu_mca_poll_init(hdl); 1048 1049 /* 1050 * Work out which MCA banks we will initialize. In MCA logout 1051 * code we will only read those banks which we initialize here. 1052 */ 1053 for (i = 0; i < nbanks; i++) { 1054 /* 1055 * On Intel family 6 and AMD family 6 we must not enable 1056 * machine check from bank 0 detectors. In the Intel 1057 * case bank 0 is reserved for the platform, while in the 1058 * AMD case reports are that enabling bank 0 (DC) produces 1059 * spurious machine checks. 1060 */ 1061 if (i == 0 && ((vendor == X86_VENDOR_Intel || 1062 vendor == X86_VENDOR_AMD) && family == 6)) 1063 continue; 1064 1065 if (cms_bankctl_skipinit(hdl, i)) 1066 continue; 1067 1068 /* 1069 * Record which MCA banks were enabled, both from the 1070 * point of view of this core and accumulating for the 1071 * whole chip (if some cores share a bank we must be 1072 * sure either can logout from it). 1073 */ 1074 mca->gcpu_actv_banks |= 1 << i; 1075 atomic_or_32(&gcpu->gcpu_shared->gcpus_actv_banks, 1 << i); 1076 } 1077 1078 /* 1079 * Log any valid telemetry lurking in the MCA banks, but do not 1080 * clear the status registers. Ignore the disposition returned - 1081 * we have already paniced or reset for any nasty errors found here. 1082 * 1083 * Intel vol 3A says that we should not do this on family 0x6, 1084 * and that for any extended family the BIOS clears things 1085 * on power-on reset so you'll only potentially find valid telemetry 1086 * on warm reset (we do it for both - on power-on reset we should 1087 * just see zeroes). 1088 * 1089 * AMD docs since K7 say we should process anything we find here. 1090 */ 1091 if (!gcpu_suppress_log_on_init && 1092 (vendor == X86_VENDOR_Intel && family >= 0xf || 1093 vendor == X86_VENDOR_AMD)) 1094 gcpu_mca_logout(hdl, NULL, -1ULL, NULL, B_FALSE); 1095 1096 /* 1097 * Initialize all MCi_CTL and clear all MCi_STATUS, allowing the 1098 * model-specific module the power of veto. 1099 */ 1100 for (i = 0; i < nbanks; i++) { 1101 struct gcpu_bios_bankcfg *bcfgp = 1102 mca->gcpu_mca_bioscfg.bios_bankcfg + i; 1103 1104 /* 1105 * Stash inherited bank MCA state, even for banks we will 1106 * not initialize ourselves. Do not read the MISC register 1107 * unconditionally - on some processors that will #GP on 1108 * banks that do not implement the MISC register (would be 1109 * caught by on_trap, anyway). 1110 */ 1111 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, CTL), 1112 &bcfgp->bios_bank_ctl); 1113 1114 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1115 &bcfgp->bios_bank_status); 1116 1117 if (bcfgp->bios_bank_status & MSR_MC_STATUS_ADDRV) 1118 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), 1119 &bcfgp->bios_bank_addr); 1120 1121 /* 1122 * In some old BIOS the status value after boot can indicate 1123 * MISCV when there is actually no MISC register for 1124 * that bank. The following read could therefore 1125 * aggravate a general protection fault. This should be 1126 * caught by on_trap, but the #GP fault handler is busted 1127 * and can suffer a double fault even before we get to 1128 * trap() to check for on_trap protection. Until that 1129 * issue is fixed we remove the one access that we know 1130 * can cause a #GP. 1131 * 1132 * if (bcfgp->bios_bank_status & MSR_MC_STATUS_MISCV) 1133 * (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), 1134 * &bcfgp->bios_bank_misc); 1135 */ 1136 bcfgp->bios_bank_misc = 0; 1137 1138 if (!(mca->gcpu_actv_banks & 1 << i)) 1139 continue; 1140 1141 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, CTL), 1142 cms_bankctl_val(hdl, i, -1ULL)); 1143 1144 if (!cms_bankstatus_skipinit(hdl, i)) { 1145 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 1146 cms_bankstatus_val(hdl, i, 0ULL)); 1147 } 1148 } 1149 1150 /* 1151 * Now let the model-specific support perform further initialization 1152 * of non-architectural features. 1153 */ 1154 cms_mca_init(hdl, nbanks); 1155 1156 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0ULL); 1157 membar_producer(); 1158 1159 /* enable all machine-check features */ 1160 if (mcg_ctl_present) 1161 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_CTL, 1162 cms_mcgctl_val(hdl, nbanks, -1ULL)); 1163 1164 mutex_exit(&gcpu->gcpu_shared->gcpus_cfglock); 1165 1166 /* enable machine-check exception in CR4 */ 1167 cmi_hdl_enable_mce(hdl); 1168 } 1169 1170 static uint64_t 1171 gcpu_mca_process(cmi_hdl_t hdl, struct regs *rp, int nerr, gcpu_data_t *gcpu, 1172 gcpu_logout_t *gcl, int ismc, gcpu_mce_status_t *mcesp) 1173 { 1174 int curctxbad = 0, unconstrained = 0, forcefatal = 0; 1175 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1176 int nbanks = mca->gcpu_mca_nbanks; 1177 gcpu_mce_status_t mce; 1178 gcpu_bank_logout_t *gbl; 1179 uint64_t disp = 0; 1180 int i; 1181 1182 if (mcesp == NULL) 1183 mcesp = &mce; 1184 1185 mcesp->mce_nerr = nerr; 1186 1187 mcesp->mce_npcc = mcesp->mce_npcc_ok = mcesp->mce_nuc = 1188 mcesp->mce_nuc_ok = mcesp->mce_nuc_poisoned = 1189 mcesp->mce_forcefatal = mcesp->mce_ignored = 0; 1190 1191 /* 1192 * If this a machine check then if the return instruction pointer 1193 * is not valid the current context is lost. 1194 */ 1195 if (ismc && !(gcl->gcl_mcg_status & MCG_STATUS_RIPV)) 1196 disp |= CMI_ERRDISP_RIPV_INVALID; 1197 1198 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1199 uint64_t mcistatus = gbl->gbl_status; 1200 uint32_t ms_scope; 1201 int pcc, uc; 1202 int poisoned; 1203 1204 if (!(mcistatus & MSR_MC_STATUS_VAL)) 1205 continue; 1206 1207 if (gbl->gbl_disp & CMI_ERRDISP_INCONSISTENT) 1208 continue; 1209 1210 pcc = (mcistatus & MSR_MC_STATUS_PCC) != 0; 1211 uc = (mcistatus & MSR_MC_STATUS_UC) != 0; 1212 mcesp->mce_npcc += pcc; 1213 mcesp->mce_nuc += uc; 1214 1215 ms_scope = cms_error_action(hdl, ismc, i, mcistatus, 1216 gbl->gbl_addr, gbl->gbl_misc, gcl->gcl_ms_logout); 1217 1218 if (pcc && ms_scope & CMS_ERRSCOPE_CURCONTEXT_OK) { 1219 pcc = 0; 1220 mcesp->mce_npcc_ok++; 1221 gbl->gbl_disp |= CMI_ERRDISP_PCC_CLEARED; 1222 } 1223 1224 if (uc && ms_scope & CMS_ERRSCOPE_CLEARED_UC) { 1225 uc = 0; 1226 mcesp->mce_nuc_ok++; 1227 gbl->gbl_disp |= CMI_ERRDISP_UC_CLEARED; 1228 } 1229 1230 if (uc) { 1231 poisoned = (ms_scope & CMS_ERRSCOPE_POISONED) != 0; 1232 if (poisoned) { 1233 mcesp->mce_nuc_poisoned++; 1234 gbl->gbl_disp |= CMI_ERRDISP_POISONED; 1235 } 1236 } 1237 1238 if ((ms_scope & CMS_ERRSCOPE_IGNORE_ERR) == 0) { 1239 /* 1240 * We're not being instructed to ignore the error, 1241 * so apply our standard disposition logic to it. 1242 */ 1243 if (uc && !poisoned) { 1244 unconstrained++; 1245 gbl->gbl_disp |= disp | 1246 CMI_ERRDISP_UC_UNCONSTRAINED; 1247 } 1248 1249 if (pcc && ismc) { 1250 curctxbad++; 1251 gbl->gbl_disp |= disp | 1252 CMI_ERRDISP_CURCTXBAD; 1253 } 1254 1255 /* 1256 * Even if the above may not indicate that the error 1257 * is terminal, model-specific support may insist 1258 * that we treat it as such. Such errors wil be 1259 * fatal even if discovered via poll. 1260 */ 1261 if (ms_scope & CMS_ERRSCOPE_FORCE_FATAL) { 1262 forcefatal++; 1263 mcesp->mce_forcefatal++; 1264 gbl->gbl_disp |= disp | 1265 CMI_ERRDISP_FORCEFATAL; 1266 } 1267 } else { 1268 mcesp->mce_ignored++; 1269 gbl->gbl_disp |= disp | CMI_ERRDISP_IGNORED; 1270 } 1271 } 1272 1273 if (unconstrained > 0) 1274 disp |= CMI_ERRDISP_UC_UNCONSTRAINED; 1275 1276 if (curctxbad > 0) 1277 disp |= CMI_ERRDISP_CURCTXBAD; 1278 1279 if (forcefatal > 0) 1280 disp |= CMI_ERRDISP_FORCEFATAL; 1281 1282 if (gcpu_mca_queue != NULL) { 1283 int how; 1284 1285 if (ismc) { 1286 how = cmi_mce_response(rp, disp) ? 1287 ERRORQ_ASYNC : /* no panic, so arrange drain */ 1288 ERRORQ_SYNC; /* panic flow will drain */ 1289 } else { 1290 how = (disp & CMI_ERRDISP_FORCEFATAL && 1291 cmi_panic_on_ue()) ? 1292 ERRORQ_SYNC : /* poller will panic */ 1293 ERRORQ_ASYNC; /* no panic */ 1294 } 1295 1296 errorq_dispatch(gcpu_mca_queue, gcl, mca->gcpu_mca_lgsz, how); 1297 } else if (disp != 0) { 1298 gcpu_bleat(hdl, gcl); 1299 } 1300 1301 mcesp->mce_disp = disp; 1302 1303 return (disp); 1304 } 1305 1306 /* 1307 * Gather error telemetry from our source, and then submit it for 1308 * processing. 1309 */ 1310 1311 #define IS_MCE_CANDIDATE(status) (((status) & MSR_MC_STATUS_EN) != 0 && \ 1312 ((status) & (MSR_MC_STATUS_UC | MSR_MC_STATUS_PCC)) != 0) 1313 1314 #define STATUS_EQV(s1, s2) \ 1315 (((s1) & ~MSR_MC_STATUS_OVER) == ((s2) & ~MSR_MC_STATUS_OVER)) 1316 1317 static uint32_t gcpu_deferrred_polled_clears; 1318 1319 void 1320 gcpu_mca_logout(cmi_hdl_t hdl, struct regs *rp, uint64_t bankmask, 1321 gcpu_mce_status_t *mcesp, boolean_t clrstatus) 1322 { 1323 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1324 gcpu_mca_t *mca = &gcpu->gcpu_mca; 1325 int nbanks = mca->gcpu_mca_nbanks; 1326 gcpu_bank_logout_t *gbl, *pgbl; 1327 gcpu_logout_t *gcl, *pgcl; 1328 int ismc = (rp != NULL); 1329 int ispoll = !ismc; 1330 int i, nerr = 0; 1331 cmi_errno_t err; 1332 uint64_t mcg_status; 1333 uint64_t disp; 1334 uint64_t cap; 1335 1336 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1337 CMI_SUCCESS || cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_CAP, &cap) != 1338 CMI_SUCCESS) { 1339 if (mcesp != NULL) 1340 mcesp->mce_nerr = mcesp->mce_disp = 0; 1341 return; 1342 } 1343 1344 if (ismc) { 1345 gcl = mca->gcpu_mca_logout[GCPU_MCA_LOGOUT_EXCEPTION]; 1346 } else { 1347 int pidx = mca->gcpu_mca_nextpoll_idx; 1348 int ppidx = (pidx == GCPU_MCA_LOGOUT_POLLER_1) ? 1349 GCPU_MCA_LOGOUT_POLLER_2 : GCPU_MCA_LOGOUT_POLLER_1; 1350 1351 gcl = mca->gcpu_mca_logout[pidx]; /* current logout */ 1352 pgcl = mca->gcpu_mca_logout[ppidx]; /* previous logout */ 1353 mca->gcpu_mca_nextpoll_idx = ppidx; /* switch next time */ 1354 } 1355 1356 gcl->gcl_timestamp = gethrtime_waitfree(); 1357 gcl->gcl_mcg_status = mcg_status; 1358 gcl->gcl_ip = rp ? rp->r_pc : 0; 1359 1360 gcl->gcl_flags = (rp && USERMODE(rp->r_cs)) ? GCPU_GCL_F_PRIV : 0; 1361 if (cap & MCG_CAP_TES_P) 1362 gcl->gcl_flags |= GCPU_GCL_F_TES_P; 1363 1364 for (i = 0, gbl = &gcl->gcl_data[0]; i < nbanks; i++, gbl++) { 1365 uint64_t status, status2, addr, misc; 1366 int retries = gcpu_mca_telemetry_retries; 1367 1368 gbl->gbl_status = 0; 1369 gbl->gbl_disp = 0; 1370 gbl->gbl_clrdefcnt = 0; 1371 1372 /* 1373 * Only logout from MCA banks we have initialized from at 1374 * least one core. If a core shares an MCA bank with another 1375 * but perhaps lost the race to initialize it, then it must 1376 * still be allowed to logout from the shared bank. 1377 */ 1378 if (!(gcpu->gcpu_shared->gcpus_actv_banks & 1 << i)) 1379 continue; 1380 1381 /* 1382 * On a poll look only at the banks we've been asked to check. 1383 */ 1384 if (rp == NULL && !(bankmask & 1 << i)) 1385 continue; 1386 1387 1388 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), &status) != 1389 CMI_SUCCESS) 1390 continue; 1391 retry: 1392 if (!(status & MSR_MC_STATUS_VAL)) 1393 continue; 1394 1395 addr = -1; 1396 misc = 0; 1397 1398 if (status & MSR_MC_STATUS_ADDRV) 1399 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, ADDR), &addr); 1400 1401 if (status & MSR_MC_STATUS_MISCV) 1402 (void) cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, MISC), &misc); 1403 1404 /* 1405 * Allow the model-specific code to extract bank telemetry. 1406 */ 1407 cms_bank_logout(hdl, i, status, addr, misc, gcl->gcl_ms_logout); 1408 1409 /* 1410 * Not all cpu models assure us that the status/address/misc 1411 * data will not change during the above sequence of MSR reads, 1412 * or that it can only change by the addition of the OVerflow 1413 * bit to the status register. If the status has changed 1414 * other than in the overflow bit then we attempt to reread 1415 * for a consistent snapshot, but eventually give up and 1416 * go with what we've got. We only perform this check 1417 * for a poll - a further #MC during a #MC will reset, and 1418 * polled errors should not overwrite higher-priority 1419 * trapping errors (but could set the overflow bit). 1420 */ 1421 if (ispoll && (err = cmi_hdl_rdmsr(hdl, IA32_MSR_MC(i, STATUS), 1422 &status2)) == CMI_SUCCESS) { 1423 if (!STATUS_EQV(status, status2)) { 1424 if (retries-- > 0) { 1425 status = status2; 1426 goto retry; 1427 } else { 1428 gbl->gbl_disp |= 1429 CMI_ERRDISP_INCONSISTENT; 1430 } 1431 } 1432 } else if (ispoll && err != CMI_SUCCESS) { 1433 gbl->gbl_disp |= CMI_ERRDISP_INCONSISTENT; 1434 } 1435 1436 nerr++; 1437 gbl->gbl_status = status; 1438 gbl->gbl_addr = addr; 1439 gbl->gbl_misc = misc; 1440 1441 if (clrstatus == B_FALSE) 1442 goto serialize; 1443 1444 /* 1445 * For machine checks we always clear status here. For polls 1446 * we must be a little more cautious since there is an 1447 * outside chance that we may clear telemetry from a shared 1448 * MCA bank on which a sibling core is machine checking. 1449 * 1450 * For polled observations of errors that look like they may 1451 * produce a machine check (UC/PCC and ENabled, although these 1452 * do not guarantee a machine check on error occurence) 1453 * we will not clear the status at this wakeup unless 1454 * we saw the same status at the previous poll. We will 1455 * always process and log the current observations - it 1456 * is only the clearing of MCi_STATUS which may be 1457 * deferred until the next wakeup. 1458 */ 1459 if (ismc || !IS_MCE_CANDIDATE(status)) { 1460 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MC(i, STATUS), 0ULL); 1461 goto serialize; 1462 } 1463 1464 /* 1465 * We have a polled observation of a machine check 1466 * candidate. If we saw essentially the same status at the 1467 * last poll then clear the status now since this appears 1468 * not to be a #MC candidate after all. If we see quite 1469 * different status now then do not clear, but reconsider at 1470 * the next poll. In no actual machine check clears 1471 * the status in the interim then the status should not 1472 * keep changing forever (meaning we'd never clear it) 1473 * since before long we'll simply have latched the highest- 1474 * priority error and set the OVerflow bit. Nonetheless 1475 * we count how many times we defer clearing and after 1476 * a while insist on clearing the status. 1477 */ 1478 pgbl = &pgcl->gcl_data[i]; 1479 if (pgbl->gbl_clrdefcnt != 0) { 1480 /* We deferred clear on this bank at last wakeup */ 1481 if (STATUS_EQV(status, pgcl->gcl_data[i].gbl_status) || 1482 pgbl->gbl_clrdefcnt > 5) { 1483 /* 1484 * Status is unchanged so clear it now and, 1485 * since we have already logged this info, 1486 * avoid logging it again. 1487 */ 1488 gbl->gbl_status = 0; 1489 nerr--; 1490 (void) cmi_hdl_wrmsr(hdl, 1491 IA32_MSR_MC(i, STATUS), 0ULL); 1492 } else { 1493 /* Record deferral for next wakeup */ 1494 gbl->gbl_clrdefcnt = pgbl->gbl_clrdefcnt + 1; 1495 } 1496 } else { 1497 /* Record initial deferral for next wakeup */ 1498 gbl->gbl_clrdefcnt = 1; 1499 gcpu_deferrred_polled_clears++; 1500 } 1501 1502 serialize: 1503 /* 1504 * Intel Vol 3A says to execute a serializing instruction 1505 * here, ie CPUID. Well WRMSR is also defined to be 1506 * serializing, so the status clear above should suffice. 1507 * To be a good citizen, and since some clears are deferred, 1508 * we'll execute a CPUID instruction here. 1509 */ 1510 { 1511 struct cpuid_regs tmp; 1512 (void) __cpuid_insn(&tmp); 1513 } 1514 } 1515 1516 if (gcpu_mca_stack_flag) 1517 gcl->gcl_stackdepth = getpcstack(gcl->gcl_stack, FM_STK_DEPTH); 1518 else 1519 gcl->gcl_stackdepth = 0; 1520 1521 /* 1522 * Decide our disposition for this error or errors, and submit for 1523 * logging and subsequent diagnosis. 1524 */ 1525 if (nerr != 0) { 1526 disp = gcpu_mca_process(hdl, rp, nerr, gcpu, gcl, ismc, mcesp); 1527 } else { 1528 disp = 0; 1529 if (mcesp) { 1530 mcesp->mce_nerr = mcesp->mce_disp = 0; 1531 } 1532 } 1533 1534 /* 1535 * Clear MCG_STATUS if MCIP is set (machine check in progress). 1536 * If a second #MC had occured before now the system would have 1537 * reset. We can only do thise once gcpu_mca_process has copied 1538 * the logout structure. 1539 */ 1540 if (ismc && mcg_status & MCG_STATUS_MCIP) 1541 (void) cmi_hdl_wrmsr(hdl, IA32_MSR_MCG_STATUS, 0); 1542 1543 /* 1544 * At this point we have read and logged all telemetry that is visible 1545 * under the MCA. On architectures for which the NorthBridge is 1546 * on-chip this may include NB-observed errors, but where the NB 1547 * is off chip it may have been the source of the #MC request and 1548 * so we must call into the memory-controller driver to give it 1549 * a chance to log errors. 1550 */ 1551 if (ismc) { 1552 int willpanic = (cmi_mce_response(rp, disp) == 0); 1553 cmi_mc_logout(hdl, 1, willpanic); 1554 } 1555 } 1556 1557 int gcpu_mca_trap_vomit_summary = 0; 1558 1559 /* 1560 * On a native machine check exception we come here from mcetrap via 1561 * cmi_mca_trap. A machine check on one cpu of a chip does not trap others 1562 * cpus of the chip, so it is possible that another cpu on this chip could 1563 * initiate a poll while we're in the #mc handler; it is also possible that 1564 * this trap has occured during a poll on this cpu. So we must acquire 1565 * the chip-wide poll lock, but be careful to avoid deadlock. 1566 * 1567 * The 'data' pointer cannot be NULL due to init order. 1568 */ 1569 uint64_t 1570 gcpu_mca_trap(cmi_hdl_t hdl, struct regs *rp) 1571 { 1572 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1573 kmutex_t *poll_lock = NULL; 1574 gcpu_mce_status_t mce; 1575 uint64_t mcg_status; 1576 int tooklock = 0; 1577 1578 if (cmi_hdl_rdmsr(hdl, IA32_MSR_MCG_STATUS, &mcg_status) != 1579 CMI_SUCCESS || !(mcg_status & MCG_STATUS_MCIP)) 1580 return (0); 1581 1582 /* 1583 * Synchronize with any poller from another core that may happen 1584 * to share access to one or more of the MCA banks. 1585 */ 1586 if (gcpu->gcpu_shared != NULL) 1587 poll_lock = &gcpu->gcpu_shared->gcpus_poll_lock; 1588 1589 if (poll_lock != NULL && !mutex_owned(poll_lock)) { 1590 /* 1591 * The lock is not owned by the thread we have 1592 * interrupted. Spin for this adaptive lock. 1593 */ 1594 while (!mutex_tryenter(poll_lock)) { 1595 while (mutex_owner(poll_lock) != NULL) 1596 ; 1597 } 1598 tooklock = 1; 1599 } 1600 1601 gcpu_mca_logout(hdl, rp, 0, &mce, B_TRUE); 1602 1603 if (tooklock) 1604 mutex_exit(poll_lock); 1605 1606 /* 1607 * gcpu_mca_trap_vomit_summary may be set for debug assistance. 1608 */ 1609 if (mce.mce_nerr != 0 && gcpu_mca_trap_vomit_summary) { 1610 cmn_err(CE_WARN, "MCE: %u errors, disp=0x%llx, " 1611 "%u PCC (%u ok), " 1612 "%u UC (%d ok, %u poisoned), " 1613 "%u forcefatal, %u ignored", 1614 mce.mce_nerr, (u_longlong_t)mce.mce_disp, 1615 mce.mce_npcc, mce.mce_npcc_ok, 1616 mce.mce_nuc, mce.mce_nuc_ok, mce.mce_nuc_poisoned, 1617 mce.mce_forcefatal, mce.mce_ignored); 1618 } 1619 1620 return (mce.mce_disp); 1621 } 1622 1623 /*ARGSUSED*/ 1624 void 1625 gcpu_faulted_enter(cmi_hdl_t hdl) 1626 { 1627 /* Nothing to do here */ 1628 } 1629 1630 /*ARGSUSED*/ 1631 void 1632 gcpu_faulted_exit(cmi_hdl_t hdl) 1633 { 1634 gcpu_data_t *gcpu = cmi_hdl_getcmidata(hdl); 1635 1636 gcpu->gcpu_mca.gcpu_mca_flags |= GCPU_MCA_F_UNFAULTING; 1637 } 1638 1639 /* 1640 * Write the requested values to the indicated MSRs. Having no knowledge 1641 * of the model-specific requirements for writing to these model-specific 1642 * registers, we will only blindly write to those MSRs if the 'force' 1643 * argument is nonzero. That option should only be used in prototyping 1644 * and debugging. 1645 */ 1646 /*ARGSUSED*/ 1647 cmi_errno_t 1648 gcpu_msrinject(cmi_hdl_t hdl, cmi_mca_regs_t *regs, uint_t nregs, 1649 int force) 1650 { 1651 int i, errs = 0; 1652 1653 for (i = 0; i < nregs; i++) { 1654 uint_t msr = regs[i].cmr_msrnum; 1655 uint64_t val = regs[i].cmr_msrval; 1656 1657 if (cms_present(hdl)) { 1658 if (cms_msrinject(hdl, msr, val) != CMS_SUCCESS) 1659 errs++; 1660 } else if (force) { 1661 errs += (cmi_hdl_wrmsr(hdl, msr, val) != CMI_SUCCESS); 1662 } else { 1663 errs++; 1664 } 1665 } 1666 1667 return (errs == 0 ? CMI_SUCCESS : CMIERR_UNKNOWN); 1668 } 1669