amd64.esc (5f25dc2a) | amd64.esc (a307a255) |
---|---|
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 98 unchanged lines hidden (view full) --- 107 * - mem_ce : reported by nb for an access from a remote cpu 108 * 109 * Single-bit errors are fed into a per-DIMM SERD engine; if a SERD engine 110 * trips we diagnose a fault.memory.page so that the response agent can 111 * retire the page that caused the trip. If the total number of pages 112 * faulted in this way on a single DIMM exceeds a threshold we will 113 * diagnose a fault.memory.dimm_sb against the DIMM. 114 * | 1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE --- 98 unchanged lines hidden (view full) --- 107 * - mem_ce : reported by nb for an access from a remote cpu 108 * 109 * Single-bit errors are fed into a per-DIMM SERD engine; if a SERD engine 110 * trips we diagnose a fault.memory.page so that the response agent can 111 * retire the page that caused the trip. If the total number of pages 112 * faulted in this way on a single DIMM exceeds a threshold we will 113 * diagnose a fault.memory.dimm_sb against the DIMM. 114 * |
115 * Multibit ChipKill-correctable errors produce an immediate page fault 116 * and corresponding fault.memory.dimm_ck. This is achieved through 117 * SERD engines using N=0 so the facility is there to be a little more 118 * tolerant of these errors. | 115 * Multibit ChipKill-correctable errors produce an immediate page fault. 116 * This is achieved through SERD engines using N=0 so the facility is there 117 * to be a little more tolerant of these errors in future. |
119 * 120 * Uncorrectable errors produce an immediate page fault and corresponding 121 * fault.memory.dimm_ue. 122 * 123 * Page faults are essentially internal - action is only required when 124 * they are accompanied by a dimm fault. As such we include message=0 125 * on DIMM faults. 126 */ --- 98 unchanged lines hidden (view full) --- 225prop upset.memory.discard@dimm (1)-> 226 ereport.memory.dimm_sb_trip@dimm; 227 228prop fault.memory.dimm_sb@dimm (0)-> 229 ereport.memory.dimm_sb_trip@dimm { 230 count(stat.page_fault@dimm) >= DIMM_SB_THRESH }; 231 232/* #DIMM_CK# | 118 * 119 * Uncorrectable errors produce an immediate page fault and corresponding 120 * fault.memory.dimm_ue. 121 * 122 * Page faults are essentially internal - action is only required when 123 * they are accompanied by a dimm fault. As such we include message=0 124 * on DIMM faults. 125 */ --- 98 unchanged lines hidden (view full) --- 224prop upset.memory.discard@dimm (1)-> 225 ereport.memory.dimm_sb_trip@dimm; 226 227prop fault.memory.dimm_sb@dimm (0)-> 228 ereport.memory.dimm_sb_trip@dimm { 229 count(stat.page_fault@dimm) >= DIMM_SB_THRESH }; 230 231/* #DIMM_CK# |
233 * ChipKill-correctable multi-bit faults indicate a likely failing SDRAM 234 * part. We will SERD them but with a very low/zero tolerance. | 232 * ChipKill-correctable multi-bit errors produce immediate page faults. 233 * If the fault is indeed isolated to just a few cells then we have contained 234 * the error; if not, say if the SDRAM device is failing, then we will hit a 235 * number of other similar errors in a short space of time. Thus we will 236 * SERD these in diagnosing a fault.memory.dimm_ck and not simply fault 237 * the DIMM at the first instance. |
235 */ 236 237#define DIMM_CK_FIT 4000 | 238 */ 239 240#define DIMM_CK_FIT 4000 |
238#define DIMM_CK_COUNT 0 239#define DIMM_CK_TIME 1h | 241#define DIMM_CK_COUNT 2 242#define DIMM_CK_TIME 72h |
240 241event fault.memory.dimm_ck@dimm, FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm, 242 action=confcall("rewrite-ASRU"); 243 244event ereport.memory.dimm_ck_trip@dimm; 245engine serd.memory.dimm_ck@dimm, N=DIMM_CK_COUNT, T=DIMM_CK_TIME, 246 method=persistent, trip=ereport.memory.dimm_ck_trip@dimm; 247event upset.memory.dimm_ck@dimm, engine=serd.memory.dimm_ck@dimm; --- 489 unchanged lines hidden (view full) --- 737 738/* 739 * A single bit fault in the datapath between the NB and requesting core 740 * can cause: 741 * 742 * - inf_sys_ecc1 : reported by ic on access from a local cpu 743 * - inf_sys_ecc1 : reported by dc on access from a local cpu 744 * - s_ecc1 : reported by bu on access from a local cpu (hw prefetch etc) | 243 244event fault.memory.dimm_ck@dimm, FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm, 245 action=confcall("rewrite-ASRU"); 246 247event ereport.memory.dimm_ck_trip@dimm; 248engine serd.memory.dimm_ck@dimm, N=DIMM_CK_COUNT, T=DIMM_CK_TIME, 249 method=persistent, trip=ereport.memory.dimm_ck_trip@dimm; 250event upset.memory.dimm_ck@dimm, engine=serd.memory.dimm_ck@dimm; --- 489 unchanged lines hidden (view full) --- 740 741/* 742 * A single bit fault in the datapath between the NB and requesting core 743 * can cause: 744 * 745 * - inf_sys_ecc1 : reported by ic on access from a local cpu 746 * - inf_sys_ecc1 : reported by dc on access from a local cpu 747 * - s_ecc1 : reported by bu on access from a local cpu (hw prefetch etc) |
748 * 749 * Empirical observations show that in 64/8 ECC mode some memory CEs *can* 750 * travel past the DRAM controller and on to the IC/DC/BU to be reported 751 * via the above errors. This is not the case with ChipKill enabled. 752 * We should not be diagnosing datapath/chip errors for these. While 753 * this behaviour is clarified the serd parameters will be set to infinity 754 * (and the multibit counterpats will not be seen because of sync flood). |
|
745 */ 746 | 755 */ 756 |
747#define CPU_DP_COUNT 3 748#define CPU_DP_TIME 12h | 757#define CPU_DP_COUNT 5000 758#define CPU_DP_TIME 1m |
749 750event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)}; 751event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)}; 752event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)}; 753event upset.cpu.dp_sb@chip/cpu, engine=serd.cpu.dp_sb@chip/cpu; 754event ereport.cpu.amd.dp_sb_trip@chip/cpu; 755 756engine serd.cpu.dp_sb@chip/cpu, N=CPU_DP_COUNT, T=CPU_DP_TIME, --- 87 unchanged lines hidden --- | 759 760event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)}; 761event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)}; 762event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)}; 763event upset.cpu.dp_sb@chip/cpu, engine=serd.cpu.dp_sb@chip/cpu; 764event ereport.cpu.amd.dp_sb_trip@chip/cpu; 765 766engine serd.cpu.dp_sb@chip/cpu, N=CPU_DP_COUNT, T=CPU_DP_TIME, --- 87 unchanged lines hidden --- |