amd64.esc (5f25dc2a) amd64.esc (a307a255)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 98 unchanged lines hidden (view full) ---

107 * - mem_ce : reported by nb for an access from a remote cpu
108 *
109 * Single-bit errors are fed into a per-DIMM SERD engine; if a SERD engine
110 * trips we diagnose a fault.memory.page so that the response agent can
111 * retire the page that caused the trip. If the total number of pages
112 * faulted in this way on a single DIMM exceeds a threshold we will
113 * diagnose a fault.memory.dimm_sb against the DIMM.
114 *
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 98 unchanged lines hidden (view full) ---

107 * - mem_ce : reported by nb for an access from a remote cpu
108 *
109 * Single-bit errors are fed into a per-DIMM SERD engine; if a SERD engine
110 * trips we diagnose a fault.memory.page so that the response agent can
111 * retire the page that caused the trip. If the total number of pages
112 * faulted in this way on a single DIMM exceeds a threshold we will
113 * diagnose a fault.memory.dimm_sb against the DIMM.
114 *
115 * Multibit ChipKill-correctable errors produce an immediate page fault
116 * and corresponding fault.memory.dimm_ck. This is achieved through
117 * SERD engines using N=0 so the facility is there to be a little more
118 * tolerant of these errors.
115 * Multibit ChipKill-correctable errors produce an immediate page fault.
116 * This is achieved through SERD engines using N=0 so the facility is there
117 * to be a little more tolerant of these errors in future.
119 *
120 * Uncorrectable errors produce an immediate page fault and corresponding
121 * fault.memory.dimm_ue.
122 *
123 * Page faults are essentially internal - action is only required when
124 * they are accompanied by a dimm fault. As such we include message=0
125 * on DIMM faults.
126 */

--- 98 unchanged lines hidden (view full) ---

225prop upset.memory.discard@dimm (1)->
226 ereport.memory.dimm_sb_trip@dimm;
227
228prop fault.memory.dimm_sb@dimm (0)->
229 ereport.memory.dimm_sb_trip@dimm {
230 count(stat.page_fault@dimm) >= DIMM_SB_THRESH };
231
232/* #DIMM_CK#
118 *
119 * Uncorrectable errors produce an immediate page fault and corresponding
120 * fault.memory.dimm_ue.
121 *
122 * Page faults are essentially internal - action is only required when
123 * they are accompanied by a dimm fault. As such we include message=0
124 * on DIMM faults.
125 */

--- 98 unchanged lines hidden (view full) ---

224prop upset.memory.discard@dimm (1)->
225 ereport.memory.dimm_sb_trip@dimm;
226
227prop fault.memory.dimm_sb@dimm (0)->
228 ereport.memory.dimm_sb_trip@dimm {
229 count(stat.page_fault@dimm) >= DIMM_SB_THRESH };
230
231/* #DIMM_CK#
233 * ChipKill-correctable multi-bit faults indicate a likely failing SDRAM
234 * part. We will SERD them but with a very low/zero tolerance.
232 * ChipKill-correctable multi-bit errors produce immediate page faults.
233 * If the fault is indeed isolated to just a few cells then we have contained
234 * the error; if not, say if the SDRAM device is failing, then we will hit a
235 * number of other similar errors in a short space of time. Thus we will
236 * SERD these in diagnosing a fault.memory.dimm_ck and not simply fault
237 * the DIMM at the first instance.
235 */
236
237#define DIMM_CK_FIT 4000
238 */
239
240#define DIMM_CK_FIT 4000
238#define DIMM_CK_COUNT 0
239#define DIMM_CK_TIME 1h
241#define DIMM_CK_COUNT 2
242#define DIMM_CK_TIME 72h
240
241event fault.memory.dimm_ck@dimm, FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm,
242 action=confcall("rewrite-ASRU");
243
244event ereport.memory.dimm_ck_trip@dimm;
245engine serd.memory.dimm_ck@dimm, N=DIMM_CK_COUNT, T=DIMM_CK_TIME,
246 method=persistent, trip=ereport.memory.dimm_ck_trip@dimm;
247event upset.memory.dimm_ck@dimm, engine=serd.memory.dimm_ck@dimm;

--- 489 unchanged lines hidden (view full) ---

737
738/*
739 * A single bit fault in the datapath between the NB and requesting core
740 * can cause:
741 *
742 * - inf_sys_ecc1 : reported by ic on access from a local cpu
743 * - inf_sys_ecc1 : reported by dc on access from a local cpu
744 * - s_ecc1 : reported by bu on access from a local cpu (hw prefetch etc)
243
244event fault.memory.dimm_ck@dimm, FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm,
245 action=confcall("rewrite-ASRU");
246
247event ereport.memory.dimm_ck_trip@dimm;
248engine serd.memory.dimm_ck@dimm, N=DIMM_CK_COUNT, T=DIMM_CK_TIME,
249 method=persistent, trip=ereport.memory.dimm_ck_trip@dimm;
250event upset.memory.dimm_ck@dimm, engine=serd.memory.dimm_ck@dimm;

--- 489 unchanged lines hidden (view full) ---

740
741/*
742 * A single bit fault in the datapath between the NB and requesting core
743 * can cause:
744 *
745 * - inf_sys_ecc1 : reported by ic on access from a local cpu
746 * - inf_sys_ecc1 : reported by dc on access from a local cpu
747 * - s_ecc1 : reported by bu on access from a local cpu (hw prefetch etc)
748 *
749 * Empirical observations show that in 64/8 ECC mode some memory CEs *can*
750 * travel past the DRAM controller and on to the IC/DC/BU to be reported
751 * via the above errors. This is not the case with ChipKill enabled.
752 * We should not be diagnosing datapath/chip errors for these. While
753 * this behaviour is clarified the serd parameters will be set to infinity
754 * (and the multibit counterpats will not be seen because of sync flood).
745 */
746
755 */
756
747#define CPU_DP_COUNT 3
748#define CPU_DP_TIME 12h
757#define CPU_DP_COUNT 5000
758#define CPU_DP_TIME 1m
749
750event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)};
751event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)};
752event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)};
753event upset.cpu.dp_sb@chip/cpu, engine=serd.cpu.dp_sb@chip/cpu;
754event ereport.cpu.amd.dp_sb_trip@chip/cpu;
755
756engine serd.cpu.dp_sb@chip/cpu, N=CPU_DP_COUNT, T=CPU_DP_TIME,

--- 87 unchanged lines hidden ---
759
760event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)};
761event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)};
762event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)};
763event upset.cpu.dp_sb@chip/cpu, engine=serd.cpu.dp_sb@chip/cpu;
764event ereport.cpu.amd.dp_sb_trip@chip/cpu;
765
766engine serd.cpu.dp_sb@chip/cpu, N=CPU_DP_COUNT, T=CPU_DP_TIME,

--- 87 unchanged lines hidden ---