1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23/*
24 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
25 * Use is subject to license terms.
26 */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30#pragma dictionary "AMD"
31
32/*
33 * Eversholt rules for the AMD Opteron CPU/Memory
34 */
35
36fru dimm;
37asru dimm;
38
39fru chip;
40asru chip/cpu;
41
42
43/*								#MEM#
44 * GET_ADDR relies on the fact that variables have global scope across an FME.
45 * Thus for each FME the assignment only occurs for the first invocation
46 * but the comparison happens on each. Thus if the new address matches the
47 * address of an existing open FME, then we return true running in the context
48 * of that FME. If the new address doesn't match the address of any existing
49 * open FME, then we return true in the context of a newly opened FME.
50 */
51#define GET_ADDR (defined($addr) ? ($addr == payloadprop("addr")) :	\
52	($addr = payloadprop("addr")))
53
54#define	GET_OFFSET ($offset = payloadprop("resource[0].hc-specific.offset"))
55
56/*
57 * SET_ADDR is used to set a payload value in the fault that we diagnose
58 * for page faults, to record the physical address of the faulting page.
59 */
60#define	SET_ADDR (setpayloadprop("asru-physaddr", $addr))
61
62#define	SET_OFFSET (setpayloadprop("asru-offset", $offset))
63
64/*
65 * RESOURCE_EXISTS is true if a pair with name "resource" exists in the
66 * payload - regardless of type (e.g., nvlist or nvlist array) or value.
67 */
68#define	RESOURCE_EXISTS	(payloadprop_defined("resource"))
69
70/*
71 * CONTAINS_DIMM is true if the "resource" nvlist array (as used in memory
72 * ereports) exists and one if its members matches the path for the
73 * dimm node.  Our memory propogation are of the form "foo@dimm -> blah@cpu"
74 * since cpus detect memory errors;  in eversholt such a propogation, where
75 * the lhs path and rhs path do not match, expands to the cross-product of
76 * all dimms and cpus in the system.  We use CONTAINS_DIMM to constrain
77 * the propogation such that it only happens if the payload resource
78 * matches the dimm.
79 */
80#define	CONTAINS_DIMM (payloadprop_contains("resource", asru(dimm)))
81
82/*
83 * The following will tell us whether a syndrome that is known to be
84 * correctable (from a mem_ecc1) is single-bit or multi-bit.  For a
85 * correctable ChipKill syndrome the number of bits set in the lowest
86 * nibble indicates how many bit were in error.
87 */
88
89#define	CBITMASK(synd) ((synd) & 0xf)
90
91#define	CKSINGLE(synd)							\
92	((synd) == 0 ||							\
93	(CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 ||		\
94	CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
95
96#define	SINGLE_BIT_CE							\
97	(payloadprop("syndrome-type") == "E" ||				\
98	(payloadprop("syndrome-type") == "C" &&				\
99	CKSINGLE(payloadprop("syndrome"))))
100
101#define	MULTI_BIT_CE							\
102	(payloadprop("syndrome-type") == "C" &&				\
103	!CKSINGLE(payloadprop("syndrome")))
104
105/*
106 * A single bit fault in a memory dimm can cause:
107 *
108 *  - mem_ce : reported by nb for an access from a remote cpu
109 *
110 * Single-bit errors are fed into a per-DIMM SERD engine; if a SERD engine
111 * trips we diagnose a fault.memory.page so that the response agent can
112 * retire the page that caused the trip.  If the total number of pages
113 * faulted in this way on a single DIMM exceeds a threshold we will
114 * diagnose a fault.memory.dimm_sb against the DIMM.
115 *
116 * Multibit ChipKill-correctable errors produce an immediate page fault
117 * and corresponding fault.memory.dimm_ck.  This is achieved through
118 * SERD engines using N=0 so the facility is there to be a little more
119 * tolerant of these errors.
120 *
121 * Uncorrectable errors produce an immediate page fault and corresponding
122 * fault.memory.dimm_ue.
123 *
124 * Page faults are essentially internal - action is only required when
125 * they are accompanied by a dimm fault.  As such we include message=0
126 * on DIMM faults.
127 */
128
129event ereport.cpu.amd.nb.mem_ce@cpu;
130
131/*
132 * If the address is not valid then no resource member will be included
133 * in a nb.mem_ce or nb.mem_ue ereport.  These cases should be rare.
134 * We will discard such ereports.  An alternative may be to SERD them
135 * on a per MC basis and trip if we see too many such events.
136 */
137
138event upset.memory.discard@cpu;
139
140/*								#PAGE#
141 * Page faults of all types diagnose to a single fault class and are
142 * counted with a stat.
143 *
144 * Single-bit errors are diagnosed as upsets and feed into per-DIMM
145 * SERD engines which diagnose fault.memory.page if they trip.
146 */
147
148#define PAGE_FIT		1
149#define PAGE_SB_COUNT		2
150#define PAGE_SB_TIME		72h
151#define	PAGE_CK_COUNT		0
152#define	PAGE_CK_TIME		1h
153
154engine stat.page_fault@dimm;
155event fault.memory.page@dimm, FITrate=PAGE_FIT,
156    ASRU=dimm, message=0, count=stat.page_fault@dimm,
157    action=confcall("rewrite-ASRU");
158event error.memory.page_sb@dimm;
159event error.memory.page_ck@dimm;
160event error.memory.page_ue@dimm;
161
162prop fault.memory.page@dimm (1)->
163    error.memory.page_sb@dimm,
164    error.memory.page_ck@dimm,
165    error.memory.page_ue@dimm;
166
167event ereport.memory.page_sb_trip@dimm;
168engine serd.memory.page_sb@dimm, N=PAGE_SB_COUNT, T=PAGE_SB_TIME,
169    method=persistent, trip=ereport.memory.page_sb_trip@dimm;
170event upset.memory.page_sb@dimm, engine=serd.memory.page_sb@dimm;
171
172event ereport.memory.page_ck_trip@dimm;
173engine serd.memory.page_ck@dimm, N=PAGE_CK_COUNT, T=PAGE_CK_TIME,
174    method=persistent, trip=ereport.memory.page_ck_trip@dimm;
175event upset.memory.page_ck@dimm, engine=serd.memory.page_ck@dimm;
176
177prop upset.memory.page_sb@dimm (0)->
178    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && SINGLE_BIT_CE };
179
180prop upset.memory.page_ck@dimm (0)->
181    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE };
182
183prop error.memory.page_sb@dimm (1)->
184    ereport.memory.page_sb_trip@dimm;
185
186prop error.memory.page_ck@dimm (1)->
187    ereport.memory.page_ck_trip@dimm;
188
189prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)->
190    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && GET_ADDR && GET_OFFSET };
191
192prop upset.memory.discard@cpu (1)->
193    ereport.cpu.amd.nb.mem_ce@cpu { !RESOURCE_EXISTS };
194
195/*								#DIMM_SB#
196 * Single-bit DIMM faults are diagnosed when the number of page faults
197 * (of all types since they all are counted in a single per-DIMM stat engine)
198 * reaches a threshold.  Since our tolerance of ChipKill and UE faults
199 * is much lower than that for single-bit errors the threshold will only be
200 * reached for repeated single-bit page faults.  We do not stop diagnosing
201 * further single-bit page faults once we have declared a single-bit DIMM
202 * fault - we continue diagnosing them and response agents can continue to
203 * retire those pages up to the system-imposed retirement limit.
204 *
205 * We maintain a parallel SERD engine to the page_sb engine which trips
206 * in unison, but on trip it generates a distinct ereport which we
207 * diagnose to a dimm_sb fault if the threshold has been reached, or
208 * to a throwaway upset if not.
209 */
210
211#define DIMM_SB_FIT		2000
212#define DIMM_SB_THRESH		128
213
214event fault.memory.dimm_sb@dimm, FITrate=DIMM_SB_FIT, FRU=dimm, ASRU=dimm;
215
216event ereport.memory.dimm_sb_trip@dimm;
217event upset.memory.discard@dimm;
218engine serd.memory.dimm_sb@dimm, N=PAGE_SB_COUNT, T=PAGE_SB_TIME,
219    method=persistent, trip=ereport.memory.dimm_sb_trip@dimm;
220event upset.memory.dimm_sb@dimm, engine=serd.memory.dimm_sb@dimm;
221
222prop upset.memory.dimm_sb@dimm (0)->
223    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM };	/* sb and ck */
224
225prop upset.memory.discard@dimm (1)->
226    ereport.memory.dimm_sb_trip@dimm;
227
228prop fault.memory.dimm_sb@dimm (0)->
229    ereport.memory.dimm_sb_trip@dimm {
230	count(stat.page_fault@dimm) >= DIMM_SB_THRESH };
231
232/*								#DIMM_CK#
233 * ChipKill-correctable multi-bit faults indicate a likely failing SDRAM
234 * part.  We will SERD them but with a very low/zero tolerance.
235 */
236
237#define DIMM_CK_FIT		4000
238#define	DIMM_CK_COUNT		0
239#define	DIMM_CK_TIME		1h
240
241event fault.memory.dimm_ck@dimm, FITrate=DIMM_CK_FIT, FRU=dimm, ASRU=dimm;
242
243event ereport.memory.dimm_ck_trip@dimm;
244engine serd.memory.dimm_ck@dimm, N=DIMM_CK_COUNT, T=DIMM_CK_TIME,
245    method=persistent, trip=ereport.memory.dimm_ck_trip@dimm;
246event upset.memory.dimm_ck@dimm, engine=serd.memory.dimm_ck@dimm;
247
248prop upset.memory.dimm_ck@dimm (0)->
249    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE };
250
251prop fault.memory.dimm_ck@dimm (1)->
252    ereport.memory.dimm_ck_trip@dimm;
253
254prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)->
255    ereport.cpu.amd.nb.mem_ce@cpu { CONTAINS_DIMM && MULTI_BIT_CE &&
256    GET_ADDR && GET_OFFSET };
257
258/* 								#DIMM_UE#
259 * A multi-bit fault in a memory dimm can cause:
260 *
261 *  - ue    : reported by nb for an access from a remote cpu
262 *
263 * Note we use a SERD engine here simply as a way of ensuring that we get
264 * both dimm and page faults reported
265 */
266
267#define DIMM_UE_FIT		6000
268
269event ereport.cpu.amd.nb.mem_ue@cpu;
270event ereport.memory.page_ue_trip@dimm;
271event ereport.memory.dimm_ue_trip@dimm;
272event fault.memory.dimm_ue@dimm, FITrate=DIMM_UE_FIT, FRU=dimm, ASRU=dimm;
273event upset.memory.page_ue@dimm, engine=serd.memory.page_ue@dimm;
274event upset.memory.dimm_ue@dimm, engine=serd.memory.dimm_ue@dimm;
275
276engine serd.memory.dimm_ue@dimm, N=0, T=1h,
277    method=persistent, trip=ereport.memory.dimm_ue_trip@dimm;
278
279engine serd.memory.page_ue@dimm, N=0, T=1h,
280    method=persistent, trip=ereport.memory.page_ue_trip@dimm;
281
282prop upset.memory.page_ue@dimm (0)->
283    ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM };
284
285prop upset.memory.dimm_ue@dimm (0)->
286    ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM };
287
288prop error.memory.page_ue@dimm (1)->
289    ereport.memory.page_ue_trip@dimm;
290
291prop fault.memory.page@dimm { SET_ADDR && SET_OFFSET } (0)->
292    ereport.cpu.amd.nb.mem_ue@cpu { CONTAINS_DIMM && GET_ADDR & GET_OFFSET };
293
294prop fault.memory.dimm_ue@dimm (1)->
295    ereport.memory.dimm_ue_trip@dimm;
296
297prop upset.memory.discard@cpu (1)->
298    ereport.cpu.amd.nb.mem_ce@cpu { !RESOURCE_EXISTS };
299
300/*								#L2D#
301 * l2 cache data errors.
302 */
303
304#define L2CACHEDATA_FIT		1000
305#define L2CACHEDATA_SB_COUNT	3
306#define L2CACHEDATA_SB_TIME	12h
307
308event fault.cpu.amd.l2cachedata@chip/cpu, FITrate=L2CACHEDATA_FIT,
309	FRU=chip, ASRU=chip/cpu;
310event error.cpu.amd.l2cachedata_sb@chip/cpu;
311event error.cpu.amd.l2cachedata_mb@chip/cpu;
312
313prop fault.cpu.amd.l2cachedata@chip/cpu (1)->
314    error.cpu.amd.l2cachedata_sb@chip/cpu,
315    error.cpu.amd.l2cachedata_mb@chip/cpu;
316
317/* 								#L2D_SINGLE#
318 * A single bit data array fault in an l2 cache can cause:
319 *
320 *  - inf_l2_ecc1 : reported by ic on this cpu
321 *  - inf_l2_ecc1 : reported by dc on this cpu
322 *  - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
323 *
324 * Single-bit errors are diagnosed to cache upsets.  SERD engines are used
325 * to count upsets resulting from CEs.
326 */
327
328event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)};
329event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)};
330event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)};
331event ereport.cpu.amd.l2d_sb_trip@chip/cpu;
332
333engine serd.cpu.amd.l2d_sb@chip/cpu,
334    N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME, method=persistent,
335    trip=ereport.cpu.amd.l2d_sb_trip@chip/cpu;
336
337event upset.cpu.amd.l2d_sb@chip/cpu,
338	engine=serd.cpu.amd.l2d_sb@chip/cpu;
339
340prop upset.cpu.amd.l2d_sb@chip/cpu (1)->
341    ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu,
342    ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu,
343    ereport.cpu.amd.bu.l2d_ecc1@chip/cpu;
344
345prop error.cpu.amd.l2cachedata_sb@chip/cpu (1)->
346    ereport.cpu.amd.l2d_sb_trip@chip/cpu;
347
348prop fault.cpu.amd.l2cachedata@chip/cpu (0)->
349    ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu,
350    ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu,
351    ereport.cpu.amd.bu.l2d_ecc1@chip/cpu;
352
353/* 								#L2D_MULTI#
354 * A multi-bit data array fault in an l2 cache can cause:
355 *
356 *  - inf_l2_eccm : reported by ic on this cpu
357 *  - inf_l2_eccm : reported by dc on this cpu
358 *  - l2d_eccm : reported by bu on copyback or on snoop from another cpu
359 */
360
361event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu;
362event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu;
363event ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
364
365prop error.cpu.amd.l2cachedata_mb@chip/cpu (1)->
366    ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu,
367    ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu,
368    ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
369
370prop fault.cpu.amd.l2cachedata@chip/cpu (0)->
371    ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu,
372    ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu,
373    ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
374
375/*								#L2T#
376 * l2 cache main tag errors
377 */
378
379#define L2CACHETAG_FIT		1000
380#define L2CACHETAG_SB_COUNT	3
381#define L2CACHETAG_SB_TIME	12h
382
383event fault.cpu.amd.l2cachetag@chip/cpu, FITrate=L2CACHETAG_FIT,
384	FRU=chip, ASRU=chip/cpu;
385event error.cpu.amd.l2cachetag_sb@chip/cpu;
386event error.cpu.amd.l2cachetag_mb@chip/cpu;
387
388prop fault.cpu.amd.l2cachetag@chip/cpu (1)->
389    error.cpu.amd.l2cachetag_sb@chip/cpu,
390    error.cpu.amd.l2cachetag_mb@chip/cpu;
391
392/* 								#L2T_SINGLE#
393 * A single bit tag array fault in an l2 cache can cause:
394 *
395 *  - l2t_ecc1 : reported by bu on this cpu when detected during snoop
396 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
397 *
398 * Note that the bu.l2t_par ereport could be due to a single bit or multi bit
399 * event. If the l2t_sb_trip has already triggered it will be treated as another
400 * ce, otherwise it will be treated as a ue event.
401 */
402
403event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)};
404event ereport.cpu.amd.bu.l2t_par@chip/cpu;
405event ereport.cpu.amd.l2t_sb_trip@chip/cpu;
406
407engine serd.cpu.amd.l2t_sb@chip/cpu,
408    N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME, method=persistent,
409    trip=ereport.cpu.amd.l2t_sb_trip@chip/cpu;
410
411event upset.cpu.amd.l2t_sb@chip/cpu,
412	engine=serd.cpu.amd.l2t_sb@chip/cpu;
413
414prop upset.cpu.amd.l2t_sb@chip/cpu (1)->
415    ereport.cpu.amd.bu.l2t_ecc1@chip/cpu,
416    ereport.cpu.amd.bu.l2t_par@chip/cpu;
417
418prop error.cpu.amd.l2cachetag_sb@chip/cpu (1)->
419    ereport.cpu.amd.l2t_sb_trip@chip/cpu;
420
421prop fault.cpu.amd.l2cachetag@chip/cpu (0)->
422    ereport.cpu.amd.bu.l2t_ecc1@chip/cpu,
423    ereport.cpu.amd.bu.l2t_par@chip/cpu;
424
425/* 								#L2T_MULTI#
426 * A multi-bit tag array fault in an l2 cache can cause:
427 *
428 *  - l2t_eccm : reported by bu on this cpu when detected during snoop
429 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
430 */
431
432event ereport.cpu.amd.bu.l2t_eccm@chip/cpu;
433
434prop error.cpu.amd.l2cachetag_mb@chip/cpu (1)->
435    ereport.cpu.amd.bu.l2t_eccm@chip/cpu,
436    ereport.cpu.amd.bu.l2t_par@chip/cpu;
437
438prop fault.cpu.amd.l2cachetag@chip/cpu (0)->
439    ereport.cpu.amd.bu.l2t_eccm@chip/cpu,
440    ereport.cpu.amd.bu.l2t_par@chip/cpu;
441
442/* 								#ICD_PAR#
443 * A data array parity fault in an I cache can cause:
444 *
445 *  - data_par : reported by ic on this cpu
446 */
447
448#define ICACHEDATA_FIT		1000
449#define ICACHEDATA_SB_COUNT	2
450#define ICACHEDATA_SB_TIME	168h
451
452event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)};
453event ereport.cpu.amd.ic_dp_trip@chip/cpu;
454
455event fault.cpu.amd.icachedata@chip/cpu, FITrate=ICACHEDATA_FIT,
456	FRU=chip, ASRU=chip/cpu;
457
458engine serd.cpu.amd.icachedata@chip/cpu,
459    N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME, method=persistent,
460    trip=ereport.cpu.amd.ic_dp_trip@chip/cpu;
461
462event upset.cpu.amd.icachedata@chip/cpu,
463	engine=serd.cpu.amd.icachedata@chip/cpu;
464
465prop upset.cpu.amd.icachedata@chip/cpu (1)->
466    ereport.cpu.amd.ic.data_par@chip/cpu;
467
468prop fault.cpu.amd.icachedata@chip/cpu (1)->
469    ereport.cpu.amd.ic_dp_trip@chip/cpu;
470
471prop fault.cpu.amd.icachedata@chip/cpu (0)->
472    ereport.cpu.amd.ic.data_par@chip/cpu;
473
474/* 								#ICT_PAR#
475 * A tag array parity fault in an I cache can cause:
476 *
477 *  - tag_par : reported by ic on this cpu
478 */
479
480#define ICACHETAG_FIT		1000
481#define ICACHETAG_SB_COUNT	2
482#define ICACHETAG_SB_TIME	168h
483
484event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)};
485event ereport.cpu.amd.ic_tp_trip@chip/cpu;
486
487event fault.cpu.amd.icachetag@chip/cpu, FITrate=ICACHETAG_FIT,
488	FRU=chip, ASRU=chip/cpu;
489
490engine serd.cpu.amd.icachetag@chip/cpu,
491    N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME, method=persistent,
492    trip=ereport.cpu.amd.ic_tp_trip@chip/cpu;
493
494event upset.cpu.amd.icachetag@chip/cpu,
495	engine=serd.cpu.amd.icachetag@chip/cpu;
496
497prop upset.cpu.amd.icachetag@chip/cpu (1)->
498    ereport.cpu.amd.ic.tag_par@chip/cpu;
499
500prop fault.cpu.amd.icachetag@chip/cpu (1)->
501    ereport.cpu.amd.ic_tp_trip@chip/cpu;
502
503prop fault.cpu.amd.icachetag@chip/cpu (0)->
504    ereport.cpu.amd.ic.tag_par@chip/cpu;
505
506/* 								#ICT_SNOOP#
507 * A snoop tag array parity fault in an I cache can cause:
508 *
509 *  - stag_par : reported by ic on this cpu
510 */
511
512#define ICACHESTAG_FIT		1000
513
514event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)};
515
516event fault.cpu.amd.icachestag@chip/cpu, FITrate=ICACHESTAG_FIT,
517	FRU=chip, ASRU=chip/cpu;
518
519prop fault.cpu.amd.icachestag@chip/cpu (1)->
520    ereport.cpu.amd.ic.stag_par@chip/cpu;
521
522/* 								#ICTLB_1#
523 * An l1tlb parity fault in an I cache can cause:
524 *
525 *  - l1tlb_par : reported by ic on this cpu
526 */
527
528#define ICACHEL1TLB_FIT		1000
529#define ICACHEL1TLB_SB_COUNT	2
530#define ICACHEL1TLB_SB_TIME	168h
531
532event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)};
533event ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
534
535event fault.cpu.amd.l1itlb@chip/cpu, FITrate=ICACHEL1TLB_FIT,
536	FRU=chip, ASRU=chip/cpu;
537
538engine serd.cpu.amd.l1itlb@chip/cpu,
539    N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME, method=persistent,
540    trip=ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
541
542event upset.cpu.amd.l1itlb@chip/cpu,
543	engine=serd.cpu.amd.l1itlb@chip/cpu;
544
545prop upset.cpu.amd.l1itlb@chip/cpu (1)->
546    ereport.cpu.amd.ic.l1tlb_par@chip/cpu;
547
548prop fault.cpu.amd.l1itlb@chip/cpu (1)->
549    ereport.cpu.amd.ic_l1tlb_trip@chip/cpu;
550
551prop fault.cpu.amd.l1itlb@chip/cpu (0)->
552    ereport.cpu.amd.ic.l1tlb_par@chip/cpu;
553
554/* 								#ICTLB_2#
555 * An l2tlb parity fault in an I cache can cause:
556 *
557 *  - l2tlb_par : reported by ic on this cpu
558 */
559
560#define ICACHEL2TLB_FIT		1000
561#define ICACHEL2TLB_SB_COUNT	2
562#define ICACHEL2TLB_SB_TIME	168h
563
564event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)};
565event ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
566
567event fault.cpu.amd.l2itlb@chip/cpu, FITrate=ICACHEL2TLB_FIT,
568	FRU=chip, ASRU=chip/cpu;
569
570engine serd.cpu.amd.l2itlb@chip/cpu,
571    N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME, method=persistent,
572    trip=ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
573
574event upset.cpu.amd.l2itlb@chip/cpu,
575	engine=serd.cpu.amd.l2itlb@chip/cpu;
576
577prop upset.cpu.amd.l2itlb@chip/cpu (1)->
578    ereport.cpu.amd.ic.l2tlb_par@chip/cpu;
579
580prop fault.cpu.amd.l2itlb@chip/cpu (1)->
581    ereport.cpu.amd.ic_l2tlb_trip@chip/cpu;
582
583prop fault.cpu.amd.l2itlb@chip/cpu (0)->
584    ereport.cpu.amd.ic.l2tlb_par@chip/cpu;
585
586/*								#DCD#
587 * dcache data errors
588 */
589
590#define DCACHEDATA_FIT		1000
591#define DCACHEDATA_SB_COUNT	2
592#define DCACHEDATA_SB_TIME	168h
593
594event fault.cpu.amd.dcachedata@chip/cpu, FITrate=DCACHEDATA_FIT,
595	FRU=chip, ASRU=chip/cpu;
596event error.cpu.amd.dcachedata_sb@chip/cpu;
597event error.cpu.amd.dcachedata_mb@chip/cpu;
598
599prop fault.cpu.amd.dcachedata@chip/cpu (1)->
600    error.cpu.amd.dcachedata_sb@chip/cpu,
601    error.cpu.amd.dcachedata_mb@chip/cpu;
602
603/* 								#DCD_SINGLE#
604 * A single bit data array fault in an D cache can cause:
605 *
606 *  - data_ecc1 : reported by dc on this cpu by scrubber
607 *  - data_ecc1_uc : reported by dc on this cpu other than by scrubber
608 *
609 * Make data_ecc1_uc fault immediately as it may have caused a panic
610 */
611
612event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)};
613event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)};
614event ereport.cpu.amd.dc_sb_trip@chip/cpu;
615
616engine serd.cpu.amd.dc_sb@chip/cpu,
617    N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME, method=persistent,
618    trip=ereport.cpu.amd.dc_sb_trip@chip/cpu;
619
620engine serd.cpu.amd.dc_sb_uc@chip/cpu,
621    N=0, T=1hr, method=persistent,
622    trip=ereport.cpu.amd.dc_sb_trip@chip/cpu;
623
624event upset.cpu.amd.dc_sb@chip/cpu,
625	engine=serd.cpu.amd.dc_sb@chip/cpu;
626
627event upset.cpu.amd.dc_sb_uc@chip/cpu,
628	engine=serd.cpu.amd.dc_sb_uc@chip/cpu;
629
630prop upset.cpu.amd.dc_sb@chip/cpu (1)->
631    ereport.cpu.amd.dc.data_ecc1@chip/cpu;
632
633prop upset.cpu.amd.dc_sb_uc@chip/cpu (1)->
634    ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu;
635
636prop error.cpu.amd.dcachedata_sb@chip/cpu (1)->
637    ereport.cpu.amd.dc_sb_trip@chip/cpu;
638
639prop fault.cpu.amd.dcachedata@chip/cpu (0)->
640    ereport.cpu.amd.dc.data_ecc1@chip/cpu,
641    ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu;
642
643/* 								#DCD_MULTI#
644 * A multi-bit data array fault in an D cache can cause:
645 *
646 *  - data_eccm : reported by dc on this cpu
647 */
648
649event ereport.cpu.amd.dc.data_eccm@chip/cpu;
650
651prop error.cpu.amd.dcachedata_mb@chip/cpu (1)->
652    ereport.cpu.amd.dc.data_eccm@chip/cpu;
653
654prop fault.cpu.amd.dcachedata@chip/cpu (0)->
655    ereport.cpu.amd.dc.data_eccm@chip/cpu;
656
657/* 								#DCT_PAR#
658 * A tag array parity fault in an D cache can cause:
659 *
660 *  - tag_par : reported by dc on this cpu
661 */
662
663#define DCACHETAG_FIT		1000
664
665event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)};
666
667event fault.cpu.amd.dcachetag@chip/cpu, FITrate=DCACHETAG_FIT,
668	FRU=chip, ASRU=chip/cpu;
669
670prop fault.cpu.amd.dcachetag@chip/cpu (1)->
671    ereport.cpu.amd.dc.tag_par@chip/cpu;
672
673/* 								#DCT_SNOOP#
674 * A snoop tag array parity fault in an D cache can cause:
675 *
676 *  - stag_par : reported by dc on this cpu
677 */
678
679#define DCACHESTAG_FIT		1000
680
681event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)};
682
683event fault.cpu.amd.dcachestag@chip/cpu, FITrate=DCACHESTAG_FIT,
684	FRU=chip, ASRU=chip/cpu;
685
686prop fault.cpu.amd.dcachestag@chip/cpu (1)->
687    ereport.cpu.amd.dc.stag_par@chip/cpu;
688
689/* 								#DCTLB_1#
690 * An l1tlb parity fault in an D cache can cause:
691 *
692 *  - l1tlb_par : reported by dc on this cpu
693 */
694
695#define L1DTLB_FIT		1000
696
697event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)};
698
699event fault.cpu.amd.l1dtlb@chip/cpu, FITrate=L1DTLB_FIT,
700	FRU=chip, ASRU=chip/cpu;
701
702prop fault.cpu.amd.l1dtlb@chip/cpu (1)->
703    ereport.cpu.amd.dc.l1tlb_par@chip/cpu;
704
705/* 								#DCTLB_2#
706 * An l2tlb parity fault in an D cache can cause:
707 *
708 *  - l2tlb_par : reported by dc on this cpu
709 */
710
711#define L2DTLB_FIT		1000
712
713event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)};
714
715event fault.cpu.amd.l2dtlb@chip/cpu, FITrate=L2DTLB_FIT,
716	FRU=chip, ASRU=chip/cpu;
717
718prop fault.cpu.amd.l2dtlb@chip/cpu (1)->
719    ereport.cpu.amd.dc.l2tlb_par@chip/cpu;
720
721/*								#DPATH_SB#
722 * Datapath errors between NB/MC and core.
723 */
724
725#define	CPU_DP_FIT		1000
726
727event fault.cpu.amd.datapath@chip/cpu, FITrate=CPU_DP_FIT, FRU=chip,
728	ASRU=chip/cpu;
729event error.cpu.amd.datapath_sb@chip/cpu;
730event error.cpu.amd.datapath_mb@chip/cpu;
731
732prop fault.cpu.amd.datapath@chip/cpu (1)->
733    error.cpu.amd.datapath_sb@chip/cpu,
734    error.cpu.amd.datapath_mb@chip/cpu;
735
736/*
737 * A single bit fault in the datapath between the NB and requesting core
738 * can cause:
739 *
740 *  - inf_sys_ecc1 : reported by ic on access from a local cpu
741 *  - inf_sys_ecc1 : reported by dc on access from a local cpu
742 *  - s_ecc1 : reported by bu on access from a local cpu (hw prefetch etc)
743 */
744
745#define	CPU_DP_COUNT	3
746#define	CPU_DP_TIME	12h
747
748event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)};
749event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)};
750event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)};
751event upset.cpu.dp_sb@chip/cpu, engine=serd.cpu.dp_sb@chip/cpu;
752event ereport.cpu.amd.dp_sb_trip@chip/cpu;
753
754engine serd.cpu.dp_sb@chip/cpu, N=CPU_DP_COUNT, T=CPU_DP_TIME,
755    method=persistent, trip=ereport.cpu.amd.dp_sb_trip@chip/cpu;
756
757prop upset.cpu.dp_sb@chip/cpu (1)->
758    ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu,
759    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu,
760    ereport.cpu.amd.bu.s_ecc1@chip/cpu;
761
762prop error.cpu.amd.datapath_sb@chip/cpu (1)->
763    ereport.cpu.amd.dp_sb_trip@chip/cpu;
764
765prop fault.cpu.amd.datapath@chip/cpu (0)->
766    ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu,
767    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu,
768    ereport.cpu.amd.bu.s_ecc1@chip/cpu;
769
770/*								#DPATH_MB#
771 * A multi-bit fault in the datapath between the NB and requesting core
772 * can cause:
773 *
774 *  - inf_sys_eccm : reported by ic on access from a local cpu
775 *  - inf_sys_eccm : reported by dc on access from a local cpu
776 *  - s_eccm : reported by bu on access from a local cpu (hw prefetch etc)
777 */
778
779event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu;
780event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu;
781event ereport.cpu.amd.bu.s_eccm@chip/cpu;
782
783prop error.cpu.amd.datapath_mb@chip/cpu (1)->
784    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu,
785    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu,
786    ereport.cpu.amd.bu.s_eccm@chip/cpu;
787
788prop fault.cpu.amd.datapath@chip/cpu (0)->
789    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu,
790    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu,
791    ereport.cpu.amd.bu.s_eccm@chip/cpu;
792
793/*
794 * Ereports that should not normally happen and which we will discard
795 * without diagnosis if they do.  These fall into a few categories:
796 *
797 *	- the corresponding detector is not enabled, typically because
798 *	  detection/handling of the event is taking place elsewhere
799 *	  (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
800 *	- the event is associated with a sync flood so even if the detector is
801 *	  enabled we will never handle the event and generate an ereport *and*
802 *	  even if the ereport did arrive we could perform no useful diagnosis
803 *	  e.g., the NB can be configured for sync flood on nb.mem_eccm
804 *	  but we don't choose to discard that ereport here since we could have
805 *	  made a useful diagnosis from it had it been delivered
806 *	  (nb.ht_sync, nb.ht_crc)
807 *	- events that will be accompanied by an immediate panic and
808 *	  delivery of the ereport during subsequent reboot but from
809 *	  which no useful diagnosis can be made. (nb.rmw, nb.wdog)
810 *
811 * Ereports for all of these can be generated by error simulation and
812 * injection.  We will perform a null diagnosos of all these ereports in order
813 * to avoid "no subscription" complaints during test harness runs.
814 */
815
816event ereport.cpu.amd.nb.ma@cpu;
817event ereport.cpu.amd.nb.ta@cpu;
818event ereport.cpu.amd.ls.s_rde@cpu;
819event ereport.cpu.amd.ic.rdde@cpu;
820event ereport.cpu.amd.bu.s_rde@cpu;
821event ereport.cpu.amd.nb.gart_walk@cpu;
822event ereport.cpu.amd.nb.ht_sync@cpu;
823event ereport.cpu.amd.nb.ht_crc@cpu;
824event ereport.cpu.amd.nb.rmw@cpu;
825event ereport.cpu.amd.nb.wdog@cpu;
826event ereport.cpu.amd.unknown@cpu;
827
828event upset.null_diag@cpu;
829
830prop upset.null_diag@cpu (1)->
831    ereport.cpu.amd.nb.ma@cpu,
832    ereport.cpu.amd.nb.ta@cpu,
833    ereport.cpu.amd.ls.s_rde@cpu,
834    ereport.cpu.amd.ic.rdde@cpu,
835    ereport.cpu.amd.bu.s_rde@cpu,
836    ereport.cpu.amd.nb.gart_walk@cpu,
837    ereport.cpu.amd.nb.ht_sync@cpu,
838    ereport.cpu.amd.nb.ht_crc@cpu,
839    ereport.cpu.amd.nb.rmw@cpu,
840    ereport.cpu.amd.nb.wdog@cpu,
841    ereport.cpu.amd.unknown@cpu;
842