1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#pragma dictionary "AMD"
30
31/*
32 * Eversholt rules for the AMD Opteron CPU/Memory
33 */
34
35#define	MAX(x, y) ((x) >= (y) ? (x) : (y))
36#define	MIN(x, y) ((x) <= (y) ? (x) : (y))
37
38/*
39 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that
40 * we diagnose for page faults, to record the physical address of the faulting
41 * page.
42 */
43#define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
44
45#define	SET_OFFSET (setpayloadprop("asru-offset", \
46	payloadprop("resource[0].hc-specific.offset")))
47
48/*
49 * RESOURCE_EXISTS is true if a member with name "resource" exists in the
50 * payload - regardless of type (e.g., nvlist or nvlist array) or value.
51 */
52#define	RESOURCE_EXISTS	(payloadprop_defined("resource"))
53
54/*
55 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory
56 * ereports) exists and one if its members matches the path for the
57 * rank node.  Our memory propogation are of the form
58 *
59 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/cpu"
60 *
61 * since cpus detect memory errors;  in eversholt such a propogation, where
62 * the lhs path and rhs path do not match, expands to the cross-product of
63 * all dimms, ranks and cpus on the same chip (since chip appears in the
64 * path on both sides).  We use CONTAINS_RANK to constrain the propogation
65 * such that it only happens if the payload resource matches the rank.
66 */
67#define	CONTAINS_RANK (payloadprop_contains("resource", \
68	asru(chip/memory-controller/dimm/rank)) \
69	|| payloadprop_contains("resource", \
70	asru(chip/memory-controller/dimm)))
71
72/*
73 * The following will tell us whether a syndrome that is known to be
74 * correctable (from a mem_ce ereport) is single-bit or multi-bit.  For a
75 * correctable ChipKill syndrome the number of bits set in the lowest
76 * nibble indicates how many bits were in error.
77 */
78
79#define	CBITMASK(synd) ((synd) & 0xf)
80
81#define	CKSINGLE(synd)							\
82	((synd) == 0 ||							\
83	(CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 ||		\
84	CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
85
86#define	SINGLE_BIT_CE							\
87	(payloadprop("syndrome-type") == "E" ||				\
88	(payloadprop("syndrome-type") == "C" &&				\
89	CKSINGLE(payloadprop("syndrome"))))
90
91#define	MULTI_BIT_CE							\
92	(payloadprop("syndrome-type") == "C" &&				\
93	!CKSINGLE(payloadprop("syndrome")))
94
95/*								#PAGE#
96 *								#DIMM_SCU#
97 * A single bit fault in a memory rank can cause:
98 *
99 *  - mem_ce : reported by nb
100 *  - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the
101 *    ic do not record a syndrome; these errors will not be triggered in
102 *    ChipKill ECC mode (the NB corrects all ECC errors in that mode)
103 *  - s_ecc1: reported by bu; this error will not be triggered in ChipKill
104 *    ECC mode (the NB corrects all ECC in that mode)
105 *
106 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine
107 * trips we diagnose a fault.memory.page so that the response agent can
108 * retire the page that caused the trip.  If the total number of pages
109 * faulted in this way on a single rank exceeds a threshold we will
110 * diagnose a fault.memory.dimm_sb against the containing dimm.
111 *
112 * Multibit ChipKill-correctable errors are treated identically to
113 * single-bit errors, but via separate serd engines to allow distinct
114 * parameters if desired.
115 *
116 * Uncorrectable errors produce an immediate page fault and corresponding
117 * fault.memory.dimm_ue.
118 *
119 * Page faults are essentially internal - action is only required when
120 * they are accompanied by a dimm fault.  As such we include message=0
121 * on page faults.
122 */
123
124event ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu{within(5s)};
125event ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu{within(5s)};
126event ereport.cpu.amd.bu.s_ecc1@chip/cpu{within(5s)};
127event ereport.cpu.amd.nb.mem_ce@chip/cpu{within(5s)};
128
129/*
130 * Single-bit correctable errors feed into per-rank
131 * SERD engines which diagnose fault.memory.page_sb if they trip.
132 *
133 * Multi-bit correctable (via ChipKill) errors feed
134 * into additional per-rank SERD engines which diagnose fault.memory.page_ck
135 * if they trip.
136 *
137 * The number of fault.memory.page and fault.memory.page_ck diagnosed is
138 * counted in stat engines for each type.  These are used in deciding
139 * whether to declare a dimm faulty after repeated page faults.
140 */
141
142#define PAGE_SB_COUNT		2
143#define PAGE_SB_TIME		72h
144#define	PAGE_CK_COUNT		2
145#define	PAGE_CK_TIME		72h
146
147engine stat.sbpgflt@chip/memory-controller/dimm/rank;
148engine stat.ckpgflt@chip/memory-controller/dimm/rank;
149engine serd.memory.page_sb@chip/memory-controller/dimm/rank,
150    N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
151engine serd.memory.page_ck@chip/memory-controller/dimm/rank,
152    N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
153engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank,
154    N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
155engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank,
156    N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
157event fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0,
158    count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0,
159    engine=serd.memory.page_sb@chip/memory-controller/dimm/rank;
160event fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0,
161    count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0,
162    engine=serd.memory.page_ck@chip/memory-controller/dimm/rank;
163event fault.memory.dimm_sb@chip/memory-controller/dimm/rank,
164    engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank;
165event fault.memory.dimm_ck@chip/memory-controller/dimm/rank,
166    engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank;
167
168/*
169 * The fraction of pages on a single rank that must be diagnosed as faulty
170 * with single correctable unit faults before we will fault the rank.
171 * Once we have faulted the rank we will continue to diagnose any further page
172 * faults on the rank up to some maximum multiple of the threshold at which
173 * we faulted the dimm.  This allows us to potentially contain some fairly
174 * far-reaching but still limited-extent fault (such as a partial column
175 * failure) without getting carried away and allowing a single faulty rank to
176 * use up the entire system-imposed page retirenment limit (which, once
177 * reached, causes retirement request to have no effect other than to fill
178 * the fault manager cache and logs).
179 *
180 * This fraction is specified in basis points, where 100 basis points are
181 * equivalent to 1 percent.  It is applied on a per-rank basis.
182 *
183 * The system imposes an absolute maximum on the number of pages it will
184 * retire;  the current value is 10 basis points, or 0.1% of 'physmem'.  Note
185 * that 'physmem' is reduced from installed memory pages by an amount
186 * reflecting permanent kernel memory allocations.  This system page retire
187 * limit bounds the maximum real response to page faults across all ranks
188 * that fault manager response agents can effect, but it should not be confused
189 * with any diagnosis threshold (i.e., the number of faulty pages we are
190 * prepared to tolerate from a single rank before faulting the rank is
191 * distinct from the total number of pages we are prepared to retire from use
192 * in response to that and other faults).  It is, however, desirable to
193 * arrange that the maximum number of pages we are prepared to fault from
194 * any one rank is less than the system-wide quota.
195 */
196#define	PAGE_RETIRE_LIMIT_BPS	5		/* or 0.05%; ~ 131 pages/GB %/
197
198/*
199 * A macro to manipulate the above fraction.  Given a size in bytes convert
200 * this to pages (4K pagesize) and calculate the number of those pages
201 * indicated by PAGE_RETIRE_LIMIT_BPS basis points.
202 */
203#define	_BPS_PGCNT(totalbytes) \
204	((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000)
205
206/*
207 * The single-correctable-unit threshold at which number of faulted pages
208 * on a rank we we fault the rank.  We insist that this be at least 128 and
209 * never more than 512.
210 */
211#define	RANK_THRESH MIN(512, MAX(128, \
212	_BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size"))))
213
214/*
215 * The maximum number of single-correctable-unit page faults we will diagnose
216 * on a single rank (must be greater than RANK_THRESH).  We set
217 * this at twice the rank fault threshold.
218 */
219#define	RANK_PGFLT_MAX (2 * RANK_THRESH)
220
221#define	SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank))
222#define	CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank))
223
224/*
225 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of
226 * page faults (diagnosed from repeated single-bit or multibit-chipkills)
227 * from any one rank on that DIMM reaches a threshold.  A "correctable unit"
228 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill
229 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron).
230 *
231 * We do not stop diagnosing further single-bit page faults once we have
232 * declared a single-bit DIMM fault - we continue diagnosing them and
233 * response agents can continue to retire those pages up to the system-imposed
234 * retirement limit.
235 *
236 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and
237 * fault.memory.dimm_ck.  Which one is diagnosed depends on whether we
238 * have reached the threshold for a majority of single-bit page faults or
239 * multibit page faults.
240 *
241 * Implementation: we maintain parallel SERD engines to the page_sb and
242 * page_ck engines, which trip in unison.  On trip it generates a distinct
243 * ereport which we diagnose to a fault if the threshold has been reached.
244 */
245prop fault.memory.page_sb@chip/memory-controller/dimm/rank
246    { CONTAINS_RANK && SINGLE_BIT_CE &&
247      SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
248    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu,
249    ereport.cpu.amd.bu.s_ecc1@chip/cpu,
250    ereport.cpu.amd.nb.mem_ce@chip/cpu;
251
252prop fault.memory.page_ck@chip/memory-controller/dimm/rank
253    { CONTAINS_RANK && !SINGLE_BIT_CE &&
254      SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
255    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu,
256    ereport.cpu.amd.bu.s_ecc1@chip/cpu,
257    ereport.cpu.amd.nb.mem_ce@chip/cpu;
258
259prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank
260    { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
261      SB_PGFLTS > RANK_THRESH / 2 } (1)->
262    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu,
263    ereport.cpu.amd.bu.s_ecc1@chip/cpu,
264    ereport.cpu.amd.nb.mem_ce@chip/cpu;
265
266prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank
267    { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
268      CK_PGFLTS > RANK_THRESH / 2 } (1)->
269    ereport.cpu.amd.nb.mem_ce@chip/cpu;
270
271/*
272 * If the address is not valid then no resource member will be included
273 * in a nb.mem_ce or nb.mem_ue ereport.  These cases should be rare.
274 * We will also discard all inf_sys_ecc1 events detected at the ic since they
275 * have no syndrome and therefore no resource information.
276 * We will discard such ereports.  An alternative may be to SERD them
277 * on a per MC basis and trip if we see too many such events.
278 */
279event upset.memory.discard1@chip/cpu;
280prop upset.memory.discard1@chip/cpu
281    { !RESOURCE_EXISTS } (1)->
282    ereport.cpu.amd.ic.inf_sys_ecc1@chip/cpu,
283    ereport.cpu.amd.dc.inf_sys_ecc1@chip/cpu,
284    ereport.cpu.amd.bu.s_ecc1@chip/cpu,
285    ereport.cpu.amd.nb.mem_ce@chip/cpu;
286
287/* 								#DIMM_UE#
288 *								#PAGE_UE#
289 * An uncorrectable multi-bit fault in a memory dimm can cause:
290 *
291 *  - mem_ue    	   : reported by nb for an access from a remote cpu
292 *  - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome
293 *  - s_eccm	   : reported by bu
294 *
295 * Since on production systems we force HT Sync Flood on uncorrectable
296 * memory errors (if not already set as such by the BIOS, as it should be)
297 * we won't actually receive these ereports since the system will be reset.
298 */
299
300event ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu{within(5s)};
301event ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu{within(5s)};
302event ereport.cpu.amd.bu.s_eccm@chip/cpu{within(5s)};
303event ereport.cpu.amd.nb.mem_ue@chip/cpu{within(5s)};
304
305event fault.memory.dimm_ue@chip/memory-controller/dimm/rank;
306event fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0,
307    response=0;
308
309prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank
310    { CONTAINS_RANK } (1)->
311    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu,
312    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu,
313    ereport.cpu.amd.bu.s_eccm@chip/cpu,
314    ereport.cpu.amd.nb.mem_ue@chip/cpu;
315
316prop fault.memory.page_ue@chip/memory-controller/dimm/rank
317    { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)->
318    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu,
319    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu,
320    ereport.cpu.amd.bu.s_eccm@chip/cpu,
321    ereport.cpu.amd.nb.mem_ue@chip/cpu;
322
323event upset.memory.discard3@chip/cpu;
324prop upset.memory.discard3@chip/cpu
325    { !RESOURCE_EXISTS } (1)->
326    ereport.cpu.amd.ic.inf_sys_eccm@chip/cpu,
327    ereport.cpu.amd.dc.inf_sys_eccm@chip/cpu,
328    ereport.cpu.amd.bu.s_eccm@chip/cpu,
329    ereport.cpu.amd.nb.mem_ue@chip/cpu;
330
331/*								#CSTESTFAIL#
332 * If the BIOS fails a chip-select during POST, or perhaps after a
333 * sync flood from an uncorrectable error, then on revision F and G it
334 * should mark that chip-select as TestFail in the CS Base register.
335 * When the memory-controller driver discovers all the MC configuration
336 * it notes such failed chip-selects and creates topology nodes for the
337 * chip-select and associated dimms and ranks, and produces an ereport for each
338 * failed chip-select with detector set to the memory-controller node
339 * and resource indicating the failed chip-select.
340 */
341
342event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)};
343event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank;
344event error.memory.cs_testfail@chip/memory-controller/chip-select;
345
346#define	CONTAINS_CS (payloadprop_contains("resource", \
347	asru(chip/memory-controller/chip-select)))
348
349prop error.memory.cs_testfail@chip/memory-controller/chip-select (1)->
350    ereport.cpu.amd.mc.cs_testfail@chip/memory-controller
351    { CONTAINS_CS };
352
353#define CSMATCH(s) \
354	(confprop_defined(chip/memory-controller/chip-select, s) && \
355	confprop(chip/memory-controller/chip-select, s) == \
356	confprop(chip/memory-controller/dimm/rank, "csname"))
357
358prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (1)->
359    error.memory.cs_testfail@chip/memory-controller/chip-select
360    { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")};
361
362/*								#ADDRPAR#
363 * DRAM Command/Address Parity Errors.
364 *
365 *  - dramaddr_par : reported by the nb; the NB status register includes
366 *    a bit indicating which dram controller channel (A or B) experienced
367 *    the error.
368 */
369
370event ereport.cpu.amd.nb.dramaddr_par@chip/cpu{within(5s)};
371event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0;
372
373prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)->
374    ereport.cpu.amd.nb.dramaddr_par@chip/cpu {
375    ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y };
376
377/* 								#L2D_SINGLE#
378 * A single bit data array fault in an l2 cache can cause:
379 *
380 *  - inf_l2_ecc1 : reported by ic on this cpu
381 *  - inf_l2_ecc1 : reported by dc on this cpu
382 *  - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
383 */
384
385#define L2CACHEDATA_SB_COUNT	3
386#define L2CACHEDATA_SB_TIME	12h
387
388event ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu{within(5s)};
389event ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu{within(5s)};
390event ereport.cpu.amd.bu.l2d_ecc1@chip/cpu{within(5s)};
391engine serd.cpu.amd.l2d_sb@chip/cpu,
392    N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME;
393event fault.cpu.amd.l2cachedata@chip/cpu, engine=serd.cpu.amd.l2d_sb@chip/cpu;
394
395prop fault.cpu.amd.l2cachedata@chip/cpu (0)->
396    ereport.cpu.amd.ic.inf_l2_ecc1@chip/cpu,
397    ereport.cpu.amd.dc.inf_l2_ecc1@chip/cpu,
398    ereport.cpu.amd.bu.l2d_ecc1@chip/cpu;
399
400/* 								#L2D_MULTI#
401 * A multi-bit data array fault in an l2 cache can cause:
402 *
403 *  - inf_l2_eccm : reported by ic on this cpu
404 *  - inf_l2_eccm : reported by dc on this cpu
405 *  - l2d_eccm : reported by bu on copyback or on snoop from another cpu
406 */
407
408event ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu{within(5s)};
409event ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu{within(5s)};
410event ereport.cpu.amd.bu.l2d_eccm@chip/cpu{within(5s)};
411
412prop fault.cpu.amd.l2cachedata@chip/cpu
413    { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)->
414    ereport.cpu.amd.ic.inf_l2_eccm@chip/cpu,
415    ereport.cpu.amd.dc.inf_l2_eccm@chip/cpu,
416    ereport.cpu.amd.bu.l2d_eccm@chip/cpu;
417
418/* 								#L2T_SINGLE#
419 * A single bit tag array fault in an l2 cache can cause:
420 *
421 *  - l2t_ecc1 : reported by bu on this cpu when detected during snoop
422 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
423 */
424
425#define L2CACHETAG_SB_COUNT	3
426#define L2CACHETAG_SB_TIME	12h
427
428event ereport.cpu.amd.bu.l2t_ecc1@chip/cpu{within(5s)};
429event ereport.cpu.amd.bu.l2t_par@chip/cpu{within(5s)};
430engine serd.cpu.amd.l2t_sb@chip/cpu,
431    N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME;
432event fault.cpu.amd.l2cachetag@chip/cpu, engine=serd.cpu.amd.l2t_sb@chip/cpu;
433
434prop fault.cpu.amd.l2cachetag@chip/cpu (0)->
435    ereport.cpu.amd.bu.l2t_ecc1@chip/cpu,
436    ereport.cpu.amd.bu.l2t_par@chip/cpu;
437
438/* 								#L2T_MULTI#
439 * A multi-bit tag array fault in an l2 cache can cause:
440 *
441 *  - l2t_eccm : reported by bu on this cpu when detected during snoop
442 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
443 */
444
445event ereport.cpu.amd.bu.l2t_eccm@chip/cpu{within(5s)};
446
447prop fault.cpu.amd.l2cachetag@chip/cpu
448    { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
449    ereport.cpu.amd.bu.l2t_eccm@chip/cpu,
450    ereport.cpu.amd.bu.l2t_par@chip/cpu;
451
452/* 								#ICD_PAR#
453 * A data array parity fault in an I cache can cause:
454 *
455 *  - data_par : reported by ic on this cpu
456 */
457
458#define ICACHEDATA_SB_COUNT	2
459#define ICACHEDATA_SB_TIME	168h
460
461event ereport.cpu.amd.ic.data_par@chip/cpu{within(5s)};
462engine serd.cpu.amd.icachedata@chip/cpu,
463    N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME;
464event fault.cpu.amd.icachedata@chip/cpu,
465    engine=serd.cpu.amd.icachedata@chip/cpu;
466
467prop fault.cpu.amd.icachedata@chip/cpu (0)->
468    ereport.cpu.amd.ic.data_par@chip/cpu;
469
470/* 								#ICT_PAR#
471 * A tag array parity fault in an I cache can cause:
472 *
473 *  - tag_par : reported by ic on this cpu
474 */
475
476#define ICACHETAG_SB_COUNT	2
477#define ICACHETAG_SB_TIME	168h
478
479event ereport.cpu.amd.ic.tag_par@chip/cpu{within(5s)};
480engine serd.cpu.amd.icachetag@chip/cpu,
481    N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME;
482event fault.cpu.amd.icachetag@chip/cpu, engine=serd.cpu.amd.icachetag@chip/cpu;
483
484prop fault.cpu.amd.icachetag@chip/cpu (0)->
485    ereport.cpu.amd.ic.tag_par@chip/cpu;
486
487/* 								#ICT_SNOOP#
488 * A snoop tag array parity fault in an I cache can cause:
489 *
490 *  - stag_par : reported by ic on this cpu
491 */
492
493event ereport.cpu.amd.ic.stag_par@chip/cpu{within(5s)};
494event fault.cpu.amd.icachestag@chip/cpu;
495
496prop fault.cpu.amd.icachestag@chip/cpu (1)->
497    ereport.cpu.amd.ic.stag_par@chip/cpu;
498
499/* 								#ICTLB_1#
500 * An l1tlb parity fault in an I cache can cause:
501 *
502 *  - l1tlb_par : reported by ic on this cpu
503 */
504
505#define ICACHEL1TLB_SB_COUNT	2
506#define ICACHEL1TLB_SB_TIME	168h
507
508event ereport.cpu.amd.ic.l1tlb_par@chip/cpu{within(5s)};
509engine serd.cpu.amd.l1itlb@chip/cpu,
510    N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME;
511event fault.cpu.amd.l1itlb@chip/cpu, engine=serd.cpu.amd.l1itlb@chip/cpu;
512
513prop fault.cpu.amd.l1itlb@chip/cpu (0)->
514    ereport.cpu.amd.ic.l1tlb_par@chip/cpu;
515
516/* 								#ICTLB_2#
517 * An l2tlb parity fault in an I cache can cause:
518 *
519 *  - l2tlb_par : reported by ic on this cpu
520 */
521
522#define ICACHEL2TLB_SB_COUNT	2
523#define ICACHEL2TLB_SB_TIME	168h
524
525event ereport.cpu.amd.ic.l2tlb_par@chip/cpu{within(5s)};
526engine serd.cpu.amd.l2itlb@chip/cpu,
527    N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME;
528event fault.cpu.amd.l2itlb@chip/cpu, engine=serd.cpu.amd.l2itlb@chip/cpu;
529
530prop fault.cpu.amd.l2itlb@chip/cpu (0)->
531    ereport.cpu.amd.ic.l2tlb_par@chip/cpu;
532
533/* 								#DCD_SINGLE#
534 * A single bit data array fault in an D cache can cause:
535 *
536 *  - data_ecc1 : reported by dc on this cpu by scrubber
537 *  - data_ecc1_uc : reported by dc on this cpu other than by scrubber
538 *
539 * Make data_ecc1_uc fault immediately as it may have caused a panic, so
540 * it is handled by the multi-bit case in the following section.
541 */
542
543#define DCACHEDATA_SB_COUNT	2
544#define DCACHEDATA_SB_TIME	168h
545
546event ereport.cpu.amd.dc.data_ecc1@chip/cpu{within(5s)};
547event ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu{within(5s)};
548engine serd.cpu.amd.dc_sb@chip/cpu,
549    N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME;
550event fault.cpu.amd.dcachedata@chip/cpu, engine=serd.cpu.amd.dc_sb@chip/cpu;
551
552prop fault.cpu.amd.dcachedata@chip/cpu (0)->
553    ereport.cpu.amd.dc.data_ecc1@chip/cpu;
554
555/* 								#DCD_MULTI#
556 * A multi-bit data array fault in an D cache can cause:
557 *
558 *  - data_eccm : reported by dc on this cpu
559 */
560
561event ereport.cpu.amd.dc.data_eccm@chip/cpu{within(5s)};
562
563prop fault.cpu.amd.dcachedata@chip/cpu
564    { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
565    ereport.cpu.amd.dc.data_eccm@chip/cpu,
566    ereport.cpu.amd.dc.data_ecc1_uc@chip/cpu;
567
568/* 								#DCT_PAR#
569 * A tag array parity fault in an D cache can cause:
570 *
571 *  - tag_par : reported by dc on this cpu
572 */
573
574event ereport.cpu.amd.dc.tag_par@chip/cpu{within(5s)};
575event fault.cpu.amd.dcachetag@chip/cpu;
576
577prop fault.cpu.amd.dcachetag@chip/cpu (1)->
578    ereport.cpu.amd.dc.tag_par@chip/cpu;
579
580/* 								#DCT_SNOOP#
581 * A snoop tag array parity fault in an D cache can cause:
582 *
583 *  - stag_par : reported by dc on this cpu
584 */
585
586event ereport.cpu.amd.dc.stag_par@chip/cpu{within(5s)};
587event fault.cpu.amd.dcachestag@chip/cpu;
588
589prop fault.cpu.amd.dcachestag@chip/cpu (1)->
590    ereport.cpu.amd.dc.stag_par@chip/cpu;
591
592/* 								#DCTLB_1#
593 * An l1tlb parity fault in an D cache can cause:
594 *
595 *  - l1tlb_par : reported by dc on this cpu
596 */
597
598event ereport.cpu.amd.dc.l1tlb_par@chip/cpu{within(5s)};
599event fault.cpu.amd.l1dtlb@chip/cpu;
600
601prop fault.cpu.amd.l1dtlb@chip/cpu (1)->
602    ereport.cpu.amd.dc.l1tlb_par@chip/cpu;
603
604/* 								#DCTLB_2#
605 * An l2tlb parity fault in an D cache can cause:
606 *
607 *  - l2tlb_par : reported by dc on this cpu
608 */
609
610event ereport.cpu.amd.dc.l2tlb_par@chip/cpu{within(5s)};
611event fault.cpu.amd.l2dtlb@chip/cpu;
612
613prop fault.cpu.amd.l2dtlb@chip/cpu (1)->
614    ereport.cpu.amd.dc.l2tlb_par@chip/cpu;
615
616/*								#MISC#
617 * Ereports that should not normally happen and which we will discard
618 * without diagnosis if they do.  These fall into a few categories:
619 *
620 *	- the corresponding detector is not enabled, typically because
621 *	  detection/handling of the event is taking place elsewhere
622 *	  (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
623 *	- the event is associated with a sync flood so even if the detector is
624 *	  enabled we will never handle the event and generate an ereport *and*
625 *	  even if the ereport did arrive we could perform no useful diagnosis
626 *	  e.g., the NB can be configured for sync flood on nb.mem_eccm
627 *	  but we don't choose to discard that ereport here since we could have
628 *	  made a useful diagnosis from it had it been delivered
629 *	  (nb.ht_sync, nb.ht_crc)
630 *	- events that will be accompanied by an immediate panic and
631 *	  delivery of the ereport during subsequent reboot but from
632 *	  which no useful diagnosis can be made. (nb.rmw, nb.wdog)
633 *
634 * Ereports for all of these can be generated by error simulation and
635 * injection.  We will perform a null diagnosos of all these ereports in order
636 * to avoid "no subscription" complaints during test harness runs.
637 */
638
639event ereport.cpu.amd.nb.ma@cpu{within(5s)};
640event ereport.cpu.amd.nb.ta@cpu{within(5s)};
641event ereport.cpu.amd.ls.s_rde@cpu{within(5s)};
642event ereport.cpu.amd.ic.rdde@cpu{within(5s)};
643event ereport.cpu.amd.bu.s_rde@cpu{within(5s)};
644event ereport.cpu.amd.nb.gart_walk@cpu{within(5s)};
645event ereport.cpu.amd.nb.ht_sync@cpu{within(5s)};
646event ereport.cpu.amd.nb.ht_crc@cpu{within(5s)};
647event ereport.cpu.amd.nb.rmw@cpu{within(5s)};
648event ereport.cpu.amd.nb.wdog@cpu{within(5s)};
649event ereport.cpu.amd.unknown@cpu{within(5s)};
650
651event upset.null_diag@cpu;
652
653prop upset.null_diag@cpu (1)->
654    ereport.cpu.amd.nb.ma@cpu,
655    ereport.cpu.amd.nb.ta@cpu,
656    ereport.cpu.amd.ls.s_rde@cpu,
657    ereport.cpu.amd.ic.rdde@cpu,
658    ereport.cpu.amd.bu.s_rde@cpu,
659    ereport.cpu.amd.nb.gart_walk@cpu,
660    ereport.cpu.amd.nb.ht_sync@cpu,
661    ereport.cpu.amd.nb.ht_crc@cpu,
662    ereport.cpu.amd.nb.rmw@cpu,
663    ereport.cpu.amd.nb.wdog@cpu,
664    ereport.cpu.amd.unknown@cpu;
665