1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma dictionary "AMD"
28
29/*
30 * Eversholt rules for the AMD Opteron CPU/Memory
31 */
32
33#define	MAX(x, y) ((x) >= (y) ? (x) : (y))
34#define	MIN(x, y) ((x) <= (y) ? (x) : (y))
35
36/*
37 * SET_ADDR and SET_OFFSET are used to set a payload value in the fault that
38 * we diagnose for page faults, to record the physical address of the faulting
39 * page.
40 */
41#define	SET_ADDR (setpayloadprop("asru-physaddr", payloadprop("IA32_MCi_ADDR")))
42
43#define	SET_OFFSET (setpayloadprop("asru-offset", \
44	payloadprop("resource[0].hc-specific.offset")))
45
46/*
47 * RESOURCE_EXISTS is true if a member with name "resource" exists in the
48 * payload - regardless of type (e.g., nvlist or nvlist array) or value.
49 */
50#define	RESOURCE_EXISTS	(payloadprop_defined("resource"))
51
52/*
53 * CONTAINS_RANK is true if the "resource" nvlist array (as used in memory
54 * ereports) exists and one if its members matches the path for the
55 * rank node.  Our memory propogation are of the form
56 *
57 * "prop foo@chip/memory-controller/dimm/rank -> blah@chip/core/strand"
58 *
59 * since cpus detect memory errors;  in eversholt such a propogation, where
60 * the lhs path and rhs path do not match, expands to the cross-product of
61 * all dimms, ranks and cpus on the same chip (since chip appears in the
62 * path on both sides).  We use CONTAINS_RANK to constrain the propogation
63 * such that it only happens if the payload resource matches the rank.
64 */
65#define	CONTAINS_RANK (payloadprop_contains("resource", \
66	asru(chip/memory-controller/dimm/rank)) \
67	|| payloadprop_contains("resource", \
68	asru(chip/memory-controller/dimm)))
69
70/*
71 * The following will tell us whether a syndrome that is known to be
72 * correctable (from a mem_ce ereport) is single-bit or multi-bit.  For a
73 * correctable ChipKill syndrome the number of bits set in the lowest
74 * nibble indicates how many bits were in error.
75 */
76
77#define	CBITMASK(synd) ((synd) & 0xf)
78
79#define	CKSINGLE(synd)							\
80	((synd) == 0 ||							\
81	(CBITMASK(synd) == 0x1 || CBITMASK(synd) == 0x2 ||		\
82	CBITMASK(synd) == 0x4 || CBITMASK(synd) == 0x8))
83
84#define	SINGLE_BIT_CE							\
85	(payloadprop("syndrome-type") == "E" ||				\
86	(payloadprop("syndrome-type") == "C" &&				\
87	CKSINGLE(payloadprop("syndrome"))))
88
89#define	MULTI_BIT_CE							\
90	(payloadprop("syndrome-type") == "C" &&				\
91	!CKSINGLE(payloadprop("syndrome")))
92
93/*								#PAGE#
94 *								#DIMM_SCU#
95 * A single bit fault in a memory rank can cause:
96 *
97 *  - mem_ce : reported by nb
98 *  - inf_sys_ecc1: reported by ic or dc; inf_sys_ecc1 errors detected at the
99 *    ic do not record a syndrome; these errors will not be triggered in
100 *    ChipKill ECC mode (the NB corrects all ECC errors in that mode)
101 *  - s_ecc1: reported by bu; this error will not be triggered in ChipKill
102 *    ECC mode (the NB corrects all ECC in that mode)
103 *
104 * Single-bit errors are fed into a per-rank SERD engine; if a SERD engine
105 * trips we diagnose a fault.memory.page so that the response agent can
106 * retire the page that caused the trip.  If the total number of pages
107 * faulted in this way on a single rank exceeds a threshold we will
108 * diagnose a fault.memory.dimm_sb against the containing dimm.
109 *
110 * Multibit ChipKill-correctable errors are treated identically to
111 * single-bit errors, but via separate serd engines to allow distinct
112 * parameters if desired.
113 *
114 * Uncorrectable errors produce an immediate page fault and corresponding
115 * fault.memory.dimm_ue.
116 *
117 * Page faults are essentially internal - action is only required when
118 * they are accompanied by a dimm fault.  As such we include message=0
119 * on page faults.
120 */
121
122event ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand{within(5s)};
123event ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand{within(5s)};
124event ereport.cpu.amd.bu.s_ecc1@chip/core/strand{within(5s)};
125event ereport.cpu.amd.nb.mem_ce@chip/core/strand{within(5s)};
126
127/*
128 * Single-bit correctable errors feed into per-rank
129 * SERD engines which diagnose fault.memory.page_sb if they trip.
130 *
131 * Multi-bit correctable (via ChipKill) errors feed
132 * into additional per-rank SERD engines which diagnose fault.memory.page_ck
133 * if they trip.
134 *
135 * The number of fault.memory.page and fault.memory.page_ck diagnosed is
136 * counted in stat engines for each type.  These are used in deciding
137 * whether to declare a dimm faulty after repeated page faults.
138 */
139
140#define PAGE_SB_COUNT		2
141#define PAGE_SB_TIME		72h
142#define	PAGE_CK_COUNT		2
143#define	PAGE_CK_TIME		72h
144
145engine stat.sbpgflt@chip/memory-controller/dimm/rank;
146engine stat.ckpgflt@chip/memory-controller/dimm/rank;
147engine serd.memory.page_sb@chip/memory-controller/dimm/rank,
148    N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
149engine serd.memory.page_ck@chip/memory-controller/dimm/rank,
150    N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
151engine serd.memory.dimm_sb@chip/memory-controller/dimm/rank,
152    N=PAGE_SB_COUNT, T=PAGE_SB_TIME;
153engine serd.memory.dimm_ck@chip/memory-controller/dimm/rank,
154    N=PAGE_CK_COUNT, T=PAGE_CK_TIME;
155event fault.memory.page_sb@chip/memory-controller/dimm/rank, message=0,
156    count=stat.sbpgflt@chip/memory-controller/dimm/rank, response=0,
157    engine=serd.memory.page_sb@chip/memory-controller/dimm/rank;
158event fault.memory.page_ck@chip/memory-controller/dimm/rank, message=0,
159    count=stat.ckpgflt@chip/memory-controller/dimm/rank, response=0,
160    engine=serd.memory.page_ck@chip/memory-controller/dimm/rank;
161event fault.memory.dimm_sb@chip/memory-controller/dimm/rank,
162    engine=serd.memory.dimm_sb@chip/memory-controller/dimm/rank;
163event fault.memory.dimm_ck@chip/memory-controller/dimm/rank,
164    engine=serd.memory.dimm_ck@chip/memory-controller/dimm/rank;
165
166/*
167 * The fraction of pages on a single rank that must be diagnosed as faulty
168 * with single correctable unit faults before we will fault the rank.
169 * Once we have faulted the rank we will continue to diagnose any further page
170 * faults on the rank up to some maximum multiple of the threshold at which
171 * we faulted the dimm.  This allows us to potentially contain some fairly
172 * far-reaching but still limited-extent fault (such as a partial column
173 * failure) without getting carried away and allowing a single faulty rank to
174 * use up the entire system-imposed page retirenment limit (which, once
175 * reached, causes retirement request to have no effect other than to fill
176 * the fault manager cache and logs).
177 *
178 * This fraction is specified in basis points, where 100 basis points are
179 * equivalent to 1 percent.  It is applied on a per-rank basis.
180 *
181 * The system imposes an absolute maximum on the number of pages it will
182 * retire;  the current value is 10 basis points, or 0.1% of 'physmem'.  Note
183 * that 'physmem' is reduced from installed memory pages by an amount
184 * reflecting permanent kernel memory allocations.  This system page retire
185 * limit bounds the maximum real response to page faults across all ranks
186 * that fault manager response agents can effect, but it should not be confused
187 * with any diagnosis threshold (i.e., the number of faulty pages we are
188 * prepared to tolerate from a single rank before faulting the rank is
189 * distinct from the total number of pages we are prepared to retire from use
190 * in response to that and other faults).  It is, however, desirable to
191 * arrange that the maximum number of pages we are prepared to fault from
192 * any one rank is less than the system-wide quota.
193 */
194#define	PAGE_RETIRE_LIMIT_BPS	5		/* or 0.05%; ~ 131 pages/GB %/
195
196/*
197 * A macro to manipulate the above fraction.  Given a size in bytes convert
198 * this to pages (4K pagesize) and calculate the number of those pages
199 * indicated by PAGE_RETIRE_LIMIT_BPS basis points.
200 */
201#define	_BPS_PGCNT(totalbytes) \
202	((((totalbytes) / 4096 ) * PAGE_RETIRE_LIMIT_BPS) / 10000)
203
204/*
205 * The single-correctable-unit threshold at which number of faulted pages
206 * on a rank we we fault the rank.  We insist that this be at least 128 and
207 * never more than 512.
208 */
209#define	RANK_THRESH MIN(512, MAX(128, \
210	_BPS_PGCNT(confprop(chip/memory-controller/dimm/rank, "size"))))
211
212/*
213 * The maximum number of single-correctable-unit page faults we will diagnose
214 * on a single rank (must be greater than RANK_THRESH).  We set
215 * this at twice the rank fault threshold.
216 */
217#define	RANK_PGFLT_MAX (2 * RANK_THRESH)
218
219#define	SB_PGFLTS (count(stat.sbpgflt@chip/memory-controller/dimm/rank))
220#define	CK_PGFLTS (count(stat.ckpgflt@chip/memory-controller/dimm/rank))
221
222/*
223 * "Single-correctable-unit" DIMM faults are diagnosed when the total number of
224 * page faults (diagnosed from repeated single-bit or multibit-chipkills)
225 * from any one rank on that DIMM reaches a threshold.  A "correctable unit"
226 * is a single bit in normal 64/8 ECC mode, or a single symbol in ChipKill
227 * 128/16 mode (i.e., nibble-aligned nibble for the code used on Opteron).
228 *
229 * We do not stop diagnosing further single-bit page faults once we have
230 * declared a single-bit DIMM fault - we continue diagnosing them and
231 * response agents can continue to retire those pages up to the system-imposed
232 * retirement limit.
233 *
234 * Two distinct fault types may be diagnosed - fault.memory.dimm_sb and
235 * fault.memory.dimm_ck.  Which one is diagnosed depends on whether we
236 * have reached the threshold for a majority of single-bit page faults or
237 * multibit page faults.
238 *
239 * Implementation: we maintain parallel SERD engines to the page_sb and
240 * page_ck engines, which trip in unison.  On trip it generates a distinct
241 * ereport which we diagnose to a fault if the threshold has been reached.
242 */
243prop fault.memory.page_sb@chip/memory-controller/dimm/rank
244    { CONTAINS_RANK && SINGLE_BIT_CE &&
245      SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
246    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
247    ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
248    ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
249
250prop fault.memory.page_ck@chip/memory-controller/dimm/rank
251    { CONTAINS_RANK && !SINGLE_BIT_CE &&
252      SB_PGFLTS + CK_PGFLTS < RANK_PGFLT_MAX && SET_ADDR && SET_OFFSET } (1)->
253    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
254    ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
255    ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
256
257prop fault.memory.dimm_sb@chip/memory-controller/dimm/rank
258    { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
259      SB_PGFLTS > RANK_THRESH / 2 } (1)->
260    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core<>/strand<>,
261    ereport.cpu.amd.bu.s_ecc1@chip/core<>/strand<>,
262    ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
263
264prop fault.memory.dimm_ck@chip/memory-controller/dimm/rank
265    { CONTAINS_RANK && SB_PGFLTS + CK_PGFLTS > RANK_THRESH &&
266      CK_PGFLTS > RANK_THRESH / 2 } (1)->
267    ereport.cpu.amd.nb.mem_ce@chip/core<>/strand<>;
268
269/*
270 * If the address is not valid then no resource member will be included
271 * in a nb.mem_ce or nb.mem_ue ereport.  These cases should be rare.
272 * We will also discard all inf_sys_ecc1 events detected at the ic since they
273 * have no syndrome and therefore no resource information.
274 * We will discard such ereports.  An alternative may be to SERD them
275 * on a per MC basis and trip if we see too many such events.
276 */
277event upset.memory.discard1@chip/core/strand;
278prop upset.memory.discard1@chip/core/strand
279    { !RESOURCE_EXISTS } (1)->
280    ereport.cpu.amd.ic.inf_sys_ecc1@chip/core/strand,
281    ereport.cpu.amd.dc.inf_sys_ecc1@chip/core/strand,
282    ereport.cpu.amd.bu.s_ecc1@chip/core/strand,
283    ereport.cpu.amd.nb.mem_ce@chip/core/strand;
284
285/* 								#DIMM_UE#
286 *								#PAGE_UE#
287 * An uncorrectable multi-bit fault in a memory dimm can cause:
288 *
289 *  - mem_ue    	   : reported by nb for an access from a remote cpu
290 *  - inf_sys_eccm : reported by ic or dc; the ic does not report a syndrome
291 *  - s_eccm	   : reported by bu
292 *
293 * Since on production systems we force HT Sync Flood on uncorrectable
294 * memory errors (if not already set as such by the BIOS, as it should be)
295 * we won't actually receive these ereports since the system will be reset.
296 */
297
298event ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand{within(5s)};
299event ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand{within(5s)};
300event ereport.cpu.amd.bu.s_eccm@chip/core/strand{within(5s)};
301event ereport.cpu.amd.nb.mem_ue@chip/core/strand{within(5s)};
302
303event fault.memory.dimm_ue@chip/memory-controller/dimm/rank;
304event fault.memory.page_ue@chip/memory-controller/dimm/rank, message=0,
305    response=0;
306
307prop fault.memory.dimm_ue@chip/memory-controller/dimm/rank
308    { CONTAINS_RANK } (1)->
309    ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>,
310    ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>,
311    ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>,
312    ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>;
313
314prop fault.memory.page_ue@chip/memory-controller/dimm/rank
315    { CONTAINS_RANK && SET_ADDR && SET_OFFSET } (1)->
316    ereport.cpu.amd.ic.inf_sys_eccm@chip/core<>/strand<>,
317    ereport.cpu.amd.dc.inf_sys_eccm@chip/core<>/strand<>,
318    ereport.cpu.amd.bu.s_eccm@chip/core<>/strand<>,
319    ereport.cpu.amd.nb.mem_ue@chip/core<>/strand<>;
320
321event upset.memory.discard3@chip/core/strand;
322prop upset.memory.discard3@chip/core/strand
323    { !RESOURCE_EXISTS } (1)->
324    ereport.cpu.amd.ic.inf_sys_eccm@chip/core/strand,
325    ereport.cpu.amd.dc.inf_sys_eccm@chip/core/strand,
326    ereport.cpu.amd.bu.s_eccm@chip/core/strand,
327    ereport.cpu.amd.nb.mem_ue@chip/core/strand;
328
329/*								#CSTESTFAIL#
330 * If the BIOS fails a chip-select during POST, or perhaps after a
331 * sync flood from an uncorrectable error, then on revision F and G it
332 * should mark that chip-select as TestFail in the CS Base register.
333 * When the memory-controller driver discovers all the MC configuration
334 * it notes such failed chip-selects and creates topology nodes for the
335 * chip-select and associated dimms and ranks, and produces an ereport for each
336 * failed chip-select with detector set to the memory-controller node
337 * and resource indicating the failed chip-select.
338 */
339
340event ereport.cpu.amd.mc.cs_testfail@chip/memory-controller{within(5s)};
341event fault.memory.dimm_testfail@chip/memory-controller/dimm/rank;
342event error.memory.cs_testfail@chip/memory-controller/chip-select;
343
344#define	CONTAINS_CS (payloadprop_contains("resource", \
345	asru(chip/memory-controller/chip-select)))
346
347prop error.memory.cs_testfail@chip/memory-controller/chip-select (1)->
348    ereport.cpu.amd.mc.cs_testfail@chip/memory-controller
349    { CONTAINS_CS };
350
351#define CSMATCH(s) \
352	(confprop_defined(chip/memory-controller/chip-select, s) && \
353	confprop(chip/memory-controller/chip-select, s) == \
354	confprop(chip/memory-controller/dimm/rank, "csname"))
355
356prop fault.memory.dimm_testfail@chip/memory-controller/dimm/rank (0)->
357    error.memory.cs_testfail@chip/memory-controller/chip-select
358    { CSMATCH("dimm1-csname") || CSMATCH("dimm2-csname")};
359
360/*								#ADDRPAR#
361 * DRAM Command/Address Parity Errors.
362 *
363 *  - dramaddr_par : reported by the nb; the NB status register includes
364 *    a bit indicating which dram controller channel (A or B) experienced
365 *    the error.
366 */
367
368event ereport.cpu.amd.nb.dramaddr_par@chip/core/strand{within(5s)};
369event fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel, response=0;
370
371prop fault.cpu.amd.dramchannel@chip/memory-controller/dram-channel[y] (0)->
372    ereport.cpu.amd.nb.dramaddr_par@chip/core/strand {
373    ((payloadprop("IA32_MCi_STATUS") >> 32 & 0x200) ? 1 : 0) == y };
374
375/* 								#L2D_SINGLE#
376 * A single bit data array fault in an l2 cache can cause:
377 *
378 *  - inf_l2_ecc1 : reported by ic on this cpu
379 *  - inf_l2_ecc1 : reported by dc on this cpu
380 *  - l2d_ecc1 : reported by bu on copyback or on snoop from another cpu
381 */
382
383#define L2CACHEDATA_SB_COUNT	3
384#define L2CACHEDATA_SB_TIME	12h
385
386event ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand{within(5s)};
387event ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand{within(5s)};
388event ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand{within(5s)};
389engine serd.cpu.amd.l2d_sb@chip/core/strand,
390    N=L2CACHEDATA_SB_COUNT, T=L2CACHEDATA_SB_TIME;
391event fault.cpu.amd.l2cachedata@chip/core/strand, engine=serd.cpu.amd.l2d_sb@chip/core/strand;
392
393prop fault.cpu.amd.l2cachedata@chip/core/strand (0)->
394    ereport.cpu.amd.ic.inf_l2_ecc1@chip/core/strand,
395    ereport.cpu.amd.dc.inf_l2_ecc1@chip/core/strand,
396    ereport.cpu.amd.bu.l2d_ecc1@chip/core/strand;
397
398/* 								#L2D_MULTI#
399 * A multi-bit data array fault in an l2 cache can cause:
400 *
401 *  - inf_l2_eccm : reported by ic on this cpu
402 *  - inf_l2_eccm : reported by dc on this cpu
403 *  - l2d_eccm : reported by bu on copyback or on snoop from another cpu
404 */
405
406event ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand{within(5s)};
407event ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand{within(5s)};
408event ereport.cpu.amd.bu.l2d_eccm@chip/core/strand{within(5s)};
409
410prop fault.cpu.amd.l2cachedata@chip/core/strand
411    { setserdincrement(L2CACHEDATA_SB_COUNT + 1) } (0)->
412    ereport.cpu.amd.ic.inf_l2_eccm@chip/core/strand,
413    ereport.cpu.amd.dc.inf_l2_eccm@chip/core/strand,
414    ereport.cpu.amd.bu.l2d_eccm@chip/core/strand;
415
416/* 								#L2T_SINGLE#
417 * A single bit tag array fault in an l2 cache can cause:
418 *
419 *  - l2t_ecc1 : reported by bu on this cpu when detected during snoop
420 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
421 */
422
423#define L2CACHETAG_SB_COUNT	3
424#define L2CACHETAG_SB_TIME	12h
425
426event ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand{within(5s)};
427event ereport.cpu.amd.bu.l2t_par@chip/core/strand{within(5s)};
428engine serd.cpu.amd.l2t_sb@chip/core/strand,
429    N=L2CACHETAG_SB_COUNT, T=L2CACHETAG_SB_TIME;
430event fault.cpu.amd.l2cachetag@chip/core/strand, engine=serd.cpu.amd.l2t_sb@chip/core/strand;
431
432prop fault.cpu.amd.l2cachetag@chip/core/strand (0)->
433    ereport.cpu.amd.bu.l2t_ecc1@chip/core/strand;
434
435/* 								#L2T_MULTI#
436 * A multi-bit tag array fault in an l2 cache can cause:
437 *
438 *  - l2t_eccm : reported by bu on this cpu when detected during snoop
439 *  - l2t_par : reported by bu on this cpu when detected other than during snoop
440 */
441
442event ereport.cpu.amd.bu.l2t_eccm@chip/core/strand{within(5s)};
443
444prop fault.cpu.amd.l2cachetag@chip/core/strand
445    { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
446    ereport.cpu.amd.bu.l2t_eccm@chip/core/strand,
447    ereport.cpu.amd.bu.l2t_par@chip/core/strand;
448
449/* 								#ICD_PAR#
450 * A data array parity fault in an I cache can cause:
451 *
452 *  - data_par : reported by ic on this cpu
453 */
454
455#define ICACHEDATA_SB_COUNT	2
456#define ICACHEDATA_SB_TIME	168h
457
458event ereport.cpu.amd.ic.data_par@chip/core/strand{within(5s)};
459engine serd.cpu.amd.icachedata@chip/core/strand,
460    N=ICACHEDATA_SB_COUNT, T=ICACHEDATA_SB_TIME;
461event fault.cpu.amd.icachedata@chip/core/strand,
462    engine=serd.cpu.amd.icachedata@chip/core/strand;
463
464prop fault.cpu.amd.icachedata@chip/core/strand (0)->
465    ereport.cpu.amd.ic.data_par@chip/core/strand;
466
467/* 								#ICT_PAR#
468 * A tag array parity fault in an I cache can cause:
469 *
470 *  - tag_par : reported by ic on this cpu
471 */
472
473#define ICACHETAG_SB_COUNT	2
474#define ICACHETAG_SB_TIME	168h
475
476event ereport.cpu.amd.ic.tag_par@chip/core/strand{within(5s)};
477engine serd.cpu.amd.icachetag@chip/core/strand,
478    N=ICACHETAG_SB_COUNT, T=ICACHETAG_SB_TIME;
479event fault.cpu.amd.icachetag@chip/core/strand, engine=serd.cpu.amd.icachetag@chip/core/strand;
480
481prop fault.cpu.amd.icachetag@chip/core/strand (0)->
482    ereport.cpu.amd.ic.tag_par@chip/core/strand;
483
484/* 								#ICT_SNOOP#
485 * A snoop tag array parity fault in an I cache can cause:
486 *
487 *  - stag_par : reported by ic on this cpu
488 */
489
490event ereport.cpu.amd.ic.stag_par@chip/core/strand{within(5s)};
491event fault.cpu.amd.icachestag@chip/core/strand;
492
493prop fault.cpu.amd.icachestag@chip/core/strand (1)->
494    ereport.cpu.amd.ic.stag_par@chip/core/strand;
495
496/* 								#ICTLB_1#
497 * An l1tlb parity fault in an I cache can cause:
498 *
499 *  - l1tlb_par : reported by ic on this cpu
500 */
501
502#define ICACHEL1TLB_SB_COUNT	2
503#define ICACHEL1TLB_SB_TIME	168h
504
505event ereport.cpu.amd.ic.l1tlb_par@chip/core/strand{within(5s)};
506engine serd.cpu.amd.l1itlb@chip/core/strand,
507    N=ICACHEL1TLB_SB_COUNT, T=ICACHEL1TLB_SB_TIME;
508event fault.cpu.amd.l1itlb@chip/core/strand, engine=serd.cpu.amd.l1itlb@chip/core/strand;
509
510prop fault.cpu.amd.l1itlb@chip/core/strand (0)->
511    ereport.cpu.amd.ic.l1tlb_par@chip/core/strand;
512
513/* 								#ICTLB_2#
514 * An l2tlb parity fault in an I cache can cause:
515 *
516 *  - l2tlb_par : reported by ic on this cpu
517 */
518
519#define ICACHEL2TLB_SB_COUNT	2
520#define ICACHEL2TLB_SB_TIME	168h
521
522event ereport.cpu.amd.ic.l2tlb_par@chip/core/strand{within(5s)};
523engine serd.cpu.amd.l2itlb@chip/core/strand,
524    N=ICACHEL2TLB_SB_COUNT, T=ICACHEL2TLB_SB_TIME;
525event fault.cpu.amd.l2itlb@chip/core/strand, engine=serd.cpu.amd.l2itlb@chip/core/strand;
526
527prop fault.cpu.amd.l2itlb@chip/core/strand (0)->
528    ereport.cpu.amd.ic.l2tlb_par@chip/core/strand;
529
530/* 								#DCD_SINGLE#
531 * A single bit data array fault in an D cache can cause:
532 *
533 *  - data_ecc1 : reported by dc on this cpu by scrubber
534 *  - data_ecc1_uc : reported by dc on this cpu other than by scrubber
535 *
536 * Make data_ecc1_uc fault immediately as it may have caused a panic, so
537 * it is handled by the multi-bit case in the following section.
538 */
539
540#define DCACHEDATA_SB_COUNT	2
541#define DCACHEDATA_SB_TIME	168h
542
543event ereport.cpu.amd.dc.data_ecc1@chip/core/strand{within(5s)};
544event ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand{within(5s)};
545engine serd.cpu.amd.dc_sb@chip/core/strand,
546    N=DCACHEDATA_SB_COUNT, T=DCACHEDATA_SB_TIME;
547event fault.cpu.amd.dcachedata@chip/core/strand, engine=serd.cpu.amd.dc_sb@chip/core/strand;
548
549prop fault.cpu.amd.dcachedata@chip/core/strand (0)->
550    ereport.cpu.amd.dc.data_ecc1@chip/core/strand;
551
552/* 								#DCD_MULTI#
553 * A multi-bit data array fault in an D cache can cause:
554 *
555 *  - data_eccm : reported by dc on this cpu
556 */
557
558event ereport.cpu.amd.dc.data_eccm@chip/core/strand{within(5s)};
559
560prop fault.cpu.amd.dcachedata@chip/core/strand
561    { setserdincrement(L2CACHETAG_SB_COUNT + 1) } (0)->
562    ereport.cpu.amd.dc.data_eccm@chip/core/strand,
563    ereport.cpu.amd.dc.data_ecc1_uc@chip/core/strand;
564
565/* 								#DCT_PAR#
566 * A tag array parity fault in an D cache can cause:
567 *
568 *  - tag_par : reported by dc on this cpu
569 */
570
571event ereport.cpu.amd.dc.tag_par@chip/core/strand{within(5s)};
572event fault.cpu.amd.dcachetag@chip/core/strand;
573
574prop fault.cpu.amd.dcachetag@chip/core/strand (1)->
575    ereport.cpu.amd.dc.tag_par@chip/core/strand;
576
577/* 								#DCT_SNOOP#
578 * A snoop tag array parity fault in an D cache can cause:
579 *
580 *  - stag_par : reported by dc on this cpu
581 */
582
583event ereport.cpu.amd.dc.stag_par@chip/core/strand{within(5s)};
584event fault.cpu.amd.dcachestag@chip/core/strand;
585
586prop fault.cpu.amd.dcachestag@chip/core/strand (1)->
587    ereport.cpu.amd.dc.stag_par@chip/core/strand;
588
589/* 								#DCTLB_1#
590 * An l1tlb parity fault in an D cache can cause:
591 *
592 *  - l1tlb_par : reported by dc on this cpu
593 */
594
595event ereport.cpu.amd.dc.l1tlb_par@chip/core/strand{within(5s)};
596event fault.cpu.amd.l1dtlb@chip/core/strand;
597
598prop fault.cpu.amd.l1dtlb@chip/core/strand (1)->
599    ereport.cpu.amd.dc.l1tlb_par@chip/core/strand;
600
601/* 								#DCTLB_2#
602 * An l2tlb parity fault in an D cache can cause:
603 *
604 *  - l2tlb_par : reported by dc on this cpu
605 */
606
607event ereport.cpu.amd.dc.l2tlb_par@chip/core/strand{within(5s)};
608event fault.cpu.amd.l2dtlb@chip/core/strand;
609
610prop fault.cpu.amd.l2dtlb@chip/core/strand (1)->
611    ereport.cpu.amd.dc.l2tlb_par@chip/core/strand;
612
613/*								#MISC#
614 * Ereports that should not normally happen and which we will discard
615 * without diagnosis if they do.  These fall into a few categories:
616 *
617 *	- the corresponding detector is not enabled, typically because
618 *	  detection/handling of the event is taking place elsewhere
619 *	  (nb.ma, nb.ta, ls.rde, ic.rdde, bu.s_rde, nb.gart_walk)
620 *	- the event is associated with a sync flood so even if the detector is
621 *	  enabled we will never handle the event and generate an ereport *and*
622 *	  even if the ereport did arrive we could perform no useful diagnosis
623 *	  e.g., the NB can be configured for sync flood on nb.mem_eccm
624 *	  but we don't choose to discard that ereport here since we could have
625 *	  made a useful diagnosis from it had it been delivered
626 *	  (nb.ht_sync, nb.ht_crc)
627 *	- events that will be accompanied by an immediate panic and
628 *	  delivery of the ereport during subsequent reboot but from
629 *	  which no useful diagnosis can be made. (nb.rmw, nb.wdog)
630 *
631 * Ereports for all of these can be generated by error simulation and
632 * injection.  We will perform a null diagnosos of all these ereports in order
633 * to avoid "no subscription" complaints during test harness runs.
634 */
635
636event ereport.cpu.amd.nb.ma@strand{within(5s)};
637event ereport.cpu.amd.nb.ta@strand{within(5s)};
638event ereport.cpu.amd.ls.s_rde@strand{within(5s)};
639event ereport.cpu.amd.ic.rdde@strand{within(5s)};
640event ereport.cpu.amd.bu.s_rde@strand{within(5s)};
641event ereport.cpu.amd.nb.gart_walk@strand{within(5s)};
642event ereport.cpu.amd.nb.ht_sync@strand{within(5s)};
643event ereport.cpu.amd.nb.ht_crc@strand{within(5s)};
644event ereport.cpu.amd.nb.rmw@strand{within(5s)};
645event ereport.cpu.amd.nb.wdog@strand{within(5s)};
646event ereport.cpu.amd.unknown@strand{within(5s)};
647
648event upset.null_diag@strand;
649
650prop upset.null_diag@strand (1)->
651    ereport.cpu.amd.nb.ma@strand,
652    ereport.cpu.amd.nb.ta@strand,
653    ereport.cpu.amd.ls.s_rde@strand,
654    ereport.cpu.amd.ic.rdde@strand,
655    ereport.cpu.amd.bu.s_rde@strand,
656    ereport.cpu.amd.nb.gart_walk@strand,
657    ereport.cpu.amd.nb.ht_sync@strand,
658    ereport.cpu.amd.nb.ht_crc@strand,
659    ereport.cpu.amd.nb.rmw@strand,
660    ereport.cpu.amd.nb.wdog@strand,
661    ereport.cpu.amd.unknown@strand;
662