1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/*
26 * Ereport-handling routines for memory errors
27 */
28
29#include <cmd_mem.h>
30#include <cmd_dimm.h>
31#include <cmd_bank.h>
32#include <cmd_page.h>
33#include <cmd_cpu.h>
34#ifdef sun4u
35#include <cmd_dp.h>
36#include <cmd_dp_page.h>
37#endif
38#include <cmd.h>
39
40#include <strings.h>
41#include <string.h>
42#include <errno.h>
43#include <limits.h>
44#include <fm/fmd_api.h>
45#include <sys/fm/protocol.h>
46#include <sys/async.h>
47#include <sys/errclassify.h>
48#include <assert.h>
49
50#ifdef sun4v
51#include <cmd_hc_sun4v.h>
52#endif /* sun4v */
53
54struct ce_name2type {
55	const char *name;
56	ce_dispact_t type;
57};
58
59ce_dispact_t
60cmd_mem_name2type(const char *name, int minorvers)
61{
62	static const struct ce_name2type old[] = {
63		{ ERR_TYPE_DESC_INTERMITTENT,	CE_DISP_INTERMITTENT },
64		{ ERR_TYPE_DESC_PERSISTENT,	CE_DISP_PERS },
65		{ ERR_TYPE_DESC_STICKY,		CE_DISP_STICKY },
66		{ ERR_TYPE_DESC_UNKNOWN,	CE_DISP_UNKNOWN },
67		{ NULL }
68	};
69	static const struct ce_name2type new[] = {
70		{ CE_DISP_DESC_U,		CE_DISP_UNKNOWN },
71		{ CE_DISP_DESC_I,		CE_DISP_INTERMITTENT },
72		{ CE_DISP_DESC_PP,		CE_DISP_POSS_PERS },
73		{ CE_DISP_DESC_P,		CE_DISP_PERS },
74		{ CE_DISP_DESC_L,		CE_DISP_LEAKY },
75		{ CE_DISP_DESC_PS,		CE_DISP_POSS_STICKY },
76		{ CE_DISP_DESC_S,		CE_DISP_STICKY },
77		{ NULL }
78	};
79	const struct ce_name2type *names = (minorvers == 0) ? &old[0] : &new[0];
80	const struct ce_name2type *tp;
81
82	for (tp = names; tp->name != NULL; tp++)
83		if (strcasecmp(name, tp->name) == 0)
84			return (tp->type);
85
86	return (CE_DISP_UNKNOWN);
87}
88
89/*
90 * check if a dimm has n CEs with the same symbol-in-error
91 */
92static int
93upos_thresh_check(cmd_dimm_t *dimm, uint16_t upos, uint32_t threshold)
94{
95	int i;
96	cmd_mq_t *ip, *next;
97	int count = 0;
98
99	for (i = 0; i < CMD_MAX_CKWDS; i++) {
100		for (ip = cmd_list_next(&dimm->mq_root[i]); ip != NULL;
101		    ip = next) {
102			next = cmd_list_next(ip);
103			if (ip->mq_unit_position == upos) {
104				count++;
105				if (count >= threshold)
106					return (1);
107			}
108		}
109	}
110	return (0);
111}
112
113/*
114 * check if smaller number of retired pages > 1/16 of larger
115 * number of retired pages
116 */
117static int
118check_bad_rw_retired_pages(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2)
119{
120	uint_t sret, lret;
121	double ratio;
122	uint_t d1_nretired, d2_nretired;
123
124	sret = lret = 0;
125
126	d1_nretired = d1->dimm_nretired;
127	d2_nretired = d2->dimm_nretired;
128
129	if (d1->dimm_bank != NULL)
130		d1_nretired += d1->dimm_bank->bank_nretired;
131
132	if (d2->dimm_bank != NULL)
133		d2_nretired += d2->dimm_bank->bank_nretired;
134
135	if (d2_nretired < d1_nretired) {
136		sret = d2_nretired;
137		lret = d1_nretired;
138	} else if (d2_nretired > d1_nretired) {
139		sret = d1_nretired;
140		lret = d2_nretired;
141	} else
142		return (0);
143
144	ratio = lret * CMD_PAGE_RATIO;
145
146	if (sret > ratio) {
147		fmd_hdl_debug(hdl, "sret=%d lret=%d ratio=%.3f\n",
148		    sret, lret, ratio);
149		return (1);
150	}
151	return (0);
152}
153
154/*
155 * check bad rw between two DIMMs
156 * the check succeeds if
157 * - each DIMM has 4 CEs with the same symbol-in-error.
158 * - the smaller number of retired pages > 1/16 larger number of retired pages
159 */
160static int
161check_bad_rw_between_dimms(fmd_hdl_t *hdl, cmd_dimm_t *d1, cmd_dimm_t *d2,
162    uint16_t *rupos)
163{
164	int i;
165	cmd_mq_t *ip, *next;
166	uint16_t upos;
167
168	for (i = 0; i < CMD_MAX_CKWDS; i++) {
169		for (ip = cmd_list_next(&d1->mq_root[i]); ip != NULL;
170		    ip = next) {
171			next = cmd_list_next(ip);
172			upos = ip->mq_unit_position;
173			if (upos_thresh_check(d1, upos, cmd.cmd_nupos)) {
174				if (upos_thresh_check(d2, upos,
175				    cmd.cmd_nupos)) {
176					if (check_bad_rw_retired_pages(hdl,
177					    d1, d2)) {
178						*rupos = upos;
179						return (1);
180					}
181				}
182			}
183		}
184	}
185
186	return (0);
187}
188
189static void
190bad_reader_writer_check(fmd_hdl_t *hdl, cmd_dimm_t *ce_dimm, nvlist_t *det)
191{
192	cmd_dimm_t *d, *next;
193	uint16_t upos;
194
195	for (d = cmd_list_next(&cmd.cmd_dimms); d != NULL; d = next) {
196		next = cmd_list_next(d);
197		if (d == ce_dimm)
198			continue;
199		if (!cmd_same_datapath_dimms(ce_dimm, d))
200			continue;
201		if (check_bad_rw_between_dimms(hdl, ce_dimm, d, &upos)) {
202			cmd_gen_datapath_fault(hdl, ce_dimm, d, upos, det);
203			cmd_dimm_save_symbol_error(ce_dimm, upos);
204			fmd_hdl_debug(hdl,
205			    "check_bad_rw_dimms succeeded: %s %s",
206			    ce_dimm->dimm_unum, d->dimm_unum);
207			return;
208		}
209	}
210}
211
212/*
213 * rule 5a checking. The check succeeds if
214 * - nretired >= 512
215 * - nretired >= 128 and (addr_hi - addr_low) / (nretired - 1) > 512KB
216 */
217static void
218ce_thresh_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
219{
220	nvlist_t *flt;
221	fmd_case_t *cp;
222	uint_t nret;
223	uint64_t delta_addr = 0;
224
225	if (dimm->dimm_flags & CMD_MEM_F_FAULTING)
226		/* We've already complained about this DIMM */
227		return;
228
229	nret = dimm->dimm_nretired;
230	if (dimm->dimm_bank != NULL)
231		nret += dimm->dimm_bank->bank_nretired;
232
233	if (nret < cmd.cmd_low_ce_thresh)
234		return;
235
236	if (dimm->dimm_phys_addr_hi >= dimm->dimm_phys_addr_low)
237		delta_addr =
238		    (dimm->dimm_phys_addr_hi - dimm->dimm_phys_addr_low) /
239		    (nret - 1);
240
241	if (nret >= cmd.cmd_hi_ce_thresh || delta_addr > CMD_MQ_512KB) {
242
243		dimm->dimm_flags |= CMD_MEM_F_FAULTING;
244		cmd_dimm_dirty(hdl, dimm);
245
246		cp = fmd_case_open(hdl, NULL);
247		flt = cmd_dimm_create_fault(hdl, dimm,
248		    "fault.memory.dimm-page-retires-excessive", CMD_FLTMAXCONF);
249		fmd_case_add_suspect(hdl, cp, flt);
250		fmd_case_solve(hdl, cp);
251		fmd_hdl_debug(hdl, "ce_thresh_check succeeded nretired %d\n",
252		    nret);
253
254	}
255}
256
257/*
258 * rule 5b checking. The check succeeds if
259 * more than 120 non-intermittent CEs are reported against one symbol
260 * position of one afar in 72 hours.
261 */
262static void
263mq_5b_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
264{
265	nvlist_t *flt;
266	fmd_case_t *cp;
267	cmd_mq_t *ip, *next;
268	int cw;
269
270	for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
271		for (ip = cmd_list_next(&dimm->mq_root[cw]);
272		    ip != NULL; ip = next) {
273			next = cmd_list_next(ip);
274			if (ip->mq_dupce_count >= cmd.cmd_dupce) {
275				cp = fmd_case_open(hdl, NULL);
276				flt = cmd_dimm_create_fault(hdl, dimm,
277				    "fault.memory.dimm-page-retires-excessive",
278				    CMD_FLTMAXCONF);
279				dimm->dimm_flags |= CMD_MEM_F_FAULTING;
280				cmd_dimm_dirty(hdl, dimm);
281				fmd_case_add_suspect(hdl, cp, flt);
282				fmd_case_solve(hdl, cp);
283				fmd_hdl_debug(hdl,
284				    "mq_5b_check succeeded: duplicate CE=%d",
285				    ip->mq_dupce_count);
286				return;
287			}
288		}
289	}
290}
291
292/*
293 * delete the expired duplicate CE time stamps
294 */
295void
296mq_prune_dup(fmd_hdl_t *hdl, cmd_mq_t *ip, uint64_t now)
297{
298	tstamp_t *tsp, *next;
299
300	for (tsp = cmd_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
301	    tsp = next) {
302		next = cmd_list_next(tsp);
303		if (tsp->tstamp < now - CMD_MQ_TIMELIM) {
304			cmd_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
305			fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
306			ip->mq_dupce_count--;
307		}
308	}
309}
310
311void
312mq_update(fmd_hdl_t *hdl, fmd_event_t *ep, cmd_mq_t *ip, uint64_t now,
313    uint32_t cpuid)
314{
315	tstamp_t *tsp;
316
317	ip->mq_tstamp = now;
318	ip->mq_cpuid = cpuid;
319	ip->mq_ep = ep;
320
321	if (fmd_serd_exists(hdl, ip->mq_serdnm))
322		fmd_serd_destroy(hdl, ip->mq_serdnm);
323	fmd_serd_create(hdl, ip->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT);
324	(void) fmd_serd_record(hdl, ip->mq_serdnm, ep);
325
326	tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP);
327	tsp->tstamp = now;
328	cmd_list_append(&ip->mq_dupce_tstamp, tsp);
329	ip->mq_dupce_count++;
330}
331
332/* Create a fresh index block for MQSC CE correlation. */
333cmd_mq_t *
334mq_create(fmd_hdl_t *hdl, fmd_event_t *ep,
335    uint64_t afar, uint16_t upos, uint64_t now, uint32_t cpuid)
336{
337	cmd_mq_t *cp;
338	tstamp_t *tsp;
339	uint16_t ckwd = (afar & 0x30) >> 4;
340
341	cp = fmd_hdl_zalloc(hdl, sizeof (cmd_mq_t), FMD_SLEEP);
342	cp->mq_tstamp = now;
343	cp->mq_ckwd = ckwd;
344	cp->mq_phys_addr = afar;
345	cp->mq_unit_position = upos;
346	cp->mq_ep = ep;
347	cp->mq_serdnm =
348	    cmd_mq_serdnm_create(hdl, "mq", afar, ckwd, upos);
349
350	tsp = fmd_hdl_zalloc(hdl, sizeof (tstamp_t), FMD_SLEEP);
351	tsp->tstamp = now;
352	cmd_list_append(&cp->mq_dupce_tstamp, tsp);
353	cp->mq_dupce_count = 1;
354	cp->mq_cpuid = cpuid;
355
356	/*
357	 * Create SERD to keep this event from being removed
358	 * by fmd which may not know there is an event pointer
359	 * saved here. This SERD is *never* meant to fire.
360	 * NOTE: wouldn't need to do this if there were an fmd
361	 * api to 'hold' an event.
362	 */
363	if (fmd_serd_exists(hdl, cp->mq_serdnm)) {
364		/* clean up dup */
365		fmd_serd_destroy(hdl, cp->mq_serdnm);
366	}
367	fmd_serd_create(hdl, cp->mq_serdnm, CMD_MQ_SERDN, CMD_MQ_SERDT);
368	(void) fmd_serd_record(hdl, cp->mq_serdnm, ep);
369
370	return (cp);
371}
372
373/* Destroy MQSC tracking block as well as event tracking SERD. */
374
375cmd_mq_t *
376mq_destroy(fmd_hdl_t *hdl, cmd_list_t *lp, cmd_mq_t *ip)
377{
378	cmd_mq_t *jp = cmd_list_next(ip);
379	tstamp_t *tsp, *next;
380
381	if (ip->mq_serdnm != NULL) {
382		if (fmd_serd_exists(hdl, ip->mq_serdnm))
383			fmd_serd_destroy(hdl, ip->mq_serdnm);
384		fmd_hdl_strfree(hdl, ip->mq_serdnm);
385		ip->mq_serdnm = NULL;
386	}
387
388	for (tsp = cmd_list_next(&ip->mq_dupce_tstamp); tsp != NULL;
389	    tsp = next) {
390		next = cmd_list_next(tsp);
391		cmd_list_delete(&ip->mq_dupce_tstamp, &tsp->ts_l);
392		fmd_hdl_free(hdl, tsp, sizeof (tstamp_t));
393	}
394
395	cmd_list_delete(lp, &ip->mq_l);
396	fmd_hdl_free(hdl, ip, sizeof (cmd_mq_t));
397
398	return (jp);
399}
400
401/*
402 * Add an index block for a new CE, sorted
403 * a) by ascending unit position
404 * b) order of arrival (~= time order)
405 */
406
407void
408mq_add(fmd_hdl_t *hdl, cmd_dimm_t *dimm, fmd_event_t *ep,
409    uint64_t afar, uint16_t synd, uint64_t now, uint32_t cpuid)
410{
411	cmd_mq_t *ip, *jp;
412	int cw, unit_position;
413
414	cw = (afar & 0x30) >> 4;		/* 0:3 */
415	if ((unit_position = cmd_synd2upos(synd)) < 0)
416		return;				/* not a CE */
417
418	for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) {
419		if (ip->mq_unit_position > unit_position) {
420			/* list is in unit position order */
421			break;
422		} else if (ip->mq_unit_position == unit_position &&
423		    ip->mq_phys_addr == afar) {
424			/*
425			 * Found a duplicate cw, unit_position, and afar.
426			 * update the mq_t with the new information
427			 */
428			mq_update(hdl, ep, ip, now, cpuid);
429			return;
430		} else {
431			ip = cmd_list_next(ip);
432		}
433	}
434
435	jp = mq_create(hdl, ep, afar, unit_position, now, cpuid);
436	if (ip == NULL)
437		cmd_list_append(&dimm->mq_root[cw], jp);
438	else
439		cmd_list_insert_before(&dimm->mq_root[cw], ip, jp);
440}
441
442/*
443 * Prune the MQSC index lists (one for each checkword), by deleting
444 * outdated index blocks from each list.
445 */
446
447void
448mq_prune(fmd_hdl_t *hdl, cmd_dimm_t *dimm, uint64_t now)
449{
450	cmd_mq_t *ip;
451	int cw;
452
453	for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
454		for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL; ) {
455			if (ip->mq_tstamp < now - CMD_MQ_TIMELIM) {
456				/*
457				 * This event has timed out - delete the
458				 * mq block as well as serd for the event.
459				 */
460				ip = mq_destroy(hdl, &dimm->mq_root[cw], ip);
461			} else {
462				/* tstamp < now - ce_t */
463				mq_prune_dup(hdl, ip, now);
464				ip = cmd_list_next(ip);
465			}
466		} /* per checkword */
467	} /* cw = 0...3 */
468}
469
470/*
471 * Check the MQSC index lists (one for each checkword) by making a
472 * complete pass through each list, checking if the criteria for
473 * Rule 4A has been met.  Rule 4A checking is done for each checkword.
474 *
475 * Rule 4A: fault a DIMM  "whenever Solaris reports two or more CEs from
476 * two or more different physical addresses on each of two or more different
477 * bit positions from the same DIMM within 72 hours of each other, and all
478 * the addresses are in the same relative checkword (that is, the AFARs
479 * are all the same modulo 64).  [Note: This means at least 4 CEs; two
480 * from one bit position, with unique addresses, and two from another,
481 * also with unique addresses, and the lower 6 bits of all the addresses
482 * are the same."
483 */
484
485void
486mq_check(fmd_hdl_t *hdl, cmd_dimm_t *dimm)
487{
488	int upos_pairs, curr_upos, cw, i, j;
489	nvlist_t *flt;
490	typedef struct upos_pair {
491		int upos;
492		cmd_mq_t *mq1;
493		cmd_mq_t *mq2;
494	} upos_pair_t;
495	upos_pair_t upos_array[8]; /* max per cw = 2, * 4 cw's */
496	cmd_mq_t *ip;
497
498	/*
499	 * Each upos_array[] member represents a pair of CEs for the same
500	 * unit position (symbol) which on a sun4u is a bit, and on sun4v
501	 * is a (4 bit) nibble.
502	 * MQSC rule 4 requires pairs of CEs from the same symbol (same DIMM
503	 * for rule 4A, and same DRAM for rule 4B) for a violation - this
504	 * is why CE pairs are tracked.
505	 */
506	upos_pairs = 0;
507	upos_array[0].mq1 = NULL;
508
509	/* Loop through all checkwords */
510	for (cw = 0; cw < CMD_MAX_CKWDS; cw++) {
511		i = upos_pairs;
512		curr_upos = -1;
513
514		/*
515		 * mq_root[] is an array of cumulative lists of CEs
516		 * indexed by checkword where the list is in unit position
517		 * order. Loop through checking for duplicate unit position
518		 * entries (filled in at mq_create()).
519		 * The upos_array[] is filled in each time a duplicate
520		 * unit position is found; the first time through the loop
521		 * of a unit position sets curr_upos but does not fill in
522		 * upos_array[] until the second symbol is found.
523		 */
524		for (ip = cmd_list_next(&dimm->mq_root[cw]); ip != NULL;
525		    ip = cmd_list_next(ip)) {
526			if (curr_upos != ip->mq_unit_position) {
527				/* Set initial current position */
528				curr_upos = ip->mq_unit_position;
529			} else if (i > upos_pairs &&
530			    curr_upos == upos_array[i-1].upos) {
531				/*
532				 * Only keep track of CE pairs; skip
533				 * triples, quads, etc...
534				 */
535				continue;
536			} else if (upos_array[i].mq1 == NULL) {
537				/*
538				 * Have a pair, add to upos_array[].
539				 */
540				upos_array[i].upos = curr_upos;
541				upos_array[i].mq1 = cmd_list_prev(ip);
542				upos_array[i].mq2 = ip;
543				upos_array[++i].mq1 = NULL;
544			}
545		}
546
547		if (i - upos_pairs >= 2) {
548			/* Rule 4A Violation. */
549			flt = cmd_dimm_create_fault(hdl,
550			    dimm, "fault.memory.dimm-ue-imminent",
551			    CMD_FLTMAXCONF);
552			for (j = upos_pairs; j < i; j++) {
553				fmd_case_add_ereport(hdl,
554				    dimm->dimm_case.cc_cp,
555				    upos_array[j].mq1->mq_ep);
556				fmd_case_add_ereport(hdl,
557				    dimm->dimm_case.cc_cp,
558				    upos_array[j].mq2->mq_ep);
559			}
560			dimm->dimm_flags |= CMD_MEM_F_FAULTING;
561			cmd_dimm_dirty(hdl, dimm);
562			fmd_case_add_suspect(hdl, dimm->dimm_case.cc_cp, flt);
563			fmd_case_solve(hdl, dimm->dimm_case.cc_cp);
564			return;
565		}
566		upos_pairs = i;
567		assert(upos_pairs < 8);
568	}
569}
570
571/*ARGSUSED*/
572cmd_evdisp_t
573cmd_ce_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
574    const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd,
575    uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru)
576{
577	cmd_dimm_t *dimm;
578	cmd_page_t *page;
579	const char *uuid;
580	uint64_t *now;
581	uint_t nelem;
582	uint32_t cpuid;
583	nvlist_t *det;
584	uint64_t addr;
585	int skip_error = 0;
586
587	if (afar_status != AFLT_STAT_VALID ||
588	    synd_status != AFLT_STAT_VALID)
589		return (CMD_EVD_UNUSED);
590
591	if ((page = cmd_page_lookup(afar)) != NULL &&
592	    page->page_case.cc_cp != NULL &&
593	    fmd_case_solved(hdl, page->page_case.cc_cp))
594		return (CMD_EVD_REDUND);
595
596#ifdef sun4u
597	if (cmd_dp_error(hdl) || cmd_dp_fault(hdl, afar)) {
598		CMD_STAT_BUMP(dp_ignored_ce);
599		return (CMD_EVD_UNUSED);
600	}
601#endif /* sun4u */
602
603	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
604		CMD_STAT_BUMP(bad_mem_asru);
605		return (CMD_EVD_BAD);
606	}
607
608	if ((dimm = cmd_dimm_lookup(hdl, asru)) == NULL &&
609	    (dimm = cmd_dimm_create(hdl, asru)) == NULL)
610		return (CMD_EVD_UNUSED);
611
612	if (dimm->dimm_case.cc_cp == NULL) {
613		dimm->dimm_case.cc_cp = cmd_case_create(hdl,
614		    &dimm->dimm_header, CMD_PTR_DIMM_CASE, &uuid);
615	}
616
617	if (nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR, &det) != 0)
618		return (CMD_EVD_BAD);
619
620	/*
621	 * Add to MQSC correlation lists all CEs which pass validity
622	 * checks above.
623	 * Add mq_t when there is no bad r/w or dimm fault.
624	 * Always prune the expired mq_t.
625	 */
626	skip_error = cmd_dimm_check_symbol_error(dimm, synd);
627
628	if (nvlist_lookup_uint64_array(nvl,
629	    "__tod", &now, &nelem) == 0) {
630
631		if (!skip_error || !(dimm->dimm_flags & CMD_MEM_F_FAULTING)) {
632			if (nvlist_lookup_uint32(det, FM_FMRI_CPU_ID, &cpuid)
633			    != 0)
634				cpuid = ULONG_MAX;
635
636			mq_add(hdl, dimm, ep, afar, synd, *now, cpuid);
637		}
638
639		mq_prune(hdl, dimm, *now);
640
641		if (!skip_error)
642			bad_reader_writer_check(hdl, dimm, det);
643
644		if (!(dimm->dimm_flags & CMD_MEM_F_FAULTING)) {
645			mq_check(hdl, dimm);
646			mq_5b_check(hdl, dimm);
647		}
648	}
649
650	switch (type) {
651	case CE_DISP_UNKNOWN:
652		CMD_STAT_BUMP(ce_unknown);
653		return (CMD_EVD_UNUSED);
654	case CE_DISP_INTERMITTENT:
655		CMD_STAT_BUMP(ce_interm);
656		return (CMD_EVD_UNUSED);
657	case CE_DISP_POSS_PERS:
658		CMD_STAT_BUMP(ce_ppersis);
659		break;
660	case CE_DISP_PERS:
661		CMD_STAT_BUMP(ce_persis);
662		break;
663	case CE_DISP_LEAKY:
664		CMD_STAT_BUMP(ce_leaky);
665		break;
666	case CE_DISP_POSS_STICKY:
667	{
668		uchar_t ptnrinfo = CE_XDIAG_PTNRINFO(disp);
669
670		if (CE_XDIAG_TESTVALID(ptnrinfo)) {
671			int ce1 = CE_XDIAG_CE1SEEN(ptnrinfo);
672			int ce2 = CE_XDIAG_CE2SEEN(ptnrinfo);
673
674			if (ce1 && ce2) {
675				/* Should have been CE_DISP_STICKY */
676				return (CMD_EVD_BAD);
677			} else if (ce1) {
678				/* Partner could see and could fix CE */
679				CMD_STAT_BUMP(ce_psticky_ptnrclrd);
680			} else {
681				/* Partner could not see ce1 (ignore ce2) */
682				CMD_STAT_BUMP(ce_psticky_ptnrnoerr);
683			}
684		} else {
685			CMD_STAT_BUMP(ce_psticky_noptnr);
686		}
687		return (CMD_EVD_UNUSED);
688	}
689	case CE_DISP_STICKY:
690		CMD_STAT_BUMP(ce_sticky);
691		break;
692	default:
693		return (CMD_EVD_BAD);
694	}
695
696	if (cmd_dimm_check_symbol_error(dimm, synd))
697		return (CMD_EVD_REDUND);
698
699	if (page == NULL)
700		page = cmd_page_create(hdl, asru, afar);
701
702	if (page->page_case.cc_cp == NULL) {
703		page->page_case.cc_cp = cmd_case_create(hdl,
704		    &page->page_header, CMD_PTR_PAGE_CASE, &uuid);
705	}
706
707	switch (type) {
708	case CE_DISP_POSS_PERS:
709	case CE_DISP_PERS:
710		fmd_hdl_debug(hdl, "adding %sPersistent event to CE serd "
711		    "engine\n", type == CE_DISP_POSS_PERS ? "Possible-" : "");
712
713		if (page->page_case.cc_serdnm == NULL) {
714			page->page_case.cc_serdnm = cmd_page_serdnm_create(hdl,
715			    "page", page->page_physbase);
716
717			fmd_serd_create(hdl, page->page_case.cc_serdnm,
718			    fmd_prop_get_int32(hdl, "ce_n"),
719			    fmd_prop_get_int64(hdl, "ce_t"));
720		}
721
722		if (fmd_serd_record(hdl, page->page_case.cc_serdnm, ep) ==
723		    FMD_B_FALSE)
724				return (CMD_EVD_OK); /* engine hasn't fired */
725
726		fmd_hdl_debug(hdl, "ce page serd fired\n");
727		fmd_case_add_serd(hdl, page->page_case.cc_cp,
728		    page->page_case.cc_serdnm);
729		fmd_serd_reset(hdl, page->page_case.cc_serdnm);
730		break;	/* to retire */
731
732	case CE_DISP_LEAKY:
733	case CE_DISP_STICKY:
734		fmd_case_add_ereport(hdl, page->page_case.cc_cp, ep);
735		break;	/* to retire */
736	}
737
738	if (page->page_flags & CMD_MEM_F_FAULTING ||
739	    fmd_nvl_fmri_unusable(hdl, page->page_asru_nvl))
740		return (CMD_EVD_OK);
741
742	/*
743	 * convert a unhashed address to hashed address
744	 */
745	cmd_to_hashed_addr(&addr, afar, class);
746
747	if (afar > dimm->dimm_phys_addr_hi)
748		dimm->dimm_phys_addr_hi = addr;
749
750	if (afar < dimm->dimm_phys_addr_low)
751		dimm->dimm_phys_addr_low = addr;
752
753	dimm->dimm_nretired++;
754	dimm->dimm_retstat.fmds_value.ui64++;
755	cmd_dimm_dirty(hdl, dimm);
756
757	cmd_page_fault(hdl, asru, cmd_dimm_fru(dimm), ep, afar);
758	ce_thresh_check(hdl, dimm);
759
760	return (CMD_EVD_OK);
761}
762
763/*
764 * Solve a bank case with suspect "fault.memory.bank".  The caller must
765 * have populated bank->bank_case.cc_cp and is also responsible for adding
766 * associated ereport(s) to that case.
767 */
768void
769cmd_bank_fault(fmd_hdl_t *hdl, cmd_bank_t *bank)
770{
771	fmd_case_t *cp = bank->bank_case.cc_cp;
772	nvlist_t *flt;
773
774	if (bank->bank_flags & CMD_MEM_F_FAULTING)
775		return; /* Only complain once per bank */
776
777	bank->bank_flags |= CMD_MEM_F_FAULTING;
778	cmd_bank_dirty(hdl, bank);
779
780#ifdef	sun4u
781	flt = cmd_bank_create_fault(hdl, bank, "fault.memory.bank",
782	    CMD_FLTMAXCONF);
783	fmd_case_add_suspect(hdl, cp, flt);
784#else /* sun4v */
785	{
786		cmd_bank_memb_t *d;
787
788		/* create separate fault for each dimm in bank */
789
790		for (d = cmd_list_next(&bank->bank_dimms);
791		    d != NULL; d = cmd_list_next(d)) {
792			flt = cmd_dimm_create_fault(hdl, d->bm_dimm,
793			    "fault.memory.bank", CMD_FLTMAXCONF);
794			fmd_case_add_suspect(hdl, cp, flt);
795		}
796	}
797#endif /* sun4u */
798	fmd_case_solve(hdl, cp);
799}
800
801/*ARGSUSED*/
802cmd_evdisp_t
803cmd_ue_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
804    const char *class, uint64_t afar, uint8_t afar_status, uint16_t synd,
805    uint8_t synd_status, ce_dispact_t type, uint64_t disp, nvlist_t *asru)
806{
807	cmd_page_t *page;
808	cmd_bank_t *bank;
809	cmd_cpu_t *cpu;
810
811#ifdef sun4u
812	/*
813	 * Note: Currently all sun4u processors using this code share
814	 * L2 and L3 cache at CMD_CPU_LEVEL_CORE.
815	 */
816	cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,
817	    CMD_CPU_LEVEL_CORE);
818#else /* sun4v */
819	cpu = cmd_cpu_lookup_from_detector(hdl, nvl, class,
820	    CMD_CPU_LEVEL_THREAD);
821#endif /* sun4u */
822
823	if (cpu == NULL) {
824		fmd_hdl_debug(hdl, "cmd_ue_common: cpu not found\n");
825		return (CMD_EVD_UNUSED);
826	}
827
828	/*
829	 * The following code applies only to sun4u, because sun4u does
830	 * not poison data in L2 cache resulting from the fetch of a
831	 * memory UE.
832	 */
833
834#ifdef sun4u
835	if (afar_status != AFLT_STAT_VALID) {
836		/*
837		 * Had this report's AFAR been valid, it would have
838		 * contributed an address to the UE cache.  We don't
839		 * know what the AFAR would have been, and thus we can't
840		 * add anything to the cache.  If a xxU is caused by
841		 * this UE, we won't be able to detect it, and will thus
842		 * erroneously offline the CPU.  To prevent this
843		 * situation, we need to assume that all xxUs generated
844		 * through the next E$ flush are attributable to the UE.
845		 */
846		cmd_cpu_uec_set_allmatch(hdl, cpu);
847	} else {
848		cmd_cpu_uec_add(hdl, cpu, afar);
849	}
850#endif /* sun4u */
851
852	if (synd_status != AFLT_STAT_VALID) {
853		fmd_hdl_debug(hdl, "cmd_ue_common: syndrome not valid\n");
854		return (CMD_EVD_UNUSED);
855	}
856
857	if (cmd_mem_synd_check(hdl, afar, afar_status, synd, synd_status,
858	    cpu) == CMD_EVD_UNUSED)
859		return (CMD_EVD_UNUSED);
860
861	if (afar_status != AFLT_STAT_VALID)
862		return (CMD_EVD_UNUSED);
863
864	if ((page = cmd_page_lookup(afar)) != NULL &&
865	    page->page_case.cc_cp != NULL &&
866	    fmd_case_solved(hdl, page->page_case.cc_cp))
867		return (CMD_EVD_REDUND);
868
869	if (fmd_nvl_fmri_expand(hdl, asru) < 0) {
870		CMD_STAT_BUMP(bad_mem_asru);
871		return (CMD_EVD_BAD);
872	}
873
874	if ((bank = cmd_bank_lookup(hdl, asru)) == NULL &&
875	    (bank = cmd_bank_create(hdl, asru)) == NULL)
876		return (CMD_EVD_UNUSED);
877
878#ifdef sun4v
879	{
880		nvlist_t *fmri;
881		char **snarray;
882		unsigned int i, n;
883
884		/*
885		 * 1: locate the array of serial numbers inside the bank asru.
886		 * 2: for each serial #, lookup its mem: FMRI in libtopo
887		 * 3: ensure that each DIMM's FMRI is on bank's dimmlist
888		 */
889
890		if (nvlist_lookup_string_array(asru,
891		    FM_FMRI_MEM_SERIAL_ID, &snarray, &n) != 0)
892			fmd_hdl_abort(hdl, "Cannot locate serial #s for bank");
893
894		for (i = 0; i < n; i++) {
895			fmri = cmd_find_dimm_by_sn(hdl, FM_FMRI_SCHEME_MEM,
896			    snarray[i]);
897			/*
898			 * If dimm structure doesn't already exist for
899			 * each dimm, create and link to bank.
900			 */
901			if (cmd_dimm_lookup(hdl, fmri) == NULL)
902				(void) cmd_dimm_create(hdl, fmri);
903			nvlist_free(fmri);
904		}
905	}
906#endif /* sun4v */
907
908	if (bank->bank_case.cc_cp == NULL) {
909		const char *uuid;
910		bank->bank_case.cc_cp = cmd_case_create(hdl, &bank->bank_header,
911		    CMD_PTR_BANK_CASE, &uuid);
912	}
913
914#ifdef sun4u
915	if (cmd_dp_error(hdl)) {
916		CMD_STAT_BUMP(dp_deferred_ue);
917		cmd_dp_page_defer(hdl, asru, ep, afar);
918		return (CMD_EVD_OK);
919	} else if (cmd_dp_fault(hdl, afar)) {
920		CMD_STAT_BUMP(dp_ignored_ue);
921		return (CMD_EVD_UNUSED);
922	}
923#endif /* sun4u */
924
925	fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep);
926
927	bank->bank_nretired++;
928	bank->bank_retstat.fmds_value.ui64++;
929	cmd_bank_dirty(hdl, bank);
930
931	cmd_page_fault(hdl, bank->bank_asru_nvl, cmd_bank_fru(bank), ep, afar);
932	cmd_bank_fault(hdl, bank);
933
934	return (CMD_EVD_OK);
935}
936
937void
938cmd_dimm_close(fmd_hdl_t *hdl, void *arg)
939{
940	cmd_dimm_destroy(hdl, arg);
941}
942
943void
944cmd_bank_close(fmd_hdl_t *hdl, void *arg)
945{
946	cmd_bank_destroy(hdl, arg);
947}
948