1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * Ereport-handling routines for Datapath errors
30 * - receive datapath ereports and open datapath case
31 * - solve datapath case when datapath fault ereports are received
32 * - maintain state of datapath error flag
33 * - close datapath case when timeout occurs (w/o fault)
34 */
35
36
37#include <strings.h>
38#include <string.h>
39#include <errno.h>
40#include <fm/fmd_api.h>
41#include <sys/fm/protocol.h>
42#include <sys/async.h>
43#include <sys/time.h>
44#include <cmd.h>
45#include <cmd_state.h>
46#include <cmd_dp.h>
47#include <cmd_dp_page.h>
48#include <cmd_page.h>
49#include <libnvpair.h>
50#include <sys/plat_datapath.h>
51
52/*
53 * Member Name     Data Type          Comments
54 * -----------     ---------          -----------
55 * version         uint8              0
56 * class           string             "asic"
57 * ENA             uint64             ENA Format 1
58 * detector        fmri               aggregated ID data for SC-DE
59 *
60 * Datapath ereport subclasses and data payloads:
61 * There will be two types of ereports (error and fault) which will be
62 * identified by the "type" member.
63 *
64 * ereport.asic.*.cds.cds-dp
65 * ereport.asic.*.dx.dx-dp
66 * ereport.asic.*.sdi.sdi-dp
67 * ereport.asic.*.cp.cp-dp
68 * ereport.asic.*.rp.rp-dp		// serengeti doesn't use "cp" term
69 *
70 * Member Name     Data Type          Comments
71 * -----------     ---------          -----------
72 * erptype         uint16            derived from message type: error or
73 *                                   fault
74 * t-value         uint32            SC's datapath SERD timeout threshold
75 * dp-list-sz      uint8             number of dp-list array elements
76 * dp-list         array of uint16   Safari IDs of affected cpus
77 */
78
79static char *dperrtype[] = {
80	DP_ERROR_CDS,		/* Starcat types */
81	DP_ERROR_DX,
82	DP_ERROR_EX,
83	DP_ERROR_CP,
84	DP_ERROR_CDS,		/* Serengeti types */
85	DP_ERROR_DX,
86	DP_ERROR_RP
87};
88
89/*
90 * Construct the ASRU(s)/FRU(s) associated with a data path fault,
91 * construct the fault(s), and add the suspect(s) to the case
92 *
93 */
94void
95cmd_dp_add_suspects(fmd_hdl_t *hdl, cmd_dp_t *dp)
96{
97	const char	*funcname = "cmd_dp_add_suspects()";
98	char		class[DP_MAX_CLASS];
99	char		frustr[3][DP_MAX_FRU];
100	int		cpuid, numfru, sgpos, xcpos, i, err;
101	nvlist_t	*asru, *fru = NULL, *flt, *hcel;
102
103	/* build ASRU, fault event class */
104	asru = cmd_dp_setasru(hdl, dp);
105	(void) snprintf(class, DP_MAX_CLASS, "fault.asic.%s.%s",
106	    dperrtype[dp->dp_err], FM_ERROR_DATAPATH);
107
108	cpuid = dp->dp_cpuid_list[0];
109
110	/* extract fru position */
111	sgpos = ((cpuid & 0x1f) / 4);
112	xcpos = ((cpuid >> 5) & 0x1f);
113
114	/* build FRU(s) for the particular error */
115	numfru = 0;
116	switch (dp->dp_err) {
117	case SC_DP_CDS_TYPE:
118	case SC_DP_DX_TYPE:
119		/* check for slot 1 (maxcat) */
120		if ((cpuid >> 3) & 0x1)
121			(void) snprintf(frustr[0], DP_MAX_FRU, "IO%d", xcpos);
122		else
123			(void) snprintf(frustr[0], DP_MAX_FRU, "SB%d", xcpos);
124
125		numfru = 1;
126		break;
127
128	case SC_DP_EX_TYPE:
129		/* check for slot 1 (maxcat) */
130		if ((cpuid >> 3) & 0x1)
131			(void) snprintf(frustr[0], DP_MAX_FRU, "IO%d", xcpos);
132		else
133			(void) snprintf(frustr[0], DP_MAX_FRU, "SB%d", xcpos);
134
135		(void) snprintf(frustr[1], DP_MAX_FRU, "EX%d", xcpos);
136		numfru = 2;
137		break;
138
139	case SC_DP_CP_TYPE:
140		/* no way to know which CP half, be generic */
141		(void) snprintf(frustr[0], DP_MAX_FRU, "EX%d", xcpos);
142		(void) snprintf(frustr[1], DP_MAX_FRU, "CP");
143		(void) snprintf(frustr[2], DP_MAX_FRU, "CS");
144		numfru = 3;
145		break;
146
147	case SG_DP_CDS_TYPE:
148	case SG_DP_DX_TYPE:
149		(void) snprintf(frustr[0], DP_MAX_FRU, "/N0/SB%d", sgpos);
150		numfru = 1;
151		break;
152
153	case SG_DP_RP_TYPE:
154		/* no way to know which RP, be generic */
155		(void) snprintf(frustr[0], DP_MAX_FRU, "/N0/SB%d", sgpos);
156		(void) snprintf(frustr[1], DP_MAX_FRU, "RP");
157		numfru = 2;
158		break;
159
160	default:
161		fmd_hdl_debug(hdl, "%s: invalid DP error type %d", funcname,
162		    dp->dp_err);
163		nvlist_free(asru);
164		return;
165	}
166
167	/* For each FRU, build an FMRI, create fault, add as suspect */
168	for (i = 0; i < numfru; i++) {
169		/* build a FRU FMRI */
170		if (nvlist_alloc(&hcel, NV_UNIQUE_NAME, 0) != 0) {
171			nvlist_free(asru);
172			return;
173		}
174		err = nvlist_add_string(hcel, FM_FMRI_HC_NAME,
175		    FM_FMRI_LEGACY_HC);
176		err |= nvlist_add_string(hcel, FM_FMRI_HC_ID, frustr[i]);
177		if (err != 0) {
178			nvlist_free(hcel);
179			nvlist_free(asru);
180			return;
181		}
182
183		/* put it in an HC scheme */
184		if (nvlist_alloc(&fru, NV_UNIQUE_NAME, 0) != 0) {
185			nvlist_free(hcel);
186			nvlist_free(asru);
187			return;
188		}
189		err = nvlist_add_uint8(fru, FM_VERSION, FM_HC_SCHEME_VERSION);
190		err |= nvlist_add_string(fru, FM_FMRI_SCHEME,
191		    FM_FMRI_SCHEME_HC);
192		err |= nvlist_add_string(fru, FM_FMRI_HC_ROOT, "");
193		err |= nvlist_add_uint32(fru, FM_FMRI_HC_LIST_SZ, 1);
194		err |= nvlist_add_nvlist_array(fru, FM_FMRI_HC_LIST, &hcel, 1);
195		if (err != 0) {
196			nvlist_free(fru);
197			nvlist_free(hcel);
198			nvlist_free(asru);
199			return;
200		}
201
202		/* create the fault, add to case. */
203		flt = cmd_nvl_create_fault(hdl, class, 100/numfru,
204		    asru, fru, NULL);
205		fmd_case_add_suspect(hdl, dp->dp_case, flt);
206
207		/* free up memory */
208		nvlist_free(fru);
209		nvlist_free(hcel);
210	}
211
212	/* free up ASRU */
213	nvlist_free(asru);
214}
215
216/*ARGSUSED*/
217cmd_evdisp_t
218cmd_dp_common(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
219	cmd_errcl_t clcode, uint8_t dperr)
220{
221	const char	*funcname = "cmd_dp_common()";
222	const char	*uuidp;
223	cmd_dp_t	*dpt, *ept;
224	int 		err, i, fltflg;
225	uint16_t	*cpuid_list;
226	uint64_t	*serid_list;
227	uint32_t	ncpuids;
228
229	/* extract common ereport contents */
230	dpt = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_t), FMD_SLEEP);
231	dpt->dp_nodetype = CMD_NT_DP;
232	dpt->dp_version = CMD_DP_VERSION;
233	dpt->dp_err = dperr;
234	err = nvlist_lookup_pairs(nvl, 0,
235	    DP_EREPORT_TYPE, DATA_TYPE_UINT16, &dpt->dp_erpt_type,
236	    DP_TVALUE, DATA_TYPE_UINT32, &dpt->dp_t_value,
237	    DP_LIST_SIZE, DATA_TYPE_UINT32, &ncpuids, NULL);
238	if (err != 0) {
239		fmd_hdl_debug(hdl, "%s: unable to verify ereport contents "
240		    "(erptype, ena, t_value, dp_list_sz)", funcname);
241		fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
242		return (CMD_EVD_UNUSED);
243	}
244
245	/* extract cpuid list from ereport */
246	err = nvlist_lookup_uint16_array(nvl, DP_LIST, &cpuid_list,
247	    &ncpuids);
248	err |= nvlist_lookup_uint64_array(nvl, SN_LIST, &serid_list,
249	    &ncpuids);
250	if (err != 0) {
251		fmd_hdl_debug(hdl, "%s: unable to verify ereport contents "
252		    "(dp_list, sn_list)", funcname);
253		fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
254		return (CMD_EVD_UNUSED);
255	}
256
257	for (i = 0; i < ncpuids; i++) {
258		dpt->dp_cpuid_list[i] = cpuid_list[i];
259		dpt->dp_serid_list[i] = serid_list[i];
260	}
261
262	dpt->dp_ncpus = ncpuids;
263
264	switch (dpt->dp_erpt_type) {
265
266	case DP_ERROR:
267
268		/*
269		 * Scan existing faults on cmd.cmd_datapaths. If each
270		 * cpuid in the current datapath event already has an
271		 * associated DP fault, this is an uninteresting event.
272		 */
273		fltflg = 0;
274		for (i = 0; i < ncpuids; i++)
275			if (cmd_dp_lookup_fault(hdl, cpuid_list[i]) != NULL)
276				fltflg++;
277		if (fltflg == ncpuids) {
278			fmd_hdl_debug(hdl, "%s: datapath fault(s) already "
279			    "experienced, event uninteresting\n", funcname);
280			fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
281			return (CMD_EVD_UNUSED);
282		}
283
284		/*
285		 * Check for an existing datapath error, and if found
286		 * add this event to the existing case
287		 */
288		ept = cmd_dp_lookup_error(dpt);
289		if (ept != NULL && !fmd_case_closed(hdl, ept->dp_case)) {
290			fmd_hdl_debug(hdl, "%s: found existing datapath error, "
291			    "adding event to case\n", funcname);
292			fmd_case_add_ereport(hdl, ept->dp_case, ep);
293			/* check for t-value change */
294			if (dpt->dp_t_value != ept->dp_t_value) {
295				fmd_event_t *ep;
296
297				fmd_timer_remove(hdl, ept->dp_id);
298				ep = fmd_case_getprincipal(hdl, ept->dp_case);
299				ept->dp_id = fmd_timer_install(hdl,
300				    (void *)CMD_TIMERTYPE_DP, ep,
301				    (hrtime_t)NANOSEC *
302				    (dpt->dp_t_value + 120));
303			}
304			fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
305			return (CMD_EVD_OK);
306		}
307
308		/*
309		 * Didn't find an existing datapath error. Create a new
310		 * case, add the event. Also, stash the datapath event on the
311		 * cmd.cmd_datapaths list
312		 */
313		fmd_hdl_debug(hdl, "%s: new datapath error, create case and "
314		    "add to cmd.cmd_datapaths\n", funcname);
315		++cmd.cmd_dp_flag;
316
317		cmd_bufname(dpt->dp_bufname, sizeof (dpt->dp_bufname),
318		    "dp_err_%d_%s", dpt->dp_cpuid_list[0],
319		    dperrtype[dpt->dp_err]);
320
321		dp_buf_write(hdl, dpt);
322
323		dpt->dp_case = cmd_case_create(hdl, &dpt->dp_header,
324		    CMD_PTR_DP_CASE, &uuidp);
325		fmd_case_setprincipal(hdl, dpt->dp_case, ep);
326		dpt->dp_id = fmd_timer_install(hdl, (void *)CMD_TIMERTYPE_DP,
327		    ep, (hrtime_t)NANOSEC * (dpt->dp_t_value + 120));
328		cmd_list_append(&cmd.cmd_datapaths, dpt);
329		break;
330
331	case DP_FAULT:
332		++cmd.cmd_dp_flag;
333		dpt->dp_erpt_type = DP_FAULT;
334		dpt->dp_id = 0;
335
336		cmd_bufname(dpt->dp_bufname, sizeof (dpt->dp_bufname),
337		    "dp_flt_%d_%s", dpt->dp_cpuid_list[0],
338		    dperrtype[dpt->dp_err]);
339
340		dp_buf_write(hdl, dpt);
341
342		/*
343		 * Check for an existing DP_ERROR on cmd.cmd_datapaths, and
344		 * if found, remove the DP_ERROR and close the case before
345		 * creating the DP_FAULT case.
346		 */
347		ept = cmd_dp_lookup_error(dpt);
348		if (ept != NULL && !fmd_case_closed(hdl, ept->dp_case)) {
349			fmd_hdl_debug(hdl, "%s: existing datapath error "
350			    "overtaken by datapath fault\n", funcname);
351			fmd_timer_remove(hdl, ept->dp_id);
352			cmd_dp_destroy(hdl, ept);
353		}
354
355		dpt->dp_case = cmd_case_create(hdl, &dpt->dp_header,
356		    CMD_PTR_DP_CASE, &uuidp);
357		fmd_case_setprincipal(hdl, dpt->dp_case, ep);
358
359		/* Add suspect(s) and solve the case. */
360		cmd_dp_add_suspects(hdl, dpt);
361		fmd_case_solve(hdl, dpt->dp_case);
362
363		/* add it to cmd.cmd_datapaths */
364		cmd_list_append(&cmd.cmd_datapaths, dpt);
365
366		--cmd.cmd_dp_flag;
367		if (cmd.cmd_dp_flag == 0)
368			cmd_dp_page_replay(hdl);
369
370		break;
371
372	default:
373		fmd_hdl_debug(hdl, "%s: unknown ereport type", funcname);
374		fmd_hdl_free(hdl, dpt, sizeof (cmd_dp_t));
375		return (CMD_EVD_UNUSED);
376	}
377
378	return (CMD_EVD_OK);
379}
380
381cmd_evdisp_t
382cmd_dp_cds(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
383    cmd_errcl_t clcode)
384{
385	if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
386		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
387		    SC_DP_CDS_TYPE));
388	} else
389		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
390		    SG_DP_CDS_TYPE));
391}
392
393cmd_evdisp_t
394cmd_dp_dx(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
395    cmd_errcl_t clcode)
396{
397	if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
398		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
399		    SC_DP_DX_TYPE));
400
401	} else
402		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
403		    SG_DP_DX_TYPE));
404}
405
406cmd_evdisp_t
407cmd_dp_ex(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
408    cmd_errcl_t clcode)
409{
410	return (cmd_dp_common(hdl, ep, nvl, class, clcode,
411	    SC_DP_EX_TYPE));
412}
413
414cmd_evdisp_t
415cmd_dp_cp(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class,
416    cmd_errcl_t clcode)
417{
418	if (fmd_nvl_class_match(hdl, nvl, "ereport.asic.starcat.*")) {
419		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
420		    SC_DP_CP_TYPE));
421	} else
422		return (cmd_dp_common(hdl, ep, nvl, class, clcode,
423		    SG_DP_RP_TYPE));
424}
425