1f6e214c7SGavin Maltby /*
2f6e214c7SGavin Maltby * CDDL HEADER START
3f6e214c7SGavin Maltby *
4f6e214c7SGavin Maltby * The contents of this file are subject to the terms of the
5f6e214c7SGavin Maltby * Common Development and Distribution License (the "License").
6f6e214c7SGavin Maltby * You may not use this file except in compliance with the License.
7f6e214c7SGavin Maltby *
8f6e214c7SGavin Maltby * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9f6e214c7SGavin Maltby * or http://www.opensolaris.org/os/licensing.
10f6e214c7SGavin Maltby * See the License for the specific language governing permissions
11f6e214c7SGavin Maltby * and limitations under the License.
12f6e214c7SGavin Maltby *
13f6e214c7SGavin Maltby * When distributing Covered Code, include this CDDL HEADER in each
14f6e214c7SGavin Maltby * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15f6e214c7SGavin Maltby * If applicable, add the following below this CDDL HEADER, with the
16f6e214c7SGavin Maltby * fields enclosed by brackets "[]" replaced with your own identifying
17f6e214c7SGavin Maltby * information: Portions Copyright [yyyy] [name of copyright owner]
18f6e214c7SGavin Maltby *
19f6e214c7SGavin Maltby * CDDL HEADER END
20f6e214c7SGavin Maltby */
21f6e214c7SGavin Maltby
22f6e214c7SGavin Maltby /*
23f6e214c7SGavin Maltby * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24f6e214c7SGavin Maltby */
25f6e214c7SGavin Maltby
26f6e214c7SGavin Maltby /*
27f6e214c7SGavin Maltby * Panic software-diagnosis subsidiary
28f6e214c7SGavin Maltby *
29f6e214c7SGavin Maltby * We model a system panic as a defect diagnosis in FMA. When a system
30f6e214c7SGavin Maltby * panicks, savecore publishes events which we subscribe to here.
31f6e214c7SGavin Maltby *
32f6e214c7SGavin Maltby * Our driving events are all raised by savecore, run either from
33f6e214c7SGavin Maltby * startup of the dumpadm service or interactively at the command line.
34f6e214c7SGavin Maltby * The following describes the logic for the handling of these events.
35f6e214c7SGavin Maltby *
36f6e214c7SGavin Maltby * On reboot after panic we will run savecore as part of the dumpadm
37f6e214c7SGavin Maltby * service startup; we run savecore even if savecore is otherwise
38f6e214c7SGavin Maltby * disabled (ie dumpadm -n in effect) - we run savecore -c to check for
39f6e214c7SGavin Maltby * a valid dump and raise the initial event.
40f6e214c7SGavin Maltby *
41f6e214c7SGavin Maltby * If savecore (or savecore -c) observes a valid dump pending on the
42f6e214c7SGavin Maltby * device, it raises a "dump_pending_on_device" event provided this
43f6e214c7SGavin Maltby * was not an FMA-initiated panic (for those we will replay ereports
44f6e214c7SGavin Maltby * from the dump device as usual and make a diagnosis from those; we do
45f6e214c7SGavin Maltby * not need to open a case for the panic). We subscribe to the
46f6e214c7SGavin Maltby * "dump_pending_on_device" event and use that to open a case; we
47f6e214c7SGavin Maltby * open a case requesting the same case uuid as the panic dump image
48f6e214c7SGavin Maltby * has for the OS instance uuid - if that fails because of a duplicate
49f6e214c7SGavin Maltby * uuid then we have already opened a case for this panic so no need
50f6e214c7SGavin Maltby * to open another.
51f6e214c7SGavin Maltby *
52f6e214c7SGavin Maltby * Included in the "dump_pending_on_device" event is an indication of
53f6e214c7SGavin Maltby * whether or not dumpadm is enabled. If not (dumpadm -n in effect)
54f6e214c7SGavin Maltby * then we do not expect any further events regarding this panic
55f6e214c7SGavin Maltby * until such time as the admin runs savecore manually (if ever).
56f6e214c7SGavin Maltby * So in this case we solve the case immediately after open. If/when
57f6e214c7SGavin Maltby * subsequent events arrive when savecore is run manually, we will toss
58f6e214c7SGavin Maltby * them.
59f6e214c7SGavin Maltby *
60f6e214c7SGavin Maltby * If dumpadm is enabled then savecore, run from dumpadm service startup,
61f6e214c7SGavin Maltby * will attempt to process the dump - either to copy it off the dump
62f6e214c7SGavin Maltby * device (if saving compressed) or to uncompress it off the dump device.
63f6e214c7SGavin Maltby * If this succeeds savecore raises a "dump_available" event which
64f6e214c7SGavin Maltby * includes information on the directory it was saved in, the instance
65f6e214c7SGavin Maltby * number, image uuid, compressed form or not, and whether the dump
66f6e214c7SGavin Maltby * was complete (as per the dumphdr). If the savecore fails for
67f6e214c7SGavin Maltby * some reason then it exits and raises a "savecore_failure" event.
68f6e214c7SGavin Maltby * These two events are raised even for FMA-initiated panics.
69f6e214c7SGavin Maltby *
70f6e214c7SGavin Maltby * We subscribe to both the "dump_available" and "savecore_failed" events,
71f6e214c7SGavin Maltby * and in the handling thereof we will close the case opened earlier (if
72f6e214c7SGavin Maltby * this is not an FMA-initiated panic). On receipt of the initial
73f6e214c7SGavin Maltby * "dump_available" event we also arm a timer for +10 minutes if
74f6e214c7SGavin Maltby * dumpadm is enabled - if no "dump_available" or "savecore_failed" arrives
75f6e214c7SGavin Maltby * in that time we will solve the case on timeout.
76f6e214c7SGavin Maltby *
77f6e214c7SGavin Maltby * When the timer fires we check whether the initial event for each panic
78f6e214c7SGavin Maltby * case was received more than 30 minutes ago; if it was we solve the case
79f6e214c7SGavin Maltby * with what we have. If we're still within the waiting period we rearm
80f6e214c7SGavin Maltby * for a further 10 minutes. The timer is shared by all cases that we
81f6e214c7SGavin Maltby * create, which is why the fire interval is shorter than the maximum time
82f6e214c7SGavin Maltby * we are prepared to wait.
83f6e214c7SGavin Maltby */
84f6e214c7SGavin Maltby
85f6e214c7SGavin Maltby #include <strings.h>
86f6e214c7SGavin Maltby #include <sys/panic.h>
87f6e214c7SGavin Maltby #include <alloca.h>
88f6e214c7SGavin Maltby #include <zone.h>
89f6e214c7SGavin Maltby
90f6e214c7SGavin Maltby #include "../../common/sw.h"
91f6e214c7SGavin Maltby #include "panic.h"
92f6e214c7SGavin Maltby
93f6e214c7SGavin Maltby #define MAX_STRING_LEN 160
94f6e214c7SGavin Maltby
95f6e214c7SGavin Maltby static id_t myid;
96f6e214c7SGavin Maltby
97f6e214c7SGavin Maltby static id_t mytimerid;
98f6e214c7SGavin Maltby
99f6e214c7SGavin Maltby /*
100f6e214c7SGavin Maltby * Our serialization structure type.
101f6e214c7SGavin Maltby */
102f6e214c7SGavin Maltby #define SWDE_PANIC_CASEDATA_VERS 1
103f6e214c7SGavin Maltby
104f6e214c7SGavin Maltby typedef struct swde_panic_casedata {
105f6e214c7SGavin Maltby uint32_t scd_vers; /* must be first member */
106f6e214c7SGavin Maltby uint64_t scd_receive_time; /* when we first knew of this panic */
107f6e214c7SGavin Maltby size_t scd_nvlbufsz; /* size of following buffer */
108f6e214c7SGavin Maltby /* packed attr nvlist follows */
109f6e214c7SGavin Maltby } swde_panic_casedata_t;
110f6e214c7SGavin Maltby
111f6e214c7SGavin Maltby static struct {
112f6e214c7SGavin Maltby fmd_stat_t swde_panic_diagnosed;
113f6e214c7SGavin Maltby fmd_stat_t swde_panic_badclass;
114f6e214c7SGavin Maltby fmd_stat_t swde_panic_noattr;
115f6e214c7SGavin Maltby fmd_stat_t swde_panic_unexpected_fm_panic;
116f6e214c7SGavin Maltby fmd_stat_t swde_panic_badattr;
117f6e214c7SGavin Maltby fmd_stat_t swde_panic_badfmri;
118f6e214c7SGavin Maltby fmd_stat_t swde_panic_noinstance;
119f6e214c7SGavin Maltby fmd_stat_t swde_panic_nouuid;
120f6e214c7SGavin Maltby fmd_stat_t swde_panic_dupuuid;
121f6e214c7SGavin Maltby fmd_stat_t swde_panic_nocase;
122f6e214c7SGavin Maltby fmd_stat_t swde_panic_notime;
123f6e214c7SGavin Maltby fmd_stat_t swde_panic_nopanicstr;
124f6e214c7SGavin Maltby fmd_stat_t swde_panic_nodumpdir;
125f6e214c7SGavin Maltby fmd_stat_t swde_panic_nostack;
126f6e214c7SGavin Maltby fmd_stat_t swde_panic_incomplete;
127f6e214c7SGavin Maltby fmd_stat_t swde_panic_failed;
128f6e214c7SGavin Maltby fmd_stat_t swde_panic_basecasedata;
129f6e214c7SGavin Maltby fmd_stat_t swde_panic_failsrlz;
130f6e214c7SGavin Maltby } swde_panic_stats = {
131f6e214c7SGavin Maltby { "swde_panic_diagnosed", FMD_TYPE_UINT64,
132f6e214c7SGavin Maltby "panic defects published" },
133f6e214c7SGavin Maltby { "swde_panic_badclass", FMD_TYPE_UINT64,
134f6e214c7SGavin Maltby "incorrect event class received" },
135f6e214c7SGavin Maltby { "swde_panic_noattr", FMD_TYPE_UINT64,
136f6e214c7SGavin Maltby "malformed event - missing attr nvlist" },
137f6e214c7SGavin Maltby { "swde_panic_unexpected_fm_panic", FMD_TYPE_UINT64,
138f6e214c7SGavin Maltby "dump available for an fm_panic()" },
139f6e214c7SGavin Maltby { "swde_panic_badattr", FMD_TYPE_UINT64,
140f6e214c7SGavin Maltby "malformed event - invalid attr list" },
141f6e214c7SGavin Maltby { "swde_panic_badfmri", FMD_TYPE_UINT64,
142f6e214c7SGavin Maltby "malformed event - fmri2str fails" },
143f6e214c7SGavin Maltby { "swde_panic_noinstance", FMD_TYPE_UINT64,
144f6e214c7SGavin Maltby "malformed event - no instance number" },
145f6e214c7SGavin Maltby { "swde_panic_nouuid", FMD_TYPE_UINT64,
146f6e214c7SGavin Maltby "malformed event - missing uuid" },
147f6e214c7SGavin Maltby { "swde_panic_dupuuid", FMD_TYPE_UINT64,
148f6e214c7SGavin Maltby "duplicate events received" },
149f6e214c7SGavin Maltby { "swde_panic_nocase", FMD_TYPE_UINT64,
150f6e214c7SGavin Maltby "case missing for uuid" },
151f6e214c7SGavin Maltby { "swde_panic_notime", FMD_TYPE_UINT64,
152f6e214c7SGavin Maltby "missing crash dump time" },
153f6e214c7SGavin Maltby { "swde_panic_nopanicstr", FMD_TYPE_UINT64,
154f6e214c7SGavin Maltby "missing panic string" },
155f6e214c7SGavin Maltby { "swde_panic_nodumpdir", FMD_TYPE_UINT64,
156f6e214c7SGavin Maltby "missing crashdump save directory" },
157f6e214c7SGavin Maltby { "swde_panic_nostack", FMD_TYPE_UINT64,
158f6e214c7SGavin Maltby "missing panic stack" },
159f6e214c7SGavin Maltby { "swde_panic_incomplete", FMD_TYPE_UINT64,
160f6e214c7SGavin Maltby "missing panic incomplete" },
161f6e214c7SGavin Maltby { "swde_panic_failed", FMD_TYPE_UINT64,
162f6e214c7SGavin Maltby "missing panic failed" },
163f6e214c7SGavin Maltby { "swde_panic_badcasedata", FMD_TYPE_UINT64,
164f6e214c7SGavin Maltby "bad case data during timeout" },
165f6e214c7SGavin Maltby { "swde_panic_failsrlz", FMD_TYPE_UINT64,
166f6e214c7SGavin Maltby "failures to serialize case data" },
167f6e214c7SGavin Maltby };
168f6e214c7SGavin Maltby
169f6e214c7SGavin Maltby #define BUMPSTAT(stat) swde_panic_stats.stat.fmds_value.ui64++
170f6e214c7SGavin Maltby
171f6e214c7SGavin Maltby static nvlist_t *
panic_sw_fmri(fmd_hdl_t * hdl,char * object)172f6e214c7SGavin Maltby panic_sw_fmri(fmd_hdl_t *hdl, char *object)
173f6e214c7SGavin Maltby {
174f6e214c7SGavin Maltby nvlist_t *fmri;
175f6e214c7SGavin Maltby nvlist_t *sw_obj;
176f6e214c7SGavin Maltby int err = 0;
177f6e214c7SGavin Maltby
178f6e214c7SGavin Maltby fmri = fmd_nvl_alloc(hdl, FMD_SLEEP);
179f6e214c7SGavin Maltby err |= nvlist_add_uint8(fmri, FM_VERSION, FM_SW_SCHEME_VERSION);
180f6e214c7SGavin Maltby err |= nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_SW);
181f6e214c7SGavin Maltby
182f6e214c7SGavin Maltby sw_obj = fmd_nvl_alloc(hdl, FMD_SLEEP);
183f6e214c7SGavin Maltby err |= nvlist_add_string(sw_obj, FM_FMRI_SW_OBJ_PATH, object);
184f6e214c7SGavin Maltby err |= nvlist_add_nvlist(fmri, FM_FMRI_SW_OBJ, sw_obj);
185*aab83bb8SJosef 'Jeff' Sipek nvlist_free(sw_obj);
186f6e214c7SGavin Maltby if (!err)
187f6e214c7SGavin Maltby return (fmri);
188f6e214c7SGavin Maltby else
189f6e214c7SGavin Maltby return (0);
190f6e214c7SGavin Maltby }
191f6e214c7SGavin Maltby
192f6e214c7SGavin Maltby static const char *dumpfiles[2] = { "unix.%lld", "vmcore.%lld" };
193f6e214c7SGavin Maltby static const char *dumpfiles_comp[2] = { "vmdump.%lld", NULL};
194f6e214c7SGavin Maltby
195f6e214c7SGavin Maltby static void
swde_panic_solve(fmd_hdl_t * hdl,fmd_case_t * cp,nvlist_t * attr,fmd_event_t * ep,boolean_t savecore_success)196f6e214c7SGavin Maltby swde_panic_solve(fmd_hdl_t *hdl, fmd_case_t *cp,
197f6e214c7SGavin Maltby nvlist_t *attr, fmd_event_t *ep, boolean_t savecore_success)
198f6e214c7SGavin Maltby {
199f6e214c7SGavin Maltby char *dumpdir, *path, *uuid;
200f6e214c7SGavin Maltby nvlist_t *defect, *rsrc;
201f6e214c7SGavin Maltby nvpair_t *nvp;
202f6e214c7SGavin Maltby int i;
203f6e214c7SGavin Maltby
204f6e214c7SGavin Maltby /*
205f6e214c7SGavin Maltby * Attribute members to include in event-specific defect
206f6e214c7SGavin Maltby * payload. Some attributes will not be present for some
207f6e214c7SGavin Maltby * cases - e.g., if we timed out and solved the case without
208f6e214c7SGavin Maltby * a "dump_available" report.
209f6e214c7SGavin Maltby */
210f6e214c7SGavin Maltby const char *toadd[] = {
211f6e214c7SGavin Maltby "os-instance-uuid", /* same as case uuid */
212f6e214c7SGavin Maltby "panicstr", /* for initial classification work */
213f6e214c7SGavin Maltby "panicstack", /* for initial classification work */
214f6e214c7SGavin Maltby "crashtime", /* in epoch time */
215f6e214c7SGavin Maltby "panic-time", /* Formatted crash time */
216f6e214c7SGavin Maltby };
217f6e214c7SGavin Maltby
218f6e214c7SGavin Maltby if (ep != NULL)
219f6e214c7SGavin Maltby fmd_case_add_ereport(hdl, cp, ep);
220f6e214c7SGavin Maltby /*
221f6e214c7SGavin Maltby * As a temporary solution we create and fmri in the sw scheme
222f6e214c7SGavin Maltby * in panic_sw_fmri. This should become a generic fmri constructor
223f6e214c7SGavin Maltby *
224f6e214c7SGavin Maltby * We need to user a resource FMRI which will have a sufficiently
225f6e214c7SGavin Maltby * unique string representation such that fmd will not see
226f6e214c7SGavin Maltby * repeated panic diagnoses (all using the same defect class)
227f6e214c7SGavin Maltby * as duplicates and discard later cases. We can't actually diagnose
228f6e214c7SGavin Maltby * the panic to anything specific (e.g., a path to a module and
229f6e214c7SGavin Maltby * function/line etc therein). We could pick on a generic
230f6e214c7SGavin Maltby * representative such as /kernel/genunix but that could lead
231f6e214c7SGavin Maltby * to misunderstanding. So we choose a path based on <dumpdir>
232f6e214c7SGavin Maltby * and the OS instance UUID - "<dumpdir>/.<os-instance-uuid>".
233f6e214c7SGavin Maltby * There's no file at that path (*) but no matter. We can't use
234f6e214c7SGavin Maltby * <dumpdir>/vmdump.N or similar because if savecore is disabled
235f6e214c7SGavin Maltby * or failed we don't have any file or instance number.
236f6e214c7SGavin Maltby *
237f6e214c7SGavin Maltby * (*) Some day it would seem tidier to keep all files to do
238f6e214c7SGavin Maltby * with a single crash (unix/vmcore/vmdump, analysis output etc)
239f6e214c7SGavin Maltby * in a distinct directory, and <dumpdir>/.<uuid> seems like a good
240f6e214c7SGavin Maltby * choice. For compatability we'd symlink into it. So that is
241f6e214c7SGavin Maltby * another reason for this choice - some day it may exist!
242f6e214c7SGavin Maltby */
243f6e214c7SGavin Maltby (void) nvlist_lookup_string(attr, "dumpdir", &dumpdir);
244f6e214c7SGavin Maltby (void) nvlist_lookup_string(attr, "os-instance-uuid", &uuid);
245f6e214c7SGavin Maltby path = alloca(strlen(dumpdir) + 1 + 1 + 36 + 1);
246f6e214c7SGavin Maltby /* LINTED: E_SEC_SPRINTF_UNBOUNDED_COPY */
247f6e214c7SGavin Maltby (void) sprintf(path, "%s/.%s", dumpdir, uuid);
248f6e214c7SGavin Maltby rsrc = panic_sw_fmri(hdl, path);
249f6e214c7SGavin Maltby
250f6e214c7SGavin Maltby defect = fmd_nvl_create_defect(hdl, SW_SUNOS_PANIC_DEFECT,
251f6e214c7SGavin Maltby 100, rsrc, NULL, rsrc);
252f6e214c7SGavin Maltby nvlist_free(rsrc);
253f6e214c7SGavin Maltby
254f6e214c7SGavin Maltby (void) nvlist_add_boolean_value(defect, "savecore-succcess",
255f6e214c7SGavin Maltby savecore_success);
256f6e214c7SGavin Maltby
257f6e214c7SGavin Maltby if (savecore_success) {
258f6e214c7SGavin Maltby boolean_t compressed;
259f6e214c7SGavin Maltby int64_t instance;
260f6e214c7SGavin Maltby const char **pathfmts;
261f6e214c7SGavin Maltby char buf[2][32];
262f6e214c7SGavin Maltby int files = 0;
263f6e214c7SGavin Maltby char *arr[2];
264f6e214c7SGavin Maltby int i;
265f6e214c7SGavin Maltby
266f6e214c7SGavin Maltby (void) nvlist_lookup_int64(attr, "instance", &instance);
267f6e214c7SGavin Maltby (void) nvlist_lookup_boolean_value(attr, "compressed",
268f6e214c7SGavin Maltby &compressed);
269f6e214c7SGavin Maltby
270f6e214c7SGavin Maltby pathfmts = compressed ? &dumpfiles_comp[0] : &dumpfiles[0];
271f6e214c7SGavin Maltby
272f6e214c7SGavin Maltby for (i = 0; i < 2; i++) {
273f6e214c7SGavin Maltby if (pathfmts[i] == NULL) {
274f6e214c7SGavin Maltby arr[i] = NULL;
275f6e214c7SGavin Maltby continue;
276f6e214c7SGavin Maltby }
277f6e214c7SGavin Maltby
278f6e214c7SGavin Maltby (void) snprintf(buf[i], 32, pathfmts[i], instance);
279f6e214c7SGavin Maltby arr[i] = buf[i];
280f6e214c7SGavin Maltby files++;
281f6e214c7SGavin Maltby }
282f6e214c7SGavin Maltby
283f6e214c7SGavin Maltby (void) nvlist_add_string(defect, "dump-dir", dumpdir);
284f6e214c7SGavin Maltby (void) nvlist_add_string_array(defect, "dump-files", arr,
285f6e214c7SGavin Maltby files);
286f6e214c7SGavin Maltby } else {
287f6e214c7SGavin Maltby char *rsn;
288f6e214c7SGavin Maltby
289f6e214c7SGavin Maltby if (nvlist_lookup_string(attr, "failure-reason", &rsn) == 0)
290f6e214c7SGavin Maltby (void) nvlist_add_string(defect, "failure-reason", rsn);
291f6e214c7SGavin Maltby }
292f6e214c7SGavin Maltby
293f6e214c7SGavin Maltby /*
294f6e214c7SGavin Maltby * Not all attributes will necessarily be available - eg if
295f6e214c7SGavin Maltby * dumpadm was not enabled there'll be no instance and dumpdir.
296f6e214c7SGavin Maltby */
297f6e214c7SGavin Maltby for (i = 0; i < sizeof (toadd) / sizeof (toadd[0]); i++) {
298f6e214c7SGavin Maltby if (nvlist_lookup_nvpair(attr, toadd[i], &nvp) == 0)
299f6e214c7SGavin Maltby (void) nvlist_add_nvpair(defect, nvp);
300f6e214c7SGavin Maltby }
301f6e214c7SGavin Maltby
302f6e214c7SGavin Maltby fmd_case_add_suspect(hdl, cp, defect);
303f6e214c7SGavin Maltby fmd_case_solve(hdl, cp);
304f6e214c7SGavin Maltby
305f6e214c7SGavin Maltby /*
306f6e214c7SGavin Maltby * Close the case. Do no free casedata - framework does that for us
307f6e214c7SGavin Maltby * on closure callback.
308f6e214c7SGavin Maltby */
309f6e214c7SGavin Maltby fmd_case_close(hdl, cp);
310f6e214c7SGavin Maltby BUMPSTAT(swde_panic_diagnosed);
311f6e214c7SGavin Maltby }
312f6e214c7SGavin Maltby
313f6e214c7SGavin Maltby /*ARGSUSED*/
314f6e214c7SGavin Maltby static void
swde_panic_timeout(fmd_hdl_t * hdl,id_t timerid,void * data)315f6e214c7SGavin Maltby swde_panic_timeout(fmd_hdl_t *hdl, id_t timerid, void *data)
316f6e214c7SGavin Maltby {
317f6e214c7SGavin Maltby fmd_case_t *cp = swde_case_first(hdl, myid);
318f6e214c7SGavin Maltby swde_panic_casedata_t *cdp;
319f6e214c7SGavin Maltby time_t now = time(NULL);
320f6e214c7SGavin Maltby nvlist_t *attr;
321f6e214c7SGavin Maltby int remain = 0;
322f6e214c7SGavin Maltby uint32_t vers;
323f6e214c7SGavin Maltby
324f6e214c7SGavin Maltby while (cp != NULL) {
325f6e214c7SGavin Maltby cdp = swde_case_data(hdl, cp, &vers);
326f6e214c7SGavin Maltby if (vers != SWDE_PANIC_CASEDATA_VERS)
327f6e214c7SGavin Maltby fmd_hdl_abort(hdl, "case data version confused\n");
328f6e214c7SGavin Maltby
329f6e214c7SGavin Maltby if (now > cdp->scd_receive_time + 30 * 60) {
330f6e214c7SGavin Maltby if (nvlist_unpack((char *)cdp + sizeof (*cdp),
331f6e214c7SGavin Maltby cdp->scd_nvlbufsz, &attr, 0) == 0) {
332f6e214c7SGavin Maltby swde_panic_solve(hdl, cp, attr, NULL, B_FALSE);
333f6e214c7SGavin Maltby nvlist_free(attr);
334f6e214c7SGavin Maltby } else {
335f6e214c7SGavin Maltby BUMPSTAT(swde_panic_basecasedata);
336f6e214c7SGavin Maltby fmd_case_close(hdl, cp);
337f6e214c7SGavin Maltby }
338f6e214c7SGavin Maltby } else {
339f6e214c7SGavin Maltby remain++;
340f6e214c7SGavin Maltby }
341f6e214c7SGavin Maltby
342f6e214c7SGavin Maltby
343f6e214c7SGavin Maltby cp = swde_case_next(hdl, cp);
344f6e214c7SGavin Maltby }
345f6e214c7SGavin Maltby
346f6e214c7SGavin Maltby if (remain) {
347f6e214c7SGavin Maltby mytimerid = sw_timer_install(hdl, myid, NULL, NULL,
348f6e214c7SGavin Maltby 10ULL * NANOSEC * 60);
349f6e214c7SGavin Maltby }
350f6e214c7SGavin Maltby }
351f6e214c7SGavin Maltby
352f6e214c7SGavin Maltby /*
353f6e214c7SGavin Maltby * Our verify entry point is called for each of our open cases during
354f6e214c7SGavin Maltby * module load. We must return 0 for the case to be closed by our caller,
355f6e214c7SGavin Maltby * or 1 to keep it (or if we have already closed it during this call).
356f6e214c7SGavin Maltby */
357f6e214c7SGavin Maltby static int
swde_panic_vrfy(fmd_hdl_t * hdl,fmd_case_t * cp)358f6e214c7SGavin Maltby swde_panic_vrfy(fmd_hdl_t *hdl, fmd_case_t *cp)
359f6e214c7SGavin Maltby {
360f6e214c7SGavin Maltby swde_panic_casedata_t *cdp;
361f6e214c7SGavin Maltby time_t now = time(NULL);
362f6e214c7SGavin Maltby nvlist_t *attr;
363f6e214c7SGavin Maltby uint32_t vers;
364f6e214c7SGavin Maltby
365f6e214c7SGavin Maltby cdp = swde_case_data(hdl, cp, &vers);
366f6e214c7SGavin Maltby
367f6e214c7SGavin Maltby if (vers != SWDE_PANIC_CASEDATA_VERS)
368f6e214c7SGavin Maltby return (0); /* case will be closed */
369f6e214c7SGavin Maltby
370f6e214c7SGavin Maltby if (now > cdp->scd_receive_time + 30 * 60) {
371f6e214c7SGavin Maltby if (nvlist_unpack((char *)cdp + sizeof (*cdp),
372f6e214c7SGavin Maltby cdp->scd_nvlbufsz, &attr, 0) == 0) {
373f6e214c7SGavin Maltby swde_panic_solve(hdl, cp, attr, NULL, B_FALSE);
374f6e214c7SGavin Maltby nvlist_free(attr);
375f6e214c7SGavin Maltby return (1); /* case already closed */
376f6e214c7SGavin Maltby } else {
377f6e214c7SGavin Maltby return (0); /* close case */
378f6e214c7SGavin Maltby }
379f6e214c7SGavin Maltby }
380f6e214c7SGavin Maltby
381f6e214c7SGavin Maltby if (mytimerid != 0)
382f6e214c7SGavin Maltby mytimerid = sw_timer_install(hdl, myid,
383f6e214c7SGavin Maltby NULL, NULL, 10ULL * NANOSEC * 60);
384f6e214c7SGavin Maltby
385f6e214c7SGavin Maltby return (1); /* retain case */
386f6e214c7SGavin Maltby }
387f6e214c7SGavin Maltby
388f6e214c7SGavin Maltby /*
389f6e214c7SGavin Maltby * Handler for ireport.os.sunos.panic.dump_pending_on_device.
390f6e214c7SGavin Maltby *
391f6e214c7SGavin Maltby * A future RFE should try adding a means of avoiding diagnosing repeated
392f6e214c7SGavin Maltby * defects on panic loops, which would just add to the mayhem and potentially
393f6e214c7SGavin Maltby * log lots of calls through ASR. Panics with similar enough panic
394f6e214c7SGavin Maltby * strings and/or stacks should not diagnose to new defects with some
395f6e214c7SGavin Maltby * period of time, for example.
396f6e214c7SGavin Maltby */
397f6e214c7SGavin Maltby
398f6e214c7SGavin Maltby /*ARGSUSED*/
399f6e214c7SGavin Maltby void
swde_panic_detected(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,void * arg)400f6e214c7SGavin Maltby swde_panic_detected(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
401f6e214c7SGavin Maltby const char *class, void *arg)
402f6e214c7SGavin Maltby {
403f6e214c7SGavin Maltby boolean_t fm_panic, expect_savecore;
404f6e214c7SGavin Maltby swde_panic_casedata_t *cdp;
405f6e214c7SGavin Maltby nvlist_t *attr;
406f6e214c7SGavin Maltby fmd_case_t *cp;
407f6e214c7SGavin Maltby char *fmribuf;
408f6e214c7SGavin Maltby char *uuid;
409f6e214c7SGavin Maltby size_t sz;
410f6e214c7SGavin Maltby
411f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "swde_panic_detected\n");
412f6e214c7SGavin Maltby
413f6e214c7SGavin Maltby if (nvlist_lookup_nvlist(nvl, FM_IREPORT_ATTRIBUTES, &attr) != 0) {
414f6e214c7SGavin Maltby BUMPSTAT(swde_panic_noattr);
415f6e214c7SGavin Maltby return;
416f6e214c7SGavin Maltby }
417f6e214c7SGavin Maltby
418f6e214c7SGavin Maltby if (nvlist_lookup_string(attr, "os-instance-uuid", &uuid) != 0) {
419f6e214c7SGavin Maltby BUMPSTAT(swde_panic_nouuid);
420f6e214c7SGavin Maltby return;
421f6e214c7SGavin Maltby }
422f6e214c7SGavin Maltby
423f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "swde_panic_detected: OS instance %s\n", uuid);
424f6e214c7SGavin Maltby
425f6e214c7SGavin Maltby if (nvlist_lookup_boolean_value(attr, "fm-panic", &fm_panic) != 0 ||
426f6e214c7SGavin Maltby fm_panic == B_TRUE) {
427f6e214c7SGavin Maltby BUMPSTAT(swde_panic_unexpected_fm_panic);
428f6e214c7SGavin Maltby return;
429f6e214c7SGavin Maltby }
430f6e214c7SGavin Maltby
431f6e214c7SGavin Maltby /*
432f6e214c7SGavin Maltby * Prepare serialization data to be associated with a new
433f6e214c7SGavin Maltby * case. Our serialization data consists of a swde_panic_casedata_t
434f6e214c7SGavin Maltby * structure followed by a packed nvlist of the attributes of
435f6e214c7SGavin Maltby * the initial event.
436f6e214c7SGavin Maltby */
437f6e214c7SGavin Maltby if (nvlist_size(attr, &sz, NV_ENCODE_NATIVE) != 0) {
438f6e214c7SGavin Maltby BUMPSTAT(swde_panic_failsrlz);
439f6e214c7SGavin Maltby return;
440f6e214c7SGavin Maltby }
441f6e214c7SGavin Maltby
442f6e214c7SGavin Maltby cdp = fmd_hdl_zalloc(hdl, sizeof (*cdp) + sz, FMD_SLEEP);
443f6e214c7SGavin Maltby fmribuf = (char *)cdp + sizeof (*cdp);
444f6e214c7SGavin Maltby cdp->scd_vers = SWDE_PANIC_CASEDATA_VERS;
445f6e214c7SGavin Maltby cdp->scd_receive_time = time(NULL);
446f6e214c7SGavin Maltby cdp->scd_nvlbufsz = sz;
447f6e214c7SGavin Maltby
448f6e214c7SGavin Maltby /*
449f6e214c7SGavin Maltby * Open a case with UUID matching the the panicking kernel, add this
450f6e214c7SGavin Maltby * event to the case.
451f6e214c7SGavin Maltby */
452f6e214c7SGavin Maltby if ((cp = swde_case_open(hdl, myid, uuid, SWDE_PANIC_CASEDATA_VERS,
453f6e214c7SGavin Maltby cdp, sizeof (*cdp) + sz)) == NULL) {
454f6e214c7SGavin Maltby BUMPSTAT(swde_panic_dupuuid);
455f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "swde_case_open returned NULL - dup?\n");
456f6e214c7SGavin Maltby fmd_hdl_free(hdl, cdp, sizeof (*cdp) + sz);
457f6e214c7SGavin Maltby return;
458f6e214c7SGavin Maltby }
459f6e214c7SGavin Maltby
460f6e214c7SGavin Maltby fmd_case_setprincipal(hdl, cp, ep);
461f6e214c7SGavin Maltby
462f6e214c7SGavin Maltby if (nvlist_lookup_boolean_value(attr, "will-attempt-savecore",
463f6e214c7SGavin Maltby &expect_savecore) != 0 || expect_savecore == B_FALSE) {
464f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "savecore not being attempted - "
465f6e214c7SGavin Maltby "solve now\n");
466f6e214c7SGavin Maltby swde_panic_solve(hdl, cp, attr, ep, B_FALSE);
467f6e214c7SGavin Maltby return;
468f6e214c7SGavin Maltby }
469f6e214c7SGavin Maltby
470f6e214c7SGavin Maltby /*
471f6e214c7SGavin Maltby * We expect to see either a "dump_available" or a "savecore_failed"
472f6e214c7SGavin Maltby * event before too long. In case that never shows up, for whatever
473f6e214c7SGavin Maltby * reason, we want to be able to solve the case anyway.
474f6e214c7SGavin Maltby */
475f6e214c7SGavin Maltby fmd_case_add_ereport(hdl, cp, ep);
476f6e214c7SGavin Maltby (void) nvlist_pack(attr, &fmribuf, &sz, NV_ENCODE_NATIVE, 0);
477f6e214c7SGavin Maltby swde_case_data_write(hdl, cp);
478f6e214c7SGavin Maltby
479f6e214c7SGavin Maltby if (mytimerid == 0) {
480f6e214c7SGavin Maltby mytimerid = sw_timer_install(hdl, myid, NULL, ep,
481f6e214c7SGavin Maltby 10ULL * NANOSEC * 60);
482f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "armed timer\n");
483f6e214c7SGavin Maltby } else {
484f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "timer already armed\n");
485f6e214c7SGavin Maltby }
486f6e214c7SGavin Maltby }
487f6e214c7SGavin Maltby
488f6e214c7SGavin Maltby /*
489f6e214c7SGavin Maltby * savecore has now run and saved a crash dump to the filesystem. It is
490f6e214c7SGavin Maltby * either a compressed dump (vmdump.n) or uncompressed {unix.n, vmcore.n}
491f6e214c7SGavin Maltby * Savecore has raised an ireport to say the dump is there.
492f6e214c7SGavin Maltby */
493f6e214c7SGavin Maltby
494f6e214c7SGavin Maltby /*ARGSUSED*/
495f6e214c7SGavin Maltby void
swde_panic_savecore_done(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class,void * arg)496f6e214c7SGavin Maltby swde_panic_savecore_done(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
497f6e214c7SGavin Maltby const char *class, void *arg)
498f6e214c7SGavin Maltby {
499f6e214c7SGavin Maltby boolean_t savecore_success = (arg != NULL);
500f6e214c7SGavin Maltby boolean_t fm_panic;
501f6e214c7SGavin Maltby nvlist_t *attr;
502f6e214c7SGavin Maltby fmd_case_t *cp;
503f6e214c7SGavin Maltby char *uuid;
504f6e214c7SGavin Maltby
505f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "savecore_done (%s)\n", savecore_success ?
506f6e214c7SGavin Maltby "success" : "fail");
507f6e214c7SGavin Maltby
508f6e214c7SGavin Maltby if (nvlist_lookup_nvlist(nvl, FM_IREPORT_ATTRIBUTES, &attr) != 0) {
509f6e214c7SGavin Maltby BUMPSTAT(swde_panic_noattr);
510f6e214c7SGavin Maltby return;
511f6e214c7SGavin Maltby }
512f6e214c7SGavin Maltby
513f6e214c7SGavin Maltby if (nvlist_lookup_boolean_value(attr, "fm-panic", &fm_panic) != 0 ||
514f6e214c7SGavin Maltby fm_panic == B_TRUE) {
515f6e214c7SGavin Maltby return; /* not expected, but just in case */
516f6e214c7SGavin Maltby }
517f6e214c7SGavin Maltby
518f6e214c7SGavin Maltby if (nvlist_lookup_string(attr, "os-instance-uuid", &uuid) != 0) {
519f6e214c7SGavin Maltby BUMPSTAT(swde_panic_nouuid);
520f6e214c7SGavin Maltby return;
521f6e214c7SGavin Maltby }
522f6e214c7SGavin Maltby
523f6e214c7SGavin Maltby /*
524f6e214c7SGavin Maltby * Find the case related to the panicking kernel; our cases have
525f6e214c7SGavin Maltby * the same uuid as the crashed OS image.
526f6e214c7SGavin Maltby */
527f6e214c7SGavin Maltby cp = fmd_case_uulookup(hdl, uuid);
528f6e214c7SGavin Maltby if (!cp) {
529f6e214c7SGavin Maltby /* Unable to find the case. */
530f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "savecore_done: can't find case for "
531f6e214c7SGavin Maltby "image %s\n", uuid);
532f6e214c7SGavin Maltby BUMPSTAT(swde_panic_nocase);
533f6e214c7SGavin Maltby return;
534f6e214c7SGavin Maltby }
535f6e214c7SGavin Maltby
536f6e214c7SGavin Maltby fmd_hdl_debug(hdl, "savecore_done: solving case %s\n", uuid);
537f6e214c7SGavin Maltby swde_panic_solve(hdl, cp, attr, ep, savecore_success);
538f6e214c7SGavin Maltby }
539f6e214c7SGavin Maltby
540f6e214c7SGavin Maltby const struct sw_disp swde_panic_disp[] = {
541f6e214c7SGavin Maltby { SW_SUNOS_PANIC_DETECTED, swde_panic_detected, NULL },
542f6e214c7SGavin Maltby { SW_SUNOS_PANIC_AVAIL, swde_panic_savecore_done, (void *)1 },
543f6e214c7SGavin Maltby { SW_SUNOS_PANIC_FAILURE, swde_panic_savecore_done, NULL },
544f6e214c7SGavin Maltby /*
545f6e214c7SGavin Maltby * Something has to subscribe to every fault
546f6e214c7SGavin Maltby * or defect diagnosed in fmd. We do that here, but throw it away.
547f6e214c7SGavin Maltby */
548f6e214c7SGavin Maltby { SW_SUNOS_PANIC_DEFECT, NULL, NULL },
549f6e214c7SGavin Maltby { NULL, NULL, NULL }
550f6e214c7SGavin Maltby };
551f6e214c7SGavin Maltby
552f6e214c7SGavin Maltby /*ARGSUSED*/
553f6e214c7SGavin Maltby int
swde_panic_init(fmd_hdl_t * hdl,id_t id,const struct sw_disp ** dpp,int * nelemp)554f6e214c7SGavin Maltby swde_panic_init(fmd_hdl_t *hdl, id_t id, const struct sw_disp **dpp,
555f6e214c7SGavin Maltby int *nelemp)
556f6e214c7SGavin Maltby {
557f6e214c7SGavin Maltby myid = id;
558f6e214c7SGavin Maltby
559f6e214c7SGavin Maltby if (getzoneid() != GLOBAL_ZONEID)
560f6e214c7SGavin Maltby return (SW_SUB_INIT_FAIL_VOLUNTARY);
561f6e214c7SGavin Maltby
562f6e214c7SGavin Maltby (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC,
563f6e214c7SGavin Maltby sizeof (swde_panic_stats) / sizeof (fmd_stat_t),
564f6e214c7SGavin Maltby (fmd_stat_t *)&swde_panic_stats);
565f6e214c7SGavin Maltby
566f6e214c7SGavin Maltby fmd_hdl_subscribe(hdl, SW_SUNOS_PANIC_DETECTED);
567f6e214c7SGavin Maltby fmd_hdl_subscribe(hdl, SW_SUNOS_PANIC_FAILURE);
568f6e214c7SGavin Maltby fmd_hdl_subscribe(hdl, SW_SUNOS_PANIC_AVAIL);
569f6e214c7SGavin Maltby
570f6e214c7SGavin Maltby *dpp = &swde_panic_disp[0];
571f6e214c7SGavin Maltby *nelemp = sizeof (swde_panic_disp) / sizeof (swde_panic_disp[0]);
572f6e214c7SGavin Maltby return (SW_SUB_INIT_SUCCESS);
573f6e214c7SGavin Maltby }
574f6e214c7SGavin Maltby
575f6e214c7SGavin Maltby void
swde_panic_fini(fmd_hdl_t * hdl)576f6e214c7SGavin Maltby swde_panic_fini(fmd_hdl_t *hdl)
577f6e214c7SGavin Maltby {
578f6e214c7SGavin Maltby if (mytimerid)
579f6e214c7SGavin Maltby sw_timer_remove(hdl, myid, mytimerid);
580f6e214c7SGavin Maltby }
581f6e214c7SGavin Maltby
582f6e214c7SGavin Maltby const struct sw_subinfo panic_diag_info = {
583f6e214c7SGavin Maltby "panic diagnosis", /* swsub_name */
584f6e214c7SGavin Maltby SW_CASE_PANIC, /* swsub_casetype */
585f6e214c7SGavin Maltby swde_panic_init, /* swsub_init */
586f6e214c7SGavin Maltby swde_panic_fini, /* swsub_fini */
587f6e214c7SGavin Maltby swde_panic_timeout, /* swsub_timeout */
588f6e214c7SGavin Maltby NULL, /* swsub_case_close */
589f6e214c7SGavin Maltby swde_panic_vrfy, /* swsub_case_vrfy */
590f6e214c7SGavin Maltby };
591