1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2012 Milan Jurik. All rights reserved.
25  * Copyright (c) 2018, Joyent, Inc.
26  *
27  * fme.c -- fault management exercise module
28  *
29  * this module provides the simulated fault management exercise.
30  */
31 
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <strings.h>
36 #include <ctype.h>
37 #include <alloca.h>
38 #include <libnvpair.h>
39 #include <sys/fm/protocol.h>
40 #include <fm/fmd_api.h>
41 #include <fm/libtopo.h>
42 #include "alloc.h"
43 #include "out.h"
44 #include "stats.h"
45 #include "stable.h"
46 #include "literals.h"
47 #include "lut.h"
48 #include "tree.h"
49 #include "ptree.h"
50 #include "itree.h"
51 #include "ipath.h"
52 #include "fme.h"
53 #include "evnv.h"
54 #include "eval.h"
55 #include "config.h"
56 #include "platform.h"
57 #include "esclex.h"
58 
59 struct lut *Istats;
60 struct lut *SerdEngines;
61 nvlist_t *Action_nvl;
62 
63 /* imported from eft.c... */
64 extern hrtime_t Hesitate;
65 extern char *Serd_Override;
66 extern nv_alloc_t Eft_nv_hdl;
67 extern int Max_fme;
68 extern fmd_hdl_t *Hdl;
69 
70 static int Istat_need_save;
71 static int Serd_need_save;
72 void istat_save(void);
73 void serd_save(void);
74 
75 /* fme under construction is global so we can free it on module abort */
76 static struct fme *Nfmep;
77 
78 static int Undiag_reason = UD_VAL_UNKNOWN;
79 
80 static int Nextid = 0;
81 
82 static int Open_fme_count = 0;	/* Count of open FMEs */
83 
84 /* list of fault management exercises underway */
85 static struct fme {
86 	struct fme *next;		/* next exercise */
87 	unsigned long long ull;		/* time when fme was created */
88 	int id;				/* FME id */
89 	struct config *config;		/* cooked configuration data */
90 	struct lut *eventtree;		/* propagation tree for this FME */
91 	/*
92 	 * The initial error report that created this FME is kept in
93 	 * two forms.  e0 points to the instance tree node and is used
94 	 * by fme_eval() as the starting point for the inference
95 	 * algorithm.  e0r is the event handle FMD passed to us when
96 	 * the ereport first arrived and is used when setting timers,
97 	 * which are always relative to the time of this initial
98 	 * report.
99 	 */
100 	struct event *e0;
101 	fmd_event_t *e0r;
102 
103 	id_t    timer;			/* for setting an fmd time-out */
104 
105 	struct event *ecurrent;		/* ereport under consideration */
106 	struct event *suspects;		/* current suspect list */
107 	struct event *psuspects;	/* previous suspect list */
108 	int nsuspects;			/* count of suspects */
109 	int posted_suspects;		/* true if we've posted a diagnosis */
110 	int uniqobs;			/* number of unique events observed */
111 	int peek;			/* just peeking, don't track suspects */
112 	int overflow;			/* true if overflow FME */
113 	enum fme_state {
114 		FME_NOTHING = 5000,	/* not evaluated yet */
115 		FME_WAIT,		/* need to wait for more info */
116 		FME_CREDIBLE,		/* suspect list is credible */
117 		FME_DISPROVED,		/* no valid suspects found */
118 		FME_DEFERRED		/* don't know yet (k-count not met) */
119 	} state;
120 
121 	unsigned long long pull;	/* time passed since created */
122 	unsigned long long wull;	/* wait until this time for re-eval */
123 	struct event *observations;	/* observation list */
124 	struct lut *globals;		/* values of global variables */
125 	/* fmd interfacing */
126 	fmd_hdl_t *hdl;			/* handle for talking with fmd */
127 	fmd_case_t *fmcase;		/* what fmd 'case' we associate with */
128 	/* stats */
129 	struct stats *Rcount;
130 	struct stats *Hcallcount;
131 	struct stats *Rcallcount;
132 	struct stats *Ccallcount;
133 	struct stats *Ecallcount;
134 	struct stats *Tcallcount;
135 	struct stats *Marrowcount;
136 	struct stats *diags;
137 } *FMElist, *EFMElist, *ClosedFMEs;
138 
139 static struct case_list {
140 	fmd_case_t *fmcase;
141 	struct case_list *next;
142 } *Undiagablecaselist;
143 
144 static void fme_eval(struct fme *fmep, fmd_event_t *ffep);
145 static enum fme_state hypothesise(struct fme *fmep, struct event *ep,
146 	unsigned long long at_latest_by, unsigned long long *pdelay);
147 static struct node *eventprop_lookup(struct event *ep, const char *propname);
148 static struct node *pathstring2epnamenp(char *path);
149 static void publish_undiagnosable(fmd_hdl_t *hdl, fmd_event_t *ffep,
150 	fmd_case_t *fmcase, nvlist_t *detector, char *arg);
151 static char *undiag_2reason_str(int ud, char *arg);
152 static const char *undiag_2defect_str(int ud);
153 static void restore_suspects(struct fme *fmep);
154 static void save_suspects(struct fme *fmep);
155 static void destroy_fme(struct fme *f);
156 static void fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
157     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl);
158 static void istat_counter_reset_cb(struct istat_entry *entp,
159     struct stats *statp, const struct ipath *ipp);
160 static void istat_counter_topo_chg_cb(struct istat_entry *entp,
161     struct stats *statp, void *unused);
162 static void serd_reset_cb(struct serd_entry *entp, void *unused,
163     const struct ipath *ipp);
164 static void serd_topo_chg_cb(struct serd_entry *entp, void *unused,
165     void *unused2);
166 static void destroy_fme_bufs(struct fme *fp);
167 
168 static struct fme *
alloc_fme(void)169 alloc_fme(void)
170 {
171 	struct fme *fmep;
172 
173 	fmep = MALLOC(sizeof (*fmep));
174 	bzero(fmep, sizeof (*fmep));
175 	return (fmep);
176 }
177 
178 /*
179  * fme_ready -- called when all initialization of the FME (except for
180  *	stats) has completed successfully.  Adds the fme to global lists
181  *	and establishes its stats.
182  */
183 static struct fme *
fme_ready(struct fme * fmep)184 fme_ready(struct fme *fmep)
185 {
186 	char nbuf[100];
187 
188 	Nfmep = NULL;	/* don't need to free this on module abort now */
189 
190 	if (EFMElist) {
191 		EFMElist->next = fmep;
192 		EFMElist = fmep;
193 	} else
194 		FMElist = EFMElist = fmep;
195 
196 	(void) sprintf(nbuf, "fme%d.Rcount", fmep->id);
197 	fmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
198 	(void) sprintf(nbuf, "fme%d.Hcall", fmep->id);
199 	fmep->Hcallcount = stats_new_counter(nbuf, "calls to hypothesise()", 1);
200 	(void) sprintf(nbuf, "fme%d.Rcall", fmep->id);
201 	fmep->Rcallcount = stats_new_counter(nbuf,
202 	    "calls to requirements_test()", 1);
203 	(void) sprintf(nbuf, "fme%d.Ccall", fmep->id);
204 	fmep->Ccallcount = stats_new_counter(nbuf, "calls to causes_test()", 1);
205 	(void) sprintf(nbuf, "fme%d.Ecall", fmep->id);
206 	fmep->Ecallcount =
207 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
208 	(void) sprintf(nbuf, "fme%d.Tcall", fmep->id);
209 	fmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
210 	(void) sprintf(nbuf, "fme%d.Marrow", fmep->id);
211 	fmep->Marrowcount = stats_new_counter(nbuf,
212 	    "arrows marked by mark_arrows()", 1);
213 	(void) sprintf(nbuf, "fme%d.diags", fmep->id);
214 	fmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
215 
216 	out(O_ALTFP|O_VERB2, "newfme: config snapshot contains...");
217 	config_print(O_ALTFP|O_VERB2, fmep->config);
218 
219 	return (fmep);
220 }
221 
222 extern void ipath_dummy_lut(struct arrow *);
223 extern struct lut *itree_create_dummy(const char *, const struct ipath *);
224 
225 /* ARGSUSED */
226 static void
set_needed_arrows(struct event * ep,struct event * ep2,struct fme * fmep)227 set_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
228 {
229 	struct bubble *bp;
230 	struct arrowlist *ap;
231 
232 	for (bp = itree_next_bubble(ep, NULL); bp;
233 	    bp = itree_next_bubble(ep, bp)) {
234 		if (bp->t != B_FROM)
235 			continue;
236 		for (ap = itree_next_arrow(bp, NULL); ap;
237 		    ap = itree_next_arrow(bp, ap)) {
238 			ap->arrowp->pnode->u.arrow.needed = 1;
239 			ipath_dummy_lut(ap->arrowp);
240 		}
241 	}
242 }
243 
244 /* ARGSUSED */
245 static void
unset_needed_arrows(struct event * ep,struct event * ep2,struct fme * fmep)246 unset_needed_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
247 {
248 	struct bubble *bp;
249 	struct arrowlist *ap;
250 
251 	for (bp = itree_next_bubble(ep, NULL); bp;
252 	    bp = itree_next_bubble(ep, bp)) {
253 		if (bp->t != B_FROM)
254 			continue;
255 		for (ap = itree_next_arrow(bp, NULL); ap;
256 		    ap = itree_next_arrow(bp, ap))
257 			ap->arrowp->pnode->u.arrow.needed = 0;
258 	}
259 }
260 
261 static void globals_destructor(void *left, void *right, void *arg);
262 static void clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep);
263 
264 static boolean_t
prune_propagations(const char * e0class,const struct ipath * e0ipp)265 prune_propagations(const char *e0class, const struct ipath *e0ipp)
266 {
267 	char nbuf[100];
268 	unsigned long long my_delay = TIMEVAL_EVENTUALLY;
269 	extern struct lut *Usednames;
270 
271 	Nfmep = alloc_fme();
272 	Nfmep->id = Nextid;
273 	Nfmep->state = FME_NOTHING;
274 	Nfmep->eventtree = itree_create_dummy(e0class, e0ipp);
275 	if ((Nfmep->e0 =
276 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
277 		itree_free(Nfmep->eventtree);
278 		FREE(Nfmep);
279 		Nfmep = NULL;
280 		return (B_FALSE);
281 	}
282 	Nfmep->ecurrent = Nfmep->observations = Nfmep->e0;
283 	Nfmep->e0->count++;
284 
285 	(void) sprintf(nbuf, "fme%d.Rcount", Nfmep->id);
286 	Nfmep->Rcount = stats_new_counter(nbuf, "ereports received", 0);
287 	(void) sprintf(nbuf, "fme%d.Hcall", Nfmep->id);
288 	Nfmep->Hcallcount =
289 	    stats_new_counter(nbuf, "calls to hypothesise()", 1);
290 	(void) sprintf(nbuf, "fme%d.Rcall", Nfmep->id);
291 	Nfmep->Rcallcount = stats_new_counter(nbuf,
292 	    "calls to requirements_test()", 1);
293 	(void) sprintf(nbuf, "fme%d.Ccall", Nfmep->id);
294 	Nfmep->Ccallcount =
295 	    stats_new_counter(nbuf, "calls to causes_test()", 1);
296 	(void) sprintf(nbuf, "fme%d.Ecall", Nfmep->id);
297 	Nfmep->Ecallcount =
298 	    stats_new_counter(nbuf, "calls to effects_test()", 1);
299 	(void) sprintf(nbuf, "fme%d.Tcall", Nfmep->id);
300 	Nfmep->Tcallcount = stats_new_counter(nbuf, "calls to triggered()", 1);
301 	(void) sprintf(nbuf, "fme%d.Marrow", Nfmep->id);
302 	Nfmep->Marrowcount = stats_new_counter(nbuf,
303 	    "arrows marked by mark_arrows()", 1);
304 	(void) sprintf(nbuf, "fme%d.diags", Nfmep->id);
305 	Nfmep->diags = stats_new_counter(nbuf, "suspect lists diagnosed", 0);
306 
307 	Nfmep->peek = 1;
308 	lut_walk(Nfmep->eventtree, (lut_cb)unset_needed_arrows, (void *)Nfmep);
309 	lut_free(Usednames, NULL, NULL);
310 	Usednames = NULL;
311 	lut_walk(Nfmep->eventtree, (lut_cb)clear_arrows, (void *)Nfmep);
312 	(void) hypothesise(Nfmep, Nfmep->e0, Nfmep->ull, &my_delay);
313 	itree_prune(Nfmep->eventtree);
314 	lut_walk(Nfmep->eventtree, (lut_cb)set_needed_arrows, (void *)Nfmep);
315 
316 	stats_delete(Nfmep->Rcount);
317 	stats_delete(Nfmep->Hcallcount);
318 	stats_delete(Nfmep->Rcallcount);
319 	stats_delete(Nfmep->Ccallcount);
320 	stats_delete(Nfmep->Ecallcount);
321 	stats_delete(Nfmep->Tcallcount);
322 	stats_delete(Nfmep->Marrowcount);
323 	stats_delete(Nfmep->diags);
324 	itree_free(Nfmep->eventtree);
325 	lut_free(Nfmep->globals, globals_destructor, NULL);
326 	FREE(Nfmep);
327 	return (B_TRUE);
328 }
329 
330 static struct fme *
newfme(const char * e0class,const struct ipath * e0ipp,fmd_hdl_t * hdl,fmd_case_t * fmcase,fmd_event_t * ffep,nvlist_t * nvl)331 newfme(const char *e0class, const struct ipath *e0ipp, fmd_hdl_t *hdl,
332     fmd_case_t *fmcase, fmd_event_t *ffep, nvlist_t *nvl)
333 {
334 	struct cfgdata *cfgdata;
335 	int init_size;
336 	extern int alloc_total();
337 	nvlist_t *detector = NULL;
338 	char *pathstr;
339 	char *arg;
340 
341 	/*
342 	 * First check if e0ipp is actually in the topology so we can give a
343 	 * more useful error message.
344 	 */
345 	ipathlastcomp(e0ipp);
346 	pathstr = ipath2str(NULL, e0ipp);
347 	cfgdata = config_snapshot();
348 	platform_unit_translate(0, cfgdata->cooked, TOPO_PROP_RESOURCE,
349 	    &detector, pathstr);
350 	FREE(pathstr);
351 	structconfig_free(cfgdata->cooked);
352 	config_free(cfgdata);
353 	if (detector == NULL) {
354 		/* See if class permits silent discard on unknown component. */
355 		if (lut_lookup(Ereportenames_discard, (void *)e0class, NULL)) {
356 			out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport "
357 			    "to component path, but silent discard allowed.",
358 			    e0class);
359 			fmd_case_close(hdl, fmcase);
360 		} else {
361 			Undiag_reason = UD_VAL_BADEVENTPATH;
362 			(void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR,
363 			    &detector);
364 			arg = ipath2str(e0class, e0ipp);
365 			publish_undiagnosable(hdl, ffep, fmcase, detector, arg);
366 			FREE(arg);
367 		}
368 		return (NULL);
369 	}
370 
371 	/*
372 	 * Next run a quick first pass of the rules with a dummy config. This
373 	 * allows us to prune those rules which can't possibly cause this
374 	 * ereport.
375 	 */
376 	if (!prune_propagations(e0class, e0ipp)) {
377 		/*
378 		 * The fault class must have been in the rules or we would
379 		 * not have registered for it (and got a "nosub"), and the
380 		 * pathname must be in the topology or we would have failed the
381 		 * previous test. So to get here means the combination of
382 		 * class and pathname in the ereport must be invalid.
383 		 */
384 		Undiag_reason = UD_VAL_BADEVENTCLASS;
385 		arg = ipath2str(e0class, e0ipp);
386 		publish_undiagnosable(hdl, ffep, fmcase, detector, arg);
387 		nvlist_free(detector);
388 		FREE(arg);
389 		return (NULL);
390 	}
391 
392 	/*
393 	 * Now go ahead and create the real fme using the pruned rules.
394 	 */
395 	init_size = alloc_total();
396 	out(O_ALTFP|O_STAMP, "start config_snapshot using %d bytes", init_size);
397 	nvlist_free(detector);
398 	pathstr = ipath2str(NULL, e0ipp);
399 	cfgdata = config_snapshot();
400 	platform_unit_translate(0, cfgdata->cooked, TOPO_PROP_RESOURCE,
401 	    &detector, pathstr);
402 	FREE(pathstr);
403 	platform_save_config(hdl, fmcase);
404 	out(O_ALTFP|O_STAMP, "config_snapshot added %d bytes",
405 	    alloc_total() - init_size);
406 
407 	Nfmep = alloc_fme();
408 
409 	Nfmep->id = Nextid++;
410 	Nfmep->config = cfgdata->cooked;
411 	config_free(cfgdata);
412 	Nfmep->posted_suspects = 0;
413 	Nfmep->uniqobs = 0;
414 	Nfmep->state = FME_NOTHING;
415 	Nfmep->pull = 0ULL;
416 	Nfmep->overflow = 0;
417 
418 	Nfmep->fmcase = fmcase;
419 	Nfmep->hdl = hdl;
420 
421 	if ((Nfmep->eventtree = itree_create(Nfmep->config)) == NULL) {
422 		Undiag_reason = UD_VAL_INSTFAIL;
423 		arg = ipath2str(e0class, e0ipp);
424 		publish_undiagnosable(hdl, ffep, fmcase, detector, arg);
425 		nvlist_free(detector);
426 		FREE(arg);
427 		structconfig_free(Nfmep->config);
428 		destroy_fme_bufs(Nfmep);
429 		FREE(Nfmep);
430 		Nfmep = NULL;
431 		return (NULL);
432 	}
433 
434 	itree_ptree(O_ALTFP|O_VERB2, Nfmep->eventtree);
435 
436 	if ((Nfmep->e0 =
437 	    itree_lookup(Nfmep->eventtree, e0class, e0ipp)) == NULL) {
438 		Undiag_reason = UD_VAL_BADEVENTI;
439 		arg = ipath2str(e0class, e0ipp);
440 		publish_undiagnosable(hdl, ffep, fmcase, detector, arg);
441 		nvlist_free(detector);
442 		FREE(arg);
443 		itree_free(Nfmep->eventtree);
444 		structconfig_free(Nfmep->config);
445 		destroy_fme_bufs(Nfmep);
446 		FREE(Nfmep);
447 		Nfmep = NULL;
448 		return (NULL);
449 	}
450 
451 	nvlist_free(detector);
452 	return (fme_ready(Nfmep));
453 }
454 
455 void
fme_fini(void)456 fme_fini(void)
457 {
458 	struct fme *sfp, *fp;
459 	struct case_list *ucasep, *nextcasep;
460 
461 	ucasep = Undiagablecaselist;
462 	while (ucasep != NULL) {
463 		nextcasep = ucasep->next;
464 		FREE(ucasep);
465 		ucasep = nextcasep;
466 	}
467 	Undiagablecaselist = NULL;
468 
469 	/* clean up closed fmes */
470 	fp = ClosedFMEs;
471 	while (fp != NULL) {
472 		sfp = fp->next;
473 		destroy_fme(fp);
474 		fp = sfp;
475 	}
476 	ClosedFMEs = NULL;
477 
478 	fp = FMElist;
479 	while (fp != NULL) {
480 		sfp = fp->next;
481 		destroy_fme(fp);
482 		fp = sfp;
483 	}
484 	FMElist = EFMElist = NULL;
485 
486 	/* if we were in the middle of creating an fme, free it now */
487 	if (Nfmep) {
488 		destroy_fme(Nfmep);
489 		Nfmep = NULL;
490 	}
491 }
492 
493 /*
494  * Allocated space for a buffer name.  20 bytes allows for
495  * a ridiculous 9,999,999 unique observations.
496  */
497 #define	OBBUFNMSZ 20
498 
499 /*
500  *  serialize_observation
501  *
502  *  Create a recoverable version of the current observation
503  *  (f->ecurrent).  We keep a serialized version of each unique
504  *  observation in order that we may resume correctly the fme in the
505  *  correct state if eft or fmd crashes and we're restarted.
506  */
507 static void
serialize_observation(struct fme * fp,const char * cls,const struct ipath * ipp)508 serialize_observation(struct fme *fp, const char *cls, const struct ipath *ipp)
509 {
510 	size_t pkdlen;
511 	char tmpbuf[OBBUFNMSZ];
512 	char *pkd = NULL;
513 	char *estr;
514 
515 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", fp->uniqobs);
516 	estr = ipath2str(cls, ipp);
517 	fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, strlen(estr) + 1);
518 	fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)estr,
519 	    strlen(estr) + 1);
520 	FREE(estr);
521 
522 	if (fp->ecurrent != NULL && fp->ecurrent->nvp != NULL) {
523 		(void) snprintf(tmpbuf,
524 		    OBBUFNMSZ, "observed%d.nvp", fp->uniqobs);
525 		if (nvlist_xpack(fp->ecurrent->nvp,
526 		    &pkd, &pkdlen, NV_ENCODE_XDR, &Eft_nv_hdl) != 0)
527 			out(O_DIE|O_SYS, "pack of observed nvl failed");
528 		fmd_buf_create(fp->hdl, fp->fmcase, tmpbuf, pkdlen);
529 		fmd_buf_write(fp->hdl, fp->fmcase, tmpbuf, (void *)pkd, pkdlen);
530 		FREE(pkd);
531 	}
532 
533 	fp->uniqobs++;
534 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
535 	    sizeof (fp->uniqobs));
536 }
537 
538 /*
539  *  init_fme_bufs -- We keep several bits of state about an fme for
540  *	use if eft or fmd crashes and we're restarted.
541  */
542 static void
init_fme_bufs(struct fme * fp)543 init_fme_bufs(struct fme *fp)
544 {
545 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_PULL, sizeof (fp->pull));
546 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_PULL, (void *)&fp->pull,
547 	    sizeof (fp->pull));
548 
549 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_ID, sizeof (fp->id));
550 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_ID, (void *)&fp->id,
551 	    sizeof (fp->id));
552 
553 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_NOBS, sizeof (fp->uniqobs));
554 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_NOBS, (void *)&fp->uniqobs,
555 	    sizeof (fp->uniqobs));
556 
557 	fmd_buf_create(fp->hdl, fp->fmcase, WOBUF_POSTD,
558 	    sizeof (fp->posted_suspects));
559 	fmd_buf_write(fp->hdl, fp->fmcase, WOBUF_POSTD,
560 	    (void *)&fp->posted_suspects, sizeof (fp->posted_suspects));
561 }
562 
563 static void
destroy_fme_bufs(struct fme * fp)564 destroy_fme_bufs(struct fme *fp)
565 {
566 	char tmpbuf[OBBUFNMSZ];
567 	int o;
568 
569 	platform_restore_config(fp->hdl, fp->fmcase);
570 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFGLEN);
571 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_CFG);
572 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_PULL);
573 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_ID);
574 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_POSTD);
575 	fmd_buf_destroy(fp->hdl, fp->fmcase, WOBUF_NOBS);
576 
577 	for (o = 0; o < fp->uniqobs; o++) {
578 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", o);
579 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
580 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", o);
581 		fmd_buf_destroy(fp->hdl, fp->fmcase, tmpbuf);
582 	}
583 }
584 
585 /*
586  * reconstitute_observations -- convert a case's serialized observations
587  *	back into struct events.  Returns zero if all observations are
588  *	successfully reconstituted.
589  */
590 static int
reconstitute_observations(struct fme * fmep)591 reconstitute_observations(struct fme *fmep)
592 {
593 	struct event *ep;
594 	struct node *epnamenp = NULL;
595 	size_t pkdlen;
596 	char *pkd = NULL;
597 	char *tmpbuf = alloca(OBBUFNMSZ);
598 	char *sepptr;
599 	char *estr;
600 	int ocnt;
601 	int elen;
602 
603 	for (ocnt = 0; ocnt < fmep->uniqobs; ocnt++) {
604 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d", ocnt);
605 		elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
606 		if (elen == 0) {
607 			out(O_ALTFP,
608 			    "reconstitute_observation: no %s buffer found.",
609 			    tmpbuf);
610 			Undiag_reason = UD_VAL_MISSINGOBS;
611 			break;
612 		}
613 
614 		estr = MALLOC(elen);
615 		fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
616 		sepptr = strchr(estr, '@');
617 		if (sepptr == NULL) {
618 			out(O_ALTFP,
619 			    "reconstitute_observation: %s: "
620 			    "missing @ separator in %s.",
621 			    tmpbuf, estr);
622 			Undiag_reason = UD_VAL_MISSINGPATH;
623 			FREE(estr);
624 			break;
625 		}
626 
627 		*sepptr = '\0';
628 		if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
629 			out(O_ALTFP,
630 			    "reconstitute_observation: %s: "
631 			    "trouble converting path string \"%s\" "
632 			    "to internal representation.",
633 			    tmpbuf, sepptr + 1);
634 			Undiag_reason = UD_VAL_MISSINGPATH;
635 			FREE(estr);
636 			break;
637 		}
638 
639 		/* construct the event */
640 		ep = itree_lookup(fmep->eventtree,
641 		    stable(estr), ipath(epnamenp));
642 		if (ep == NULL) {
643 			out(O_ALTFP,
644 			    "reconstitute_observation: %s: "
645 			    "lookup of  \"%s\" in itree failed.",
646 			    tmpbuf, ipath2str(estr, ipath(epnamenp)));
647 			Undiag_reason = UD_VAL_BADOBS;
648 			tree_free(epnamenp);
649 			FREE(estr);
650 			break;
651 		}
652 		tree_free(epnamenp);
653 
654 		/*
655 		 * We may or may not have a saved nvlist for the observation
656 		 */
657 		(void) snprintf(tmpbuf, OBBUFNMSZ, "observed%d.nvp", ocnt);
658 		pkdlen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
659 		if (pkdlen != 0) {
660 			pkd = MALLOC(pkdlen);
661 			fmd_buf_read(fmep->hdl,
662 			    fmep->fmcase, tmpbuf, pkd, pkdlen);
663 			ASSERT(ep->nvp == NULL);
664 			if (nvlist_xunpack(pkd,
665 			    pkdlen, &ep->nvp, &Eft_nv_hdl) != 0)
666 				out(O_DIE|O_SYS, "pack of observed nvl failed");
667 			FREE(pkd);
668 		}
669 
670 		if (ocnt == 0)
671 			fmep->e0 = ep;
672 
673 		FREE(estr);
674 		fmep->ecurrent = ep;
675 		ep->count++;
676 
677 		/* link it into list of observations seen */
678 		ep->observations = fmep->observations;
679 		fmep->observations = ep;
680 	}
681 
682 	if (ocnt == fmep->uniqobs) {
683 		(void) fme_ready(fmep);
684 		return (0);
685 	}
686 
687 	return (1);
688 }
689 
690 /*
691  * restart_fme -- called during eft initialization.  Reconstitutes
692  *	an in-progress fme.
693  */
694 void
fme_restart(fmd_hdl_t * hdl,fmd_case_t * inprogress)695 fme_restart(fmd_hdl_t *hdl, fmd_case_t *inprogress)
696 {
697 	nvlist_t *defect;
698 	struct case_list *bad;
699 	struct fme *fmep;
700 	struct cfgdata *cfgdata;
701 	size_t rawsz;
702 	struct event *ep;
703 	char *tmpbuf = alloca(OBBUFNMSZ);
704 	char *sepptr;
705 	char *estr;
706 	int elen;
707 	struct node *epnamenp = NULL;
708 	int init_size;
709 	extern int alloc_total();
710 	char *reason;
711 
712 	/*
713 	 * ignore solved or closed cases
714 	 */
715 	if (fmd_case_solved(hdl, inprogress) ||
716 	    fmd_case_closed(hdl, inprogress))
717 		return;
718 
719 	fmep = alloc_fme();
720 	fmep->fmcase = inprogress;
721 	fmep->hdl = hdl;
722 
723 	if (fmd_buf_size(hdl, inprogress, WOBUF_POSTD) == 0) {
724 		out(O_ALTFP, "restart_fme: no saved posted status");
725 		Undiag_reason = UD_VAL_MISSINGINFO;
726 		goto badcase;
727 	} else {
728 		fmd_buf_read(hdl, inprogress, WOBUF_POSTD,
729 		    (void *)&fmep->posted_suspects,
730 		    sizeof (fmep->posted_suspects));
731 	}
732 
733 	if (fmd_buf_size(hdl, inprogress, WOBUF_ID) == 0) {
734 		out(O_ALTFP, "restart_fme: no saved id");
735 		Undiag_reason = UD_VAL_MISSINGINFO;
736 		goto badcase;
737 	} else {
738 		fmd_buf_read(hdl, inprogress, WOBUF_ID, (void *)&fmep->id,
739 		    sizeof (fmep->id));
740 	}
741 	if (Nextid <= fmep->id)
742 		Nextid = fmep->id + 1;
743 
744 	out(O_ALTFP, "Replay FME %d", fmep->id);
745 
746 	if (fmd_buf_size(hdl, inprogress, WOBUF_CFGLEN) != sizeof (size_t)) {
747 		out(O_ALTFP, "restart_fme: No config data");
748 		Undiag_reason = UD_VAL_MISSINGINFO;
749 		goto badcase;
750 	}
751 	fmd_buf_read(hdl, inprogress, WOBUF_CFGLEN, (void *)&rawsz,
752 	    sizeof (size_t));
753 
754 	if ((fmep->e0r = fmd_case_getprincipal(hdl, inprogress)) == NULL) {
755 		out(O_ALTFP, "restart_fme: No event zero");
756 		Undiag_reason = UD_VAL_MISSINGZERO;
757 		goto badcase;
758 	}
759 
760 	if (fmd_buf_size(hdl, inprogress, WOBUF_PULL) == 0) {
761 		out(O_ALTFP, "restart_fme: no saved wait time");
762 		Undiag_reason = UD_VAL_MISSINGINFO;
763 		goto badcase;
764 	} else {
765 		fmd_buf_read(hdl, inprogress, WOBUF_PULL, (void *)&fmep->pull,
766 		    sizeof (fmep->pull));
767 	}
768 
769 	if (fmd_buf_size(hdl, inprogress, WOBUF_NOBS) == 0) {
770 		out(O_ALTFP, "restart_fme: no count of observations");
771 		Undiag_reason = UD_VAL_MISSINGINFO;
772 		goto badcase;
773 	} else {
774 		fmd_buf_read(hdl, inprogress, WOBUF_NOBS,
775 		    (void *)&fmep->uniqobs, sizeof (fmep->uniqobs));
776 	}
777 
778 	(void) snprintf(tmpbuf, OBBUFNMSZ, "observed0");
779 	elen = fmd_buf_size(fmep->hdl, fmep->fmcase, tmpbuf);
780 	if (elen == 0) {
781 		out(O_ALTFP, "reconstitute_observation: no %s buffer found.",
782 		    tmpbuf);
783 		Undiag_reason = UD_VAL_MISSINGOBS;
784 		goto badcase;
785 	}
786 	estr = MALLOC(elen);
787 	fmd_buf_read(fmep->hdl, fmep->fmcase, tmpbuf, estr, elen);
788 	sepptr = strchr(estr, '@');
789 	if (sepptr == NULL) {
790 		out(O_ALTFP, "reconstitute_observation: %s: "
791 		    "missing @ separator in %s.",
792 		    tmpbuf, estr);
793 		Undiag_reason = UD_VAL_MISSINGPATH;
794 		FREE(estr);
795 		goto badcase;
796 	}
797 	*sepptr = '\0';
798 	if ((epnamenp = pathstring2epnamenp(sepptr + 1)) == NULL) {
799 		out(O_ALTFP, "reconstitute_observation: %s: "
800 		    "trouble converting path string \"%s\" "
801 		    "to internal representation.", tmpbuf, sepptr + 1);
802 		Undiag_reason = UD_VAL_MISSINGPATH;
803 		FREE(estr);
804 		goto badcase;
805 	}
806 	(void) prune_propagations(stable(estr), ipath(epnamenp));
807 	tree_free(epnamenp);
808 	FREE(estr);
809 
810 	init_size = alloc_total();
811 	out(O_ALTFP|O_STAMP, "start config_restore using %d bytes", init_size);
812 	cfgdata = MALLOC(sizeof (struct cfgdata));
813 	cfgdata->cooked = NULL;
814 	cfgdata->devcache = NULL;
815 	cfgdata->devidcache = NULL;
816 	cfgdata->tpcache = NULL;
817 	cfgdata->cpucache = NULL;
818 	cfgdata->raw_refcnt = 1;
819 
820 	if (rawsz > 0) {
821 		if (fmd_buf_size(hdl, inprogress, WOBUF_CFG) != rawsz) {
822 			out(O_ALTFP, "restart_fme: Config data size mismatch");
823 			Undiag_reason = UD_VAL_CFGMISMATCH;
824 			goto badcase;
825 		}
826 		cfgdata->begin = MALLOC(rawsz);
827 		cfgdata->end = cfgdata->nextfree = cfgdata->begin + rawsz;
828 		fmd_buf_read(hdl,
829 		    inprogress, WOBUF_CFG, cfgdata->begin, rawsz);
830 	} else {
831 		cfgdata->begin = cfgdata->end = cfgdata->nextfree = NULL;
832 	}
833 
834 	config_cook(cfgdata);
835 	fmep->config = cfgdata->cooked;
836 	config_free(cfgdata);
837 	out(O_ALTFP|O_STAMP, "config_restore added %d bytes",
838 	    alloc_total() - init_size);
839 
840 	if ((fmep->eventtree = itree_create(fmep->config)) == NULL) {
841 		/* case not properly saved or irretrievable */
842 		out(O_ALTFP, "restart_fme: NULL instance tree");
843 		Undiag_reason = UD_VAL_INSTFAIL;
844 		goto badcase;
845 	}
846 
847 	itree_ptree(O_ALTFP|O_VERB2, fmep->eventtree);
848 
849 	if (reconstitute_observations(fmep) != 0)
850 		goto badcase;
851 
852 	out(O_ALTFP|O_NONL, "FME %d replay observations: ", fmep->id);
853 	for (ep = fmep->observations; ep; ep = ep->observations) {
854 		out(O_ALTFP|O_NONL, " ");
855 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
856 	}
857 	out(O_ALTFP, NULL);
858 
859 	Open_fme_count++;
860 
861 	/* give the diagnosis algorithm a shot at the new FME state */
862 	fme_eval(fmep, fmep->e0r);
863 	return;
864 
865 badcase:
866 	if (fmep->eventtree != NULL)
867 		itree_free(fmep->eventtree);
868 	if (fmep->config)
869 		structconfig_free(fmep->config);
870 	destroy_fme_bufs(fmep);
871 	FREE(fmep);
872 
873 	/*
874 	 * Since we're unable to restart the case, add it to the undiagable
875 	 * list and solve and close it as appropriate.
876 	 */
877 	bad = MALLOC(sizeof (struct case_list));
878 	bad->next = NULL;
879 
880 	if (Undiagablecaselist != NULL)
881 		bad->next = Undiagablecaselist;
882 	Undiagablecaselist = bad;
883 	bad->fmcase = inprogress;
884 
885 	out(O_ALTFP|O_NONL, "[case %s (unable to restart), ",
886 	    fmd_case_uuid(hdl, bad->fmcase));
887 
888 	if (fmd_case_solved(hdl, bad->fmcase)) {
889 		out(O_ALTFP|O_NONL, "already solved, ");
890 	} else {
891 		out(O_ALTFP|O_NONL, "solving, ");
892 		defect = fmd_nvl_create_fault(hdl,
893 		    undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL);
894 		reason = undiag_2reason_str(Undiag_reason, NULL);
895 		(void) nvlist_add_string(defect, UNDIAG_REASON, reason);
896 		FREE(reason);
897 		fmd_case_add_suspect(hdl, bad->fmcase, defect);
898 		fmd_case_solve(hdl, bad->fmcase);
899 		Undiag_reason = UD_VAL_UNKNOWN;
900 	}
901 
902 	if (fmd_case_closed(hdl, bad->fmcase)) {
903 		out(O_ALTFP, "already closed ]");
904 	} else {
905 		out(O_ALTFP, "closing ]");
906 		fmd_case_close(hdl, bad->fmcase);
907 	}
908 }
909 
910 /*ARGSUSED*/
911 static void
globals_destructor(void * left,void * right,void * arg)912 globals_destructor(void *left, void *right, void *arg)
913 {
914 	struct evalue *evp = (struct evalue *)right;
915 	if (evp->t == NODEPTR)
916 		tree_free((struct node *)(uintptr_t)evp->v);
917 	evp->v = (uintptr_t)NULL;
918 	FREE(evp);
919 }
920 
921 void
destroy_fme(struct fme * f)922 destroy_fme(struct fme *f)
923 {
924 	stats_delete(f->Rcount);
925 	stats_delete(f->Hcallcount);
926 	stats_delete(f->Rcallcount);
927 	stats_delete(f->Ccallcount);
928 	stats_delete(f->Ecallcount);
929 	stats_delete(f->Tcallcount);
930 	stats_delete(f->Marrowcount);
931 	stats_delete(f->diags);
932 
933 	if (f->eventtree != NULL)
934 		itree_free(f->eventtree);
935 	if (f->config)
936 		structconfig_free(f->config);
937 	lut_free(f->globals, globals_destructor, NULL);
938 	FREE(f);
939 }
940 
941 static const char *
fme_state2str(enum fme_state s)942 fme_state2str(enum fme_state s)
943 {
944 	switch (s) {
945 	case FME_NOTHING:	return ("NOTHING");
946 	case FME_WAIT:		return ("WAIT");
947 	case FME_CREDIBLE:	return ("CREDIBLE");
948 	case FME_DISPROVED:	return ("DISPROVED");
949 	case FME_DEFERRED:	return ("DEFERRED");
950 	default:		return ("UNKNOWN");
951 	}
952 }
953 
954 static int
is_problem(enum nametype t)955 is_problem(enum nametype t)
956 {
957 	return (t == N_FAULT || t == N_DEFECT || t == N_UPSET);
958 }
959 
960 static int
is_defect(enum nametype t)961 is_defect(enum nametype t)
962 {
963 	return (t == N_DEFECT);
964 }
965 
966 static int
is_upset(enum nametype t)967 is_upset(enum nametype t)
968 {
969 	return (t == N_UPSET);
970 }
971 
972 static void
fme_print(int flags,struct fme * fmep)973 fme_print(int flags, struct fme *fmep)
974 {
975 	struct event *ep;
976 
977 	out(flags, "Fault Management Exercise %d", fmep->id);
978 	out(flags, "\t       State: %s", fme_state2str(fmep->state));
979 	out(flags|O_NONL, "\t  Start time: ");
980 	ptree_timeval(flags|O_NONL, &fmep->ull);
981 	out(flags, NULL);
982 	if (fmep->wull) {
983 		out(flags|O_NONL, "\t   Wait time: ");
984 		ptree_timeval(flags|O_NONL, &fmep->wull);
985 		out(flags, NULL);
986 	}
987 	out(flags|O_NONL, "\t          E0: ");
988 	if (fmep->e0)
989 		itree_pevent_brief(flags|O_NONL, fmep->e0);
990 	else
991 		out(flags|O_NONL, "NULL");
992 	out(flags, NULL);
993 	out(flags|O_NONL, "\tObservations:");
994 	for (ep = fmep->observations; ep; ep = ep->observations) {
995 		out(flags|O_NONL, " ");
996 		itree_pevent_brief(flags|O_NONL, ep);
997 	}
998 	out(flags, NULL);
999 	out(flags|O_NONL, "\tSuspect list:");
1000 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1001 		out(flags|O_NONL, " ");
1002 		itree_pevent_brief(flags|O_NONL, ep);
1003 	}
1004 	out(flags, NULL);
1005 	if (fmep->eventtree != NULL) {
1006 		out(flags|O_VERB2, "\t        Tree:");
1007 		itree_ptree(flags|O_VERB2, fmep->eventtree);
1008 	}
1009 }
1010 
1011 static struct node *
pathstring2epnamenp(char * path)1012 pathstring2epnamenp(char *path)
1013 {
1014 	char *sep = "/";
1015 	struct node *ret;
1016 	char *ptr;
1017 
1018 	if ((ptr = strtok(path, sep)) == NULL)
1019 		out(O_DIE, "pathstring2epnamenp: invalid empty class");
1020 
1021 	ret = tree_iname(stable(ptr), NULL, 0);
1022 
1023 	while ((ptr = strtok(NULL, sep)) != NULL)
1024 		ret = tree_name_append(ret,
1025 		    tree_iname(stable(ptr), NULL, 0));
1026 
1027 	return (ret);
1028 }
1029 
1030 /*
1031  * for a given upset sp, increment the corresponding SERD engine.  if the
1032  * SERD engine trips, return the ename and ipp of the resulting ereport.
1033  * returns true if engine tripped and *enamep and *ippp were filled in.
1034  */
1035 static int
serd_eval(struct fme * fmep,fmd_hdl_t * hdl,fmd_event_t * ffep,fmd_case_t * fmcase,struct event * sp,const char ** enamep,const struct ipath ** ippp)1036 serd_eval(struct fme *fmep, fmd_hdl_t *hdl, fmd_event_t *ffep,
1037     fmd_case_t *fmcase, struct event *sp, const char **enamep,
1038     const struct ipath **ippp)
1039 {
1040 	struct node *serdinst;
1041 	char *serdname;
1042 	char *serdresource;
1043 	char *serdclass;
1044 	struct node *nid;
1045 	struct serd_entry *newentp;
1046 	int i, serdn = -1, serdincrement = 1, len = 0;
1047 	char *serdsuffix = NULL, *serdt = NULL;
1048 	struct evalue *ep;
1049 
1050 	ASSERT(sp->t == N_UPSET);
1051 	ASSERT(ffep != NULL);
1052 
1053 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
1054 	    (void *)"n", (lut_cmp)strcmp)) != NULL) {
1055 		ASSERT(ep->t == UINT64);
1056 		serdn = (int)ep->v;
1057 	}
1058 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
1059 	    (void *)"t", (lut_cmp)strcmp)) != NULL) {
1060 		ASSERT(ep->t == STRING);
1061 		serdt = (char *)(uintptr_t)ep->v;
1062 	}
1063 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
1064 	    (void *)"suffix", (lut_cmp)strcmp)) != NULL) {
1065 		ASSERT(ep->t == STRING);
1066 		serdsuffix = (char *)(uintptr_t)ep->v;
1067 	}
1068 	if ((ep = (struct evalue *)lut_lookup(sp->serdprops,
1069 	    (void *)"increment", (lut_cmp)strcmp)) != NULL) {
1070 		ASSERT(ep->t == UINT64);
1071 		serdincrement = (int)ep->v;
1072 	}
1073 
1074 	/*
1075 	 * obtain instanced SERD engine from the upset sp.  from this
1076 	 * derive serdname, the string used to identify the SERD engine.
1077 	 */
1078 	serdinst = eventprop_lookup(sp, L_engine);
1079 
1080 	if (serdinst == NULL)
1081 		return (-1);
1082 
1083 	len = strlen(serdinst->u.stmt.np->u.event.ename->u.name.s) + 1;
1084 	if (serdsuffix != NULL)
1085 		len += strlen(serdsuffix);
1086 	serdclass = MALLOC(len);
1087 	if (serdsuffix != NULL)
1088 		(void) snprintf(serdclass, len, "%s%s",
1089 		    serdinst->u.stmt.np->u.event.ename->u.name.s, serdsuffix);
1090 	else
1091 		(void) snprintf(serdclass, len, "%s",
1092 		    serdinst->u.stmt.np->u.event.ename->u.name.s);
1093 	serdresource = ipath2str(NULL,
1094 	    ipath(serdinst->u.stmt.np->u.event.epname));
1095 	len += strlen(serdresource) + 1;
1096 	serdname = MALLOC(len);
1097 	(void) snprintf(serdname, len, "%s@%s", serdclass, serdresource);
1098 	FREE(serdresource);
1099 
1100 	/* handle serd engine "id" property, if there is one */
1101 	if ((nid =
1102 	    lut_lookup(serdinst->u.stmt.lutp, (void *)L_id, NULL)) != NULL) {
1103 		struct evalue *gval;
1104 		char suffixbuf[200];
1105 		char *suffix;
1106 		char *nserdname;
1107 		size_t nname;
1108 
1109 		out(O_ALTFP|O_NONL, "serd \"%s\" id: ", serdname);
1110 		ptree_name_iter(O_ALTFP|O_NONL, nid);
1111 
1112 		ASSERTinfo(nid->t == T_GLOBID, ptree_nodetype2str(nid->t));
1113 
1114 		if ((gval = lut_lookup(fmep->globals,
1115 		    (void *)nid->u.globid.s, NULL)) == NULL) {
1116 			out(O_ALTFP, " undefined");
1117 		} else if (gval->t == UINT64) {
1118 			out(O_ALTFP, " %llu", gval->v);
1119 			(void) sprintf(suffixbuf, "%llu", gval->v);
1120 			suffix = suffixbuf;
1121 		} else {
1122 			out(O_ALTFP, " \"%s\"", (char *)(uintptr_t)gval->v);
1123 			suffix = (char *)(uintptr_t)gval->v;
1124 		}
1125 
1126 		nname = strlen(serdname) + strlen(suffix) + 2;
1127 		nserdname = MALLOC(nname);
1128 		(void) snprintf(nserdname, nname, "%s:%s", serdname, suffix);
1129 		FREE(serdname);
1130 		serdname = nserdname;
1131 	}
1132 
1133 	/*
1134 	 * if the engine is empty, and we have an override for n/t then
1135 	 * destroy and recreate it.
1136 	 */
1137 	if ((serdn != -1 || serdt != NULL) && fmd_serd_exists(hdl, serdname) &&
1138 	    fmd_serd_empty(hdl, serdname))
1139 		fmd_serd_destroy(hdl, serdname);
1140 
1141 	if (!fmd_serd_exists(hdl, serdname)) {
1142 		struct node *nN, *nT;
1143 		const char *s;
1144 		struct node *nodep;
1145 		struct config *cp;
1146 		char *path;
1147 		uint_t nval;
1148 		hrtime_t tval;
1149 		int i;
1150 		char *ptr;
1151 		int got_n_override = 0, got_t_override = 0;
1152 
1153 		/* no SERD engine yet, so create it */
1154 		nodep = serdinst->u.stmt.np->u.event.epname;
1155 		path = ipath2str(NULL, ipath(nodep));
1156 		cp = config_lookup(fmep->config, path, 0);
1157 		FREE((void *)path);
1158 
1159 		/*
1160 		 * We allow serd paramaters to be overridden, either from
1161 		 * eft.conf file values (if Serd_Override is set) or from
1162 		 * driver properties (for "serd.io.device" engines).
1163 		 */
1164 		if (Serd_Override != NULL) {
1165 			char *save_ptr, *ptr1, *ptr2, *ptr3;
1166 			ptr3 = save_ptr = STRDUP(Serd_Override);
1167 			while (*ptr3 != '\0') {
1168 				ptr1 = strchr(ptr3, ',');
1169 				*ptr1 = '\0';
1170 				if (strcmp(ptr3, serdclass) == 0) {
1171 					ptr2 =  strchr(ptr1 + 1, ',');
1172 					*ptr2 = '\0';
1173 					nval = atoi(ptr1 + 1);
1174 					out(O_ALTFP, "serd override %s_n %d",
1175 					    serdclass, nval);
1176 					ptr3 =  strchr(ptr2 + 1, ' ');
1177 					if (ptr3)
1178 						*ptr3 = '\0';
1179 					ptr = STRDUP(ptr2 + 1);
1180 					out(O_ALTFP, "serd override %s_t %s",
1181 					    serdclass, ptr);
1182 					got_n_override = 1;
1183 					got_t_override = 1;
1184 					break;
1185 				} else {
1186 					ptr2 =  strchr(ptr1 + 1, ',');
1187 					ptr3 =  strchr(ptr2 + 1, ' ');
1188 					if (ptr3 == NULL)
1189 						break;
1190 				}
1191 				ptr3++;
1192 			}
1193 			FREE(save_ptr);
1194 		}
1195 
1196 		if (cp && got_n_override == 0) {
1197 			/*
1198 			 * convert serd engine class into property name
1199 			 */
1200 			char *prop_name = MALLOC(strlen(serdclass) + 3);
1201 			for (i = 0; i < strlen(serdclass); i++) {
1202 				if (serdclass[i] == '.')
1203 					prop_name[i] = '_';
1204 				else
1205 					prop_name[i] = serdclass[i];
1206 			}
1207 			prop_name[i++] = '_';
1208 			prop_name[i++] = 'n';
1209 			prop_name[i] = '\0';
1210 			if (s = config_getprop(cp, prop_name)) {
1211 				nval = atoi(s);
1212 				out(O_ALTFP, "serd override %s_n %s",
1213 				    serdclass, s);
1214 				got_n_override = 1;
1215 			}
1216 			prop_name[i - 1] = 't';
1217 			if (s = config_getprop(cp, prop_name)) {
1218 				ptr = STRDUP(s);
1219 				out(O_ALTFP, "serd override %s_t %s",
1220 				    serdclass, s);
1221 				got_t_override = 1;
1222 			}
1223 			FREE(prop_name);
1224 		}
1225 
1226 		if (serdn != -1 && got_n_override == 0) {
1227 			nval = serdn;
1228 			out(O_ALTFP, "serd override %s_n %d", serdclass, serdn);
1229 			got_n_override = 1;
1230 		}
1231 		if (serdt != NULL && got_t_override == 0) {
1232 			ptr = STRDUP(serdt);
1233 			out(O_ALTFP, "serd override %s_t %s", serdclass, serdt);
1234 			got_t_override = 1;
1235 		}
1236 
1237 		if (!got_n_override) {
1238 			nN = lut_lookup(serdinst->u.stmt.lutp, (void *)L_N,
1239 			    NULL);
1240 			ASSERT(nN->t == T_NUM);
1241 			nval = (uint_t)nN->u.ull;
1242 		}
1243 		if (!got_t_override) {
1244 			nT = lut_lookup(serdinst->u.stmt.lutp, (void *)L_T,
1245 			    NULL);
1246 			ASSERT(nT->t == T_TIMEVAL);
1247 			tval = (hrtime_t)nT->u.ull;
1248 		} else {
1249 			const unsigned long long *ullp;
1250 			const char *suffix;
1251 			int len;
1252 
1253 			len = strspn(ptr, "0123456789");
1254 			suffix = stable(&ptr[len]);
1255 			ullp = (unsigned long long *)lut_lookup(Timesuffixlut,
1256 			    (void *)suffix, NULL);
1257 			ptr[len] = '\0';
1258 			tval = strtoull(ptr, NULL, 0) * (ullp ? *ullp : 1ll);
1259 			FREE(ptr);
1260 		}
1261 		fmd_serd_create(hdl, serdname, nval, tval);
1262 	}
1263 
1264 	newentp = MALLOC(sizeof (*newentp));
1265 	newentp->ename = stable(serdclass);
1266 	FREE(serdclass);
1267 	newentp->ipath = ipath(serdinst->u.stmt.np->u.event.epname);
1268 	newentp->hdl = hdl;
1269 	if (lut_lookup(SerdEngines, newentp, (lut_cmp)serd_cmp) == NULL) {
1270 		SerdEngines = lut_add(SerdEngines, (void *)newentp,
1271 		    (void *)newentp, (lut_cmp)serd_cmp);
1272 		Serd_need_save = 1;
1273 		serd_save();
1274 	} else {
1275 		FREE(newentp);
1276 	}
1277 
1278 
1279 	/*
1280 	 * increment SERD engine.  if engine fires, reset serd
1281 	 * engine and return trip_strcode if required.
1282 	 */
1283 	for (i = 0; i < serdincrement; i++) {
1284 		if (fmd_serd_record(hdl, serdname, ffep)) {
1285 			fmd_case_add_serd(hdl, fmcase, serdname);
1286 			fmd_serd_reset(hdl, serdname);
1287 
1288 			if (ippp) {
1289 				struct node *tripinst =
1290 				    lut_lookup(serdinst->u.stmt.lutp,
1291 				    (void *)L_trip, NULL);
1292 				ASSERT(tripinst != NULL);
1293 				*enamep = tripinst->u.event.ename->u.name.s;
1294 				*ippp = ipath(tripinst->u.event.epname);
1295 				out(O_ALTFP|O_NONL,
1296 				    "[engine fired: %s, sending: ", serdname);
1297 				ipath_print(O_ALTFP|O_NONL, *enamep, *ippp);
1298 				out(O_ALTFP, "]");
1299 			} else {
1300 				out(O_ALTFP, "[engine fired: %s, no trip]",
1301 				    serdname);
1302 			}
1303 			FREE(serdname);
1304 			return (1);
1305 		}
1306 	}
1307 
1308 	FREE(serdname);
1309 	return (0);
1310 }
1311 
1312 /*
1313  * search a suspect list for upsets.  feed each upset to serd_eval() and
1314  * build up tripped[], an array of ereports produced by the firing of
1315  * any SERD engines.  then feed each ereport back into
1316  * fme_receive_report().
1317  *
1318  * returns ntrip, the number of these ereports produced.
1319  */
1320 static int
upsets_eval(struct fme * fmep,fmd_event_t * ffep)1321 upsets_eval(struct fme *fmep, fmd_event_t *ffep)
1322 {
1323 	/* we build an array of tripped ereports that we send ourselves */
1324 	struct {
1325 		const char *ename;
1326 		const struct ipath *ipp;
1327 	} *tripped;
1328 	struct event *sp;
1329 	int ntrip, nupset, i;
1330 
1331 	/*
1332 	 * count the number of upsets to determine the upper limit on
1333 	 * expected trip ereport strings.  remember that one upset can
1334 	 * lead to at most one ereport.
1335 	 */
1336 	nupset = 0;
1337 	for (sp = fmep->suspects; sp; sp = sp->suspects) {
1338 		if (sp->t == N_UPSET)
1339 			nupset++;
1340 	}
1341 
1342 	if (nupset == 0)
1343 		return (0);
1344 
1345 	/*
1346 	 * get to this point if we have upsets and expect some trip
1347 	 * ereports
1348 	 */
1349 	tripped = alloca(sizeof (*tripped) * nupset);
1350 	bzero((void *)tripped, sizeof (*tripped) * nupset);
1351 
1352 	ntrip = 0;
1353 	for (sp = fmep->suspects; sp; sp = sp->suspects)
1354 		if (sp->t == N_UPSET &&
1355 		    serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, sp,
1356 		    &tripped[ntrip].ename, &tripped[ntrip].ipp) == 1)
1357 			ntrip++;
1358 
1359 	for (i = 0; i < ntrip; i++) {
1360 		struct event *ep, *nep;
1361 		struct fme *nfmep;
1362 		fmd_case_t *fmcase;
1363 		const struct ipath *ipp;
1364 		const char *eventstring;
1365 		int prev_verbose;
1366 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1367 		enum fme_state state;
1368 
1369 		/*
1370 		 * First try and evaluate a case with the trip ereport plus
1371 		 * all the other ereports that cause the trip. If that fails
1372 		 * to evaluate then try again with just this ereport on its own.
1373 		 */
1374 		out(O_ALTFP|O_NONL, "fme_receive_report_serd: ");
1375 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1376 		out(O_ALTFP|O_STAMP, NULL);
1377 		ep = fmep->e0;
1378 		eventstring = ep->enode->u.event.ename->u.name.s;
1379 		ipp = ep->ipp;
1380 
1381 		/*
1382 		 * create a duplicate fme and case
1383 		 */
1384 		fmcase = fmd_case_open(fmep->hdl, NULL);
1385 		out(O_ALTFP|O_NONL, "duplicate fme for event [");
1386 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1387 		out(O_ALTFP, " ]");
1388 
1389 		if ((nfmep = newfme(eventstring, ipp, fmep->hdl,
1390 		    fmcase, ffep, ep->nvp)) == NULL) {
1391 			out(O_ALTFP|O_NONL, "[");
1392 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1393 			out(O_ALTFP, " CANNOT DIAGNOSE]");
1394 			continue;
1395 		}
1396 
1397 		Open_fme_count++;
1398 		nfmep->pull = fmep->pull;
1399 		init_fme_bufs(nfmep);
1400 		out(O_ALTFP|O_NONL, "[");
1401 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1402 		out(O_ALTFP, " created FME%d, case %s]", nfmep->id,
1403 		    fmd_case_uuid(nfmep->hdl, nfmep->fmcase));
1404 		if (ffep) {
1405 			fmd_case_setprincipal(nfmep->hdl, nfmep->fmcase, ffep);
1406 			fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase, ffep);
1407 			nfmep->e0r = ffep;
1408 		}
1409 
1410 		/*
1411 		 * add the original ereports
1412 		 */
1413 		for (ep = fmep->observations; ep; ep = ep->observations) {
1414 			eventstring = ep->enode->u.event.ename->u.name.s;
1415 			ipp = ep->ipp;
1416 			out(O_ALTFP|O_NONL, "adding event [");
1417 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1418 			out(O_ALTFP, " ]");
1419 			nep = itree_lookup(nfmep->eventtree, eventstring, ipp);
1420 			if (nep->count++ == 0) {
1421 				nep->observations = nfmep->observations;
1422 				nfmep->observations = nep;
1423 				serialize_observation(nfmep, eventstring, ipp);
1424 				nep->nvp = evnv_dupnvl(ep->nvp);
1425 			}
1426 			if (ep->ffep && ep->ffep != ffep)
1427 				fmd_case_add_ereport(nfmep->hdl, nfmep->fmcase,
1428 				    ep->ffep);
1429 			stats_counter_bump(nfmep->Rcount);
1430 		}
1431 
1432 		/*
1433 		 * add the serd trigger ereport
1434 		 */
1435 		if ((ep = itree_lookup(nfmep->eventtree, tripped[i].ename,
1436 		    tripped[i].ipp)) == NULL) {
1437 			/*
1438 			 * The trigger ereport is not in the instance tree. It
1439 			 * was presumably removed by prune_propagations() as
1440 			 * this combination of events is not present in the
1441 			 * rules.
1442 			 */
1443 			out(O_ALTFP, "upsets_eval: e0 not in instance tree");
1444 			Undiag_reason = UD_VAL_BADEVENTI;
1445 			goto retry_lone_ereport;
1446 		}
1447 		out(O_ALTFP|O_NONL, "adding event [");
1448 		ipath_print(O_ALTFP|O_NONL, tripped[i].ename, tripped[i].ipp);
1449 		out(O_ALTFP, " ]");
1450 		nfmep->ecurrent = ep;
1451 		ep->nvp = NULL;
1452 		ep->count = 1;
1453 		ep->observations = nfmep->observations;
1454 		nfmep->observations = ep;
1455 
1456 		/*
1457 		 * just peek first.
1458 		 */
1459 		nfmep->peek = 1;
1460 		prev_verbose = Verbose;
1461 		if (Debug == 0)
1462 			Verbose = 0;
1463 		lut_walk(nfmep->eventtree, (lut_cb)clear_arrows, (void *)nfmep);
1464 		state = hypothesise(nfmep, nfmep->e0, nfmep->ull, &my_delay);
1465 		nfmep->peek = 0;
1466 		Verbose = prev_verbose;
1467 		if (state == FME_DISPROVED) {
1468 			out(O_ALTFP, "upsets_eval: hypothesis disproved");
1469 			Undiag_reason = UD_VAL_UNSOLVD;
1470 retry_lone_ereport:
1471 			/*
1472 			 * However the trigger ereport on its own might be
1473 			 * diagnosable, so check for that. Undo the new fme
1474 			 * and case we just created and call fme_receive_report.
1475 			 */
1476 			out(O_ALTFP|O_NONL, "[");
1477 			ipath_print(O_ALTFP|O_NONL, tripped[i].ename,
1478 			    tripped[i].ipp);
1479 			out(O_ALTFP, " retrying with just trigger ereport]");
1480 			itree_free(nfmep->eventtree);
1481 			nfmep->eventtree = NULL;
1482 			structconfig_free(nfmep->config);
1483 			nfmep->config = NULL;
1484 			destroy_fme_bufs(nfmep);
1485 			fmd_case_close(nfmep->hdl, nfmep->fmcase);
1486 			fme_receive_report(fmep->hdl, ffep,
1487 			    tripped[i].ename, tripped[i].ipp, NULL);
1488 			continue;
1489 		}
1490 
1491 		/*
1492 		 * and evaluate
1493 		 */
1494 		serialize_observation(nfmep, tripped[i].ename, tripped[i].ipp);
1495 		fme_eval(nfmep, ffep);
1496 	}
1497 
1498 	return (ntrip);
1499 }
1500 
1501 /*
1502  * fme_receive_external_report -- call when an external ereport comes in
1503  *
1504  * this routine just converts the relevant information from the ereport
1505  * into a format used internally and passes it on to fme_receive_report().
1506  */
1507 void
fme_receive_external_report(fmd_hdl_t * hdl,fmd_event_t * ffep,nvlist_t * nvl,const char * class)1508 fme_receive_external_report(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1509     const char *class)
1510 {
1511 	struct node		*epnamenp;
1512 	fmd_case_t		*fmcase;
1513 	const struct ipath	*ipp;
1514 	nvlist_t		*detector = NULL;
1515 
1516 	class = stable(class);
1517 
1518 	/* Get the component path from the ereport */
1519 	epnamenp = platform_getpath(nvl);
1520 
1521 	/* See if we ended up without a path. */
1522 	if (epnamenp == NULL) {
1523 		/* See if class permits silent discard on unknown component. */
1524 		if (lut_lookup(Ereportenames_discard, (void *)class, NULL)) {
1525 			out(O_ALTFP|O_VERB2, "Unable to map \"%s\" ereport "
1526 			    "to component path, but silent discard allowed.",
1527 			    class);
1528 		} else {
1529 			/*
1530 			 * XFILE: Failure to find a component is bad unless
1531 			 * 'discard_if_config_unknown=1' was specified in the
1532 			 * ereport definition. Indicate undiagnosable.
1533 			 */
1534 			Undiag_reason = UD_VAL_NOPATH;
1535 			fmcase = fmd_case_open(hdl, NULL);
1536 
1537 			/*
1538 			 * We don't have a component path here (which means that
1539 			 * the detector was not in hc-scheme and couldn't be
1540 			 * converted to hc-scheme. Report the raw detector as
1541 			 * the suspect resource if there is one.
1542 			 */
1543 			(void) nvlist_lookup_nvlist(nvl, FM_EREPORT_DETECTOR,
1544 			    &detector);
1545 			publish_undiagnosable(hdl, ffep, fmcase, detector,
1546 			    (char *)class);
1547 		}
1548 		return;
1549 	}
1550 
1551 	ipp = ipath(epnamenp);
1552 	tree_free(epnamenp);
1553 	fme_receive_report(hdl, ffep, class, ipp, nvl);
1554 }
1555 
1556 /*ARGSUSED*/
1557 void
fme_receive_repair_list(fmd_hdl_t * hdl,fmd_event_t * ffep,nvlist_t * nvl,const char * eventstring)1558 fme_receive_repair_list(fmd_hdl_t *hdl, fmd_event_t *ffep, nvlist_t *nvl,
1559     const char *eventstring)
1560 {
1561 	char *uuid;
1562 	nvlist_t **nva;
1563 	uint_t nvc;
1564 	const struct ipath *ipp;
1565 
1566 	if (nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) != 0 ||
1567 	    nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
1568 	    &nva, &nvc) != 0) {
1569 		out(O_ALTFP, "No uuid or fault list for list.repaired event");
1570 		return;
1571 	}
1572 
1573 	out(O_ALTFP, "Processing list.repaired from case %s", uuid);
1574 
1575 	while (nvc-- != 0) {
1576 		/*
1577 		 * Reset any istat or serd engine associated with this path.
1578 		 */
1579 		char *path;
1580 
1581 		if ((ipp = platform_fault2ipath(*nva++)) == NULL)
1582 			continue;
1583 
1584 		path = ipath2str(NULL, ipp);
1585 		out(O_ALTFP, "fme_receive_repair_list: resetting state for %s",
1586 		    path);
1587 		FREE(path);
1588 
1589 		lut_walk(Istats, (lut_cb)istat_counter_reset_cb, (void *)ipp);
1590 		istat_save();
1591 
1592 		lut_walk(SerdEngines, (lut_cb)serd_reset_cb, (void *)ipp);
1593 		serd_save();
1594 	}
1595 }
1596 
1597 /*ARGSUSED*/
1598 void
fme_receive_topology_change(void)1599 fme_receive_topology_change(void)
1600 {
1601 	lut_walk(Istats, (lut_cb)istat_counter_topo_chg_cb, NULL);
1602 	istat_save();
1603 
1604 	lut_walk(SerdEngines, (lut_cb)serd_topo_chg_cb, NULL);
1605 	serd_save();
1606 }
1607 
1608 static int mark_arrows(struct fme *fmep, struct event *ep, int mark,
1609     unsigned long long at_latest_by, unsigned long long *pdelay, int keep);
1610 
1611 /* ARGSUSED */
1612 static void
clear_arrows(struct event * ep,struct event * ep2,struct fme * fmep)1613 clear_arrows(struct event *ep, struct event *ep2, struct fme *fmep)
1614 {
1615 	struct bubble *bp;
1616 	struct arrowlist *ap;
1617 
1618 	ep->cached_state = 0;
1619 	ep->keep_in_tree = 0;
1620 	for (bp = itree_next_bubble(ep, NULL); bp;
1621 	    bp = itree_next_bubble(ep, bp)) {
1622 		if (bp->t != B_FROM)
1623 			continue;
1624 		bp->mark = 0;
1625 		for (ap = itree_next_arrow(bp, NULL); ap;
1626 		    ap = itree_next_arrow(bp, ap))
1627 			ap->arrowp->mark = 0;
1628 	}
1629 }
1630 
1631 static void
fme_receive_report(fmd_hdl_t * hdl,fmd_event_t * ffep,const char * eventstring,const struct ipath * ipp,nvlist_t * nvl)1632 fme_receive_report(fmd_hdl_t *hdl, fmd_event_t *ffep,
1633     const char *eventstring, const struct ipath *ipp, nvlist_t *nvl)
1634 {
1635 	struct event *ep;
1636 	struct fme *fmep = NULL;
1637 	struct fme *ofmep = NULL;
1638 	struct fme *cfmep, *svfmep;
1639 	int matched = 0;
1640 	nvlist_t *defect;
1641 	fmd_case_t *fmcase;
1642 	char *reason;
1643 
1644 	out(O_ALTFP|O_NONL, "fme_receive_report: ");
1645 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1646 	out(O_ALTFP|O_STAMP, NULL);
1647 
1648 	/* decide which FME it goes to */
1649 	for (fmep = FMElist; fmep; fmep = fmep->next) {
1650 		int prev_verbose;
1651 		unsigned long long my_delay = TIMEVAL_EVENTUALLY;
1652 		enum fme_state state;
1653 		nvlist_t *pre_peek_nvp = NULL;
1654 
1655 		if (fmep->overflow) {
1656 			if (!(fmd_case_closed(fmep->hdl, fmep->fmcase)))
1657 				ofmep = fmep;
1658 
1659 			continue;
1660 		}
1661 
1662 		/*
1663 		 * ignore solved or closed cases
1664 		 */
1665 		if (fmep->posted_suspects ||
1666 		    fmd_case_solved(fmep->hdl, fmep->fmcase) ||
1667 		    fmd_case_closed(fmep->hdl, fmep->fmcase))
1668 			continue;
1669 
1670 		/* look up event in event tree for this FME */
1671 		if ((ep = itree_lookup(fmep->eventtree,
1672 		    eventstring, ipp)) == NULL)
1673 			continue;
1674 
1675 		/* note observation */
1676 		fmep->ecurrent = ep;
1677 		if (ep->count++ == 0) {
1678 			/* link it into list of observations seen */
1679 			ep->observations = fmep->observations;
1680 			fmep->observations = ep;
1681 			ep->nvp = evnv_dupnvl(nvl);
1682 		} else {
1683 			/* use new payload values for peek */
1684 			pre_peek_nvp = ep->nvp;
1685 			ep->nvp = evnv_dupnvl(nvl);
1686 		}
1687 
1688 		/* tell hypothesise() not to mess with suspect list */
1689 		fmep->peek = 1;
1690 
1691 		/* don't want this to be verbose (unless Debug is set) */
1692 		prev_verbose = Verbose;
1693 		if (Debug == 0)
1694 			Verbose = 0;
1695 
1696 		lut_walk(fmep->eventtree, (lut_cb)clear_arrows, (void *)fmep);
1697 		state = hypothesise(fmep, fmep->e0, fmep->ull, &my_delay);
1698 
1699 		fmep->peek = 0;
1700 
1701 		/* put verbose flag back */
1702 		Verbose = prev_verbose;
1703 
1704 		if (state != FME_DISPROVED) {
1705 			/* found an FME that explains the ereport */
1706 			matched++;
1707 			out(O_ALTFP|O_NONL, "[");
1708 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1709 			out(O_ALTFP, " explained by FME%d]", fmep->id);
1710 
1711 			nvlist_free(pre_peek_nvp);
1712 
1713 			if (ep->count == 1)
1714 				serialize_observation(fmep, eventstring, ipp);
1715 
1716 			if (ffep) {
1717 				fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1718 				ep->ffep = ffep;
1719 			}
1720 
1721 			stats_counter_bump(fmep->Rcount);
1722 
1723 			/* re-eval FME */
1724 			fme_eval(fmep, ffep);
1725 		} else {
1726 
1727 			/* not a match, undo noting of observation */
1728 			fmep->ecurrent = NULL;
1729 			if (--ep->count == 0) {
1730 				/* unlink it from observations */
1731 				fmep->observations = ep->observations;
1732 				ep->observations = NULL;
1733 				nvlist_free(ep->nvp);
1734 				ep->nvp = NULL;
1735 			} else {
1736 				nvlist_free(ep->nvp);
1737 				ep->nvp = pre_peek_nvp;
1738 			}
1739 		}
1740 	}
1741 
1742 	if (matched)
1743 		return;	/* explained by at least one existing FME */
1744 
1745 	/* clean up closed fmes */
1746 	cfmep = ClosedFMEs;
1747 	while (cfmep != NULL) {
1748 		svfmep = cfmep->next;
1749 		destroy_fme(cfmep);
1750 		cfmep = svfmep;
1751 	}
1752 	ClosedFMEs = NULL;
1753 
1754 	if (ofmep) {
1755 		out(O_ALTFP|O_NONL, "[");
1756 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1757 		out(O_ALTFP, " ADDING TO OVERFLOW FME]");
1758 		if (ffep)
1759 			fmd_case_add_ereport(hdl, ofmep->fmcase, ffep);
1760 
1761 		return;
1762 
1763 	} else if (Max_fme && (Open_fme_count >= Max_fme)) {
1764 		out(O_ALTFP|O_NONL, "[");
1765 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1766 		out(O_ALTFP, " MAX OPEN FME REACHED]");
1767 
1768 		fmcase = fmd_case_open(hdl, NULL);
1769 
1770 		/* Create overflow fme */
1771 		if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep,
1772 		    nvl)) == NULL) {
1773 			out(O_ALTFP|O_NONL, "[");
1774 			ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1775 			out(O_ALTFP, " CANNOT OPEN OVERFLOW FME]");
1776 			return;
1777 		}
1778 
1779 		Open_fme_count++;
1780 
1781 		init_fme_bufs(fmep);
1782 		fmep->overflow = B_TRUE;
1783 
1784 		if (ffep)
1785 			fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1786 
1787 		Undiag_reason = UD_VAL_MAXFME;
1788 		defect = fmd_nvl_create_fault(hdl,
1789 		    undiag_2defect_str(Undiag_reason), 100, NULL, NULL, NULL);
1790 		reason = undiag_2reason_str(Undiag_reason, NULL);
1791 		(void) nvlist_add_string(defect, UNDIAG_REASON, reason);
1792 		FREE(reason);
1793 		fmd_case_add_suspect(hdl, fmep->fmcase, defect);
1794 		fmd_case_solve(hdl, fmep->fmcase);
1795 		Undiag_reason = UD_VAL_UNKNOWN;
1796 		return;
1797 	}
1798 
1799 	/* open a case */
1800 	fmcase = fmd_case_open(hdl, NULL);
1801 
1802 	/* start a new FME */
1803 	if ((fmep = newfme(eventstring, ipp, hdl, fmcase, ffep, nvl)) == NULL) {
1804 		out(O_ALTFP|O_NONL, "[");
1805 		ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1806 		out(O_ALTFP, " CANNOT DIAGNOSE]");
1807 		return;
1808 	}
1809 
1810 	Open_fme_count++;
1811 
1812 	init_fme_bufs(fmep);
1813 
1814 	out(O_ALTFP|O_NONL, "[");
1815 	ipath_print(O_ALTFP|O_NONL, eventstring, ipp);
1816 	out(O_ALTFP, " created FME%d, case %s]", fmep->id,
1817 	    fmd_case_uuid(hdl, fmep->fmcase));
1818 
1819 	ep = fmep->e0;
1820 	ASSERT(ep != NULL);
1821 
1822 	/* note observation */
1823 	fmep->ecurrent = ep;
1824 	if (ep->count++ == 0) {
1825 		/* link it into list of observations seen */
1826 		ep->observations = fmep->observations;
1827 		fmep->observations = ep;
1828 		ep->nvp = evnv_dupnvl(nvl);
1829 		serialize_observation(fmep, eventstring, ipp);
1830 	} else {
1831 		/* new payload overrides any previous */
1832 		nvlist_free(ep->nvp);
1833 		ep->nvp = evnv_dupnvl(nvl);
1834 	}
1835 
1836 	stats_counter_bump(fmep->Rcount);
1837 
1838 	if (ffep) {
1839 		fmd_case_add_ereport(hdl, fmep->fmcase, ffep);
1840 		fmd_case_setprincipal(hdl, fmep->fmcase, ffep);
1841 		fmep->e0r = ffep;
1842 		ep->ffep = ffep;
1843 	}
1844 
1845 	/* give the diagnosis algorithm a shot at the new FME state */
1846 	fme_eval(fmep, ffep);
1847 }
1848 
1849 void
fme_status(int flags)1850 fme_status(int flags)
1851 {
1852 	struct fme *fmep;
1853 
1854 	if (FMElist == NULL) {
1855 		out(flags, "No fault management exercises underway.");
1856 		return;
1857 	}
1858 
1859 	for (fmep = FMElist; fmep; fmep = fmep->next)
1860 		fme_print(flags, fmep);
1861 }
1862 
1863 /*
1864  * "indent" routines used mostly for nicely formatted debug output, but also
1865  * for sanity checking for infinite recursion bugs.
1866  */
1867 
1868 #define	MAX_INDENT 1024
1869 static const char *indent_s[MAX_INDENT];
1870 static int current_indent;
1871 
1872 static void
indent_push(const char * s)1873 indent_push(const char *s)
1874 {
1875 	if (current_indent < MAX_INDENT)
1876 		indent_s[current_indent++] = s;
1877 	else
1878 		out(O_DIE, "unexpected recursion depth (%d)", current_indent);
1879 }
1880 
1881 static void
indent_set(const char * s)1882 indent_set(const char *s)
1883 {
1884 	current_indent = 0;
1885 	indent_push(s);
1886 }
1887 
1888 static void
indent_pop(void)1889 indent_pop(void)
1890 {
1891 	if (current_indent > 0)
1892 		current_indent--;
1893 	else
1894 		out(O_DIE, "recursion underflow");
1895 }
1896 
1897 static void
indent(void)1898 indent(void)
1899 {
1900 	int i;
1901 	if (!Verbose)
1902 		return;
1903 	for (i = 0; i < current_indent; i++)
1904 		out(O_ALTFP|O_VERB|O_NONL, indent_s[i]);
1905 }
1906 
1907 #define	SLNEW		1
1908 #define	SLCHANGED	2
1909 #define	SLWAIT		3
1910 #define	SLDISPROVED	4
1911 
1912 static void
print_suspects(int circumstance,struct fme * fmep)1913 print_suspects(int circumstance, struct fme *fmep)
1914 {
1915 	struct event *ep;
1916 
1917 	out(O_ALTFP|O_NONL, "[");
1918 	if (circumstance == SLCHANGED) {
1919 		out(O_ALTFP|O_NONL, "FME%d diagnosis changed. state: %s, "
1920 		    "suspect list:", fmep->id, fme_state2str(fmep->state));
1921 	} else if (circumstance == SLWAIT) {
1922 		out(O_ALTFP|O_NONL, "FME%d set wait timer %ld ", fmep->id,
1923 		    fmep->timer);
1924 		ptree_timeval(O_ALTFP|O_NONL, &fmep->wull);
1925 	} else if (circumstance == SLDISPROVED) {
1926 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS UNKNOWN", fmep->id);
1927 	} else {
1928 		out(O_ALTFP|O_NONL, "FME%d DIAGNOSIS PRODUCED:", fmep->id);
1929 	}
1930 
1931 	if (circumstance == SLWAIT || circumstance == SLDISPROVED) {
1932 		out(O_ALTFP, "]");
1933 		return;
1934 	}
1935 
1936 	for (ep = fmep->suspects; ep; ep = ep->suspects) {
1937 		out(O_ALTFP|O_NONL, " ");
1938 		itree_pevent_brief(O_ALTFP|O_NONL, ep);
1939 	}
1940 	out(O_ALTFP, "]");
1941 }
1942 
1943 static struct node *
eventprop_lookup(struct event * ep,const char * propname)1944 eventprop_lookup(struct event *ep, const char *propname)
1945 {
1946 	return (lut_lookup(ep->props, (void *)propname, NULL));
1947 }
1948 
1949 #define	MAXDIGITIDX	23
1950 static char numbuf[MAXDIGITIDX + 1];
1951 
1952 static int
node2uint(struct node * n,uint_t * valp)1953 node2uint(struct node *n, uint_t *valp)
1954 {
1955 	struct evalue value;
1956 	struct lut *globals = NULL;
1957 
1958 	if (n == NULL)
1959 		return (1);
1960 
1961 	/*
1962 	 * check value.v since we are being asked to convert an unsigned
1963 	 * long long int to an unsigned int
1964 	 */
1965 	if (! eval_expr(n, NULL, NULL, &globals, NULL, NULL, 0, &value) ||
1966 	    value.t != UINT64 || value.v > (1ULL << 32))
1967 		return (1);
1968 
1969 	*valp = (uint_t)value.v;
1970 
1971 	return (0);
1972 }
1973 
1974 static nvlist_t *
node2fmri(struct node * n)1975 node2fmri(struct node *n)
1976 {
1977 	nvlist_t **pa, *f, *p;
1978 	struct node *nc;
1979 	uint_t depth = 0;
1980 	char *numstr, *nullbyte;
1981 	char *failure;
1982 	int err, i;
1983 
1984 	/* XXX do we need to be able to handle a non-T_NAME node? */
1985 	if (n == NULL || n->t != T_NAME)
1986 		return (NULL);
1987 
1988 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
1989 		if (nc->u.name.child == NULL || nc->u.name.child->t != T_NUM)
1990 			break;
1991 		depth++;
1992 	}
1993 
1994 	if (nc != NULL) {
1995 		/* We bailed early, something went wrong */
1996 		return (NULL);
1997 	}
1998 
1999 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
2000 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
2001 	pa = alloca(depth * sizeof (nvlist_t *));
2002 	for (i = 0; i < depth; i++)
2003 		pa[i] = NULL;
2004 
2005 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
2006 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
2007 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
2008 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
2009 	if (err != 0) {
2010 		failure = "basic construction of FMRI failed";
2011 		goto boom;
2012 	}
2013 
2014 	numbuf[MAXDIGITIDX] = '\0';
2015 	nullbyte = &numbuf[MAXDIGITIDX];
2016 	i = 0;
2017 
2018 	for (nc = n; nc != NULL; nc = nc->u.name.next) {
2019 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
2020 		if (err != 0) {
2021 			failure = "alloc of an hc-pair failed";
2022 			goto boom;
2023 		}
2024 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, nc->u.name.s);
2025 		numstr = ulltostr(nc->u.name.child->u.ull, nullbyte);
2026 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
2027 		if (err != 0) {
2028 			failure = "construction of an hc-pair failed";
2029 			goto boom;
2030 		}
2031 		pa[i++] = p;
2032 	}
2033 
2034 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
2035 	if (err == 0) {
2036 		for (i = 0; i < depth; i++)
2037 			nvlist_free(pa[i]);
2038 		return (f);
2039 	}
2040 	failure = "addition of hc-pair array to FMRI failed";
2041 
2042 boom:
2043 	for (i = 0; i < depth; i++)
2044 		nvlist_free(pa[i]);
2045 	nvlist_free(f);
2046 	out(O_DIE, "%s", failure);
2047 	/*NOTREACHED*/
2048 	return (NULL);
2049 }
2050 
2051 /* an ipath cache entry is an array of these, with s==NULL at the end */
2052 struct ipath {
2053 	const char *s;	/* component name (in stable) */
2054 	int i;		/* instance number */
2055 };
2056 
2057 static nvlist_t *
ipath2fmri(struct ipath * ipath)2058 ipath2fmri(struct ipath *ipath)
2059 {
2060 	nvlist_t **pa, *f, *p;
2061 	uint_t depth = 0;
2062 	char *numstr, *nullbyte;
2063 	char *failure;
2064 	int err, i;
2065 	struct ipath *ipp;
2066 
2067 	for (ipp = ipath; ipp->s != NULL; ipp++)
2068 		depth++;
2069 
2070 	if ((err = nvlist_xalloc(&f, NV_UNIQUE_NAME, &Eft_nv_hdl)) != 0)
2071 		out(O_DIE|O_SYS, "alloc of fmri nvl failed");
2072 	pa = alloca(depth * sizeof (nvlist_t *));
2073 	for (i = 0; i < depth; i++)
2074 		pa[i] = NULL;
2075 
2076 	err = nvlist_add_string(f, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC);
2077 	err |= nvlist_add_uint8(f, FM_VERSION, FM_HC_SCHEME_VERSION);
2078 	err |= nvlist_add_string(f, FM_FMRI_HC_ROOT, "");
2079 	err |= nvlist_add_uint32(f, FM_FMRI_HC_LIST_SZ, depth);
2080 	if (err != 0) {
2081 		failure = "basic construction of FMRI failed";
2082 		goto boom;
2083 	}
2084 
2085 	numbuf[MAXDIGITIDX] = '\0';
2086 	nullbyte = &numbuf[MAXDIGITIDX];
2087 	i = 0;
2088 
2089 	for (ipp = ipath; ipp->s != NULL; ipp++) {
2090 		err = nvlist_xalloc(&p, NV_UNIQUE_NAME, &Eft_nv_hdl);
2091 		if (err != 0) {
2092 			failure = "alloc of an hc-pair failed";
2093 			goto boom;
2094 		}
2095 		err = nvlist_add_string(p, FM_FMRI_HC_NAME, ipp->s);
2096 		numstr = ulltostr(ipp->i, nullbyte);
2097 		err |= nvlist_add_string(p, FM_FMRI_HC_ID, numstr);
2098 		if (err != 0) {
2099 			failure = "construction of an hc-pair failed";
2100 			goto boom;
2101 		}
2102 		pa[i++] = p;
2103 	}
2104 
2105 	err = nvlist_add_nvlist_array(f, FM_FMRI_HC_LIST, pa, depth);
2106 	if (err == 0) {
2107 		for (i = 0; i < depth; i++)
2108 			nvlist_free(pa[i]);
2109 		return (f);
2110 	}
2111 	failure = "addition of hc-pair array to FMRI failed";
2112 
2113 boom:
2114 	for (i = 0; i < depth; i++)
2115 		nvlist_free(pa[i]);
2116 	nvlist_free(f);
2117 	out(O_DIE, "%s", failure);
2118 	/*NOTREACHED*/
2119 	return (NULL);
2120 }
2121 
2122 static uint8_t
percentof(uint_t part,uint_t whole)2123 percentof(uint_t part, uint_t whole)
2124 {
2125 	unsigned long long p = part * 1000;
2126 
2127 	return ((p / whole / 10) + (((p / whole % 10) >= 5) ? 1 : 0));
2128 }
2129 
2130 struct rsl {
2131 	struct event *suspect;
2132 	nvlist_t *asru;
2133 	nvlist_t *fru;
2134 	nvlist_t *rsrc;
2135 };
2136 
2137 static void publish_suspects(struct fme *fmep, struct rsl *srl);
2138 
2139 /*
2140  *  rslfree -- free internal members of struct rsl not expected to be
2141  *	freed elsewhere.
2142  */
2143 static void
rslfree(struct rsl * freeme)2144 rslfree(struct rsl *freeme)
2145 {
2146 	nvlist_free(freeme->asru);
2147 	nvlist_free(freeme->fru);
2148 	if (freeme->rsrc != freeme->asru)
2149 		nvlist_free(freeme->rsrc);
2150 }
2151 
2152 /*
2153  *  rslcmp -- compare two rsl structures.  Use the following
2154  *	comparisons to establish cardinality:
2155  *
2156  *	1. Name of the suspect's class. (simple strcmp)
2157  *	2. Name of the suspect's ASRU. (trickier, since nvlist)
2158  *
2159  */
2160 static int
rslcmp(const void * a,const void * b)2161 rslcmp(const void *a, const void *b)
2162 {
2163 	struct rsl *r1 = (struct rsl *)a;
2164 	struct rsl *r2 = (struct rsl *)b;
2165 	int rv;
2166 
2167 	rv = strcmp(r1->suspect->enode->u.event.ename->u.name.s,
2168 	    r2->suspect->enode->u.event.ename->u.name.s);
2169 	if (rv != 0)
2170 		return (rv);
2171 
2172 	if (r1->rsrc == NULL && r2->rsrc == NULL)
2173 		return (0);
2174 	if (r1->rsrc == NULL)
2175 		return (-1);
2176 	if (r2->rsrc == NULL)
2177 		return (1);
2178 	return (evnv_cmpnvl(r1->rsrc, r2->rsrc, 0));
2179 }
2180 
2181 /*
2182  * get_resources -- for a given suspect, determine what ASRU, FRU and
2183  *     RSRC nvlists should be advertised in the final suspect list.
2184  */
2185 void
get_resources(struct event * sp,struct rsl * rsrcs,struct config * croot)2186 get_resources(struct event *sp, struct rsl *rsrcs, struct config *croot)
2187 {
2188 	struct node *asrudef, *frudef;
2189 	const struct ipath *asrupath, *frupath;
2190 	nvlist_t *asru = NULL, *fru = NULL;
2191 	nvlist_t *rsrc = NULL;
2192 	char *pathstr;
2193 
2194 	/*
2195 	 * First find any ASRU and/or FRU defined in the
2196 	 * initial fault tree.
2197 	 */
2198 	asrudef = eventprop_lookup(sp, L_ASRU);
2199 	frudef = eventprop_lookup(sp, L_FRU);
2200 
2201 	/*
2202 	 * Create ipaths based on those definitions
2203 	 */
2204 	asrupath = ipath(asrudef);
2205 	frupath = ipath(frudef);
2206 
2207 	/*
2208 	 *  Allow for platform translations of the FMRIs
2209 	 */
2210 	pathstr = ipath2str(NULL, sp->ipp);
2211 	platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_RESOURCE,
2212 	    &rsrc, pathstr);
2213 	FREE(pathstr);
2214 
2215 	pathstr = ipath2str(NULL, asrupath);
2216 	platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_ASRU,
2217 	    &asru, pathstr);
2218 	FREE(pathstr);
2219 
2220 	pathstr = ipath2str(NULL, frupath);
2221 	platform_unit_translate(is_defect(sp->t), croot, TOPO_PROP_FRU,
2222 	    &fru, pathstr);
2223 	FREE(pathstr);
2224 
2225 	rsrcs->suspect = sp;
2226 	rsrcs->asru = asru;
2227 	rsrcs->fru = fru;
2228 	rsrcs->rsrc = rsrc;
2229 }
2230 
2231 /*
2232  * trim_suspects -- prior to publishing, we may need to remove some
2233  *    suspects from the list.  If we're auto-closing upsets, we don't
2234  *    want any of those in the published list.  If the ASRUs for multiple
2235  *    defects resolve to the same ASRU (driver) we only want to publish
2236  *    that as a single suspect.
2237  */
2238 static int
trim_suspects(struct fme * fmep,struct rsl * begin,struct rsl * begin2,fmd_event_t * ffep)2239 trim_suspects(struct fme *fmep, struct rsl *begin, struct rsl *begin2,
2240     fmd_event_t *ffep)
2241 {
2242 	struct event *ep;
2243 	struct rsl *rp = begin;
2244 	struct rsl *rp2 = begin2;
2245 	int mess_zero_count = 0;
2246 	int serd_rval;
2247 	uint_t messval;
2248 
2249 	/* remove any unwanted upsets and populate our array */
2250 	for (ep = fmep->psuspects; ep; ep = ep->psuspects) {
2251 		if (is_upset(ep->t))
2252 			continue;
2253 		serd_rval = serd_eval(fmep, fmep->hdl, ffep, fmep->fmcase, ep,
2254 		    NULL, NULL);
2255 		if (serd_rval == 0)
2256 			continue;
2257 		if (node2uint(eventprop_lookup(ep, L_message),
2258 		    &messval) == 0 && messval == 0) {
2259 			get_resources(ep, rp2, fmep->config);
2260 			rp2++;
2261 			mess_zero_count++;
2262 		} else {
2263 			get_resources(ep, rp, fmep->config);
2264 			rp++;
2265 			fmep->nsuspects++;
2266 		}
2267 	}
2268 	return (mess_zero_count);
2269 }
2270 
2271 /*
2272  * addpayloadprop -- add a payload prop to a problem
2273  */
2274 static void
addpayloadprop(const char * lhs,struct evalue * rhs,nvlist_t * fault)2275 addpayloadprop(const char *lhs, struct evalue *rhs, nvlist_t *fault)
2276 {
2277 	nvlist_t *rsrc, *hcs;
2278 
2279 	ASSERT(fault != NULL);
2280 	ASSERT(lhs != NULL);
2281 	ASSERT(rhs != NULL);
2282 
2283 	if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, &rsrc) != 0)
2284 		out(O_DIE, "cannot add payloadprop \"%s\" to fault", lhs);
2285 
2286 	if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0) {
2287 		out(O_ALTFP|O_VERB2, "addpayloadprop: create hc_specific");
2288 		if (nvlist_xalloc(&hcs, NV_UNIQUE_NAME, &Eft_nv_hdl) != 0)
2289 			out(O_DIE,
2290 			    "cannot add payloadprop \"%s\" to fault", lhs);
2291 		if (nvlist_add_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, hcs) != 0)
2292 			out(O_DIE,
2293 			    "cannot add payloadprop \"%s\" to fault", lhs);
2294 		nvlist_free(hcs);
2295 		if (nvlist_lookup_nvlist(rsrc, FM_FMRI_HC_SPECIFIC, &hcs) != 0)
2296 			out(O_DIE,
2297 			    "cannot add payloadprop \"%s\" to fault", lhs);
2298 	} else
2299 		out(O_ALTFP|O_VERB2, "addpayloadprop: reuse hc_specific");
2300 
2301 	if (rhs->t == UINT64) {
2302 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=%llu", lhs, rhs->v);
2303 
2304 		if (nvlist_add_uint64(hcs, lhs, rhs->v) != 0)
2305 			out(O_DIE,
2306 			    "cannot add payloadprop \"%s\" to fault", lhs);
2307 	} else {
2308 		out(O_ALTFP|O_VERB2, "addpayloadprop: %s=\"%s\"",
2309 		    lhs, (char *)(uintptr_t)rhs->v);
2310 
2311 		if (nvlist_add_string(hcs, lhs, (char *)(uintptr_t)rhs->v) != 0)
2312 			out(O_DIE,
2313 			    "cannot add payloadprop \"%s\" to fault", lhs);
2314 	}
2315 }
2316 
2317 static char *Istatbuf;
2318 static char *Istatbufptr;
2319 static int Istatsz;
2320 
2321 /*
2322  * istataddsize -- calculate size of istat and add it to Istatsz
2323  */
2324 /*ARGSUSED2*/
2325 static void
istataddsize(const struct istat_entry * lhs,struct stats * rhs,void * arg)2326 istataddsize(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2327 {
2328 	int val;
2329 
2330 	ASSERT(lhs != NULL);
2331 	ASSERT(rhs != NULL);
2332 
2333 	if ((val = stats_counter_value(rhs)) == 0)
2334 		return;	/* skip zero-valued stats */
2335 
2336 	/* count up the size of the stat name */
2337 	Istatsz += ipath2strlen(lhs->ename, lhs->ipath);
2338 	Istatsz++;	/* for the trailing NULL byte */
2339 
2340 	/* count up the size of the stat value */
2341 	Istatsz += snprintf(NULL, 0, "%d", val);
2342 	Istatsz++;	/* for the trailing NULL byte */
2343 }
2344 
2345 /*
2346  * istat2str -- serialize an istat, writing result to *Istatbufptr
2347  */
2348 /*ARGSUSED2*/
2349 static void
istat2str(const struct istat_entry * lhs,struct stats * rhs,void * arg)2350 istat2str(const struct istat_entry *lhs, struct stats *rhs, void *arg)
2351 {
2352 	char *str;
2353 	int len;
2354 	int val;
2355 
2356 	ASSERT(lhs != NULL);
2357 	ASSERT(rhs != NULL);
2358 
2359 	if ((val = stats_counter_value(rhs)) == 0)
2360 		return;	/* skip zero-valued stats */
2361 
2362 	/* serialize the stat name */
2363 	str = ipath2str(lhs->ename, lhs->ipath);
2364 	len = strlen(str);
2365 
2366 	ASSERT(Istatbufptr + len + 1 < &Istatbuf[Istatsz]);
2367 	(void) strlcpy(Istatbufptr, str, &Istatbuf[Istatsz] - Istatbufptr);
2368 	Istatbufptr += len;
2369 	FREE(str);
2370 	*Istatbufptr++ = '\0';
2371 
2372 	/* serialize the stat value */
2373 	Istatbufptr += snprintf(Istatbufptr, &Istatbuf[Istatsz] - Istatbufptr,
2374 	    "%d", val);
2375 	*Istatbufptr++ = '\0';
2376 
2377 	ASSERT(Istatbufptr <= &Istatbuf[Istatsz]);
2378 }
2379 
2380 void
istat_save()2381 istat_save()
2382 {
2383 	if (Istat_need_save == 0)
2384 		return;
2385 
2386 	/* figure out how big the serialzed info is */
2387 	Istatsz = 0;
2388 	lut_walk(Istats, (lut_cb)istataddsize, NULL);
2389 
2390 	if (Istatsz == 0) {
2391 		/* no stats to save */
2392 		fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2393 		return;
2394 	}
2395 
2396 	/* create the serialized buffer */
2397 	Istatbufptr = Istatbuf = MALLOC(Istatsz);
2398 	lut_walk(Istats, (lut_cb)istat2str, NULL);
2399 
2400 	/* clear out current saved stats */
2401 	fmd_buf_destroy(Hdl, NULL, WOBUF_ISTATS);
2402 
2403 	/* write out the new version */
2404 	fmd_buf_write(Hdl, NULL, WOBUF_ISTATS, Istatbuf, Istatsz);
2405 	FREE(Istatbuf);
2406 
2407 	Istat_need_save = 0;
2408 }
2409 
2410 int
istat_cmp(struct istat_entry * ent1,struct istat_entry * ent2)2411 istat_cmp(struct istat_entry *ent1, struct istat_entry *ent2)
2412 {
2413 	if (ent1->ename != ent2->ename)
2414 		return (ent2->ename - ent1->ename);
2415 	if (ent1->ipath != ent2->ipath)
2416 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2417 
2418 	return (0);
2419 }
2420 
2421 /*
2422  * istat-verify -- verify the component associated with a stat still exists
2423  *
2424  * if the component no longer exists, this routine resets the stat and
2425  * returns 0.  if the component still exists, it returns 1.
2426  */
2427 static int
istat_verify(struct node * snp,struct istat_entry * entp)2428 istat_verify(struct node *snp, struct istat_entry *entp)
2429 {
2430 	struct stats *statp;
2431 	nvlist_t *fmri;
2432 
2433 	fmri = node2fmri(snp->u.event.epname);
2434 	if (platform_path_exists(fmri)) {
2435 		nvlist_free(fmri);
2436 		return (1);
2437 	}
2438 	nvlist_free(fmri);
2439 
2440 	/* component no longer in system.  zero out the associated stats */
2441 	if ((statp = (struct stats *)
2442 	    lut_lookup(Istats, entp, (lut_cmp)istat_cmp)) == NULL ||
2443 	    stats_counter_value(statp) == 0)
2444 		return (0);	/* stat is already reset */
2445 
2446 	Istat_need_save = 1;
2447 	stats_counter_reset(statp);
2448 	return (0);
2449 }
2450 
2451 static void
istat_bump(struct node * snp,int n)2452 istat_bump(struct node *snp, int n)
2453 {
2454 	struct stats *statp;
2455 	struct istat_entry ent;
2456 
2457 	ASSERT(snp != NULL);
2458 	ASSERTinfo(snp->t == T_EVENT, ptree_nodetype2str(snp->t));
2459 	ASSERT(snp->u.event.epname != NULL);
2460 
2461 	/* class name should be hoisted into a single stable entry */
2462 	ASSERT(snp->u.event.ename->u.name.next == NULL);
2463 	ent.ename = snp->u.event.ename->u.name.s;
2464 	ent.ipath = ipath(snp->u.event.epname);
2465 
2466 	if (!istat_verify(snp, &ent)) {
2467 		/* component no longer exists in system, nothing to do */
2468 		return;
2469 	}
2470 
2471 	if ((statp = (struct stats *)
2472 	    lut_lookup(Istats, &ent, (lut_cmp)istat_cmp)) == NULL) {
2473 		/* need to create the counter */
2474 		int cnt = 0;
2475 		struct node *np;
2476 		char *sname;
2477 		char *snamep;
2478 		struct istat_entry *newentp;
2479 
2480 		/* count up the size of the stat name */
2481 		np = snp->u.event.ename;
2482 		while (np != NULL) {
2483 			cnt += strlen(np->u.name.s);
2484 			cnt++;	/* for the '.' or '@' */
2485 			np = np->u.name.next;
2486 		}
2487 		np = snp->u.event.epname;
2488 		while (np != NULL) {
2489 			cnt += snprintf(NULL, 0, "%s%llu",
2490 			    np->u.name.s, np->u.name.child->u.ull);
2491 			cnt++;	/* for the '/' or trailing NULL byte */
2492 			np = np->u.name.next;
2493 		}
2494 
2495 		/* build the stat name */
2496 		snamep = sname = alloca(cnt);
2497 		np = snp->u.event.ename;
2498 		while (np != NULL) {
2499 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2500 			    "%s", np->u.name.s);
2501 			np = np->u.name.next;
2502 			if (np)
2503 				*snamep++ = '.';
2504 		}
2505 		*snamep++ = '@';
2506 		np = snp->u.event.epname;
2507 		while (np != NULL) {
2508 			snamep += snprintf(snamep, &sname[cnt] - snamep,
2509 			    "%s%llu", np->u.name.s, np->u.name.child->u.ull);
2510 			np = np->u.name.next;
2511 			if (np)
2512 				*snamep++ = '/';
2513 		}
2514 		*snamep++ = '\0';
2515 
2516 		/* create the new stat & add it to our list */
2517 		newentp = MALLOC(sizeof (*newentp));
2518 		*newentp = ent;
2519 		statp = stats_new_counter(NULL, sname, 0);
2520 		Istats = lut_add(Istats, (void *)newentp, (void *)statp,
2521 		    (lut_cmp)istat_cmp);
2522 	}
2523 
2524 	/* if n is non-zero, set that value instead of bumping */
2525 	if (n) {
2526 		stats_counter_reset(statp);
2527 		stats_counter_add(statp, n);
2528 	} else
2529 		stats_counter_bump(statp);
2530 	Istat_need_save = 1;
2531 
2532 	ipath_print(O_ALTFP|O_VERB2, ent.ename, ent.ipath);
2533 	out(O_ALTFP|O_VERB2, " %s to value %d", n ? "set" : "incremented",
2534 	    stats_counter_value(statp));
2535 }
2536 
2537 /*ARGSUSED*/
2538 static void
istat_destructor(void * left,void * right,void * arg)2539 istat_destructor(void *left, void *right, void *arg)
2540 {
2541 	struct istat_entry *entp = (struct istat_entry *)left;
2542 	struct stats *statp = (struct stats *)right;
2543 	FREE(entp);
2544 	stats_delete(statp);
2545 }
2546 
2547 /*
2548  * Callback used in a walk of the Istats to reset matching stat counters.
2549  */
2550 static void
istat_counter_reset_cb(struct istat_entry * entp,struct stats * statp,const struct ipath * ipp)2551 istat_counter_reset_cb(struct istat_entry *entp, struct stats *statp,
2552     const struct ipath *ipp)
2553 {
2554 	char *path;
2555 
2556 	if (entp->ipath == ipp) {
2557 		path = ipath2str(entp->ename, ipp);
2558 		out(O_ALTFP, "istat_counter_reset_cb: resetting %s", path);
2559 		FREE(path);
2560 		stats_counter_reset(statp);
2561 		Istat_need_save = 1;
2562 	}
2563 }
2564 
2565 /*ARGSUSED*/
2566 static void
istat_counter_topo_chg_cb(struct istat_entry * entp,struct stats * statp,void * unused)2567 istat_counter_topo_chg_cb(struct istat_entry *entp, struct stats *statp,
2568     void *unused)
2569 {
2570 	char *path;
2571 	nvlist_t *fmri;
2572 
2573 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2574 	if (!platform_path_exists(fmri)) {
2575 		path = ipath2str(entp->ename, entp->ipath);
2576 		out(O_ALTFP, "istat_counter_topo_chg_cb: not present %s", path);
2577 		FREE(path);
2578 		stats_counter_reset(statp);
2579 		Istat_need_save = 1;
2580 	}
2581 	nvlist_free(fmri);
2582 }
2583 
2584 void
istat_fini(void)2585 istat_fini(void)
2586 {
2587 	lut_free(Istats, istat_destructor, NULL);
2588 }
2589 
2590 static char *Serdbuf;
2591 static char *Serdbufptr;
2592 static int Serdsz;
2593 
2594 /*
2595  * serdaddsize -- calculate size of serd and add it to Serdsz
2596  */
2597 /*ARGSUSED*/
2598 static void
serdaddsize(const struct serd_entry * lhs,struct stats * rhs,void * arg)2599 serdaddsize(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2600 {
2601 	ASSERT(lhs != NULL);
2602 
2603 	/* count up the size of the stat name */
2604 	Serdsz += ipath2strlen(lhs->ename, lhs->ipath);
2605 	Serdsz++;	/* for the trailing NULL byte */
2606 }
2607 
2608 /*
2609  * serd2str -- serialize a serd engine, writing result to *Serdbufptr
2610  */
2611 /*ARGSUSED*/
2612 static void
serd2str(const struct serd_entry * lhs,struct stats * rhs,void * arg)2613 serd2str(const struct serd_entry *lhs, struct stats *rhs, void *arg)
2614 {
2615 	char *str;
2616 	int len;
2617 
2618 	ASSERT(lhs != NULL);
2619 
2620 	/* serialize the serd engine name */
2621 	str = ipath2str(lhs->ename, lhs->ipath);
2622 	len = strlen(str);
2623 
2624 	ASSERT(Serdbufptr + len + 1 <= &Serdbuf[Serdsz]);
2625 	(void) strlcpy(Serdbufptr, str, &Serdbuf[Serdsz] - Serdbufptr);
2626 	Serdbufptr += len;
2627 	FREE(str);
2628 	*Serdbufptr++ = '\0';
2629 	ASSERT(Serdbufptr <= &Serdbuf[Serdsz]);
2630 }
2631 
2632 void
serd_save()2633 serd_save()
2634 {
2635 	if (Serd_need_save == 0)
2636 		return;
2637 
2638 	/* figure out how big the serialzed info is */
2639 	Serdsz = 0;
2640 	lut_walk(SerdEngines, (lut_cb)serdaddsize, NULL);
2641 
2642 	if (Serdsz == 0) {
2643 		/* no serd engines to save */
2644 		fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2645 		return;
2646 	}
2647 
2648 	/* create the serialized buffer */
2649 	Serdbufptr = Serdbuf = MALLOC(Serdsz);
2650 	lut_walk(SerdEngines, (lut_cb)serd2str, NULL);
2651 
2652 	/* clear out current saved stats */
2653 	fmd_buf_destroy(Hdl, NULL, WOBUF_SERDS);
2654 
2655 	/* write out the new version */
2656 	fmd_buf_write(Hdl, NULL, WOBUF_SERDS, Serdbuf, Serdsz);
2657 	FREE(Serdbuf);
2658 	Serd_need_save = 0;
2659 }
2660 
2661 int
serd_cmp(struct serd_entry * ent1,struct serd_entry * ent2)2662 serd_cmp(struct serd_entry *ent1, struct serd_entry *ent2)
2663 {
2664 	if (ent1->ename != ent2->ename)
2665 		return (ent2->ename - ent1->ename);
2666 	if (ent1->ipath != ent2->ipath)
2667 		return ((char *)ent2->ipath - (char *)ent1->ipath);
2668 
2669 	return (0);
2670 }
2671 
2672 void
fme_serd_load(fmd_hdl_t * hdl)2673 fme_serd_load(fmd_hdl_t *hdl)
2674 {
2675 	int sz;
2676 	char *sbuf;
2677 	char *sepptr;
2678 	char *ptr;
2679 	struct serd_entry *newentp;
2680 	struct node *epname;
2681 	nvlist_t *fmri;
2682 	char *namestring;
2683 
2684 	if ((sz = fmd_buf_size(hdl, NULL, WOBUF_SERDS)) == 0)
2685 		return;
2686 	sbuf = alloca(sz);
2687 	fmd_buf_read(hdl, NULL, WOBUF_SERDS, sbuf, sz);
2688 	ptr = sbuf;
2689 	while (ptr < &sbuf[sz]) {
2690 		sepptr = strchr(ptr, '@');
2691 		*sepptr = '\0';
2692 		namestring = ptr;
2693 		sepptr++;
2694 		ptr = sepptr;
2695 		ptr += strlen(ptr);
2696 		ptr++;	/* move past the '\0' separating paths */
2697 		epname = pathstring2epnamenp(sepptr);
2698 		fmri = node2fmri(epname);
2699 		if (platform_path_exists(fmri)) {
2700 			newentp = MALLOC(sizeof (*newentp));
2701 			newentp->hdl = hdl;
2702 			newentp->ipath = ipath(epname);
2703 			newentp->ename = stable(namestring);
2704 			SerdEngines = lut_add(SerdEngines, (void *)newentp,
2705 			    (void *)newentp, (lut_cmp)serd_cmp);
2706 		} else
2707 			Serd_need_save = 1;
2708 		tree_free(epname);
2709 		nvlist_free(fmri);
2710 	}
2711 	/* save it back again in case some of the paths no longer exist */
2712 	serd_save();
2713 }
2714 
2715 /*ARGSUSED*/
2716 static void
serd_destructor(void * left,void * right,void * arg)2717 serd_destructor(void *left, void *right, void *arg)
2718 {
2719 	struct serd_entry *entp = (struct serd_entry *)left;
2720 	FREE(entp);
2721 }
2722 
2723 /*
2724  * Callback used in a walk of the SerdEngines to reset matching serd engines.
2725  */
2726 /*ARGSUSED*/
2727 static void
serd_reset_cb(struct serd_entry * entp,void * unused,const struct ipath * ipp)2728 serd_reset_cb(struct serd_entry *entp, void *unused, const struct ipath *ipp)
2729 {
2730 	char *path;
2731 
2732 	if (entp->ipath == ipp) {
2733 		path = ipath2str(entp->ename, ipp);
2734 		out(O_ALTFP, "serd_reset_cb: resetting %s", path);
2735 		fmd_serd_reset(entp->hdl, path);
2736 		FREE(path);
2737 		Serd_need_save = 1;
2738 	}
2739 }
2740 
2741 /*ARGSUSED*/
2742 static void
serd_topo_chg_cb(struct serd_entry * entp,void * unused,void * unused2)2743 serd_topo_chg_cb(struct serd_entry *entp, void *unused, void *unused2)
2744 {
2745 	char *path;
2746 	nvlist_t *fmri;
2747 
2748 	fmri = ipath2fmri((struct ipath *)(entp->ipath));
2749 	if (!platform_path_exists(fmri)) {
2750 		path = ipath2str(entp->ename, entp->ipath);
2751 		out(O_ALTFP, "serd_topo_chg_cb: not present %s", path);
2752 		fmd_serd_reset(entp->hdl, path);
2753 		FREE(path);
2754 		Serd_need_save = 1;
2755 	}
2756 	nvlist_free(fmri);
2757 }
2758 
2759 void
serd_fini(void)2760 serd_fini(void)
2761 {
2762 	lut_free(SerdEngines, serd_destructor, NULL);
2763 }
2764 
2765 static void
publish_suspects(struct fme * fmep,struct rsl * srl)2766 publish_suspects(struct fme *fmep, struct rsl *srl)
2767 {
2768 	struct rsl *rp;
2769 	nvlist_t *fault;
2770 	uint8_t cert;
2771 	uint_t *frs;
2772 	uint_t frsum, fr;
2773 	uint_t messval;
2774 	uint_t retireval;
2775 	uint_t responseval;
2776 	struct node *snp;
2777 	int frcnt, fridx;
2778 	boolean_t allfaulty = B_TRUE;
2779 	struct rsl *erl = srl + fmep->nsuspects - 1;
2780 
2781 	/*
2782 	 * sort the array
2783 	 */
2784 	qsort(srl, fmep->nsuspects, sizeof (struct rsl), rslcmp);
2785 
2786 	/* sum the fitrates */
2787 	frs = alloca(fmep->nsuspects * sizeof (uint_t));
2788 	fridx = frcnt = frsum = 0;
2789 
2790 	for (rp = srl; rp <= erl; rp++) {
2791 		struct node *n;
2792 
2793 		n = eventprop_lookup(rp->suspect, L_FITrate);
2794 		if (node2uint(n, &fr) != 0) {
2795 			out(O_DEBUG|O_NONL, "event ");
2796 			ipath_print(O_DEBUG|O_NONL,
2797 			    rp->suspect->enode->u.event.ename->u.name.s,
2798 			    rp->suspect->ipp);
2799 			out(O_VERB, " has no FITrate (using 1)");
2800 			fr = 1;
2801 		} else if (fr == 0) {
2802 			out(O_DEBUG|O_NONL, "event ");
2803 			ipath_print(O_DEBUG|O_NONL,
2804 			    rp->suspect->enode->u.event.ename->u.name.s,
2805 			    rp->suspect->ipp);
2806 			out(O_VERB, " has zero FITrate (using 1)");
2807 			fr = 1;
2808 		}
2809 
2810 		frs[fridx++] = fr;
2811 		frsum += fr;
2812 		frcnt++;
2813 	}
2814 
2815 	/* Add them in reverse order of our sort, as fmd reverses order */
2816 	for (rp = erl; rp >= srl; rp--) {
2817 		cert = percentof(frs[--fridx], frsum);
2818 		fault = fmd_nvl_create_fault(fmep->hdl,
2819 		    rp->suspect->enode->u.event.ename->u.name.s,
2820 		    cert,
2821 		    rp->asru,
2822 		    rp->fru,
2823 		    rp->rsrc);
2824 		if (fault == NULL)
2825 			out(O_DIE, "fault creation failed");
2826 		/* if "message" property exists, add it to the fault */
2827 		if (node2uint(eventprop_lookup(rp->suspect, L_message),
2828 		    &messval) == 0) {
2829 
2830 			out(O_ALTFP,
2831 			    "[FME%d, %s adds message=%d to suspect list]",
2832 			    fmep->id,
2833 			    rp->suspect->enode->u.event.ename->u.name.s,
2834 			    messval);
2835 			if (nvlist_add_boolean_value(fault,
2836 			    FM_SUSPECT_MESSAGE,
2837 			    (messval) ? B_TRUE : B_FALSE) != 0) {
2838 				out(O_DIE, "cannot add no-message to fault");
2839 			}
2840 		}
2841 
2842 		/* if "retire" property exists, add it to the fault */
2843 		if (node2uint(eventprop_lookup(rp->suspect, L_retire),
2844 		    &retireval) == 0) {
2845 
2846 			out(O_ALTFP,
2847 			    "[FME%d, %s adds retire=%d to suspect list]",
2848 			    fmep->id,
2849 			    rp->suspect->enode->u.event.ename->u.name.s,
2850 			    retireval);
2851 			if (nvlist_add_boolean_value(fault,
2852 			    FM_SUSPECT_RETIRE,
2853 			    (retireval) ? B_TRUE : B_FALSE) != 0) {
2854 				out(O_DIE, "cannot add no-retire to fault");
2855 			}
2856 		}
2857 
2858 		/* if "response" property exists, add it to the fault */
2859 		if (node2uint(eventprop_lookup(rp->suspect, L_response),
2860 		    &responseval) == 0) {
2861 
2862 			out(O_ALTFP,
2863 			    "[FME%d, %s adds response=%d to suspect list]",
2864 			    fmep->id,
2865 			    rp->suspect->enode->u.event.ename->u.name.s,
2866 			    responseval);
2867 			if (nvlist_add_boolean_value(fault,
2868 			    FM_SUSPECT_RESPONSE,
2869 			    (responseval) ? B_TRUE : B_FALSE) != 0) {
2870 				out(O_DIE, "cannot add no-response to fault");
2871 			}
2872 		}
2873 
2874 		/* add any payload properties */
2875 		lut_walk(rp->suspect->payloadprops,
2876 		    (lut_cb)addpayloadprop, (void *)fault);
2877 		rslfree(rp);
2878 
2879 		/*
2880 		 * If "action" property exists, evaluate it;  this must be done
2881 		 * before the allfaulty check below since some actions may
2882 		 * modify the asru to be used in fmd_nvl_fmri_has_fault.  This
2883 		 * needs to be restructured if any new actions are introduced
2884 		 * that have effects that we do not want to be visible if
2885 		 * we decide not to publish in the dupclose check below.
2886 		 */
2887 		if ((snp = eventprop_lookup(rp->suspect, L_action)) != NULL) {
2888 			struct evalue evalue;
2889 
2890 			out(O_ALTFP|O_NONL,
2891 			    "[FME%d, %s action ", fmep->id,
2892 			    rp->suspect->enode->u.event.ename->u.name.s);
2893 			ptree_name_iter(O_ALTFP|O_NONL, snp);
2894 			out(O_ALTFP, "]");
2895 			Action_nvl = fault;
2896 			(void) eval_expr(snp, NULL, NULL, NULL, NULL,
2897 			    NULL, 0, &evalue);
2898 		}
2899 
2900 		fmd_case_add_suspect(fmep->hdl, fmep->fmcase, fault);
2901 
2902 		/*
2903 		 * check if the asru is already marked as "faulty".
2904 		 */
2905 		if (allfaulty) {
2906 			nvlist_t *asru;
2907 
2908 			out(O_ALTFP|O_VERB, "FME%d dup check ", fmep->id);
2909 			itree_pevent_brief(O_ALTFP|O_VERB|O_NONL, rp->suspect);
2910 			out(O_ALTFP|O_VERB|O_NONL, " ");
2911