1724365f7Ssethg /*
2724365f7Ssethg * CDDL HEADER START
3724365f7Ssethg *
4724365f7Ssethg * The contents of this file are subject to the terms of the
5724365f7Ssethg * Common Development and Distribution License (the "License").
6724365f7Ssethg * You may not use this file except in compliance with the License.
7724365f7Ssethg *
8724365f7Ssethg * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9724365f7Ssethg * or http://www.opensolaris.org/os/licensing.
10724365f7Ssethg * See the License for the specific language governing permissions
11724365f7Ssethg * and limitations under the License.
12724365f7Ssethg *
13724365f7Ssethg * When distributing Covered Code, include this CDDL HEADER in each
14724365f7Ssethg * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15724365f7Ssethg * If applicable, add the following below this CDDL HEADER, with the
16724365f7Ssethg * fields enclosed by brackets "[]" replaced with your own identifying
17724365f7Ssethg * information: Portions Copyright [yyyy] [name of copyright owner]
18724365f7Ssethg *
19724365f7Ssethg * CDDL HEADER END
20724365f7Ssethg */
21724365f7Ssethg
22724365f7Ssethg /*
23cbf75e67SStephen Hanson * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24724365f7Ssethg * Use is subject to license terms.
25*0244979bSAlek Pinchuk * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26724365f7Ssethg */
27724365f7Ssethg
28724365f7Ssethg /*
29184cd04cScth * Disk Monitor
30724365f7Ssethg */
31724365f7Ssethg #include <sys/types.h>
32724365f7Ssethg #include <sys/stat.h>
33724365f7Ssethg #include <fcntl.h>
34724365f7Ssethg #include <time.h>
35724365f7Ssethg #include <stdio.h>
36724365f7Ssethg #include <stdlib.h>
37724365f7Ssethg #include <strings.h>
38724365f7Ssethg #include <stdarg.h>
39724365f7Ssethg #include <errno.h>
40724365f7Ssethg #include <signal.h>
41724365f7Ssethg #include <unistd.h>
42724365f7Ssethg #include <pthread.h>
43724365f7Ssethg #include <libnvpair.h>
44724365f7Ssethg #include <fm/fmd_api.h>
45724365f7Ssethg #include <fm/fmd_fmri.h>
46724365f7Ssethg #include <sys/fm/protocol.h>
4724db4641Seschrock #include <sys/fm/io/disk.h>
48724365f7Ssethg #include <fm/libtopo.h>
49724365f7Ssethg
50184cd04cScth #include "disk_monitor.h"
51724365f7Ssethg #include "hotplug_mgr.h"
52724365f7Ssethg #include "schg_mgr.h"
53724365f7Ssethg #include "topo_gather.h"
549113a79cSeschrock #include "dm_platform.h"
55724365f7Ssethg
56184cd04cScth #define THIS_FMD_MODULE_NAME "disk-monitor"
57724365f7Ssethg
58184cd04cScth static enum disk_init_state {
59724365f7Ssethg INIT_STATE_NONE = 0,
60724365f7Ssethg STATE_CHANGE_MGR_INITTED = 2,
6124db4641Seschrock HOTPLUG_MGR_INITTED = 4
62724365f7Ssethg } g_init_state = INIT_STATE_NONE;
63724365f7Ssethg
64724365f7Ssethg typedef enum {
65724365f7Ssethg LT_SUSPECT,
66724365f7Ssethg LT_REPAIRED
67724365f7Ssethg } fm_list_type_t;
68724365f7Ssethg
69724365f7Ssethg /*
70724365f7Ssethg * Global verbosity flag -- controls chattiness of debug messages and
71724365f7Ssethg * warnings. Its value is determined by the fmd property "log-level"
72724365f7Ssethg * settable in the DE's .conf file.
73724365f7Ssethg */
74724365f7Ssethg log_class_t g_verbose = 0;
75724365f7Ssethg cfgdata_t *config_data = NULL;
76724365f7Ssethg fmd_hdl_t *g_fm_hdl = NULL;
77724365f7Ssethg
78724365f7Ssethg static const fmd_prop_t fmd_props[];
79724365f7Ssethg
80724365f7Ssethg static void
diskmon_teardown_all(void)81724365f7Ssethg diskmon_teardown_all(void)
82724365f7Ssethg {
83724365f7Ssethg cleanup_hotplug_manager();
84724365f7Ssethg cleanup_state_change_manager(config_data);
85724365f7Ssethg config_fini();
86724365f7Ssethg }
87724365f7Ssethg
88724365f7Ssethg static int
count_disks(diskmon_t * disklistp)89724365f7Ssethg count_disks(diskmon_t *disklistp)
90724365f7Ssethg {
91724365f7Ssethg int i = 0;
92724365f7Ssethg
93724365f7Ssethg while (disklistp != NULL) {
94724365f7Ssethg i++;
95724365f7Ssethg disklistp = disklistp->next;
96724365f7Ssethg }
97724365f7Ssethg
98724365f7Ssethg return (i);
99724365f7Ssethg }
100724365f7Ssethg
101724365f7Ssethg static int
diskmon_init(void)102724365f7Ssethg diskmon_init(void)
103724365f7Ssethg {
1047a0b67e3Ssethg /*
1057a0b67e3Ssethg * Block the generation of state change events (generated by the
1067a0b67e3Ssethg * hotplug manager thread) here; they will be unblocked after the
1077a0b67e3Ssethg * state change manager thread is ready to accept state changes
1087a0b67e3Ssethg * (shortly after it starts).
1097a0b67e3Ssethg */
1107a0b67e3Ssethg block_state_change_events();
1117a0b67e3Ssethg
1129113a79cSeschrock if (dm_platform_init() != 0)
113724365f7Ssethg goto cleanup;
114724365f7Ssethg
115724365f7Ssethg if (init_hotplug_manager() != 0)
116724365f7Ssethg goto cleanup;
117724365f7Ssethg else
118724365f7Ssethg g_init_state |= HOTPLUG_MGR_INITTED;
119724365f7Ssethg
120724365f7Ssethg if (init_state_change_manager(config_data) != 0)
121724365f7Ssethg goto cleanup;
122724365f7Ssethg else
123724365f7Ssethg g_init_state |= STATE_CHANGE_MGR_INITTED;
124724365f7Ssethg
125724365f7Ssethg return (E_SUCCESS);
126724365f7Ssethg
127724365f7Ssethg cleanup:
128724365f7Ssethg
129724365f7Ssethg unblock_state_change_events();
130724365f7Ssethg
131724365f7Ssethg /*
132724365f7Ssethg * The cleanup order here does matter, due to dependencies between the
133724365f7Ssethg * managers.
134724365f7Ssethg */
135724365f7Ssethg if (g_init_state & HOTPLUG_MGR_INITTED)
136724365f7Ssethg cleanup_hotplug_manager();
137724365f7Ssethg if (g_init_state & STATE_CHANGE_MGR_INITTED)
138724365f7Ssethg cleanup_state_change_manager(config_data);
1399113a79cSeschrock dm_platform_fini();
140724365f7Ssethg
141724365f7Ssethg return (E_ERROR);
142724365f7Ssethg }
143724365f7Ssethg
144724365f7Ssethg static void
dm_fault_execute_actions(fmd_hdl_t * hdl,diskmon_t * diskp,nvlist_t * nvl)14524db4641Seschrock dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
146724365f7Ssethg {
147724365f7Ssethg const char *action_prop = NULL;
148724365f7Ssethg const char *action_string;
149724365f7Ssethg
150724365f7Ssethg /*
151724365f7Ssethg * The predictive failure action is the activation of the fault
152724365f7Ssethg * indicator.
153724365f7Ssethg */
15424db4641Seschrock if (fmd_nvl_class_match(hdl, nvl,
15524db4641Seschrock DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
156724365f7Ssethg action_prop = DISK_PROP_OTEMPACTION;
157724365f7Ssethg
15824db4641Seschrock if (fmd_nvl_class_match(hdl, nvl,
15924db4641Seschrock DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
160724365f7Ssethg action_prop = DISK_PROP_STFAILACTION;
161724365f7Ssethg
162*0244979bSAlek Pinchuk if (fmd_nvl_class_match(hdl, nvl,
163*0244979bSAlek Pinchuk DISK_ERROR_CLASS "." FM_FAULT_SSM_WEAROUT))
164*0244979bSAlek Pinchuk action_prop = DISK_PROP_SSMWEAROUTACTION;
165*0244979bSAlek Pinchuk
166724365f7Ssethg dm_fault_indicator_set(diskp, INDICATOR_ON);
167724365f7Ssethg
168724365f7Ssethg if (action_prop != NULL &&
169724365f7Ssethg (action_string = dm_prop_lookup(diskp->props, action_prop))
170724365f7Ssethg != NULL) {
171724365f7Ssethg
1729113a79cSeschrock if (dm_platform_indicator_execute(action_string) != 0) {
173724365f7Ssethg log_warn("Fault action `%s' did not successfully "
174724365f7Ssethg "complete.\n", action_string);
175724365f7Ssethg }
176724365f7Ssethg }
177724365f7Ssethg }
178724365f7Ssethg
179724365f7Ssethg static void
diskmon_agent_repair(fmd_hdl_t * hdl,nvlist_t * nvl,int repair)18025c6ff4bSstephh diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
181724365f7Ssethg {
182724365f7Ssethg char *uuid = NULL;
183724365f7Ssethg nvlist_t **nva;
184724365f7Ssethg uint_t nvc;
185724365f7Ssethg diskmon_t *diskp;
186724365f7Ssethg nvlist_t *fmri;
187724365f7Ssethg nvlist_t *fltnvl;
188724365f7Ssethg int err = 0;
189724365f7Ssethg
1907a0b67e3Ssethg err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
1917a0b67e3Ssethg err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
1927a0b67e3Ssethg &nva, &nvc);
1937a0b67e3Ssethg if (err != 0)
1947a0b67e3Ssethg return;
195724365f7Ssethg
1967a0b67e3Ssethg while (nvc-- != 0) {
197724365f7Ssethg
1987a0b67e3Ssethg fltnvl = *nva++;
199724365f7Ssethg
2007a0b67e3Ssethg if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
2017a0b67e3Ssethg != 0)
2027a0b67e3Ssethg continue;
203724365f7Ssethg
2047a0b67e3Ssethg if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
2057a0b67e3Ssethg continue;
206724365f7Ssethg
2077a0b67e3Ssethg log_msg(MM_MAIN, "Disk %s repaired!\n",
2087a0b67e3Ssethg diskp->location);
209724365f7Ssethg
2107a0b67e3Ssethg dm_fault_indicator_set(diskp, INDICATOR_OFF);
211724365f7Ssethg
2127a0b67e3Ssethg dm_state_change(diskp, HPS_REPAIRED);
2137a0b67e3Ssethg }
214724365f7Ssethg
21525c6ff4bSstephh if (repair)
21625c6ff4bSstephh fmd_case_uuresolved(hdl, uuid);
21725c6ff4bSstephh
2187a0b67e3Ssethg }
219724365f7Ssethg
2207a0b67e3Ssethg static void
diskmon_agent_suspect(fmd_hdl_t * hdl,nvlist_t * nvl)2217a0b67e3Ssethg diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
2227a0b67e3Ssethg {
2237a0b67e3Ssethg char *uuid = NULL;
2247a0b67e3Ssethg nvlist_t **nva;
2257a0b67e3Ssethg uint_t nvc;
2267a0b67e3Ssethg diskmon_t *diskp;
2277a0b67e3Ssethg nvlist_t *fmri;
2287a0b67e3Ssethg nvlist_t *fltnvl;
2297a0b67e3Ssethg int err = 0;
230724365f7Ssethg
2317a0b67e3Ssethg err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
2327a0b67e3Ssethg err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
2337a0b67e3Ssethg &nva, &nvc);
2347a0b67e3Ssethg if (err != 0)
2357a0b67e3Ssethg return;
236724365f7Ssethg
2377a0b67e3Ssethg while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {
238724365f7Ssethg
2397a0b67e3Ssethg fltnvl = *nva++;
240724365f7Ssethg
2417a0b67e3Ssethg if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
2427a0b67e3Ssethg continue;
243724365f7Ssethg
2447a0b67e3Ssethg if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
2457a0b67e3Ssethg continue;
246724365f7Ssethg
2477a0b67e3Ssethg /* Execute the actions associated with this fault */
24824db4641Seschrock dm_fault_execute_actions(hdl, diskp, fltnvl);
249724365f7Ssethg
2507a0b67e3Ssethg /*
25124db4641Seschrock * Send a state change event to the state change manager
2527a0b67e3Ssethg */
25324db4641Seschrock dm_state_change(diskp, HPS_FAULTED);
2547a0b67e3Ssethg }
255724365f7Ssethg
2567a0b67e3Ssethg if (!fmd_case_uuclosed(hdl, uuid)) {
2577a0b67e3Ssethg /* Case is closed */
2587a0b67e3Ssethg fmd_case_uuclose(hdl, uuid);
259724365f7Ssethg }
260724365f7Ssethg }
261724365f7Ssethg
26224db4641Seschrock /*ARGSUSED*/
263724365f7Ssethg static void
diskmon_recv(fmd_hdl_t * hdl,fmd_event_t * ep,nvlist_t * nvl,const char * class)264724365f7Ssethg diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
265724365f7Ssethg {
266724365f7Ssethg diskmon_t *diskp;
267724365f7Ssethg nvlist_t *fmri;
268724365f7Ssethg
269724365f7Ssethg if (g_verbose & MM_MAIN)
270724365f7Ssethg nvlist_print(stderr, nvl);
271724365f7Ssethg
272724365f7Ssethg /*
273724365f7Ssethg * Act on the fault suspect list or repaired list (embedded agent
274724365f7Ssethg * action).
275724365f7Ssethg */
27625c6ff4bSstephh if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {
27725c6ff4bSstephh
27825c6ff4bSstephh diskmon_agent_repair(hdl, nvl, 1);
27925c6ff4bSstephh return;
28025c6ff4bSstephh
28125c6ff4bSstephh } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {
282724365f7Ssethg
28325c6ff4bSstephh diskmon_agent_repair(hdl, nvl, 0);
284724365f7Ssethg return;
285724365f7Ssethg
28625c6ff4bSstephh } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {
287724365f7Ssethg
2887a0b67e3Ssethg diskmon_agent_suspect(hdl, nvl);
289724365f7Ssethg return;
290cbf75e67SStephen Hanson } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
291cbf75e67SStephen Hanson return;
292724365f7Ssethg }
293724365f7Ssethg
294724365f7Ssethg /*
295724365f7Ssethg * If we get any replayed faults, set the diskmon's faulted
296724365f7Ssethg * flag for the appropriate fault, then change the diskmon's state
297724365f7Ssethg * to faulted.
298724365f7Ssethg */
29924db4641Seschrock if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {
300724365f7Ssethg
301724365f7Ssethg if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
302724365f7Ssethg &fmri) != 0)
303724365f7Ssethg return;
304724365f7Ssethg
305724365f7Ssethg if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
306724365f7Ssethg return;
307724365f7Ssethg
308724365f7Ssethg /* Execute the actions associated with this fault */
30924db4641Seschrock dm_fault_execute_actions(hdl, diskp, nvl);
310724365f7Ssethg
311724365f7Ssethg /*
312724365f7Ssethg * If the fault wasn't generated by this module, send a
313724365f7Ssethg * state change event to the state change manager
314724365f7Ssethg */
315724365f7Ssethg dm_state_change(diskp, HPS_FAULTED);
316724365f7Ssethg return;
317724365f7Ssethg }
318724365f7Ssethg }
319724365f7Ssethg
320724365f7Ssethg static const fmd_hdl_ops_t fmd_ops = {
321724365f7Ssethg diskmon_recv, /* fmdo_recv */
322724365f7Ssethg NULL, /* fmdo_timeout */
323724365f7Ssethg NULL, /* fmdo_close */
324724365f7Ssethg NULL, /* fmdo_stats */
325724365f7Ssethg NULL, /* fmdo_gc */
326724365f7Ssethg };
327724365f7Ssethg
328724365f7Ssethg static const fmd_prop_t fmd_props[] = {
329724365f7Ssethg { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
330724365f7Ssethg { NULL, 0, NULL }
331724365f7Ssethg };
332724365f7Ssethg
333724365f7Ssethg static const fmd_hdl_info_t fmd_info = {
334184cd04cScth "Disk Monitor",
335184cd04cScth DISK_MONITOR_MODULE_VERSION,
336724365f7Ssethg &fmd_ops,
337724365f7Ssethg fmd_props
338724365f7Ssethg };
339724365f7Ssethg
340724365f7Ssethg void
_fmd_init(fmd_hdl_t * hdl)341724365f7Ssethg _fmd_init(fmd_hdl_t *hdl)
342724365f7Ssethg {
343724365f7Ssethg fmd_case_t *cp;
344724365f7Ssethg int disk_count;
345724365f7Ssethg
346724365f7Ssethg g_fm_hdl = hdl;
347724365f7Ssethg
348724365f7Ssethg if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
349724365f7Ssethg return;
350724365f7Ssethg }
351724365f7Ssethg
352724365f7Ssethg if (config_init()) {
353724365f7Ssethg log_err("Could not initialize configuration!\n");
354724365f7Ssethg fmd_hdl_unregister(hdl);
355724365f7Ssethg return;
356724365f7Ssethg }
357724365f7Ssethg
358724365f7Ssethg if (config_get(hdl, fmd_props)) {
359724365f7Ssethg config_fini();
360724365f7Ssethg log_err("Could not retrieve configuration from libtopo!\n");
361724365f7Ssethg fmd_hdl_unregister(hdl);
362724365f7Ssethg return;
363724365f7Ssethg }
364724365f7Ssethg
365724365f7Ssethg /*
366724365f7Ssethg * If there are no disks to monitor, bail out
367724365f7Ssethg */
368724365f7Ssethg if ((disk_count = count_disks(config_data->disk_list)) == 0) {
369724365f7Ssethg config_fini();
370724365f7Ssethg fmd_hdl_unregister(hdl);
371724365f7Ssethg return;
372724365f7Ssethg }
373724365f7Ssethg
374724365f7Ssethg if (diskmon_init() == E_ERROR) {
375724365f7Ssethg config_fini();
376724365f7Ssethg fmd_hdl_unregister(hdl);
377724365f7Ssethg return;
378724365f7Ssethg }
379724365f7Ssethg
380724365f7Ssethg log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);
381724365f7Ssethg
382724365f7Ssethg /*
383724365f7Ssethg * Iterate over all active cases.
384724365f7Ssethg * Since we automatically solve all cases, these cases must have
385724365f7Ssethg * had the fault added, but the DE must have been interrupted
386724365f7Ssethg * before they were solved.
387724365f7Ssethg */
388724365f7Ssethg for (cp = fmd_case_next(hdl, NULL);
389724365f7Ssethg cp != NULL; cp = fmd_case_next(hdl, cp)) {
390724365f7Ssethg
391724365f7Ssethg if (!fmd_case_solved(hdl, cp))
392724365f7Ssethg fmd_case_solve(hdl, cp);
393724365f7Ssethg }
394724365f7Ssethg }
395724365f7Ssethg
39624db4641Seschrock /*ARGSUSED*/
397724365f7Ssethg void
_fmd_fini(fmd_hdl_t * hdl)398724365f7Ssethg _fmd_fini(fmd_hdl_t *hdl)
399724365f7Ssethg {
400724365f7Ssethg diskmon_teardown_all();
401724365f7Ssethg g_fm_hdl = NULL;
402724365f7Ssethg }
403