xref: /illumos-gate/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c (revision 4263d13f00c9691fa14620eff82abef795be0693)
13d7072f8Seschrock /*
23d7072f8Seschrock  * CDDL HEADER START
33d7072f8Seschrock  *
43d7072f8Seschrock  * The contents of this file are subject to the terms of the
53d7072f8Seschrock  * Common Development and Distribution License (the "License").
63d7072f8Seschrock  * You may not use this file except in compliance with the License.
73d7072f8Seschrock  *
83d7072f8Seschrock  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93d7072f8Seschrock  * or http://www.opensolaris.org/os/licensing.
103d7072f8Seschrock  * See the License for the specific language governing permissions
113d7072f8Seschrock  * and limitations under the License.
123d7072f8Seschrock  *
133d7072f8Seschrock  * When distributing Covered Code, include this CDDL HEADER in each
143d7072f8Seschrock  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153d7072f8Seschrock  * If applicable, add the following below this CDDL HEADER, with the
163d7072f8Seschrock  * fields enclosed by brackets "[]" replaced with your own identifying
173d7072f8Seschrock  * information: Portions Copyright [yyyy] [name of copyright owner]
183d7072f8Seschrock  *
193d7072f8Seschrock  * CDDL HEADER END
203d7072f8Seschrock  */
213d7072f8Seschrock /*
22b98131cfSEric Taylor  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23*4263d13fSGeorge Wilson  * Copyright (c) 2012 by Delphix. All rights reserved.
243d7072f8Seschrock  */
253d7072f8Seschrock 
263d7072f8Seschrock /*
273d7072f8Seschrock  * ZFS syseventd module.
283d7072f8Seschrock  *
293d7072f8Seschrock  * The purpose of this module is to identify when devices are added to the
303d7072f8Seschrock  * system, and appropriately online or replace the affected vdevs.
313d7072f8Seschrock  *
323d7072f8Seschrock  * When a device is added to the system:
333d7072f8Seschrock  *
343d7072f8Seschrock  * 	1. Search for any vdevs whose devid matches that of the newly added
353d7072f8Seschrock  *	   device.
363d7072f8Seschrock  *
373d7072f8Seschrock  * 	2. If no vdevs are found, then search for any vdevs whose devfs path
383d7072f8Seschrock  *	   matches that of the new device.
393d7072f8Seschrock  *
403d7072f8Seschrock  *	3. If no vdevs match by either method, then ignore the event.
413d7072f8Seschrock  *
423d7072f8Seschrock  * 	4. Attempt to online the device with a flag to indicate that it should
433d7072f8Seschrock  *	   be unspared when resilvering completes.  If this succeeds, then the
443d7072f8Seschrock  *	   same device was inserted and we should continue normally.
453d7072f8Seschrock  *
463d7072f8Seschrock  *	5. If the pool does not have the 'autoreplace' property set, attempt to
473d7072f8Seschrock  *	   online the device again without the unspare flag, which will
483d7072f8Seschrock  *	   generate a FMA fault.
493d7072f8Seschrock  *
503d7072f8Seschrock  *	6. If the pool has the 'autoreplace' property set, and the matching vdev
513d7072f8Seschrock  *	   is a whole disk, then label the new disk and attempt a 'zpool
523d7072f8Seschrock  *	   replace'.
533d7072f8Seschrock  *
543d7072f8Seschrock  * The module responds to EC_DEV_ADD events for both disks and lofi devices,
553d7072f8Seschrock  * with the latter used for testing.  The special ESC_ZFS_VDEV_CHECK event
563d7072f8Seschrock  * indicates that a device failed to open during pool load, but the autoreplace
573d7072f8Seschrock  * property was set.  In this case, we deferred the associated FMA fault until
583d7072f8Seschrock  * our module had a chance to process the autoreplace logic.  If the device
593d7072f8Seschrock  * could not be replaced, then the second online attempt will trigger the FMA
603d7072f8Seschrock  * fault that we skipped earlier.
613d7072f8Seschrock  */
623d7072f8Seschrock 
633d7072f8Seschrock #include <alloca.h>
643d7072f8Seschrock #include <devid.h>
653d7072f8Seschrock #include <fcntl.h>
663d7072f8Seschrock #include <libnvpair.h>
673d7072f8Seschrock #include <libsysevent.h>
683d7072f8Seschrock #include <libzfs.h>
693d7072f8Seschrock #include <limits.h>
703d7072f8Seschrock #include <stdlib.h>
713d7072f8Seschrock #include <string.h>
723d7072f8Seschrock #include <syslog.h>
733c112a2bSEric Taylor #include <sys/list.h>
743d7072f8Seschrock #include <sys/sunddi.h>
753d7072f8Seschrock #include <sys/sysevent/eventdefs.h>
763d7072f8Seschrock #include <sys/sysevent/dev.h>
773c112a2bSEric Taylor #include <thread_pool.h>
783d7072f8Seschrock #include <unistd.h>
79b98131cfSEric Taylor #include "syseventd.h"
803d7072f8Seschrock 
813d7072f8Seschrock #if defined(__i386) || defined(__amd64)
823d7072f8Seschrock #define	PHYS_PATH	":q"
833d7072f8Seschrock #define	RAW_SLICE	"p0"
843d7072f8Seschrock #elif defined(__sparc)
853d7072f8Seschrock #define	PHYS_PATH	":c"
863d7072f8Seschrock #define	RAW_SLICE	"s2"
873d7072f8Seschrock #else
883d7072f8Seschrock #error Unknown architecture
893d7072f8Seschrock #endif
903d7072f8Seschrock 
913d7072f8Seschrock typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
923d7072f8Seschrock 
933d7072f8Seschrock libzfs_handle_t *g_zfshdl;
943c112a2bSEric Taylor list_t g_pool_list;
953c112a2bSEric Taylor tpool_t *g_tpool;
9637e3a0d8SEric Taylor boolean_t g_enumeration_done;
9737e3a0d8SEric Taylor thread_t g_zfs_tid;
983c112a2bSEric Taylor 
993c112a2bSEric Taylor typedef struct unavailpool {
1003c112a2bSEric Taylor 	zpool_handle_t	*uap_zhp;
1013c112a2bSEric Taylor 	list_node_t	uap_node;
1023c112a2bSEric Taylor } unavailpool_t;
1033c112a2bSEric Taylor 
1043c112a2bSEric Taylor int
1053c112a2bSEric Taylor zfs_toplevel_state(zpool_handle_t *zhp)
1063c112a2bSEric Taylor {
1073c112a2bSEric Taylor 	nvlist_t *nvroot;
1083c112a2bSEric Taylor 	vdev_stat_t *vs;
1093c112a2bSEric Taylor 	unsigned int c;
1103c112a2bSEric Taylor 
1113c112a2bSEric Taylor 	verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
1123c112a2bSEric Taylor 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1133c112a2bSEric Taylor 	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
1143c112a2bSEric Taylor 	    (uint64_t **)&vs, &c) == 0);
1153c112a2bSEric Taylor 	return (vs->vs_state);
1163c112a2bSEric Taylor }
1173c112a2bSEric Taylor 
1183c112a2bSEric Taylor static int
1193c112a2bSEric Taylor zfs_unavail_pool(zpool_handle_t *zhp, void *data)
1203c112a2bSEric Taylor {
1213c112a2bSEric Taylor 	if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
1223c112a2bSEric Taylor 		unavailpool_t *uap;
1233c112a2bSEric Taylor 		uap = malloc(sizeof (unavailpool_t));
1243c112a2bSEric Taylor 		uap->uap_zhp = zhp;
1253c112a2bSEric Taylor 		list_insert_tail((list_t *)data, uap);
1263c112a2bSEric Taylor 	} else {
1273c112a2bSEric Taylor 		zpool_close(zhp);
1283c112a2bSEric Taylor 	}
1293c112a2bSEric Taylor 	return (0);
1303c112a2bSEric Taylor }
1313d7072f8Seschrock 
1323d7072f8Seschrock /*
1333d7072f8Seschrock  * The device associated with the given vdev (either by devid or physical path)
1343d7072f8Seschrock  * has been added to the system.  If 'isdisk' is set, then we only attempt a
1353d7072f8Seschrock  * replacement if it's a whole disk.  This also implies that we should label the
1363d7072f8Seschrock  * disk first.
1373d7072f8Seschrock  *
1383d7072f8Seschrock  * First, we attempt to online the device (making sure to undo any spare
1393d7072f8Seschrock  * operation when finished).  If this succeeds, then we're done.  If it fails,
1403d7072f8Seschrock  * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
1413d7072f8Seschrock  * but that the label was not what we expected.  If the 'autoreplace' property
1423d7072f8Seschrock  * is not set, then we relabel the disk (if specified), and attempt a 'zpool
1433d7072f8Seschrock  * replace'.  If the online is successful, but the new state is something else
1443d7072f8Seschrock  * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
1453d7072f8Seschrock  * race, and we should avoid attempting to relabel the disk.
1463d7072f8Seschrock  */
1473d7072f8Seschrock static void
1483d7072f8Seschrock zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
1493d7072f8Seschrock {
1503d7072f8Seschrock 	char *path;
1513d7072f8Seschrock 	vdev_state_t newstate;
1523d7072f8Seschrock 	nvlist_t *nvroot, *newvd;
1533d7072f8Seschrock 	uint64_t wholedisk = 0ULL;
154bf82a41bSeschrock 	char *physpath = NULL;
1553d7072f8Seschrock 	char rawpath[PATH_MAX], fullpath[PATH_MAX];
1563d7072f8Seschrock 	size_t len;
1573d7072f8Seschrock 
1583d7072f8Seschrock 	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
1593d7072f8Seschrock 		return;
1603d7072f8Seschrock 
161bf82a41bSeschrock 	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
1623d7072f8Seschrock 	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
1633d7072f8Seschrock 
1643d7072f8Seschrock 	/*
1653d7072f8Seschrock 	 * We should have a way to online a device by guid.  With the current
1663d7072f8Seschrock 	 * interface, we are forced to chop off the 's0' for whole disks.
1673d7072f8Seschrock 	 */
1683d7072f8Seschrock 	(void) strlcpy(fullpath, path, sizeof (fullpath));
1693d7072f8Seschrock 	if (wholedisk)
1703d7072f8Seschrock 		fullpath[strlen(fullpath) - 2] = '\0';
1713d7072f8Seschrock 
1723d7072f8Seschrock 	/*
1733d7072f8Seschrock 	 * Attempt to online the device.  It would be nice to online this by
1743d7072f8Seschrock 	 * GUID, but the current interface only supports lookup by path.
1753d7072f8Seschrock 	 */
1763d7072f8Seschrock 	if (zpool_vdev_online(zhp, fullpath,
1773d7072f8Seschrock 	    ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
178bf82a41bSeschrock 	    (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED))
1793d7072f8Seschrock 		return;
1803d7072f8Seschrock 
1813d7072f8Seschrock 	/*
1823d7072f8Seschrock 	 * If the pool doesn't have the autoreplace property set, then attempt a
1833d7072f8Seschrock 	 * true online (without the unspare flag), which will trigger a FMA
1843d7072f8Seschrock 	 * fault.
1853d7072f8Seschrock 	 */
186990b4856Slling 	if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
1873d7072f8Seschrock 	    (isdisk && !wholedisk)) {
1883d7072f8Seschrock 		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
1893d7072f8Seschrock 		    &newstate);
1903d7072f8Seschrock 		return;
1913d7072f8Seschrock 	}
1923d7072f8Seschrock 
1933d7072f8Seschrock 	if (isdisk) {
1943d7072f8Seschrock 		/*
1953d7072f8Seschrock 		 * If this is a request to label a whole disk, then attempt to
1963d7072f8Seschrock 		 * write out the label.  Before we can label the disk, we need
1973d7072f8Seschrock 		 * access to a raw node.  Ideally, we'd like to walk the devinfo
1983d7072f8Seschrock 		 * tree and find a raw node from the corresponding parent node.
1993d7072f8Seschrock 		 * This is overly complicated, and since we know how we labeled
2003d7072f8Seschrock 		 * this device in the first place, we know it's save to switch
2013d7072f8Seschrock 		 * from /dev/dsk to /dev/rdsk and append the backup slice.
202c5904d13Seschrock 		 *
203c5904d13Seschrock 		 * If any part of this process fails, then do a force online to
204c5904d13Seschrock 		 * trigger a ZFS fault for the device (and any hot spare
205c5904d13Seschrock 		 * replacement).
2063d7072f8Seschrock 		 */
207c5904d13Seschrock 		if (strncmp(path, "/dev/dsk/", 9) != 0) {
208c5904d13Seschrock 			(void) zpool_vdev_online(zhp, fullpath,
209c5904d13Seschrock 			    ZFS_ONLINE_FORCEFAULT, &newstate);
2103d7072f8Seschrock 			return;
211c5904d13Seschrock 		}
2123d7072f8Seschrock 
2133d7072f8Seschrock 		(void) strlcpy(rawpath, path + 9, sizeof (rawpath));
2143d7072f8Seschrock 		len = strlen(rawpath);
2153d7072f8Seschrock 		rawpath[len - 2] = '\0';
2163d7072f8Seschrock 
217c5904d13Seschrock 		if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0) {
218c5904d13Seschrock 			(void) zpool_vdev_online(zhp, fullpath,
219c5904d13Seschrock 			    ZFS_ONLINE_FORCEFAULT, &newstate);
2203d7072f8Seschrock 			return;
221c5904d13Seschrock 		}
2223d7072f8Seschrock 	}
2233d7072f8Seschrock 
2243d7072f8Seschrock 	/*
2253d7072f8Seschrock 	 * Cosntruct the root vdev to pass to zpool_vdev_attach().  While adding
2263d7072f8Seschrock 	 * the entire vdev structure is harmless, we construct a reduced set of
227bf82a41bSeschrock 	 * path/physpath/wholedisk to keep it simple.
2283d7072f8Seschrock 	 */
2293d7072f8Seschrock 	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
2303d7072f8Seschrock 		return;
2313d7072f8Seschrock 
2323d7072f8Seschrock 	if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
2333d7072f8Seschrock 		nvlist_free(nvroot);
2343d7072f8Seschrock 		return;
2353d7072f8Seschrock 	}
2363d7072f8Seschrock 
2373d7072f8Seschrock 	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
2383d7072f8Seschrock 	    nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
239bf82a41bSeschrock 	    (physpath != NULL && nvlist_add_string(newvd,
240bf82a41bSeschrock 	    ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
2413d7072f8Seschrock 	    nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
2423d7072f8Seschrock 	    nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
2433d7072f8Seschrock 	    nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
2443d7072f8Seschrock 	    1) != 0) {
2453d7072f8Seschrock 		nvlist_free(newvd);
2463d7072f8Seschrock 		nvlist_free(nvroot);
2473d7072f8Seschrock 		return;
2483d7072f8Seschrock 	}
2493d7072f8Seschrock 
2503d7072f8Seschrock 	nvlist_free(newvd);
2513d7072f8Seschrock 
2523d7072f8Seschrock 	(void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
2533d7072f8Seschrock 
2543d7072f8Seschrock 	nvlist_free(nvroot);
2553d7072f8Seschrock 
2563d7072f8Seschrock }
2573d7072f8Seschrock 
2583d7072f8Seschrock /*
2593d7072f8Seschrock  * Utility functions to find a vdev matching given criteria.
2603d7072f8Seschrock  */
2613d7072f8Seschrock typedef struct dev_data {
2623d7072f8Seschrock 	const char		*dd_compare;
2633d7072f8Seschrock 	const char		*dd_prop;
2643d7072f8Seschrock 	zfs_process_func_t	dd_func;
2653d7072f8Seschrock 	boolean_t		dd_found;
2663d7072f8Seschrock 	boolean_t		dd_isdisk;
2673d7072f8Seschrock 	uint64_t		dd_pool_guid;
2683d7072f8Seschrock 	uint64_t		dd_vdev_guid;
2693d7072f8Seschrock } dev_data_t;
2703d7072f8Seschrock 
2713d7072f8Seschrock static void
2723d7072f8Seschrock zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
2733d7072f8Seschrock {
2743d7072f8Seschrock 	dev_data_t *dp = data;
2753d7072f8Seschrock 	char *path;
2763d7072f8Seschrock 	uint_t c, children;
2773d7072f8Seschrock 	nvlist_t **child;
278b01c3b58Seschrock 	size_t len;
2793d7072f8Seschrock 	uint64_t guid;
2803d7072f8Seschrock 
2813d7072f8Seschrock 	/*
2823d7072f8Seschrock 	 * First iterate over any children.
2833d7072f8Seschrock 	 */
2843d7072f8Seschrock 	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
2853d7072f8Seschrock 	    &child, &children) == 0) {
2863d7072f8Seschrock 		for (c = 0; c < children; c++)
2873d7072f8Seschrock 			zfs_iter_vdev(zhp, child[c], data);
2883d7072f8Seschrock 		return;
2893d7072f8Seschrock 	}
2903d7072f8Seschrock 
2913d7072f8Seschrock 	if (dp->dd_vdev_guid != 0) {
2923d7072f8Seschrock 		if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
2933d7072f8Seschrock 		    &guid) != 0 || guid != dp->dd_vdev_guid)
2943d7072f8Seschrock 			return;
2953d7072f8Seschrock 	} else {
296b01c3b58Seschrock 		len = strlen(dp->dd_compare);
297b01c3b58Seschrock 
2983d7072f8Seschrock 		if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
2993d7072f8Seschrock 		    strncmp(dp->dd_compare, path, len) != 0)
3003d7072f8Seschrock 			return;
3013d7072f8Seschrock 
3023d7072f8Seschrock 		/*
3033d7072f8Seschrock 		 * Normally, we want to have an exact match for the comparison
3043d7072f8Seschrock 		 * string.  However, we allow substring matches in the following
3053d7072f8Seschrock 		 * cases:
3063d7072f8Seschrock 		 *
3073d7072f8Seschrock 		 * 	<path>:		This is a devpath, and the target is one
3083d7072f8Seschrock 		 * 			of its children.
3093d7072f8Seschrock 		 *
3103d7072f8Seschrock 		 * 	<path/>		This is a devid for a whole disk, and
3113d7072f8Seschrock 		 * 			the target is one of its children.
3123d7072f8Seschrock 		 */
3133d7072f8Seschrock 		if (path[len] != '\0' && path[len] != ':' &&
3143d7072f8Seschrock 		    path[len - 1] != '/')
3153d7072f8Seschrock 			return;
3163d7072f8Seschrock 	}
3173d7072f8Seschrock 
3183d7072f8Seschrock 	(dp->dd_func)(zhp, nvl, dp->dd_isdisk);
3193d7072f8Seschrock }
3203d7072f8Seschrock 
3213c112a2bSEric Taylor void
3223c112a2bSEric Taylor zfs_enable_ds(void *arg)
3233c112a2bSEric Taylor {
3243c112a2bSEric Taylor 	unavailpool_t *pool = (unavailpool_t *)arg;
3253c112a2bSEric Taylor 
3263c112a2bSEric Taylor 	(void) zpool_enable_datasets(pool->uap_zhp, NULL, 0);
3273c112a2bSEric Taylor 	zpool_close(pool->uap_zhp);
3283c112a2bSEric Taylor 	free(pool);
3293c112a2bSEric Taylor }
3303c112a2bSEric Taylor 
3313d7072f8Seschrock static int
3323d7072f8Seschrock zfs_iter_pool(zpool_handle_t *zhp, void *data)
3333d7072f8Seschrock {
3343d7072f8Seschrock 	nvlist_t *config, *nvl;
3353d7072f8Seschrock 	dev_data_t *dp = data;
3363d7072f8Seschrock 	uint64_t pool_guid;
3373c112a2bSEric Taylor 	unavailpool_t *pool;
3383d7072f8Seschrock 
3393d7072f8Seschrock 	if ((config = zpool_get_config(zhp, NULL)) != NULL) {
3403d7072f8Seschrock 		if (dp->dd_pool_guid == 0 ||
3413d7072f8Seschrock 		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3423d7072f8Seschrock 		    &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) {
3433d7072f8Seschrock 			(void) nvlist_lookup_nvlist(config,
3443d7072f8Seschrock 			    ZPOOL_CONFIG_VDEV_TREE, &nvl);
3453d7072f8Seschrock 			zfs_iter_vdev(zhp, nvl, data);
3463d7072f8Seschrock 		}
3473d7072f8Seschrock 	}
34837e3a0d8SEric Taylor 	if (g_enumeration_done)  {
34937e3a0d8SEric Taylor 		for (pool = list_head(&g_pool_list); pool != NULL;
35037e3a0d8SEric Taylor 		    pool = list_next(&g_pool_list, pool)) {
35137e3a0d8SEric Taylor 
35237e3a0d8SEric Taylor 			if (strcmp(zpool_get_name(zhp),
35337e3a0d8SEric Taylor 			    zpool_get_name(pool->uap_zhp)))
35437e3a0d8SEric Taylor 				continue;
35537e3a0d8SEric Taylor 			if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) {
35637e3a0d8SEric Taylor 				list_remove(&g_pool_list, pool);
35737e3a0d8SEric Taylor 				(void) tpool_dispatch(g_tpool, zfs_enable_ds,
35837e3a0d8SEric Taylor 				    pool);
35937e3a0d8SEric Taylor 				break;
36037e3a0d8SEric Taylor 			}
3613c112a2bSEric Taylor 		}
3623c112a2bSEric Taylor 	}
3633d7072f8Seschrock 
3643d7072f8Seschrock 	zpool_close(zhp);
3653d7072f8Seschrock 	return (0);
3663d7072f8Seschrock }
3673d7072f8Seschrock 
3683d7072f8Seschrock /*
3693d7072f8Seschrock  * Given a physical device path, iterate over all (pool, vdev) pairs which
3703d7072f8Seschrock  * correspond to the given path.
3713d7072f8Seschrock  */
3723d7072f8Seschrock static boolean_t
3733d7072f8Seschrock devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
3743d7072f8Seschrock {
3753d7072f8Seschrock 	dev_data_t data = { 0 };
3763d7072f8Seschrock 
3773d7072f8Seschrock 	data.dd_compare = devpath;
3783d7072f8Seschrock 	data.dd_func = func;
3793d7072f8Seschrock 	data.dd_prop = ZPOOL_CONFIG_PHYS_PATH;
3803d7072f8Seschrock 	data.dd_found = B_FALSE;
3813d7072f8Seschrock 	data.dd_isdisk = wholedisk;
3823d7072f8Seschrock 
3833d7072f8Seschrock 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
3843d7072f8Seschrock 
3853d7072f8Seschrock 	return (data.dd_found);
3863d7072f8Seschrock }
3873d7072f8Seschrock 
3883d7072f8Seschrock /*
3893d7072f8Seschrock  * Given a /devices path, lookup the corresponding devid for each minor node,
3903d7072f8Seschrock  * and find any vdevs with matching devids.  Doing this straight up would be
3913d7072f8Seschrock  * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of
3923d7072f8Seschrock  * the fact that each devid ends with "/<minornode>".  Once we find any valid
3933d7072f8Seschrock  * minor node, we chop off the portion after the last slash, and then search for
3943d7072f8Seschrock  * matching vdevs, which is O(vdevs in system).
3953d7072f8Seschrock  */
3963d7072f8Seschrock static boolean_t
3973d7072f8Seschrock devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk)
3983d7072f8Seschrock {
3993d7072f8Seschrock 	size_t len = strlen(devpath) + sizeof ("/devices") +
4003d7072f8Seschrock 	    sizeof (PHYS_PATH) - 1;
4013d7072f8Seschrock 	char *fullpath;
4023d7072f8Seschrock 	int fd;
4033d7072f8Seschrock 	ddi_devid_t devid;
4043d7072f8Seschrock 	char *devidstr, *fulldevid;
4053d7072f8Seschrock 	dev_data_t data = { 0 };
4063d7072f8Seschrock 
4073d7072f8Seschrock 	/*
4083d7072f8Seschrock 	 * Try to open a known minor node.
4093d7072f8Seschrock 	 */
4103d7072f8Seschrock 	fullpath = alloca(len);
4113d7072f8Seschrock 	(void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH);
4123d7072f8Seschrock 	if ((fd = open(fullpath, O_RDONLY)) < 0)
4133d7072f8Seschrock 		return (B_FALSE);
4143d7072f8Seschrock 
4153d7072f8Seschrock 	/*
4163d7072f8Seschrock 	 * Determine the devid as a string, with no trailing slash for the minor
4173d7072f8Seschrock 	 * node.
4183d7072f8Seschrock 	 */
4193d7072f8Seschrock 	if (devid_get(fd, &devid) != 0) {
4203d7072f8Seschrock 		(void) close(fd);
4213d7072f8Seschrock 		return (B_FALSE);
4223d7072f8Seschrock 	}
4233d7072f8Seschrock 	(void) close(fd);
4243d7072f8Seschrock 
4253d7072f8Seschrock 	if ((devidstr = devid_str_encode(devid, NULL)) == NULL) {
4263d7072f8Seschrock 		devid_free(devid);
4273d7072f8Seschrock 		return (B_FALSE);
4283d7072f8Seschrock 	}
4293d7072f8Seschrock 
4303d7072f8Seschrock 	len = strlen(devidstr) + 2;
4313d7072f8Seschrock 	fulldevid = alloca(len);
4323d7072f8Seschrock 	(void) snprintf(fulldevid, len, "%s/", devidstr);
4333d7072f8Seschrock 
4343d7072f8Seschrock 	data.dd_compare = fulldevid;
4353d7072f8Seschrock 	data.dd_func = func;
4363d7072f8Seschrock 	data.dd_prop = ZPOOL_CONFIG_DEVID;
4373d7072f8Seschrock 	data.dd_found = B_FALSE;
4383d7072f8Seschrock 	data.dd_isdisk = wholedisk;
4393d7072f8Seschrock 
4403d7072f8Seschrock 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
4413d7072f8Seschrock 
4423d7072f8Seschrock 	devid_str_free(devidstr);
44325085d90SEric Taylor 	devid_free(devid);
4443d7072f8Seschrock 
4453d7072f8Seschrock 	return (data.dd_found);
4463d7072f8Seschrock }
4473d7072f8Seschrock 
4483d7072f8Seschrock /*
4493d7072f8Seschrock  * This function is called when we receive a devfs add event.  This can be
4503d7072f8Seschrock  * either a disk event or a lofi event, and the behavior is slightly different
4513d7072f8Seschrock  * depending on which it is.
4523d7072f8Seschrock  */
4533d7072f8Seschrock static int
4543d7072f8Seschrock zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi)
4553d7072f8Seschrock {
4563d7072f8Seschrock 	char *devpath, *devname;
4573d7072f8Seschrock 	char path[PATH_MAX], realpath[PATH_MAX];
4583d7072f8Seschrock 	char *colon, *raw;
4593d7072f8Seschrock 	int ret;
4603d7072f8Seschrock 
4613d7072f8Seschrock 	/*
4623d7072f8Seschrock 	 * The main unit of operation is the physical device path.  For disks,
4633d7072f8Seschrock 	 * this is the device node, as all minor nodes are affected.  For lofi
4643d7072f8Seschrock 	 * devices, this includes the minor path.  Unfortunately, this isn't
4653d7072f8Seschrock 	 * represented in the DEV_PHYS_PATH for various reasons.
4663d7072f8Seschrock 	 */
4673d7072f8Seschrock 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0)
4683d7072f8Seschrock 		return (-1);
4693d7072f8Seschrock 
4703d7072f8Seschrock 	/*
4713d7072f8Seschrock 	 * If this is a lofi device, then also get the minor instance name.
4723d7072f8Seschrock 	 * Unfortunately, the current payload doesn't include an easy way to get
4733d7072f8Seschrock 	 * this information.  So we cheat by resolving the 'dev_name' (which
4743d7072f8Seschrock 	 * refers to the raw device) and taking the portion between ':(*),raw'.
4753d7072f8Seschrock 	 */
4763d7072f8Seschrock 	(void) strlcpy(realpath, devpath, sizeof (realpath));
4773d7072f8Seschrock 	if (is_lofi) {
4783d7072f8Seschrock 		if (nvlist_lookup_string(nvl, DEV_NAME,
4793d7072f8Seschrock 		    &devname) == 0 &&
4803d7072f8Seschrock 		    (ret = resolvepath(devname, path,
4813d7072f8Seschrock 		    sizeof (path))) > 0) {
4823d7072f8Seschrock 			path[ret] = '\0';
4833d7072f8Seschrock 			colon = strchr(path, ':');
4843d7072f8Seschrock 			if (colon != NULL)
4853d7072f8Seschrock 				raw = strstr(colon + 1, ",raw");
4863d7072f8Seschrock 			if (colon != NULL && raw != NULL) {
4873d7072f8Seschrock 				*raw = '\0';
4883d7072f8Seschrock 				(void) snprintf(realpath,
4893d7072f8Seschrock 				    sizeof (realpath), "%s%s",
4903d7072f8Seschrock 				    devpath, colon);
4913d7072f8Seschrock 				*raw = ',';
4923d7072f8Seschrock 			}
4933d7072f8Seschrock 		}
4943d7072f8Seschrock 	}
4953d7072f8Seschrock 
4963d7072f8Seschrock 	/*
4973d7072f8Seschrock 	 * Iterate over all vdevs with a matching devid, and then those with a
4983d7072f8Seschrock 	 * matching /devices path.  For disks, we only want to pay attention to
4993d7072f8Seschrock 	 * vdevs marked as whole disks.  For lofi, we don't care (because we're
5003d7072f8Seschrock 	 * matching an exact minor name).
5013d7072f8Seschrock 	 */
5023d7072f8Seschrock 	if (!devid_iter(realpath, zfs_process_add, !is_lofi))
5033d7072f8Seschrock 		(void) devpath_iter(realpath, zfs_process_add, !is_lofi);
5043d7072f8Seschrock 
5053d7072f8Seschrock 	return (0);
5063d7072f8Seschrock }
5073d7072f8Seschrock 
5083d7072f8Seschrock /*
5093d7072f8Seschrock  * Called when we receive a VDEV_CHECK event, which indicates a device could not
5103d7072f8Seschrock  * be opened during initial pool open, but the autoreplace property was set on
5113d7072f8Seschrock  * the pool.  In this case, we treat it as if it were an add event.
5123d7072f8Seschrock  */
5133d7072f8Seschrock static int
5143d7072f8Seschrock zfs_deliver_check(nvlist_t *nvl)
5153d7072f8Seschrock {
5163d7072f8Seschrock 	dev_data_t data = { 0 };
5173d7072f8Seschrock 
5183d7072f8Seschrock 	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID,
5193d7072f8Seschrock 	    &data.dd_pool_guid) != 0 ||
5203d7072f8Seschrock 	    nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID,
5213d7072f8Seschrock 	    &data.dd_vdev_guid) != 0)
5223d7072f8Seschrock 		return (0);
5233d7072f8Seschrock 
5243d7072f8Seschrock 	data.dd_isdisk = B_TRUE;
5253d7072f8Seschrock 	data.dd_func = zfs_process_add;
5263d7072f8Seschrock 
5273d7072f8Seschrock 	(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);
5283d7072f8Seschrock 
5293d7072f8Seschrock 	return (0);
5303d7072f8Seschrock }
5313d7072f8Seschrock 
532b98131cfSEric Taylor #define	DEVICE_PREFIX	"/devices"
533b98131cfSEric Taylor 
534b98131cfSEric Taylor static int
535b98131cfSEric Taylor zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
536b98131cfSEric Taylor {
537b98131cfSEric Taylor 	char *devname = data;
538b98131cfSEric Taylor 	boolean_t avail_spare, l2cache;
539b98131cfSEric Taylor 	vdev_state_t newstate;
540b98131cfSEric Taylor 	nvlist_t *tgt;
541b98131cfSEric Taylor 
542b98131cfSEric Taylor 	syseventd_print(9, "zfsdle_vdev_online: searching for %s in pool %s\n",
543b98131cfSEric Taylor 	    devname, zpool_get_name(zhp));
544b98131cfSEric Taylor 
545b98131cfSEric Taylor 	if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
546b98131cfSEric Taylor 	    &avail_spare, &l2cache, NULL)) != NULL) {
547b98131cfSEric Taylor 		char *path, fullpath[MAXPATHLEN];
548b98131cfSEric Taylor 		uint64_t wholedisk = 0ULL;
549b98131cfSEric Taylor 
550b98131cfSEric Taylor 		verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
551b98131cfSEric Taylor 		    &path) == 0);
552b98131cfSEric Taylor 		verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
553b98131cfSEric Taylor 		    &wholedisk) == 0);
554b98131cfSEric Taylor 
555b98131cfSEric Taylor 		(void) strlcpy(fullpath, path, sizeof (fullpath));
556*4263d13fSGeorge Wilson 		if (wholedisk) {
557b98131cfSEric Taylor 			fullpath[strlen(fullpath) - 2] = '\0';
558b98131cfSEric Taylor 
559*4263d13fSGeorge Wilson 			/*
560*4263d13fSGeorge Wilson 			 * We need to reopen the pool associated with this
561*4263d13fSGeorge Wilson 			 * device so that the kernel can update the size
562*4263d13fSGeorge Wilson 			 * of the expanded device.
563*4263d13fSGeorge Wilson 			 */
564*4263d13fSGeorge Wilson 			(void) zpool_reopen(zhp);
565*4263d13fSGeorge Wilson 		}
566*4263d13fSGeorge Wilson 
567b98131cfSEric Taylor 		if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
568b98131cfSEric Taylor 			syseventd_print(9, "zfsdle_vdev_online: setting device"
569b98131cfSEric Taylor 			    " device %s to ONLINE state in pool %s.\n",
570b98131cfSEric Taylor 			    fullpath, zpool_get_name(zhp));
571b98131cfSEric Taylor 			if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL)
572b98131cfSEric Taylor 				(void) zpool_vdev_online(zhp, fullpath, 0,
573b98131cfSEric Taylor 				    &newstate);
574b98131cfSEric Taylor 		}
57525085d90SEric Taylor 		zpool_close(zhp);
576b98131cfSEric Taylor 		return (1);
577b98131cfSEric Taylor 	}
57825085d90SEric Taylor 	zpool_close(zhp);
579b98131cfSEric Taylor 	return (0);
580b98131cfSEric Taylor }
581b98131cfSEric Taylor 
582b98131cfSEric Taylor int
583b98131cfSEric Taylor zfs_deliver_dle(nvlist_t *nvl)
584b98131cfSEric Taylor {
585b98131cfSEric Taylor 	char *devname;
586b98131cfSEric Taylor 	if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) {
587b98131cfSEric Taylor 		syseventd_print(9, "zfs_deliver_event: no physpath\n");
588b98131cfSEric Taylor 		return (-1);
589b98131cfSEric Taylor 	}
590b98131cfSEric Taylor 	if (strncmp(devname, DEVICE_PREFIX, strlen(DEVICE_PREFIX)) != 0) {
591b98131cfSEric Taylor 		syseventd_print(9, "zfs_deliver_event: invalid "
592b98131cfSEric Taylor 		    "device '%s'", devname);
593b98131cfSEric Taylor 		return (-1);
594b98131cfSEric Taylor 	}
595b98131cfSEric Taylor 
596b98131cfSEric Taylor 	/*
597b98131cfSEric Taylor 	 * We try to find the device using the physical
598b98131cfSEric Taylor 	 * path that has been supplied. We need to strip off
599b98131cfSEric Taylor 	 * the /devices prefix before starting our search.
600b98131cfSEric Taylor 	 */
601b98131cfSEric Taylor 	devname += strlen(DEVICE_PREFIX);
602b98131cfSEric Taylor 	if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) {
603b98131cfSEric Taylor 		syseventd_print(9, "zfs_deliver_event: device '%s' not"
604b98131cfSEric Taylor 		    " found\n", devname);
605b98131cfSEric Taylor 		return (1);
606b98131cfSEric Taylor 	}
607b98131cfSEric Taylor 	return (0);
608b98131cfSEric Taylor }
609b98131cfSEric Taylor 
610b98131cfSEric Taylor 
6113d7072f8Seschrock /*ARGSUSED*/
6123d7072f8Seschrock static int
6133d7072f8Seschrock zfs_deliver_event(sysevent_t *ev, int unused)
6143d7072f8Seschrock {
6153d7072f8Seschrock 	const char *class = sysevent_get_class_name(ev);
6163d7072f8Seschrock 	const char *subclass = sysevent_get_subclass_name(ev);
6173d7072f8Seschrock 	nvlist_t *nvl;
6183d7072f8Seschrock 	int ret;
619b98131cfSEric Taylor 	boolean_t is_lofi, is_check, is_dle = B_FALSE;
6203d7072f8Seschrock 
6213d7072f8Seschrock 	if (strcmp(class, EC_DEV_ADD) == 0) {
6223d7072f8Seschrock 		/*
6233d7072f8Seschrock 		 * We're mainly interested in disk additions, but we also listen
6243d7072f8Seschrock 		 * for new lofi devices, to allow for simplified testing.
6253d7072f8Seschrock 		 */
6263d7072f8Seschrock 		if (strcmp(subclass, ESC_DISK) == 0)
6273d7072f8Seschrock 			is_lofi = B_FALSE;
6283d7072f8Seschrock 		else if (strcmp(subclass, ESC_LOFI) == 0)
6293d7072f8Seschrock 			is_lofi = B_TRUE;
6303d7072f8Seschrock 		else
6313d7072f8Seschrock 			return (0);
6323d7072f8Seschrock 
6333d7072f8Seschrock 		is_check = B_FALSE;
6343d7072f8Seschrock 	} else if (strcmp(class, EC_ZFS) == 0 &&
6353d7072f8Seschrock 	    strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) {
6363d7072f8Seschrock 		/*
6373d7072f8Seschrock 		 * This event signifies that a device failed to open during pool
6383d7072f8Seschrock 		 * load, but the 'autoreplace' property was set, so we should
6393d7072f8Seschrock 		 * pretend it's just been added.
6403d7072f8Seschrock 		 */
6413d7072f8Seschrock 		is_check = B_TRUE;
642b98131cfSEric Taylor 	} else if (strcmp(class, EC_DEV_STATUS) == 0 &&
643b98131cfSEric Taylor 	    strcmp(subclass, ESC_DEV_DLE) == 0) {
644b98131cfSEric Taylor 		is_dle = B_TRUE;
6453d7072f8Seschrock 	} else {
6463d7072f8Seschrock 		return (0);
6473d7072f8Seschrock 	}
6483d7072f8Seschrock 
6493d7072f8Seschrock 	if (sysevent_get_attr_list(ev, &nvl) != 0)
6503d7072f8Seschrock 		return (-1);
6513d7072f8Seschrock 
652b98131cfSEric Taylor 	if (is_dle)
653b98131cfSEric Taylor 		ret = zfs_deliver_dle(nvl);
654b98131cfSEric Taylor 	else if (is_check)
6553d7072f8Seschrock 		ret = zfs_deliver_check(nvl);
6563d7072f8Seschrock 	else
6573d7072f8Seschrock 		ret = zfs_deliver_add(nvl, is_lofi);
6583d7072f8Seschrock 
6593d7072f8Seschrock 	nvlist_free(nvl);
6603d7072f8Seschrock 	return (ret);
6613d7072f8Seschrock }
6623d7072f8Seschrock 
66337e3a0d8SEric Taylor /*ARGSUSED*/
66437e3a0d8SEric Taylor void *
66537e3a0d8SEric Taylor zfs_enum_pools(void *arg)
66637e3a0d8SEric Taylor {
66737e3a0d8SEric Taylor 	(void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list);
66837e3a0d8SEric Taylor 	if (!list_is_empty(&g_pool_list))
66937e3a0d8SEric Taylor 		g_tpool = tpool_create(1, sysconf(_SC_NPROCESSORS_ONLN),
67037e3a0d8SEric Taylor 		    0, NULL);
67137e3a0d8SEric Taylor 	g_enumeration_done = B_TRUE;
67237e3a0d8SEric Taylor 	return (NULL);
67337e3a0d8SEric Taylor }
67437e3a0d8SEric Taylor 
6753d7072f8Seschrock static struct slm_mod_ops zfs_mod_ops = {
6763d7072f8Seschrock 	SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event
6773d7072f8Seschrock };
6783d7072f8Seschrock 
6793d7072f8Seschrock struct slm_mod_ops *
6803d7072f8Seschrock slm_init()
6813d7072f8Seschrock {
6823d7072f8Seschrock 	if ((g_zfshdl = libzfs_init()) == NULL)
6833d7072f8Seschrock 		return (NULL);
68437e3a0d8SEric Taylor 	/*
68537e3a0d8SEric Taylor 	 * collect a list of unavailable pools (asynchronously,
68637e3a0d8SEric Taylor 	 * since this can take a while)
68737e3a0d8SEric Taylor 	 */
6883c112a2bSEric Taylor 	list_create(&g_pool_list, sizeof (struct unavailpool),
6893c112a2bSEric Taylor 	    offsetof(struct unavailpool, uap_node));
69037e3a0d8SEric Taylor 	if (thr_create(NULL, 0, zfs_enum_pools, NULL, 0, &g_zfs_tid) != 0)
69137e3a0d8SEric Taylor 		return (NULL);
6923d7072f8Seschrock 	return (&zfs_mod_ops);
6933d7072f8Seschrock }
6943d7072f8Seschrock 
6953d7072f8Seschrock void
6963d7072f8Seschrock slm_fini()
6973d7072f8Seschrock {
6983c112a2bSEric Taylor 	unavailpool_t *pool;
6993c112a2bSEric Taylor 
70037e3a0d8SEric Taylor 	if (g_tpool != NULL) {
7013c112a2bSEric Taylor 		tpool_wait(g_tpool);
7023c112a2bSEric Taylor 		tpool_destroy(g_tpool);
7033c112a2bSEric Taylor 	}
7043c112a2bSEric Taylor 	while ((pool = (list_head(&g_pool_list))) != NULL) {
7053c112a2bSEric Taylor 		list_remove(&g_pool_list, pool);
7063c112a2bSEric Taylor 		zpool_close(pool->uap_zhp);
7073c112a2bSEric Taylor 		free(pool);
7083c112a2bSEric Taylor 	}
70937e3a0d8SEric Taylor 	(void) thr_join(g_zfs_tid, NULL, NULL);
7103c112a2bSEric Taylor 	list_destroy(&g_pool_list);
71125085d90SEric Taylor 	libzfs_fini(g_zfshdl);
7123d7072f8Seschrock }
713