13d7072feschrock/*
23d7072feschrock * CDDL HEADER START
33d7072feschrock *
43d7072feschrock * The contents of this file are subject to the terms of the
53d7072feschrock * Common Development and Distribution License (the "License").
63d7072feschrock * You may not use this file except in compliance with the License.
73d7072feschrock *
83d7072feschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93d7072feschrock * or http://www.opensolaris.org/os/licensing.
103d7072feschrock * See the License for the specific language governing permissions
113d7072feschrock * and limitations under the License.
123d7072feschrock *
133d7072feschrock * When distributing Covered Code, include this CDDL HEADER in each
143d7072feschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153d7072feschrock * If applicable, add the following below this CDDL HEADER, with the
163d7072feschrock * fields enclosed by brackets "[]" replaced with your own identifying
173d7072feschrock * information: Portions Copyright [yyyy] [name of copyright owner]
183d7072feschrock *
193d7072feschrock * CDDL HEADER END
203d7072feschrock */
213d7072feschrock/*
22b98131cEric Taylor * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
234263d13George Wilson * Copyright (c) 2012 by Delphix. All rights reserved.
24cead1dfHans Rosenfeld * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
253d7072feschrock */
263d7072feschrock
273d7072feschrock/*
283d7072feschrock * ZFS syseventd module.
293d7072feschrock *
303d7072feschrock * The purpose of this module is to identify when devices are added to the
313d7072feschrock * system, and appropriately online or replace the affected vdevs.
323d7072feschrock *
333d7072feschrock * When a device is added to the system:
343d7072feschrock *
353d7072feschrock * 	1. Search for any vdevs whose devid matches that of the newly added
363d7072feschrock *	   device.
373d7072feschrock *
383d7072feschrock * 	2. If no vdevs are found, then search for any vdevs whose devfs path
393d7072feschrock *	   matches that of the new device.
403d7072feschrock *
413d7072feschrock *	3. If no vdevs match by either method, then ignore the event.
423d7072feschrock *
433d7072feschrock * 	4. Attempt to online the device with a flag to indicate that it should
443d7072feschrock *	   be unspared when resilvering completes.  If this succeeds, then the
453d7072feschrock *	   same device was inserted and we should continue normally.
463d7072feschrock *
473d7072feschrock *	5. If the pool does not have the 'autoreplace' property set, attempt to
483d7072feschrock *	   online the device again without the unspare flag, which will
493d7072feschrock *	   generate a FMA fault.
503d7072feschrock *
513d7072feschrock *	6. If the pool has the 'autoreplace' property set, and the matching vdev
523d7072feschrock *	   is a whole disk, then label the new disk and attempt a 'zpool
533d7072feschrock *	   replace'.
543d7072feschrock *
553d7072feschrock * The module responds to EC_DEV_ADD events for both disks and lofi devices,
563d7072feschrock * with the latter used for testing.  The special ESC_ZFS_VDEV_CHECK event
573d7072feschrock * indicates that a device failed to open during pool load, but the autoreplace
583d7072feschrock * property was set.  In this case, we deferred the associated FMA fault until
593d7072feschrock * our module had a chance to process the autoreplace logic.  If the device
603d7072feschrock * could not be replaced, then the second online attempt will trigger the FMA
613d7072feschrock * fault that we skipped earlier.
623d7072feschrock */
633d7072feschrock
643d7072feschrock#include <alloca.h>
653d7072feschrock#include <devid.h>
663d7072feschrock#include <fcntl.h>
673d7072feschrock#include <libnvpair.h>
683d7072feschrock#include <libsysevent.h>
693d7072feschrock#include <libzfs.h>
703d7072feschrock#include <limits.h>
713d7072feschrock#include <stdlib.h>
723d7072feschrock#include <string.h>
733d7072feschrock#include <syslog.h>
743c112a2Eric Taylor#include <sys/list.h>
753d7072feschrock#include <sys/sunddi.h>
763d7072feschrock#include <sys/sysevent/eventdefs.h>
773d7072feschrock#include <sys/sysevent/dev.h>
783c112a2Eric Taylor#include <thread_pool.h>
793d7072feschrock#include <unistd.h>
80b98131cEric Taylor#include "syseventd.h"
813d7072feschrock
823d7072feschrock#if defined(__i386) || defined(__amd64)
833d7072feschrock#define	PHYS_PATH	":q"
843d7072feschrock#define	RAW_SLICE	"p0"
853d7072feschrock#elif defined(__sparc)
863d7072feschrock#define	PHYS_PATH	":c"
873d7072feschrock#define	RAW_SLICE	"s2"
883d7072feschrock#else
893d7072feschrock#error Unknown architecture
903d7072feschrock#endif
913d7072feschrock
923d7072feschrocktypedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t);
933d7072feschrock
943d7072feschrocklibzfs_handle_t *g_zfshdl;
953c112a2Eric Taylorlist_t g_pool_list;
963c112a2Eric Taylortpool_t *g_tpool;
9737e3a0dEric Taylorboolean_t g_enumeration_done;
9837e3a0dEric Taylorthread_t g_zfs_tid;
993c112a2Eric Taylor
1003c112a2Eric Taylortypedef struct unavailpool {
1013c112a2Eric Taylor	zpool_handle_t	*uap_zhp;
1023c112a2Eric Taylor	list_node_t	uap_node;
1033c112a2Eric Taylor} unavailpool_t;
1043c112a2Eric Taylor
1053c112a2Eric Taylorint
1063c112a2Eric Taylorzfs_toplevel_state(zpool_handle_t *zhp)
1073c112a2Eric Taylor{
1083c112a2Eric Taylor	nvlist_t *nvroot;
1093c112a2Eric Taylor	vdev_stat_t *vs;
1103c112a2Eric Taylor	unsigned int c;
1113c112a2Eric Taylor
1123c112a2Eric Taylor	verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
1133c112a2Eric Taylor	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1143c112a2Eric Taylor	verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
1153c112a2Eric Taylor	    (uint64_t **)&vs, &c) == 0);
1163c112a2Eric Taylor	return (vs->vs_state);
1173c112a2Eric Taylor}
1183c112a2Eric Taylor
1193c112a2Eric Taylorstatic int
1203c112a2Eric Taylorzfs_unavail_pool(zpool_handle_t *zhp, void *data)
1213c112a2Eric Taylor{
1223c112a2Eric Taylor	if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) {
1233c112a2Eric Taylor		unavailpool_t *uap;
1243c112a2Eric Taylor		uap = malloc(sizeof (unavailpool_t));
1253c112a2Eric Taylor		uap->uap_zhp = zhp;
1263c112a2Eric Taylor		list_insert_tail((list_t *)data, uap);
1273c112a2Eric Taylor	} else {
1283c112a2Eric Taylor		zpool_close(zhp);
1293c112a2Eric Taylor	}
1303c112a2Eric Taylor	return (0);
1313c112a2Eric Taylor}
1323d7072feschrock
1333d7072feschrock/*
1343d7072feschrock * The device associated with the given vdev (either by devid or physical path)
1353d7072feschrock * has been added to the system.  If 'isdisk' is set, then we only attempt a
1363d7072feschrock * replacement if it's a whole disk.  This also implies that we should label the
1373d7072feschrock * disk first.
1383d7072feschrock *
1393d7072feschrock * First, we attempt to online the device (making sure to undo any spare
1403d7072feschrock * operation when finished).  If this succeeds, then we're done.  If it fails,
1413d7072feschrock * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened,
1423d7072feschrock * but that the label was not what we expected.  If the 'autoreplace' property
1433d7072feschrock * is not set, then we relabel the disk (if specified), and attempt a 'zpool
1443d7072feschrock * replace'.  If the online is successful, but the new state is something else
1453d7072feschrock * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of
1463d7072feschrock * race, and we should avoid attempting to relabel the disk.
1473d7072feschrock */
1483d7072feschrockstatic void
1493d7072feschrockzfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk)
1503d7072feschrock{
1513d7072feschrock	char *path;
1523d7072feschrock	vdev_state_t newstate;
1533d7072feschrock	nvlist_t *nvroot, *newvd;
1543d7072feschrock	uint64_t wholedisk = 0ULL;
155acd07c6Yuri Pankov	uint64_t offline = 0ULL;
156bf82a41eschrock	char *physpath = NULL;
1573d7072feschrock	char rawpath[PATH_MAX], fullpath[PATH_MAX];
1587855d95Toomas Soome	zpool_boot_label_t boot_type;
1597855d95Toomas Soome	uint64_t boot_size;
1603d7072feschrock	size_t len;
1613d7072feschrock
1623d7072feschrock	if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
1633d7072feschrock		return;
1643d7072feschrock
165bf82a41eschrock	(void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath);
1663d7072feschrock	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
167acd07c6Yuri Pankov	(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
1683d7072feschrock
1693d7072feschrock	/*
1703d7072feschrock	 * We should have a way to online a device by guid.  With the current
1713d7072feschrock	 * interface, we are forced to chop off the 's0' for whole disks.
1723d7072feschrock	 */
1733d7072feschrock	(void) strlcpy(fullpath, path, sizeof (fullpath));
1743d7072feschrock	if (wholedisk)
1753d7072feschrock		fullpath[strlen(fullpath) - 2] = '\0';
1763d7072feschrock
1773d7072feschrock	/*
1783d7072feschrock	 * Attempt to online the device.  It would be nice to online this by
1793d7072feschrock	 * GUID, but the current interface only supports lookup by path.
1803d7072feschrock	 */
181acd07c6Yuri Pankov	if (offline ||
182acd07c6Yuri Pankov	    (zpool_vdev_online(zhp, fullpath,
1833d7072feschrock	    ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
184acd07c6Yuri Pankov	    (newstate == VDEV_STATE_HEALTHY ||
185acd07c6Yuri Pankov	    newstate == VDEV_STATE_DEGRADED)))
1863d7072feschrock		return;
1873d7072feschrock
1883d7072feschrock	/*
1893d7072feschrock	 * If the pool doesn't have the autoreplace property set, then attempt a
1903d7072feschrock	 * true online (without the unspare flag), which will trigger a FMA
1913d7072feschrock	 * fault.
1923d7072feschrock	 */
193990b485lling	if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
1943d7072feschrock	    (isdisk && !wholedisk)) {
1953d7072feschrock		(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
1963d7072feschrock		    &newstate);
1973d7072feschrock		return;
1983d7072feschrock	}
1993d7072feschrock
2003d7072feschrock	if (isdisk) {
2013d7072feschrock		/*
2023d7072feschrock		 * If this is a request to label a whole disk, then attempt to
2033d7072feschrock		 * write out the label.  Before we can label the disk, we need
2043d7072feschrock		 * access to a raw node.  Ideally, we'd like to walk the devinfo
2053d7072feschrock		 * tree and find a raw node from the corresponding parent node.
2063d7072feschrock		 * This is overly complicated, and since we know how we labeled
2073d7072feschrock		 * this device in the first place, we know it's save to switch
2083d7072feschrock		 * from /dev/dsk to /dev/rdsk and append the backup slice.
209c5904d1eschrock		 *
210c5904d1eschrock		 * If any part of this process fails, then do a force online to
211c5904d1eschrock		 * trigger a ZFS fault for the device (and any hot spare
212c5904d1eschrock		 * replacement).
2133d7072feschrock		 */
2146401734Will Andrews		if (strncmp(path, ZFS_DISK_ROOTD,
2156401734Will Andrews		    strlen(ZFS_DISK_ROOTD)) != 0) {
216c5904d1eschrock			(void) zpool_vdev_online(zhp, fullpath,
217c5904d1eschrock			    ZFS_ONLINE_FORCEFAULT, &newstate);
2183d7072feschrock			return;
219c5904d1eschrock		}
2203d7072feschrock
2213d7072feschrock		(void) strlcpy(rawpath, path + 9, sizeof (rawpath));
2223d7072feschrock		len = strlen(rawpath);
2233d7072feschrock		rawpath[len - 2] = '\0';
2243d7072feschrock
2257855d95Toomas Soome		if (zpool_is_bootable(zhp))
2267855d95Toomas Soome			boot_type = ZPOOL_COPY_BOOT_LABEL;
2277855d95Toomas Soome		else
2287855d95Toomas Soome			boot_type = ZPOOL_NO_BOOT_LABEL;
2297855d95Toomas Soome
2307855d95Toomas Soome		boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL);
2317855d95Toomas Soome		if (zpool_label_disk(g_zfshdl, zhp, rawpath,
2327855d95Toomas Soome		    boot_type, boot_size, NULL) != 0) {
233c5904d1eschrock			(void) zpool_vdev_online(zhp, fullpath,
234c5904d1eschrock			    ZFS_ONLINE_FORCEFAULT, &newstate);
2353d7072feschrock			return;
236c5904d1eschrock		}
2373d7072feschrock	}
2383d7072feschrock
2393d7072feschrock	/*
2403d7072feschrock	 * Cosntruct the root vdev to pass to zpool_vdev_attach().  While adding
2413d7072feschrock	 * the entire vdev structure is harmless, we construct a reduced set of
242bf82a41eschrock	 * path/physpath/wholedisk to keep it simple.
2433d7072feschrock	 */
2443d7072feschrock	if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0)
2453d7072feschrock		return;
2463d7072feschrock
2473d7072feschrock	if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
2483d7072feschrock		nvlist_free(nvroot);
2493d7072feschrock		return;
2503d7072feschrock	}
2513d7072feschrock
2523d7072feschrock	if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 ||
2533d7072feschrock	    nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 ||
254bf82a41eschrock	    (physpath != NULL && nvlist_add_string(newvd,
255bf82a41eschrock	    ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) ||
2563d7072feschrock	    nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 ||
2573d7072feschrock	    nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 ||
2583d7072feschrock	    nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd,
2593d7072feschrock	    1) != 0) {
2603d7072feschrock		nvlist_free(newvd);
2613d7072feschrock		nvlist_free(nvroot);
2623d7072feschrock		return;
2633d7072feschrock	}
2643d7072feschrock
2653d7072feschrock	nvlist_free(newvd);
2663d7072feschrock
2673d7072feschrock	(void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
2683d7072feschrock
2693d7072feschrock	nvlist_free(nvroot);
2703d7072feschrock
2713d7072feschrock}
2723d7072feschrock
2733d7072feschrock/*
2743d7072feschrock * Utility functions to find a vdev matching given criteria.
2753d7072feschrock */
2763d7072feschrocktypedef struct dev_data {
2773d7072feschrock	const char		*dd_compare;
2783d7072feschrock	const char		*dd_prop;
2793d7072feschrock	zfs_process_func_t	dd_func;
2803d7072feschrock	boolean_t		dd_found;
2813d7072feschrock	boolean_t		dd_isdisk;
2823d7072feschrock	uint64_t		dd_pool_guid;
2833d7072feschrock	uint64_t		dd_vdev_guid;
2843d7072feschrock} dev_data_t;
2853d7072feschrock
2863d7072feschrockstatic void
2873d7072feschrockzfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
2883d7072feschrock{
2893d7072feschrock	dev_data_t *dp = data;
2903d7072feschrock	char *path;
2913d7072feschrock	uint_t c, children;
2923d7072feschrock	nvlist_t **child;
293b01c3b5eschrock	size_t len;
2943d7072feschrock	uint64_t guid;
2953d7072feschrock
2963d7072feschrock	/*
2973d7072feschrock	 * First iterate over any children.
2983d7072feschrock	 */
2993d7072feschrock	if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN,
3003d7072feschrock	    &child, &children) == 0) {
3013d7072feschrock		for (c = 0; c < children; c++)
3023d7072feschrock			zfs_iter_vdev(zhp, child[c], data);
3033d7072feschrock		return;
3043d7072feschrock	}
3053d7072feschrock
3063d7072feschrock	if (dp->dd_vdev_guid != 0) {
3073d7072feschrock		if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID,
3083d7072feschrock		    &guid) != 0 || guid != dp->dd_vdev_guid)
3093d7072feschrock			return;
3101437283Hans Rosenfeld	} else if (dp->dd_compare != NULL) {
311b01c3b5eschrock		len = strlen(dp->dd_compare);
312b01c3b5eschrock
3133d7072feschrock		if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
3143d7072feschrock		    strncmp(dp->dd_compare, path, len) != 0)
3153d7072feschrock			return;
3163d7072feschrock
3173d7072feschrock		/*
3183d7072feschrock		 * Normally, we want to have an exact match for the comparison
3193d7072feschrock		 * string.  However, we allow substring matches in the following
3203d7072feschrock		 * cases:
3213d7072feschrock		 *
3223d7072feschrock		 * 	<path>:		This is a devpath, and the target is one
3233d7072feschrock		 * 			of its children.
3243d7072feschrock		 *
3253d7072feschrock		 * 	<path/>		This is a devid for a whole disk, and
3263d7072feschrock</