13d7072f8Seschrock /* 23d7072f8Seschrock * CDDL HEADER START 33d7072f8Seschrock * 43d7072f8Seschrock * The contents of this file are subject to the terms of the 53d7072f8Seschrock * Common Development and Distribution License (the "License"). 63d7072f8Seschrock * You may not use this file except in compliance with the License. 73d7072f8Seschrock * 83d7072f8Seschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 93d7072f8Seschrock * or http://www.opensolaris.org/os/licensing. 103d7072f8Seschrock * See the License for the specific language governing permissions 113d7072f8Seschrock * and limitations under the License. 123d7072f8Seschrock * 133d7072f8Seschrock * When distributing Covered Code, include this CDDL HEADER in each 143d7072f8Seschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 153d7072f8Seschrock * If applicable, add the following below this CDDL HEADER, with the 163d7072f8Seschrock * fields enclosed by brackets "[]" replaced with your own identifying 173d7072f8Seschrock * information: Portions Copyright [yyyy] [name of copyright owner] 183d7072f8Seschrock * 193d7072f8Seschrock * CDDL HEADER END 203d7072f8Seschrock */ 213d7072f8Seschrock /* 22b98131cfSEric Taylor * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23*4263d13fSGeorge Wilson * Copyright (c) 2012 by Delphix. All rights reserved. 243d7072f8Seschrock */ 253d7072f8Seschrock 263d7072f8Seschrock /* 273d7072f8Seschrock * ZFS syseventd module. 283d7072f8Seschrock * 293d7072f8Seschrock * The purpose of this module is to identify when devices are added to the 303d7072f8Seschrock * system, and appropriately online or replace the affected vdevs. 313d7072f8Seschrock * 323d7072f8Seschrock * When a device is added to the system: 333d7072f8Seschrock * 343d7072f8Seschrock * 1. Search for any vdevs whose devid matches that of the newly added 353d7072f8Seschrock * device. 363d7072f8Seschrock * 373d7072f8Seschrock * 2. If no vdevs are found, then search for any vdevs whose devfs path 383d7072f8Seschrock * matches that of the new device. 393d7072f8Seschrock * 403d7072f8Seschrock * 3. If no vdevs match by either method, then ignore the event. 413d7072f8Seschrock * 423d7072f8Seschrock * 4. Attempt to online the device with a flag to indicate that it should 433d7072f8Seschrock * be unspared when resilvering completes. If this succeeds, then the 443d7072f8Seschrock * same device was inserted and we should continue normally. 453d7072f8Seschrock * 463d7072f8Seschrock * 5. If the pool does not have the 'autoreplace' property set, attempt to 473d7072f8Seschrock * online the device again without the unspare flag, which will 483d7072f8Seschrock * generate a FMA fault. 493d7072f8Seschrock * 503d7072f8Seschrock * 6. If the pool has the 'autoreplace' property set, and the matching vdev 513d7072f8Seschrock * is a whole disk, then label the new disk and attempt a 'zpool 523d7072f8Seschrock * replace'. 533d7072f8Seschrock * 543d7072f8Seschrock * The module responds to EC_DEV_ADD events for both disks and lofi devices, 553d7072f8Seschrock * with the latter used for testing. The special ESC_ZFS_VDEV_CHECK event 563d7072f8Seschrock * indicates that a device failed to open during pool load, but the autoreplace 573d7072f8Seschrock * property was set. In this case, we deferred the associated FMA fault until 583d7072f8Seschrock * our module had a chance to process the autoreplace logic. If the device 593d7072f8Seschrock * could not be replaced, then the second online attempt will trigger the FMA 603d7072f8Seschrock * fault that we skipped earlier. 613d7072f8Seschrock */ 623d7072f8Seschrock 633d7072f8Seschrock #include <alloca.h> 643d7072f8Seschrock #include <devid.h> 653d7072f8Seschrock #include <fcntl.h> 663d7072f8Seschrock #include <libnvpair.h> 673d7072f8Seschrock #include <libsysevent.h> 683d7072f8Seschrock #include <libzfs.h> 693d7072f8Seschrock #include <limits.h> 703d7072f8Seschrock #include <stdlib.h> 713d7072f8Seschrock #include <string.h> 723d7072f8Seschrock #include <syslog.h> 733c112a2bSEric Taylor #include <sys/list.h> 743d7072f8Seschrock #include <sys/sunddi.h> 753d7072f8Seschrock #include <sys/sysevent/eventdefs.h> 763d7072f8Seschrock #include <sys/sysevent/dev.h> 773c112a2bSEric Taylor #include <thread_pool.h> 783d7072f8Seschrock #include <unistd.h> 79b98131cfSEric Taylor #include "syseventd.h" 803d7072f8Seschrock 813d7072f8Seschrock #if defined(__i386) || defined(__amd64) 823d7072f8Seschrock #define PHYS_PATH ":q" 833d7072f8Seschrock #define RAW_SLICE "p0" 843d7072f8Seschrock #elif defined(__sparc) 853d7072f8Seschrock #define PHYS_PATH ":c" 863d7072f8Seschrock #define RAW_SLICE "s2" 873d7072f8Seschrock #else 883d7072f8Seschrock #error Unknown architecture 893d7072f8Seschrock #endif 903d7072f8Seschrock 913d7072f8Seschrock typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); 923d7072f8Seschrock 933d7072f8Seschrock libzfs_handle_t *g_zfshdl; 943c112a2bSEric Taylor list_t g_pool_list; 953c112a2bSEric Taylor tpool_t *g_tpool; 9637e3a0d8SEric Taylor boolean_t g_enumeration_done; 9737e3a0d8SEric Taylor thread_t g_zfs_tid; 983c112a2bSEric Taylor 993c112a2bSEric Taylor typedef struct unavailpool { 1003c112a2bSEric Taylor zpool_handle_t *uap_zhp; 1013c112a2bSEric Taylor list_node_t uap_node; 1023c112a2bSEric Taylor } unavailpool_t; 1033c112a2bSEric Taylor 1043c112a2bSEric Taylor int 1053c112a2bSEric Taylor zfs_toplevel_state(zpool_handle_t *zhp) 1063c112a2bSEric Taylor { 1073c112a2bSEric Taylor nvlist_t *nvroot; 1083c112a2bSEric Taylor vdev_stat_t *vs; 1093c112a2bSEric Taylor unsigned int c; 1103c112a2bSEric Taylor 1113c112a2bSEric Taylor verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), 1123c112a2bSEric Taylor ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1133c112a2bSEric Taylor verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, 1143c112a2bSEric Taylor (uint64_t **)&vs, &c) == 0); 1153c112a2bSEric Taylor return (vs->vs_state); 1163c112a2bSEric Taylor } 1173c112a2bSEric Taylor 1183c112a2bSEric Taylor static int 1193c112a2bSEric Taylor zfs_unavail_pool(zpool_handle_t *zhp, void *data) 1203c112a2bSEric Taylor { 1213c112a2bSEric Taylor if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { 1223c112a2bSEric Taylor unavailpool_t *uap; 1233c112a2bSEric Taylor uap = malloc(sizeof (unavailpool_t)); 1243c112a2bSEric Taylor uap->uap_zhp = zhp; 1253c112a2bSEric Taylor list_insert_tail((list_t *)data, uap); 1263c112a2bSEric Taylor } else { 1273c112a2bSEric Taylor zpool_close(zhp); 1283c112a2bSEric Taylor } 1293c112a2bSEric Taylor return (0); 1303c112a2bSEric Taylor } 1313d7072f8Seschrock 1323d7072f8Seschrock /* 1333d7072f8Seschrock * The device associated with the given vdev (either by devid or physical path) 1343d7072f8Seschrock * has been added to the system. If 'isdisk' is set, then we only attempt a 1353d7072f8Seschrock * replacement if it's a whole disk. This also implies that we should label the 1363d7072f8Seschrock * disk first. 1373d7072f8Seschrock * 1383d7072f8Seschrock * First, we attempt to online the device (making sure to undo any spare 1393d7072f8Seschrock * operation when finished). If this succeeds, then we're done. If it fails, 1403d7072f8Seschrock * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, 1413d7072f8Seschrock * but that the label was not what we expected. If the 'autoreplace' property 1423d7072f8Seschrock * is not set, then we relabel the disk (if specified), and attempt a 'zpool 1433d7072f8Seschrock * replace'. If the online is successful, but the new state is something else 1443d7072f8Seschrock * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of 1453d7072f8Seschrock * race, and we should avoid attempting to relabel the disk. 1463d7072f8Seschrock */ 1473d7072f8Seschrock static void 1483d7072f8Seschrock zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk) 1493d7072f8Seschrock { 1503d7072f8Seschrock char *path; 1513d7072f8Seschrock vdev_state_t newstate; 1523d7072f8Seschrock nvlist_t *nvroot, *newvd; 1533d7072f8Seschrock uint64_t wholedisk = 0ULL; 154bf82a41bSeschrock char *physpath = NULL; 1553d7072f8Seschrock char rawpath[PATH_MAX], fullpath[PATH_MAX]; 1563d7072f8Seschrock size_t len; 1573d7072f8Seschrock 1583d7072f8Seschrock if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) 1593d7072f8Seschrock return; 1603d7072f8Seschrock 161bf82a41bSeschrock (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); 1623d7072f8Seschrock (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); 1633d7072f8Seschrock 1643d7072f8Seschrock /* 1653d7072f8Seschrock * We should have a way to online a device by guid. With the current 1663d7072f8Seschrock * interface, we are forced to chop off the 's0' for whole disks. 1673d7072f8Seschrock */ 1683d7072f8Seschrock (void) strlcpy(fullpath, path, sizeof (fullpath)); 1693d7072f8Seschrock if (wholedisk) 1703d7072f8Seschrock fullpath[strlen(fullpath) - 2] = '\0'; 1713d7072f8Seschrock 1723d7072f8Seschrock /* 1733d7072f8Seschrock * Attempt to online the device. It would be nice to online this by 1743d7072f8Seschrock * GUID, but the current interface only supports lookup by path. 1753d7072f8Seschrock */ 1763d7072f8Seschrock if (zpool_vdev_online(zhp, fullpath, 1773d7072f8Seschrock ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && 178bf82a41bSeschrock (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED)) 1793d7072f8Seschrock return; 1803d7072f8Seschrock 1813d7072f8Seschrock /* 1823d7072f8Seschrock * If the pool doesn't have the autoreplace property set, then attempt a 1833d7072f8Seschrock * true online (without the unspare flag), which will trigger a FMA 1843d7072f8Seschrock * fault. 1853d7072f8Seschrock */ 186990b4856Slling if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || 1873d7072f8Seschrock (isdisk && !wholedisk)) { 1883d7072f8Seschrock (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, 1893d7072f8Seschrock &newstate); 1903d7072f8Seschrock return; 1913d7072f8Seschrock } 1923d7072f8Seschrock 1933d7072f8Seschrock if (isdisk) { 1943d7072f8Seschrock /* 1953d7072f8Seschrock * If this is a request to label a whole disk, then attempt to 1963d7072f8Seschrock * write out the label. Before we can label the disk, we need 1973d7072f8Seschrock * access to a raw node. Ideally, we'd like to walk the devinfo 1983d7072f8Seschrock * tree and find a raw node from the corresponding parent node. 1993d7072f8Seschrock * This is overly complicated, and since we know how we labeled 2003d7072f8Seschrock * this device in the first place, we know it's save to switch 2013d7072f8Seschrock * from /dev/dsk to /dev/rdsk and append the backup slice. 202c5904d13Seschrock * 203c5904d13Seschrock * If any part of this process fails, then do a force online to 204c5904d13Seschrock * trigger a ZFS fault for the device (and any hot spare 205c5904d13Seschrock * replacement). 2063d7072f8Seschrock */ 207c5904d13Seschrock if (strncmp(path, "/dev/dsk/", 9) != 0) { 208c5904d13Seschrock (void) zpool_vdev_online(zhp, fullpath, 209c5904d13Seschrock ZFS_ONLINE_FORCEFAULT, &newstate); 2103d7072f8Seschrock return; 211c5904d13Seschrock } 2123d7072f8Seschrock 2133d7072f8Seschrock (void) strlcpy(rawpath, path + 9, sizeof (rawpath)); 2143d7072f8Seschrock len = strlen(rawpath); 2153d7072f8Seschrock rawpath[len - 2] = '\0'; 2163d7072f8Seschrock 217c5904d13Seschrock if (zpool_label_disk(g_zfshdl, zhp, rawpath) != 0) { 218c5904d13Seschrock (void) zpool_vdev_online(zhp, fullpath, 219c5904d13Seschrock ZFS_ONLINE_FORCEFAULT, &newstate); 2203d7072f8Seschrock return; 221c5904d13Seschrock } 2223d7072f8Seschrock } 2233d7072f8Seschrock 2243d7072f8Seschrock /* 2253d7072f8Seschrock * Cosntruct the root vdev to pass to zpool_vdev_attach(). While adding 2263d7072f8Seschrock * the entire vdev structure is harmless, we construct a reduced set of 227bf82a41bSeschrock * path/physpath/wholedisk to keep it simple. 2283d7072f8Seschrock */ 2293d7072f8Seschrock if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) 2303d7072f8Seschrock return; 2313d7072f8Seschrock 2323d7072f8Seschrock if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 2333d7072f8Seschrock nvlist_free(nvroot); 2343d7072f8Seschrock return; 2353d7072f8Seschrock } 2363d7072f8Seschrock 2373d7072f8Seschrock if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || 2383d7072f8Seschrock nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || 239bf82a41bSeschrock (physpath != NULL && nvlist_add_string(newvd, 240bf82a41bSeschrock ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || 2413d7072f8Seschrock nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || 2423d7072f8Seschrock nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || 2433d7072f8Seschrock nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, 2443d7072f8Seschrock 1) != 0) { 2453d7072f8Seschrock nvlist_free(newvd); 2463d7072f8Seschrock nvlist_free(nvroot); 2473d7072f8Seschrock return; 2483d7072f8Seschrock } 2493d7072f8Seschrock 2503d7072f8Seschrock nvlist_free(newvd); 2513d7072f8Seschrock 2523d7072f8Seschrock (void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); 2533d7072f8Seschrock 2543d7072f8Seschrock nvlist_free(nvroot); 2553d7072f8Seschrock 2563d7072f8Seschrock } 2573d7072f8Seschrock 2583d7072f8Seschrock /* 2593d7072f8Seschrock * Utility functions to find a vdev matching given criteria. 2603d7072f8Seschrock */ 2613d7072f8Seschrock typedef struct dev_data { 2623d7072f8Seschrock const char *dd_compare; 2633d7072f8Seschrock const char *dd_prop; 2643d7072f8Seschrock zfs_process_func_t dd_func; 2653d7072f8Seschrock boolean_t dd_found; 2663d7072f8Seschrock boolean_t dd_isdisk; 2673d7072f8Seschrock uint64_t dd_pool_guid; 2683d7072f8Seschrock uint64_t dd_vdev_guid; 2693d7072f8Seschrock } dev_data_t; 2703d7072f8Seschrock 2713d7072f8Seschrock static void 2723d7072f8Seschrock zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) 2733d7072f8Seschrock { 2743d7072f8Seschrock dev_data_t *dp = data; 2753d7072f8Seschrock char *path; 2763d7072f8Seschrock uint_t c, children; 2773d7072f8Seschrock nvlist_t **child; 278b01c3b58Seschrock size_t len; 2793d7072f8Seschrock uint64_t guid; 2803d7072f8Seschrock 2813d7072f8Seschrock /* 2823d7072f8Seschrock * First iterate over any children. 2833d7072f8Seschrock */ 2843d7072f8Seschrock if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, 2853d7072f8Seschrock &child, &children) == 0) { 2863d7072f8Seschrock for (c = 0; c < children; c++) 2873d7072f8Seschrock zfs_iter_vdev(zhp, child[c], data); 2883d7072f8Seschrock return; 2893d7072f8Seschrock } 2903d7072f8Seschrock 2913d7072f8Seschrock if (dp->dd_vdev_guid != 0) { 2923d7072f8Seschrock if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, 2933d7072f8Seschrock &guid) != 0 || guid != dp->dd_vdev_guid) 2943d7072f8Seschrock return; 2953d7072f8Seschrock } else { 296b01c3b58Seschrock len = strlen(dp->dd_compare); 297b01c3b58Seschrock 2983d7072f8Seschrock if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || 2993d7072f8Seschrock strncmp(dp->dd_compare, path, len) != 0) 3003d7072f8Seschrock return; 3013d7072f8Seschrock 3023d7072f8Seschrock /* 3033d7072f8Seschrock * Normally, we want to have an exact match for the comparison 3043d7072f8Seschrock * string. However, we allow substring matches in the following 3053d7072f8Seschrock * cases: 3063d7072f8Seschrock * 3073d7072f8Seschrock * <path>: This is a devpath, and the target is one 3083d7072f8Seschrock * of its children. 3093d7072f8Seschrock * 3103d7072f8Seschrock * <path/> This is a devid for a whole disk, and 3113d7072f8Seschrock * the target is one of its children. 3123d7072f8Seschrock */ 3133d7072f8Seschrock if (path[len] != '\0' && path[len] != ':' && 3143d7072f8Seschrock path[len - 1] != '/') 3153d7072f8Seschrock return; 3163d7072f8Seschrock } 3173d7072f8Seschrock 3183d7072f8Seschrock (dp->dd_func)(zhp, nvl, dp->dd_isdisk); 3193d7072f8Seschrock } 3203d7072f8Seschrock 3213c112a2bSEric Taylor void 3223c112a2bSEric Taylor zfs_enable_ds(void *arg) 3233c112a2bSEric Taylor { 3243c112a2bSEric Taylor unavailpool_t *pool = (unavailpool_t *)arg; 3253c112a2bSEric Taylor 3263c112a2bSEric Taylor (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); 3273c112a2bSEric Taylor zpool_close(pool->uap_zhp); 3283c112a2bSEric Taylor free(pool); 3293c112a2bSEric Taylor } 3303c112a2bSEric Taylor 3313d7072f8Seschrock static int 3323d7072f8Seschrock zfs_iter_pool(zpool_handle_t *zhp, void *data) 3333d7072f8Seschrock { 3343d7072f8Seschrock nvlist_t *config, *nvl; 3353d7072f8Seschrock dev_data_t *dp = data; 3363d7072f8Seschrock uint64_t pool_guid; 3373c112a2bSEric Taylor unavailpool_t *pool; 3383d7072f8Seschrock 3393d7072f8Seschrock if ((config = zpool_get_config(zhp, NULL)) != NULL) { 3403d7072f8Seschrock if (dp->dd_pool_guid == 0 || 3413d7072f8Seschrock (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3423d7072f8Seschrock &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { 3433d7072f8Seschrock (void) nvlist_lookup_nvlist(config, 3443d7072f8Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvl); 3453d7072f8Seschrock zfs_iter_vdev(zhp, nvl, data); 3463d7072f8Seschrock } 3473d7072f8Seschrock } 34837e3a0d8SEric Taylor if (g_enumeration_done) { 34937e3a0d8SEric Taylor for (pool = list_head(&g_pool_list); pool != NULL; 35037e3a0d8SEric Taylor pool = list_next(&g_pool_list, pool)) { 35137e3a0d8SEric Taylor 35237e3a0d8SEric Taylor if (strcmp(zpool_get_name(zhp), 35337e3a0d8SEric Taylor zpool_get_name(pool->uap_zhp))) 35437e3a0d8SEric Taylor continue; 35537e3a0d8SEric Taylor if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { 35637e3a0d8SEric Taylor list_remove(&g_pool_list, pool); 35737e3a0d8SEric Taylor (void) tpool_dispatch(g_tpool, zfs_enable_ds, 35837e3a0d8SEric Taylor pool); 35937e3a0d8SEric Taylor break; 36037e3a0d8SEric Taylor } 3613c112a2bSEric Taylor } 3623c112a2bSEric Taylor } 3633d7072f8Seschrock 3643d7072f8Seschrock zpool_close(zhp); 3653d7072f8Seschrock return (0); 3663d7072f8Seschrock } 3673d7072f8Seschrock 3683d7072f8Seschrock /* 3693d7072f8Seschrock * Given a physical device path, iterate over all (pool, vdev) pairs which 3703d7072f8Seschrock * correspond to the given path. 3713d7072f8Seschrock */ 3723d7072f8Seschrock static boolean_t 3733d7072f8Seschrock devpath_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk) 3743d7072f8Seschrock { 3753d7072f8Seschrock dev_data_t data = { 0 }; 3763d7072f8Seschrock 3773d7072f8Seschrock data.dd_compare = devpath; 3783d7072f8Seschrock data.dd_func = func; 3793d7072f8Seschrock data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; 3803d7072f8Seschrock data.dd_found = B_FALSE; 3813d7072f8Seschrock data.dd_isdisk = wholedisk; 3823d7072f8Seschrock 3833d7072f8Seschrock (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 3843d7072f8Seschrock 3853d7072f8Seschrock return (data.dd_found); 3863d7072f8Seschrock } 3873d7072f8Seschrock 3883d7072f8Seschrock /* 3893d7072f8Seschrock * Given a /devices path, lookup the corresponding devid for each minor node, 3903d7072f8Seschrock * and find any vdevs with matching devids. Doing this straight up would be 3913d7072f8Seschrock * rather inefficient, O(minor nodes * vdevs in system), so we take advantage of 3923d7072f8Seschrock * the fact that each devid ends with "/<minornode>". Once we find any valid 3933d7072f8Seschrock * minor node, we chop off the portion after the last slash, and then search for 3943d7072f8Seschrock * matching vdevs, which is O(vdevs in system). 3953d7072f8Seschrock */ 3963d7072f8Seschrock static boolean_t 3973d7072f8Seschrock devid_iter(const char *devpath, zfs_process_func_t func, boolean_t wholedisk) 3983d7072f8Seschrock { 3993d7072f8Seschrock size_t len = strlen(devpath) + sizeof ("/devices") + 4003d7072f8Seschrock sizeof (PHYS_PATH) - 1; 4013d7072f8Seschrock char *fullpath; 4023d7072f8Seschrock int fd; 4033d7072f8Seschrock ddi_devid_t devid; 4043d7072f8Seschrock char *devidstr, *fulldevid; 4053d7072f8Seschrock dev_data_t data = { 0 }; 4063d7072f8Seschrock 4073d7072f8Seschrock /* 4083d7072f8Seschrock * Try to open a known minor node. 4093d7072f8Seschrock */ 4103d7072f8Seschrock fullpath = alloca(len); 4113d7072f8Seschrock (void) snprintf(fullpath, len, "/devices%s%s", devpath, PHYS_PATH); 4123d7072f8Seschrock if ((fd = open(fullpath, O_RDONLY)) < 0) 4133d7072f8Seschrock return (B_FALSE); 4143d7072f8Seschrock 4153d7072f8Seschrock /* 4163d7072f8Seschrock * Determine the devid as a string, with no trailing slash for the minor 4173d7072f8Seschrock * node. 4183d7072f8Seschrock */ 4193d7072f8Seschrock if (devid_get(fd, &devid) != 0) { 4203d7072f8Seschrock (void) close(fd); 4213d7072f8Seschrock return (B_FALSE); 4223d7072f8Seschrock } 4233d7072f8Seschrock (void) close(fd); 4243d7072f8Seschrock 4253d7072f8Seschrock if ((devidstr = devid_str_encode(devid, NULL)) == NULL) { 4263d7072f8Seschrock devid_free(devid); 4273d7072f8Seschrock return (B_FALSE); 4283d7072f8Seschrock } 4293d7072f8Seschrock 4303d7072f8Seschrock len = strlen(devidstr) + 2; 4313d7072f8Seschrock fulldevid = alloca(len); 4323d7072f8Seschrock (void) snprintf(fulldevid, len, "%s/", devidstr); 4333d7072f8Seschrock 4343d7072f8Seschrock data.dd_compare = fulldevid; 4353d7072f8Seschrock data.dd_func = func; 4363d7072f8Seschrock data.dd_prop = ZPOOL_CONFIG_DEVID; 4373d7072f8Seschrock data.dd_found = B_FALSE; 4383d7072f8Seschrock data.dd_isdisk = wholedisk; 4393d7072f8Seschrock 4403d7072f8Seschrock (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 4413d7072f8Seschrock 4423d7072f8Seschrock devid_str_free(devidstr); 44325085d90SEric Taylor devid_free(devid); 4443d7072f8Seschrock 4453d7072f8Seschrock return (data.dd_found); 4463d7072f8Seschrock } 4473d7072f8Seschrock 4483d7072f8Seschrock /* 4493d7072f8Seschrock * This function is called when we receive a devfs add event. This can be 4503d7072f8Seschrock * either a disk event or a lofi event, and the behavior is slightly different 4513d7072f8Seschrock * depending on which it is. 4523d7072f8Seschrock */ 4533d7072f8Seschrock static int 4543d7072f8Seschrock zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) 4553d7072f8Seschrock { 4563d7072f8Seschrock char *devpath, *devname; 4573d7072f8Seschrock char path[PATH_MAX], realpath[PATH_MAX]; 4583d7072f8Seschrock char *colon, *raw; 4593d7072f8Seschrock int ret; 4603d7072f8Seschrock 4613d7072f8Seschrock /* 4623d7072f8Seschrock * The main unit of operation is the physical device path. For disks, 4633d7072f8Seschrock * this is the device node, as all minor nodes are affected. For lofi 4643d7072f8Seschrock * devices, this includes the minor path. Unfortunately, this isn't 4653d7072f8Seschrock * represented in the DEV_PHYS_PATH for various reasons. 4663d7072f8Seschrock */ 4673d7072f8Seschrock if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath) != 0) 4683d7072f8Seschrock return (-1); 4693d7072f8Seschrock 4703d7072f8Seschrock /* 4713d7072f8Seschrock * If this is a lofi device, then also get the minor instance name. 4723d7072f8Seschrock * Unfortunately, the current payload doesn't include an easy way to get 4733d7072f8Seschrock * this information. So we cheat by resolving the 'dev_name' (which 4743d7072f8Seschrock * refers to the raw device) and taking the portion between ':(*),raw'. 4753d7072f8Seschrock */ 4763d7072f8Seschrock (void) strlcpy(realpath, devpath, sizeof (realpath)); 4773d7072f8Seschrock if (is_lofi) { 4783d7072f8Seschrock if (nvlist_lookup_string(nvl, DEV_NAME, 4793d7072f8Seschrock &devname) == 0 && 4803d7072f8Seschrock (ret = resolvepath(devname, path, 4813d7072f8Seschrock sizeof (path))) > 0) { 4823d7072f8Seschrock path[ret] = '\0'; 4833d7072f8Seschrock colon = strchr(path, ':'); 4843d7072f8Seschrock if (colon != NULL) 4853d7072f8Seschrock raw = strstr(colon + 1, ",raw"); 4863d7072f8Seschrock if (colon != NULL && raw != NULL) { 4873d7072f8Seschrock *raw = '\0'; 4883d7072f8Seschrock (void) snprintf(realpath, 4893d7072f8Seschrock sizeof (realpath), "%s%s", 4903d7072f8Seschrock devpath, colon); 4913d7072f8Seschrock *raw = ','; 4923d7072f8Seschrock } 4933d7072f8Seschrock } 4943d7072f8Seschrock } 4953d7072f8Seschrock 4963d7072f8Seschrock /* 4973d7072f8Seschrock * Iterate over all vdevs with a matching devid, and then those with a 4983d7072f8Seschrock * matching /devices path. For disks, we only want to pay attention to 4993d7072f8Seschrock * vdevs marked as whole disks. For lofi, we don't care (because we're 5003d7072f8Seschrock * matching an exact minor name). 5013d7072f8Seschrock */ 5023d7072f8Seschrock if (!devid_iter(realpath, zfs_process_add, !is_lofi)) 5033d7072f8Seschrock (void) devpath_iter(realpath, zfs_process_add, !is_lofi); 5043d7072f8Seschrock 5053d7072f8Seschrock return (0); 5063d7072f8Seschrock } 5073d7072f8Seschrock 5083d7072f8Seschrock /* 5093d7072f8Seschrock * Called when we receive a VDEV_CHECK event, which indicates a device could not 5103d7072f8Seschrock * be opened during initial pool open, but the autoreplace property was set on 5113d7072f8Seschrock * the pool. In this case, we treat it as if it were an add event. 5123d7072f8Seschrock */ 5133d7072f8Seschrock static int 5143d7072f8Seschrock zfs_deliver_check(nvlist_t *nvl) 5153d7072f8Seschrock { 5163d7072f8Seschrock dev_data_t data = { 0 }; 5173d7072f8Seschrock 5183d7072f8Seschrock if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, 5193d7072f8Seschrock &data.dd_pool_guid) != 0 || 5203d7072f8Seschrock nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, 5213d7072f8Seschrock &data.dd_vdev_guid) != 0) 5223d7072f8Seschrock return (0); 5233d7072f8Seschrock 5243d7072f8Seschrock data.dd_isdisk = B_TRUE; 5253d7072f8Seschrock data.dd_func = zfs_process_add; 5263d7072f8Seschrock 5273d7072f8Seschrock (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); 5283d7072f8Seschrock 5293d7072f8Seschrock return (0); 5303d7072f8Seschrock } 5313d7072f8Seschrock 532b98131cfSEric Taylor #define DEVICE_PREFIX "/devices" 533b98131cfSEric Taylor 534b98131cfSEric Taylor static int 535b98131cfSEric Taylor zfsdle_vdev_online(zpool_handle_t *zhp, void *data) 536b98131cfSEric Taylor { 537b98131cfSEric Taylor char *devname = data; 538b98131cfSEric Taylor boolean_t avail_spare, l2cache; 539b98131cfSEric Taylor vdev_state_t newstate; 540b98131cfSEric Taylor nvlist_t *tgt; 541b98131cfSEric Taylor 542b98131cfSEric Taylor syseventd_print(9, "zfsdle_vdev_online: searching for %s in pool %s\n", 543b98131cfSEric Taylor devname, zpool_get_name(zhp)); 544b98131cfSEric Taylor 545b98131cfSEric Taylor if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, 546b98131cfSEric Taylor &avail_spare, &l2cache, NULL)) != NULL) { 547b98131cfSEric Taylor char *path, fullpath[MAXPATHLEN]; 548b98131cfSEric Taylor uint64_t wholedisk = 0ULL; 549b98131cfSEric Taylor 550b98131cfSEric Taylor verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, 551b98131cfSEric Taylor &path) == 0); 552b98131cfSEric Taylor verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, 553b98131cfSEric Taylor &wholedisk) == 0); 554b98131cfSEric Taylor 555b98131cfSEric Taylor (void) strlcpy(fullpath, path, sizeof (fullpath)); 556*4263d13fSGeorge Wilson if (wholedisk) { 557b98131cfSEric Taylor fullpath[strlen(fullpath) - 2] = '\0'; 558b98131cfSEric Taylor 559*4263d13fSGeorge Wilson /* 560*4263d13fSGeorge Wilson * We need to reopen the pool associated with this 561*4263d13fSGeorge Wilson * device so that the kernel can update the size 562*4263d13fSGeorge Wilson * of the expanded device. 563*4263d13fSGeorge Wilson */ 564*4263d13fSGeorge Wilson (void) zpool_reopen(zhp); 565*4263d13fSGeorge Wilson } 566*4263d13fSGeorge Wilson 567b98131cfSEric Taylor if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { 568b98131cfSEric Taylor syseventd_print(9, "zfsdle_vdev_online: setting device" 569b98131cfSEric Taylor " device %s to ONLINE state in pool %s.\n", 570b98131cfSEric Taylor fullpath, zpool_get_name(zhp)); 571b98131cfSEric Taylor if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) 572b98131cfSEric Taylor (void) zpool_vdev_online(zhp, fullpath, 0, 573b98131cfSEric Taylor &newstate); 574b98131cfSEric Taylor } 57525085d90SEric Taylor zpool_close(zhp); 576b98131cfSEric Taylor return (1); 577b98131cfSEric Taylor } 57825085d90SEric Taylor zpool_close(zhp); 579b98131cfSEric Taylor return (0); 580b98131cfSEric Taylor } 581b98131cfSEric Taylor 582b98131cfSEric Taylor int 583b98131cfSEric Taylor zfs_deliver_dle(nvlist_t *nvl) 584b98131cfSEric Taylor { 585b98131cfSEric Taylor char *devname; 586b98131cfSEric Taylor if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) != 0) { 587b98131cfSEric Taylor syseventd_print(9, "zfs_deliver_event: no physpath\n"); 588b98131cfSEric Taylor return (-1); 589b98131cfSEric Taylor } 590b98131cfSEric Taylor if (strncmp(devname, DEVICE_PREFIX, strlen(DEVICE_PREFIX)) != 0) { 591b98131cfSEric Taylor syseventd_print(9, "zfs_deliver_event: invalid " 592b98131cfSEric Taylor "device '%s'", devname); 593b98131cfSEric Taylor return (-1); 594b98131cfSEric Taylor } 595b98131cfSEric Taylor 596b98131cfSEric Taylor /* 597b98131cfSEric Taylor * We try to find the device using the physical 598b98131cfSEric Taylor * path that has been supplied. We need to strip off 599b98131cfSEric Taylor * the /devices prefix before starting our search. 600b98131cfSEric Taylor */ 601b98131cfSEric Taylor devname += strlen(DEVICE_PREFIX); 602b98131cfSEric Taylor if (zpool_iter(g_zfshdl, zfsdle_vdev_online, devname) != 1) { 603b98131cfSEric Taylor syseventd_print(9, "zfs_deliver_event: device '%s' not" 604b98131cfSEric Taylor " found\n", devname); 605b98131cfSEric Taylor return (1); 606b98131cfSEric Taylor } 607b98131cfSEric Taylor return (0); 608b98131cfSEric Taylor } 609b98131cfSEric Taylor 610b98131cfSEric Taylor 6113d7072f8Seschrock /*ARGSUSED*/ 6123d7072f8Seschrock static int 6133d7072f8Seschrock zfs_deliver_event(sysevent_t *ev, int unused) 6143d7072f8Seschrock { 6153d7072f8Seschrock const char *class = sysevent_get_class_name(ev); 6163d7072f8Seschrock const char *subclass = sysevent_get_subclass_name(ev); 6173d7072f8Seschrock nvlist_t *nvl; 6183d7072f8Seschrock int ret; 619b98131cfSEric Taylor boolean_t is_lofi, is_check, is_dle = B_FALSE; 6203d7072f8Seschrock 6213d7072f8Seschrock if (strcmp(class, EC_DEV_ADD) == 0) { 6223d7072f8Seschrock /* 6233d7072f8Seschrock * We're mainly interested in disk additions, but we also listen 6243d7072f8Seschrock * for new lofi devices, to allow for simplified testing. 6253d7072f8Seschrock */ 6263d7072f8Seschrock if (strcmp(subclass, ESC_DISK) == 0) 6273d7072f8Seschrock is_lofi = B_FALSE; 6283d7072f8Seschrock else if (strcmp(subclass, ESC_LOFI) == 0) 6293d7072f8Seschrock is_lofi = B_TRUE; 6303d7072f8Seschrock else 6313d7072f8Seschrock return (0); 6323d7072f8Seschrock 6333d7072f8Seschrock is_check = B_FALSE; 6343d7072f8Seschrock } else if (strcmp(class, EC_ZFS) == 0 && 6353d7072f8Seschrock strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { 6363d7072f8Seschrock /* 6373d7072f8Seschrock * This event signifies that a device failed to open during pool 6383d7072f8Seschrock * load, but the 'autoreplace' property was set, so we should 6393d7072f8Seschrock * pretend it's just been added. 6403d7072f8Seschrock */ 6413d7072f8Seschrock is_check = B_TRUE; 642b98131cfSEric Taylor } else if (strcmp(class, EC_DEV_STATUS) == 0 && 643b98131cfSEric Taylor strcmp(subclass, ESC_DEV_DLE) == 0) { 644b98131cfSEric Taylor is_dle = B_TRUE; 6453d7072f8Seschrock } else { 6463d7072f8Seschrock return (0); 6473d7072f8Seschrock } 6483d7072f8Seschrock 6493d7072f8Seschrock if (sysevent_get_attr_list(ev, &nvl) != 0) 6503d7072f8Seschrock return (-1); 6513d7072f8Seschrock 652b98131cfSEric Taylor if (is_dle) 653b98131cfSEric Taylor ret = zfs_deliver_dle(nvl); 654b98131cfSEric Taylor else if (is_check) 6553d7072f8Seschrock ret = zfs_deliver_check(nvl); 6563d7072f8Seschrock else 6573d7072f8Seschrock ret = zfs_deliver_add(nvl, is_lofi); 6583d7072f8Seschrock 6593d7072f8Seschrock nvlist_free(nvl); 6603d7072f8Seschrock return (ret); 6613d7072f8Seschrock } 6623d7072f8Seschrock 66337e3a0d8SEric Taylor /*ARGSUSED*/ 66437e3a0d8SEric Taylor void * 66537e3a0d8SEric Taylor zfs_enum_pools(void *arg) 66637e3a0d8SEric Taylor { 66737e3a0d8SEric Taylor (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); 66837e3a0d8SEric Taylor if (!list_is_empty(&g_pool_list)) 66937e3a0d8SEric Taylor g_tpool = tpool_create(1, sysconf(_SC_NPROCESSORS_ONLN), 67037e3a0d8SEric Taylor 0, NULL); 67137e3a0d8SEric Taylor g_enumeration_done = B_TRUE; 67237e3a0d8SEric Taylor return (NULL); 67337e3a0d8SEric Taylor } 67437e3a0d8SEric Taylor 6753d7072f8Seschrock static struct slm_mod_ops zfs_mod_ops = { 6763d7072f8Seschrock SE_MAJOR_VERSION, SE_MINOR_VERSION, 10, zfs_deliver_event 6773d7072f8Seschrock }; 6783d7072f8Seschrock 6793d7072f8Seschrock struct slm_mod_ops * 6803d7072f8Seschrock slm_init() 6813d7072f8Seschrock { 6823d7072f8Seschrock if ((g_zfshdl = libzfs_init()) == NULL) 6833d7072f8Seschrock return (NULL); 68437e3a0d8SEric Taylor /* 68537e3a0d8SEric Taylor * collect a list of unavailable pools (asynchronously, 68637e3a0d8SEric Taylor * since this can take a while) 68737e3a0d8SEric Taylor */ 6883c112a2bSEric Taylor list_create(&g_pool_list, sizeof (struct unavailpool), 6893c112a2bSEric Taylor offsetof(struct unavailpool, uap_node)); 69037e3a0d8SEric Taylor if (thr_create(NULL, 0, zfs_enum_pools, NULL, 0, &g_zfs_tid) != 0) 69137e3a0d8SEric Taylor return (NULL); 6923d7072f8Seschrock return (&zfs_mod_ops); 6933d7072f8Seschrock } 6943d7072f8Seschrock 6953d7072f8Seschrock void 6963d7072f8Seschrock slm_fini() 6973d7072f8Seschrock { 6983c112a2bSEric Taylor unavailpool_t *pool; 6993c112a2bSEric Taylor 70037e3a0d8SEric Taylor if (g_tpool != NULL) { 7013c112a2bSEric Taylor tpool_wait(g_tpool); 7023c112a2bSEric Taylor tpool_destroy(g_tpool); 7033c112a2bSEric Taylor } 7043c112a2bSEric Taylor while ((pool = (list_head(&g_pool_list))) != NULL) { 7053c112a2bSEric Taylor list_remove(&g_pool_list, pool); 7063c112a2bSEric Taylor zpool_close(pool->uap_zhp); 7073c112a2bSEric Taylor free(pool); 7083c112a2bSEric Taylor } 70937e3a0d8SEric Taylor (void) thr_join(g_zfs_tid, NULL, NULL); 7103c112a2bSEric Taylor list_destroy(&g_pool_list); 71125085d90SEric Taylor libzfs_fini(g_zfshdl); 7123d7072f8Seschrock } 713