13d7072feschrock/* 23d7072feschrock * CDDL HEADER START 33d7072feschrock * 43d7072feschrock * The contents of this file are subject to the terms of the 53d7072feschrock * Common Development and Distribution License (the "License"). 63d7072feschrock * You may not use this file except in compliance with the License. 73d7072feschrock * 83d7072feschrock * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 93d7072feschrock * or http://www.opensolaris.org/os/licensing. 103d7072feschrock * See the License for the specific language governing permissions 113d7072feschrock * and limitations under the License. 123d7072feschrock * 133d7072feschrock * When distributing Covered Code, include this CDDL HEADER in each 143d7072feschrock * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 153d7072feschrock * If applicable, add the following below this CDDL HEADER, with the 163d7072feschrock * fields enclosed by brackets "[]" replaced with your own identifying 173d7072feschrock * information: Portions Copyright [yyyy] [name of copyright owner] 183d7072feschrock * 193d7072feschrock * CDDL HEADER END 203d7072feschrock */ 213d7072feschrock/* 22b98131cEric Taylor * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 234263d13George Wilson * Copyright (c) 2012 by Delphix. All rights reserved. 24cead1dfHans Rosenfeld * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 253d7072feschrock */ 263d7072feschrock 273d7072feschrock/* 283d7072feschrock * ZFS syseventd module. 293d7072feschrock * 303d7072feschrock * The purpose of this module is to identify when devices are added to the 313d7072feschrock * system, and appropriately online or replace the affected vdevs. 323d7072feschrock * 333d7072feschrock * When a device is added to the system: 343d7072feschrock * 353d7072feschrock * 1. Search for any vdevs whose devid matches that of the newly added 363d7072feschrock * device. 373d7072feschrock * 383d7072feschrock * 2. If no vdevs are found, then search for any vdevs whose devfs path 393d7072feschrock * matches that of the new device. 403d7072feschrock * 413d7072feschrock * 3. If no vdevs match by either method, then ignore the event. 423d7072feschrock * 433d7072feschrock * 4. Attempt to online the device with a flag to indicate that it should 443d7072feschrock * be unspared when resilvering completes. If this succeeds, then the 453d7072feschrock * same device was inserted and we should continue normally. 463d7072feschrock * 473d7072feschrock * 5. If the pool does not have the 'autoreplace' property set, attempt to 483d7072feschrock * online the device again without the unspare flag, which will 493d7072feschrock * generate a FMA fault. 503d7072feschrock * 513d7072feschrock * 6. If the pool has the 'autoreplace' property set, and the matching vdev 523d7072feschrock * is a whole disk, then label the new disk and attempt a 'zpool 533d7072feschrock * replace'. 543d7072feschrock * 553d7072feschrock * The module responds to EC_DEV_ADD events for both disks and lofi devices, 563d7072feschrock * with the latter used for testing. The special ESC_ZFS_VDEV_CHECK event 573d7072feschrock * indicates that a device failed to open during pool load, but the autoreplace 583d7072feschrock * property was set. In this case, we deferred the associated FMA fault until 593d7072feschrock * our module had a chance to process the autoreplace logic. If the device 603d7072feschrock * could not be replaced, then the second online attempt will trigger the FMA 613d7072feschrock * fault that we skipped earlier. 623d7072feschrock */ 633d7072feschrock 643d7072feschrock#include <alloca.h> 653d7072feschrock#include <devid.h> 663d7072feschrock#include <fcntl.h> 673d7072feschrock#include <libnvpair.h> 683d7072feschrock#include <libsysevent.h> 693d7072feschrock#include <libzfs.h> 703d7072feschrock#include <limits.h> 713d7072feschrock#include <stdlib.h> 723d7072feschrock#include <string.h> 733d7072feschrock#include <syslog.h> 743c112a2Eric Taylor#include <sys/list.h> 753d7072feschrock#include <sys/sunddi.h> 763d7072feschrock#include <sys/sysevent/eventdefs.h> 773d7072feschrock#include <sys/sysevent/dev.h> 783c112a2Eric Taylor#include <thread_pool.h> 793d7072feschrock#include <unistd.h> 80b98131cEric Taylor#include "syseventd.h" 813d7072feschrock 823d7072feschrock#if defined(__i386) || defined(__amd64) 833d7072feschrock#define PHYS_PATH ":q" 843d7072feschrock#define RAW_SLICE "p0" 853d7072feschrock#elif defined(__sparc) 863d7072feschrock#define PHYS_PATH ":c" 873d7072feschrock#define RAW_SLICE "s2" 883d7072feschrock#else 893d7072feschrock#error Unknown architecture 903d7072feschrock#endif 913d7072feschrock 923d7072feschrocktypedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); 933d7072feschrock 943d7072feschrocklibzfs_handle_t *g_zfshdl; 953c112a2Eric Taylorlist_t g_pool_list; 963c112a2Eric Taylortpool_t *g_tpool; 9737e3a0dEric Taylorboolean_t g_enumeration_done; 9837e3a0dEric Taylorthread_t g_zfs_tid; 993c112a2Eric Taylor 1003c112a2Eric Taylortypedef struct unavailpool { 1013c112a2Eric Taylor zpool_handle_t *uap_zhp; 1023c112a2Eric Taylor list_node_t uap_node; 1033c112a2Eric Taylor} unavailpool_t; 1043c112a2Eric Taylor 1053c112a2Eric Taylorint 1063c112a2Eric Taylorzfs_toplevel_state(zpool_handle_t *zhp) 1073c112a2Eric Taylor{ 1083c112a2Eric Taylor nvlist_t *nvroot; 1093c112a2Eric Taylor vdev_stat_t *vs; 1103c112a2Eric Taylor unsigned int c; 1113c112a2Eric Taylor 1123c112a2Eric Taylor verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), 1133c112a2Eric Taylor ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1143c112a2Eric Taylor verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, 1153c112a2Eric Taylor (uint64_t **)&vs, &c) == 0); 1163c112a2Eric Taylor return (vs->vs_state); 1173c112a2Eric Taylor} 1183c112a2Eric Taylor 1193c112a2Eric Taylorstatic int 1203c112a2Eric Taylorzfs_unavail_pool(zpool_handle_t *zhp, void *data) 1213c112a2Eric Taylor{ 1223c112a2Eric Taylor if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { 1233c112a2Eric Taylor unavailpool_t *uap; 1243c112a2Eric Taylor uap = malloc(sizeof (unavailpool_t)); 1253c112a2Eric Taylor uap->uap_zhp = zhp; 1263c112a2Eric Taylor list_insert_tail((list_t *)data, uap); 1273c112a2Eric Taylor } else { 1283c112a2Eric Taylor zpool_close(zhp); 1293c112a2Eric Taylor } 1303c112a2Eric Taylor return (0); 1313c112a2Eric Taylor} 1323d7072feschrock 1333d7072feschrock/* 1343d7072feschrock * The device associated with the given vdev (either by devid or physical path) 1353d7072feschrock * has been added to the system. If 'isdisk' is set, then we only attempt a 1363d7072feschrock * replacement if it's a whole disk. This also implies that we should label the 1373d7072feschrock * disk first. 1383d7072feschrock * 1393d7072feschrock * First, we attempt to online the device (making sure to undo any spare 1403d7072feschrock * operation when finished). If this succeeds, then we're done. If it fails, 1413d7072feschrock * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, 1423d7072feschrock * but that the label was not what we expected. If the 'autoreplace' property 1433d7072feschrock * is not set, then we relabel the disk (if specified), and attempt a 'zpool 1443d7072feschrock * replace'. If the online is successful, but the new state is something else 1453d7072feschrock * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of 1463d7072feschrock * race, and we should avoid attempting to relabel the disk. 1473d7072feschrock */ 1483d7072feschrockstatic void 1493d7072feschrockzfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t isdisk) 1503d7072feschrock{ 1513d7072feschrock char *path; 1523d7072feschrock vdev_state_t newstate; 1533d7072feschrock nvlist_t *nvroot, *newvd; 1543d7072feschrock uint64_t wholedisk = 0ULL; 155acd07c6Yuri Pankov uint64_t offline = 0ULL; 156bf82a41eschrock char *physpath = NULL; 1573d7072feschrock char rawpath[PATH_MAX], fullpath[PATH_MAX]; 1587855d95Toomas Soome zpool_boot_label_t boot_type; 1597855d95Toomas Soome uint64_t boot_size; 1603d7072feschrock size_t len; 1613d7072feschrock 1623d7072feschrock if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) 1633d7072feschrock return; 1643d7072feschrock 165bf82a41eschrock (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); 1663d7072feschrock (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); 167acd07c6Yuri Pankov (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); 1683d7072feschrock 1693d7072feschrock /* 1703d7072feschrock * We should have a way to online a device by guid. With the current 1713d7072feschrock * interface, we are forced to chop off the 's0' for whole disks. 1723d7072feschrock */ 1733d7072feschrock (void) strlcpy(fullpath, path, sizeof (fullpath)); 1743d7072feschrock if (wholedisk) 1753d7072feschrock fullpath[strlen(fullpath) - 2] = '\0'; 1763d7072feschrock 1773d7072feschrock /* 1783d7072feschrock * Attempt to online the device. It would be nice to online this by 1793d7072feschrock * GUID, but the current interface only supports lookup by path. 1803d7072feschrock */ 181acd07c6Yuri Pankov if (offline || 182acd07c6Yuri Pankov (zpool_vdev_online(zhp, fullpath, 1833d7072feschrock ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && 184acd07c6Yuri Pankov (newstate == VDEV_STATE_HEALTHY || 185acd07c6Yuri Pankov newstate == VDEV_STATE_DEGRADED))) 1863d7072feschrock return; 1873d7072feschrock 1883d7072feschrock /* 1893d7072feschrock * If the pool doesn't have the autoreplace property set, then attempt a 1903d7072feschrock * true online (without the unspare flag), which will trigger a FMA 1913d7072feschrock * fault. 1923d7072feschrock */ 193990b485lling if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || 1943d7072feschrock (isdisk && !wholedisk)) { 1953d7072feschrock (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, 1963d7072feschrock &newstate); 1973d7072feschrock return; 1983d7072feschrock } 1993d7072feschrock 2003d7072feschrock if (isdisk) { 2013d7072feschrock /* 2023d7072feschrock * If this is a request to label a whole disk, then attempt to 2033d7072feschrock * write out the label. Before we can label the disk, we need 2043d7072feschrock * access to a raw node. Ideally, we'd like to walk the devinfo 2053d7072feschrock * tree and find a raw node from the corresponding parent node. 2063d7072feschrock * This is overly complicated, and since we know how we labeled 2073d7072feschrock * this device in the first place, we know it's save to switch 2083d7072feschrock * from /dev/dsk to /dev/rdsk and append the backup slice. 209c5904d1eschrock * 210c5904d1eschrock * If any part of this process fails, then do a force online to 211c5904d1eschrock * trigger a ZFS fault for the device (and any hot spare 212c5904d1eschrock * replacement). 2133d7072feschrock */ 2146401734Will Andrews if (strncmp(path, ZFS_DISK_ROOTD, 2156401734Will Andrews strlen(ZFS_DISK_ROOTD)) != 0) { 216c5904d1eschrock (void) zpool_vdev_online(zhp, fullpath, 217c5904d1eschrock ZFS_ONLINE_FORCEFAULT, &newstate); 2183d7072feschrock return; 219c5904d1eschrock } 2203d7072feschrock 2213d7072feschrock (void) strlcpy(rawpath, path + 9, sizeof (rawpath)); 2223d7072feschrock len = strlen(rawpath); 2233d7072feschrock rawpath[len - 2] = '\0'; 2243d7072feschrock 2257855d95Toomas Soome if (zpool_is_bootable(zhp)) 2267855d95Toomas Soome boot_type = ZPOOL_COPY_BOOT_LABEL; 2277855d95Toomas Soome else 2287855d95Toomas Soome boot_type = ZPOOL_NO_BOOT_LABEL; 2297855d95Toomas Soome 2307855d95Toomas Soome boot_size = zpool_get_prop_int(zhp, ZPOOL_PROP_BOOTSIZE, NULL); 2317855d95Toomas Soome if (zpool_label_disk(g_zfshdl, zhp, rawpath, 2327855d95Toomas Soome boot_type, boot_size, NULL) != 0) { 233c5904d1eschrock (void) zpool_vdev_online(zhp, fullpath, 234c5904d1eschrock ZFS_ONLINE_FORCEFAULT, &newstate); 2353d7072feschrock return; 236c5904d1eschrock } 2373d7072feschrock } 2383d7072feschrock 2393d7072feschrock /* 2403d7072feschrock * Cosntruct the root vdev to pass to zpool_vdev_attach(). While adding 2413d7072feschrock * the entire vdev structure is harmless, we construct a reduced set of 242bf82a41eschrock * path/physpath/wholedisk to keep it simple. 2433d7072feschrock */ 2443d7072feschrock if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) 2453d7072feschrock return; 2463d7072feschrock 2473d7072feschrock if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { 2483d7072feschrock nvlist_free(nvroot); 2493d7072feschrock return; 2503d7072feschrock } 2513d7072feschrock 2523d7072feschrock if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || 2533d7072feschrock nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || 254bf82a41eschrock (physpath != NULL && nvlist_add_string(newvd, 255bf82a41eschrock ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || 2563d7072feschrock nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || 2573d7072feschrock nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || 2583d7072feschrock nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, 2593d7072feschrock 1) != 0) { 2603d7072feschrock nvlist_free(newvd); 2613d7072feschrock nvlist_free(nvroot); 2623d7072feschrock return; 2633d7072feschrock } 2643d7072feschrock 2653d7072feschrock nvlist_free(newvd); 2663d7072feschrock 2673d7072feschrock (void) zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE); 2683d7072feschrock 2693d7072feschrock nvlist_free(nvroot); 2703d7072feschrock 2713d7072feschrock} 2723d7072feschrock 2733d7072feschrock/* 2743d7072feschrock * Utility functions to find a vdev matching given criteria. 2753d7072feschrock */ 2763d7072feschrocktypedef struct dev_data { 2773d7072feschrock const char *dd_compare; 2783d7072feschrock const char *dd_prop; 2793d7072feschrock zfs_process_func_t dd_func; 2803d7072feschrock boolean_t dd_found; 2813d7072feschrock boolean_t dd_isdisk; 2823d7072feschrock uint64_t dd_pool_guid; 2833d7072feschrock uint64_t dd_vdev_guid; 2843d7072feschrock} dev_data_t; 2853d7072feschrock 2863d7072feschrockstatic void 2873d7072feschrockzfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) 2883d7072feschrock{ 2893d7072feschrock dev_data_t *dp = data; 2903d7072feschrock char *path; 2913d7072feschrock uint_t c, children; 2923d7072feschrock nvlist_t **child; 293b01c3b5eschrock size_t len; 2943d7072feschrock uint64_t guid; 2953d7072feschrock 2963d7072feschrock /* 2973d7072feschrock * First iterate over any children. 2983d7072feschrock */ 2993d7072feschrock if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, 3003d7072feschrock &child, &children) == 0) { 3013d7072feschrock for (c = 0; c < children; c++) 3023d7072feschrock zfs_iter_vdev(zhp, child[c], data); 3033d7072feschrock return; 3043d7072feschrock } 3053d7072feschrock 3063d7072feschrock if (dp->dd_vdev_guid != 0) { 3073d7072feschrock if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, 3083d7072feschrock &guid) != 0 || guid != dp->dd_vdev_guid) 3093d7072feschrock return; 3101437283Hans Rosenfeld } else if (dp->dd_compare != NULL) { 311b01c3b5eschrock len = strlen(dp->dd_compare); 312b01c3b5eschrock 3133d7072feschrock if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || 3143d7072feschrock strncmp(dp->dd_compare, path, len) != 0) 3153d7072feschrock return; 3163d7072feschrock 3173d7072feschrock /* 3183d7072feschrock * Normally, we want to have an exact match for the comparison 3193d7072feschrock * string. However, we allow substring matches in the following 3203d7072feschrock * cases: 3213d7072feschrock * 3223d7072feschrock * <path>: This is a devpath, and the target is one 3233d7072feschrock * of its children. 3243d7072feschrock * 3253d7072feschrock * <path/> This is a devid for a whole disk, and 3263d7072feschrock