1*e0f1c0afSOlaf Faaland /* 2*e0f1c0afSOlaf Faaland * CDDL HEADER START 3*e0f1c0afSOlaf Faaland * 4*e0f1c0afSOlaf Faaland * The contents of this file are subject to the terms of the 5*e0f1c0afSOlaf Faaland * Common Development and Distribution License (the "License"). 6*e0f1c0afSOlaf Faaland * You may not use this file except in compliance with the License. 7*e0f1c0afSOlaf Faaland * 8*e0f1c0afSOlaf Faaland * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*e0f1c0afSOlaf Faaland * or http://www.opensolaris.org/os/licensing. 10*e0f1c0afSOlaf Faaland * See the License for the specific language governing permissions 11*e0f1c0afSOlaf Faaland * and limitations under the License. 12*e0f1c0afSOlaf Faaland * 13*e0f1c0afSOlaf Faaland * When distributing Covered Code, include this CDDL HEADER in each 14*e0f1c0afSOlaf Faaland * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*e0f1c0afSOlaf Faaland * If applicable, add the following below this CDDL HEADER, with the 16*e0f1c0afSOlaf Faaland * fields enclosed by brackets "[]" replaced with your own identifying 17*e0f1c0afSOlaf Faaland * information: Portions Copyright [yyyy] [name of copyright owner] 18*e0f1c0afSOlaf Faaland * 19*e0f1c0afSOlaf Faaland * CDDL HEADER END 20*e0f1c0afSOlaf Faaland */ 21*e0f1c0afSOlaf Faaland /* 22*e0f1c0afSOlaf Faaland * Copyright (c) 2017 by Lawrence Livermore National Security, LLC. 23*e0f1c0afSOlaf Faaland * Copyright 2019 Joyent, Inc. 24*e0f1c0afSOlaf Faaland */ 25*e0f1c0afSOlaf Faaland 26*e0f1c0afSOlaf Faaland #include <sys/abd.h> 27*e0f1c0afSOlaf Faaland #include <sys/mmp.h> 28*e0f1c0afSOlaf Faaland #include <sys/spa.h> 29*e0f1c0afSOlaf Faaland #include <sys/spa_impl.h> 30*e0f1c0afSOlaf Faaland #include <sys/time.h> 31*e0f1c0afSOlaf Faaland #include <sys/vdev.h> 32*e0f1c0afSOlaf Faaland #include <sys/vdev_impl.h> 33*e0f1c0afSOlaf Faaland #include <sys/zfs_context.h> 34*e0f1c0afSOlaf Faaland #include <sys/callb.h> 35*e0f1c0afSOlaf Faaland 36*e0f1c0afSOlaf Faaland /* 37*e0f1c0afSOlaf Faaland * Multi-Modifier Protection (MMP) attempts to prevent a user from importing 38*e0f1c0afSOlaf Faaland * or opening a pool on more than one host at a time. In particular, it 39*e0f1c0afSOlaf Faaland * prevents "zpool import -f" on a host from succeeding while the pool is 40*e0f1c0afSOlaf Faaland * already imported on another host. There are many other ways in which a 41*e0f1c0afSOlaf Faaland * device could be used by two hosts for different purposes at the same time 42*e0f1c0afSOlaf Faaland * resulting in pool damage. This implementation does not attempt to detect 43*e0f1c0afSOlaf Faaland * those cases. 44*e0f1c0afSOlaf Faaland * 45*e0f1c0afSOlaf Faaland * MMP operates by ensuring there are frequent visible changes on disk (a 46*e0f1c0afSOlaf Faaland * "heartbeat") at all times. And by altering the import process to check 47*e0f1c0afSOlaf Faaland * for these changes and failing the import when they are detected. This 48*e0f1c0afSOlaf Faaland * functionality is enabled by setting the 'multihost' pool property to on. 49*e0f1c0afSOlaf Faaland * 50*e0f1c0afSOlaf Faaland * Uberblocks written by the txg_sync thread always go into the first 51*e0f1c0afSOlaf Faaland * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP. 52*e0f1c0afSOlaf Faaland * They are used to hold uberblocks which are exactly the same as the last 53*e0f1c0afSOlaf Faaland * synced uberblock except that the ub_timestamp is frequently updated. 54*e0f1c0afSOlaf Faaland * Like all other uberblocks, the slot is written with an embedded checksum, 55*e0f1c0afSOlaf Faaland * and slots with invalid checksums are ignored. This provides the 56*e0f1c0afSOlaf Faaland * "heartbeat", with no risk of overwriting good uberblocks that must be 57*e0f1c0afSOlaf Faaland * preserved, e.g. previous txgs and associated block pointers. 58*e0f1c0afSOlaf Faaland * 59*e0f1c0afSOlaf Faaland * Two optional fields are added to uberblock structure: ub_mmp_magic and 60*e0f1c0afSOlaf Faaland * ub_mmp_delay. The magic field allows zfs to tell whether ub_mmp_delay is 61*e0f1c0afSOlaf Faaland * valid. The delay field is a decaying average of the amount of time between 62*e0f1c0afSOlaf Faaland * completion of successive MMP writes, in nanoseconds. It is used to predict 63*e0f1c0afSOlaf Faaland * how long the import must wait to detect activity in the pool, before 64*e0f1c0afSOlaf Faaland * concluding it is not in use. 65*e0f1c0afSOlaf Faaland * 66*e0f1c0afSOlaf Faaland * During import an activity test may now be performed to determine if 67*e0f1c0afSOlaf Faaland * the pool is in use. The activity test is typically required if the 68*e0f1c0afSOlaf Faaland * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is 69*e0f1c0afSOlaf Faaland * POOL_STATE_ACTIVE, and the pool is not a root pool. 70*e0f1c0afSOlaf Faaland * 71*e0f1c0afSOlaf Faaland * The activity test finds the "best" uberblock (highest txg & timestamp), 72*e0f1c0afSOlaf Faaland * waits some time, and then finds the "best" uberblock again. If the txg 73*e0f1c0afSOlaf Faaland * and timestamp in both "best" uberblocks do not match, the pool is in use 74*e0f1c0afSOlaf Faaland * by another host and the import fails. Since the granularity of the 75*e0f1c0afSOlaf Faaland * timestamp is in seconds this activity test must take a bare minimum of one 76*e0f1c0afSOlaf Faaland * second. In order to assure the accuracy of the activity test, the default 77*e0f1c0afSOlaf Faaland * values result in an activity test duration of 10x the mmp write interval. 78*e0f1c0afSOlaf Faaland * 79*e0f1c0afSOlaf Faaland * The "zpool import" activity test can be expected to take a minimum time of 80*e0f1c0afSOlaf Faaland * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds. If the 81*e0f1c0afSOlaf Faaland * "best" uberblock has a valid ub_mmp_delay field, then the duration of the 82*e0f1c0afSOlaf Faaland * test may take longer if MMP writes were occurring less frequently than 83*e0f1c0afSOlaf Faaland * expected. Additionally, the duration is then extended by a random 25% to 84*e0f1c0afSOlaf Faaland * attempt to to detect simultaneous imports. For example, if both partner 85*e0f1c0afSOlaf Faaland * hosts are rebooted at the same time and automatically attempt to import the 86*e0f1c0afSOlaf Faaland * pool. 87*e0f1c0afSOlaf Faaland */ 88*e0f1c0afSOlaf Faaland 89*e0f1c0afSOlaf Faaland /* 90*e0f1c0afSOlaf Faaland * Used to control the frequency of mmp writes which are performed when the 91*e0f1c0afSOlaf Faaland * 'multihost' pool property is on. This is one factor used to determine the 92*e0f1c0afSOlaf Faaland * length of the activity check during import. 93*e0f1c0afSOlaf Faaland * 94*e0f1c0afSOlaf Faaland * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds. 95*e0f1c0afSOlaf Faaland * This means that on average an mmp write will be issued for each leaf vdev 96*e0f1c0afSOlaf Faaland * every zfs_multihost_interval milliseconds. In practice, the observed period 97*e0f1c0afSOlaf Faaland * can vary with the I/O load and this observed value is the delay which is 98*e0f1c0afSOlaf Faaland * stored in the uberblock. The minimum allowed value is 100 ms. 99*e0f1c0afSOlaf Faaland */ 100*e0f1c0afSOlaf Faaland ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; 101*e0f1c0afSOlaf Faaland 102*e0f1c0afSOlaf Faaland /* 103*e0f1c0afSOlaf Faaland * Used to control the duration of the activity test on import. Smaller values 104*e0f1c0afSOlaf Faaland * of zfs_multihost_import_intervals will reduce the import time but increase 105*e0f1c0afSOlaf Faaland * the risk of failing to detect an active pool. The total activity check time 106*e0f1c0afSOlaf Faaland * is never allowed to drop below one second. A value of 0 is ignored and 107*e0f1c0afSOlaf Faaland * treated as if it was set to 1. 108*e0f1c0afSOlaf Faaland */ 109*e0f1c0afSOlaf Faaland uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS; 110*e0f1c0afSOlaf Faaland 111*e0f1c0afSOlaf Faaland /* 112*e0f1c0afSOlaf Faaland * Controls the behavior of the pool when mmp write failures are detected. 113*e0f1c0afSOlaf Faaland * 114*e0f1c0afSOlaf Faaland * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored. 115*e0f1c0afSOlaf Faaland * The failures will still be reported to the ZED which depending on its 116*e0f1c0afSOlaf Faaland * configuration may take action such as suspending the pool or taking a 117*e0f1c0afSOlaf Faaland * device offline. 118*e0f1c0afSOlaf Faaland * 119*e0f1c0afSOlaf Faaland * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will 120*e0f1c0afSOlaf Faaland * cause the pool to be suspended. This occurs when 121*e0f1c0afSOlaf Faaland * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have 122*e0f1c0afSOlaf Faaland * passed since the last successful mmp write. This guarantees the activity 123*e0f1c0afSOlaf Faaland * test will see mmp writes if the 124*e0f1c0afSOlaf Faaland * pool is imported. 125*e0f1c0afSOlaf Faaland */ 126*e0f1c0afSOlaf Faaland uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS; 127*e0f1c0afSOlaf Faaland 128*e0f1c0afSOlaf Faaland char *mmp_tag = "mmp_write_uberblock"; 129*e0f1c0afSOlaf Faaland static void mmp_thread(void *arg); 130*e0f1c0afSOlaf Faaland 131*e0f1c0afSOlaf Faaland void 132*e0f1c0afSOlaf Faaland mmp_init(spa_t *spa) 133*e0f1c0afSOlaf Faaland { 134*e0f1c0afSOlaf Faaland mmp_thread_t *mmp = &spa->spa_mmp; 135*e0f1c0afSOlaf Faaland 136*e0f1c0afSOlaf Faaland mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL); 137*e0f1c0afSOlaf Faaland cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL); 138*e0f1c0afSOlaf Faaland mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL); 139*e0f1c0afSOlaf Faaland mmp->mmp_kstat_id = 1; 140*e0f1c0afSOlaf Faaland } 141*e0f1c0afSOlaf Faaland 142*e0f1c0afSOlaf Faaland void 143*e0f1c0afSOlaf Faaland mmp_fini(spa_t *spa) 144*e0f1c0afSOlaf Faaland { 145*e0f1c0afSOlaf Faaland mmp_thread_t *mmp = &spa->spa_mmp; 146*e0f1c0afSOlaf Faaland 147*e0f1c0afSOlaf Faaland mutex_destroy(&mmp->mmp_thread_lock); 148*e0f1c0afSOlaf Faaland cv_destroy(&mmp->mmp_thread_cv); 149*e0f1c0afSOlaf Faaland mutex_destroy(&mmp->mmp_io_lock); 150*e0f1c0afSOlaf Faaland } 151*e0f1c0afSOlaf Faaland 152*e0f1c0afSOlaf Faaland static void 153*e0f1c0afSOlaf Faaland mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr) 154*e0f1c0afSOlaf Faaland { 155*e0f1c0afSOlaf Faaland CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG); 156*e0f1c0afSOlaf Faaland mutex_enter(&mmp->mmp_thread_lock); 157*e0f1c0afSOlaf Faaland } 158*e0f1c0afSOlaf Faaland 159*e0f1c0afSOlaf Faaland static void 160*e0f1c0afSOlaf Faaland mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr) 161*e0f1c0afSOlaf Faaland { 162*e0f1c0afSOlaf Faaland ASSERT(*mpp != NULL); 163*e0f1c0afSOlaf Faaland *mpp = NULL; 164*e0f1c0afSOlaf Faaland cv_broadcast(&mmp->mmp_thread_cv); 165*e0f1c0afSOlaf Faaland CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */ 166*e0f1c0afSOlaf Faaland thread_exit(); 167*e0f1c0afSOlaf Faaland } 168*e0f1c0afSOlaf Faaland 169*e0f1c0afSOlaf Faaland void 170*e0f1c0afSOlaf Faaland mmp_thread_start(spa_t *spa) 171*e0f1c0afSOlaf Faaland { 172*e0f1c0afSOlaf Faaland mmp_thread_t *mmp = &spa->spa_mmp; 173*e0f1c0afSOlaf Faaland 174*e0f1c0afSOlaf Faaland if (spa_writeable(spa)) { 175*e0f1c0afSOlaf Faaland mutex_enter(&mmp->mmp_thread_lock); 176*e0f1c0afSOlaf Faaland if (!mmp->mmp_thread) { 177*e0f1c0afSOlaf Faaland dprintf("mmp_thread_start pool %s\n", 178*e0f1c0afSOlaf Faaland spa->spa_name); 179*e0f1c0afSOlaf Faaland mmp->mmp_thread = thread_create(NULL, 0, mmp_thread, 180*e0f1c0afSOlaf Faaland spa, 0, &p0, TS_RUN, minclsyspri); 181*e0f1c0afSOlaf Faaland } 182*e0f1c0afSOlaf Faaland mutex_exit(&mmp->mmp_thread_lock); 183*e0f1c0afSOlaf Faaland } 184*e0f1c0afSOlaf Faaland } 185*e0f1c0afSOlaf Faaland 186*e0f1c0afSOlaf Faaland void 187*e0f1c0afSOlaf Faaland mmp_thread_stop(spa_t *spa) 188*e0f1c0afSOlaf Faaland { 189*e0f1c0afSOlaf Faaland mmp_thread_t *mmp = &spa->spa_mmp; 190*e0f1c0afSOlaf Faaland 191*e0f1c0afSOlaf Faaland mutex_enter(&mmp->mmp_thread_lock); 192*e0f1c0afSOlaf Faaland mmp->mmp_thread_exiting = 1; 193*e0f1c0afSOlaf Faaland cv_broadcast(&mmp->mmp_thread_cv); 194*e0f1c0afSOlaf Faaland 195*e0f1c0afSOlaf Faaland while (mmp->mmp_thread) { 196*e0f1c0afSOlaf Faaland cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock); 197*e0f1c0afSOlaf Faaland } 198*e0f1c0afSOlaf Faaland mutex_exit(&mmp->mmp_thread_lock); 199*e0f1c0afSOlaf Faaland 200*e0f1c0afSOlaf Faaland ASSERT(mmp->mmp_thread == NULL); 201*e0f1c0afSOlaf Faaland mmp->mmp_thread_exiting = 0; 202*e0f1c0afSOlaf Faaland } 203*e0f1c0afSOlaf Faaland 204*e0f1c0afSOlaf Faaland typedef enum mmp_vdev_state_flag { 205*e0f1c0afSOlaf Faaland MMP_FAIL_NOT_WRITABLE = (1 << 0), 206*e0f1c0afSOlaf Faaland MMP_FAIL_WRITE_PENDING = (1 << 1), 207*e0f1c0afSOlaf Faaland } mmp_vdev_state_flag_t; 208*e0f1c0afSOlaf Faaland 209*e0f1c0afSOlaf Faaland /* 210*e0f1c0afSOlaf Faaland * Find a leaf vdev to write an MMP block to. It must not have an outstanding 211*e0f1c0afSOlaf Faaland * mmp write (if so a new write will also likely block). If there is no usable 212*e0f1c0afSOlaf Faaland * leaf, a nonzero error value is returned. The error value returned is a bit 213*e0f1c0afSOlaf Faaland * field. 214*e0f1c0afSOlaf Faaland * 215*e0f1c0afSOlaf Faaland * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an 216*e0f1c0afSOlaf Faaland * outstanding MMP write. 217*e0f1c0afSOlaf Faaland * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable. 218*e0f1c0afSOlaf Faaland */ 219*e0f1c0afSOlaf Faaland 220*e0f1c0afSOlaf Faaland static int 221*e0f1c0afSOlaf Faaland mmp_next_leaf(spa_t *spa) 222*e0f1c0afSOlaf Faaland { 223*e0f1c0afSOlaf Faaland vdev_t *leaf; 224*e0f1c0afSOlaf Faaland vdev_t *starting_leaf; 225*e0f1c0afSOlaf Faaland int fail_mask = 0; 226*e0f1c0afSOlaf Faaland 227*e0f1c0afSOlaf Faaland ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock)); 228*e0f1c0afSOlaf Faaland ASSERT(spa_config_held(spa, SCL_STATE, RW_READER)); 229*e0f1c0afSOlaf Faaland ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE); 230*e0f1c0afSOlaf Faaland ASSERT(!list_is_empty(&spa->spa_leaf_list)); 231*e0f1c0afSOlaf Faaland 232*e0f1c0afSOlaf Faaland if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) { 233*e0f1c0afSOlaf Faaland spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list); 234*e0f1c0afSOlaf Faaland spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen; 235*e0f1c0afSOlaf Faaland } 236*e0f1c0afSOlaf Faaland 237*e0f1c0afSOlaf Faaland leaf = spa->spa_mmp.mmp_last_leaf; 238*e0f1c0afSOlaf Faaland if (leaf == NULL) 239*e0f1c0afSOlaf Faaland leaf = list_head(&spa->spa_leaf_list); 240*e0f1c0afSOlaf Faaland starting_leaf = leaf; 241*e0f1c0afSOlaf Faaland 242*e0f1c0afSOlaf Faaland do { 243*e0f1c0afSOlaf Faaland leaf = list_next(&spa->spa_leaf_list, leaf); 244*e0f1c0afSOlaf Faaland if (leaf == NULL) 245*e0f1c0afSOlaf Faaland leaf = list_head(&spa->spa_leaf_list); 246*e0f1c0afSOlaf Faaland 247*e0f1c0afSOlaf Faaland if (!vdev_writeable(leaf)) { 248*e0f1c0afSOlaf Faaland fail_mask |= MMP_FAIL_NOT_WRITABLE; 249*e0f1c0afSOlaf Faaland } else if (leaf->vdev_mmp_pending != 0) { 250*e0f1c0afSOlaf Faaland fail_mask |= MMP_FAIL_WRITE_PENDING; 251*e0f1c0afSOlaf Faaland } else { 252*e0f1c0afSOlaf Faaland spa->spa_mmp.mmp_last_leaf = leaf; 253*e0f1c0afSOlaf Faaland return (0); 254*e0f1c0afSOlaf Faaland } 255*e0f1c0afSOlaf Faaland } while (leaf != starting_leaf); 256*e0f1c0afSOlaf Faaland 257*e0f1c0afSOlaf Faaland ASSERT(fail_mask); 258*e0f1c0afSOlaf Faaland 259*e0f1c0afSOlaf Faaland return (fail_mask); 260*e0f1c0afSOlaf Faaland } 261*e0f1c0afSOlaf Faaland 262*e0f1c0afSOlaf Faaland /* 263*e0f1c0afSOlaf Faaland * MMP writes are issued on a fixed schedule, but may complete at variable, 264*e0f1c0afSOlaf Faaland * much longer, intervals. The mmp_delay captures long periods between 265*e0f1c0afSOlaf Faaland * successful writes for any reason, including disk latency, scheduling delays, 266*e0f1c0afSOlaf Faaland * etc. 267*e0f1c0afSOlaf Faaland * 268*e0f1c0afSOlaf Faaland * The mmp_delay is usually calculated as a decaying average, but if the latest 269*e0f1c0afSOlaf Faaland * delay is higher we do not average it, so that we do not hide sudden spikes 270*e0f1c0afSOlaf Faaland * which the importing host must wait for. 271*e0f1c0afSOlaf Faaland * 272*e0f1c0afSOlaf Faaland * If writes are occurring frequently, such as due to a high rate of txg syncs, 273*e0f1c0afSOlaf Faaland * the mmp_delay could become very small. Since those short delays depend on 274*e0f1c0afSOlaf Faaland * activity we cannot count on, we never allow mmp_delay to get lower than rate 275*e0f1c0afSOlaf Faaland * expected if only mmp_thread writes occur. 276*e0f1c0afSOlaf Faaland * 277*e0f1c0afSOlaf Faaland * If an mmp write was skipped or fails, and we have already waited longer than 278*e0f1c0afSOlaf Faaland * mmp_delay, we need to update it so the next write reflects the longer delay. 279*e0f1c0afSOlaf Faaland * 280*e0f1c0afSOlaf Faaland * Do not set mmp_delay if the multihost property is not on, so as not to 281*e0f1c0afSOlaf Faaland * trigger an activity check on import. 282*e0f1c0afSOlaf Faaland */ 283*e0f1c0afSOlaf Faaland static void 284*e0f1c0afSOlaf Faaland mmp_delay_update(spa_t *spa, boolean_t write_completed) 285*e0f1c0afSOlaf Faaland { 286*e0f1c0afSOlaf Faaland mmp_thread_t *mts = &spa->spa_mmp; 287*e0f1c0afSOlaf Faaland hrtime_t delay = gethrtime() - mts->mmp_last_write; 288*e0f1c0afSOlaf Faaland 289*e0f1c0afSOlaf Faaland ASSERT(MUTEX_HELD(&mts->mmp_io_lock)); 290*e0f1c0afSOlaf Faaland 291*e0f1c0afSOlaf Faaland if (spa_multihost(spa) == B_FALSE) { 292*e0f1c0afSOlaf Faaland mts->mmp_delay = 0; 293*e0f1c0afSOlaf Faaland return; 294*e0f1c0afSOlaf Faaland } 295*e0f1c0afSOlaf Faaland 296*e0f1c0afSOlaf Faaland if (delay > mts->mmp_delay) 297*e0f1c0afSOlaf Faaland mts->mmp_delay = delay; 298*e0f1c0afSOlaf Faaland 299*e0f1c0afSOlaf Faaland if (write_completed == B_FALSE) 300*e0f1c0afSOlaf Faaland return; 301*e0f1c0afSOlaf Faaland 302*e0f1c0afSOlaf Faaland mts->mmp_last_write = gethrtime(); 303*e0f1c0afSOlaf Faaland 304*e0f1c0afSOlaf Faaland /* 305*e0f1c0afSOlaf Faaland * strictly less than, in case delay was changed above. 306*e0f1c0afSOlaf Faaland */ 307*e0f1c0afSOlaf Faaland if (delay < mts->mmp_delay) { 308*e0f1c0afSOlaf Faaland hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) / 309*e0f1c0afSOlaf Faaland MAX(1, vdev_count_leaves(spa)); 310*e0f1c0afSOlaf Faaland mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128), 311*e0f1c0afSOlaf Faaland min_delay); 312*e0f1c0afSOlaf Faaland } 313*e0f1c0afSOlaf Faaland } 314*e0f1c0afSOlaf Faaland 315*e0f1c0afSOlaf Faaland static void 316*e0f1c0afSOlaf Faaland mmp_write_done(zio_t *zio) 317*e0f1c0afSOlaf Faaland { 318*e0f1c0afSOlaf Faaland spa_t *spa = zio->io_spa; 319*e0f1c0afSOlaf Faaland vdev_t *vd = zio->io_vd; 320*e0f1c0afSOlaf Faaland mmp_thread_t *mts = zio->io_private; 321*e0f1c0afSOlaf Faaland 322*e0f1c0afSOlaf Faaland mutex_enter(&mts->mmp_io_lock); 323*e0f1c0afSOlaf Faaland uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; 324*e0f1c0afSOlaf Faaland hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; 325*e0f1c0afSOlaf Faaland 326*e0f1c0afSOlaf Faaland mmp_delay_update(spa, (zio->io_error == 0)); 327*e0f1c0afSOlaf Faaland 328*e0f1c0afSOlaf Faaland vd->vdev_mmp_pending = 0; 329*e0f1c0afSOlaf Faaland vd->vdev_mmp_kstat_id = 0; 330*e0f1c0afSOlaf Faaland 331*e0f1c0afSOlaf Faaland mutex_exit(&mts->mmp_io_lock); 332*e0f1c0afSOlaf Faaland spa_config_exit(spa, SCL_STATE, mmp_tag); 333*e0f1c0afSOlaf Faaland 334*e0f1c0afSOlaf Faaland abd_free(zio->io_abd); 335*e0f1c0afSOlaf Faaland } 336*e0f1c0afSOlaf Faaland 337*e0f1c0afSOlaf Faaland /* 338*e0f1c0afSOlaf Faaland * When the uberblock on-disk is updated by a spa_sync, 339*e0f1c0afSOlaf Faaland * creating a new "best" uberblock, update the one stored 340*e0f1c0afSOlaf Faaland * in the mmp thread state, used for mmp writes. 341*e0f1c0afSOlaf Faaland */ 342*e0f1c0afSOlaf Faaland void 343*e0f1c0afSOlaf Faaland mmp_update_uberblock(spa_t *spa, uberblock_t *ub) 344*e0f1c0afSOlaf Faaland { 345*e0f1c0afSOlaf Faaland mmp_thread_t *mmp = &spa->spa_mmp; 346*e0f1c0afSOlaf Faaland 347*e0f1c0afSOlaf Faaland mutex_enter(&mmp->mmp_io_lock); 348*e0f1c0afSOlaf Faaland mmp->mmp_ub = *ub; 349*e0f1c0afSOlaf Faaland mmp->mmp_ub.ub_timestamp = gethrestime_sec(); 350*e0f1c0afSOlaf Faaland mmp_delay_update(spa, B_TRUE); 351*e0f1c0afSOlaf Faaland mutex_exit(&mmp->mmp_io_lock); 352*e0f1c0afSOlaf Faaland } 353*e0f1c0afSOlaf Faaland 354*e0f1c0afSOlaf Faaland /* 355*e0f1c0afSOlaf Faaland * Choose a random vdev, label, and MMP block, and write over it 356*e0f1c0afSOlaf Faaland * with a copy of the last-synced uberblock, whose timestamp 357*e0f1c0afSOlaf Faaland * has been updated to reflect that the pool is in use. 358*e0f1c0afSOlaf Faaland */ 359*e0f1c0afSOlaf Faaland static void 360*e0f1c0afSOlaf Faaland mmp_write_uberblock(spa_t *spa) 361*e0f1c0afSOlaf Faaland { 362*e0f1c0afSOlaf Faaland int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; 363*e0f1c0afSOlaf Faaland mmp_thread_t *mmp = &spa->spa_mmp; 364*e0f1c0afSOlaf Faaland uberblock_t *ub; 365*e0f1c0afSOlaf Faaland vdev_t *vd = NULL; 366*e0f1c0afSOlaf Faaland int label, error; 367*e0f1c0afSOlaf Faaland uint64_t offset; 368*e0f1c0afSOlaf Faaland 369*e0f1c0afSOlaf Faaland hrtime_t lock_acquire_time = gethrtime(); 370*e0f1c0afSOlaf Faaland spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); 371*e0f1c0afSOlaf Faaland lock_acquire_time = gethrtime() - lock_acquire_time; 372*e0f1c0afSOlaf Faaland if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) 373*e0f1c0afSOlaf Faaland zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n", 374*e0f1c0afSOlaf Faaland (u_longlong_t)lock_acquire_time); 375*e0f1c0afSOlaf Faaland 376*e0f1c0afSOlaf Faaland mutex_enter(&mmp->mmp_io_lock); 377*e0f1c0afSOlaf Faaland 378*e0f1c0afSOlaf Faaland error = mmp_next_leaf(spa); 379*e0f1c0afSOlaf Faaland 380*e0f1c0afSOlaf Faaland /* 381*e0f1c0afSOlaf Faaland * spa_mmp_history has two types of entries: 382*e0f1c0afSOlaf Faaland * Issued MMP write: records time issued, error status, etc. 383*e0f1c0afSOlaf Faaland * Skipped MMP write: an MMP write could not be issued because no 384*e0f1c0afSOlaf Faaland * suitable leaf vdev was available. See comment above struct 385*e0f1c0afSOlaf Faaland * spa_mmp_history for details. 386*e0f1c0afSOlaf Faaland */ 387*e0f1c0afSOlaf Faaland 388*e0f1c0afSOlaf Faaland if (error) { 389*e0f1c0afSOlaf Faaland mmp_delay_update(spa, B_FALSE); 390*e0f1c0afSOlaf Faaland if (mmp->mmp_skip_error == error) { 391*e0f1c0afSOlaf Faaland /* 392*e0f1c0afSOlaf Faaland * ZoL porting note: the following is TBD 393*e0f1c0afSOlaf Faaland * spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); 394*e0f1c0afSOlaf Faaland */ 395*e0f1c0afSOlaf Faaland } else { 396*e0f1c0afSOlaf Faaland mmp->mmp_skip_error = error; 397*e0f1c0afSOlaf Faaland /* 398*e0f1c0afSOlaf Faaland * ZoL porting note: the following is TBD 399*e0f1c0afSOlaf Faaland * spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg, 400*e0f1c0afSOlaf Faaland * gethrestime_sec(), mmp->mmp_delay, NULL, 0, 401*e0f1c0afSOlaf Faaland * mmp->mmp_kstat_id++, error); 402*e0f1c0afSOlaf Faaland */ 403*e0f1c0afSOlaf Faaland } 404*e0f1c0afSOlaf Faaland mutex_exit(&mmp->mmp_io_lock); 405*e0f1c0afSOlaf Faaland spa_config_exit(spa, SCL_STATE, mmp_tag); 406*e0f1c0afSOlaf Faaland return; 407*e0f1c0afSOlaf Faaland } 408*e0f1c0afSOlaf Faaland 409*e0f1c0afSOlaf Faaland vd = spa->spa_mmp.mmp_last_leaf; 410*e0f1c0afSOlaf Faaland mmp->mmp_skip_error = 0; 411*e0f1c0afSOlaf Faaland 412*e0f1c0afSOlaf Faaland if (mmp->mmp_zio_root == NULL) 413*e0f1c0afSOlaf Faaland mmp->mmp_zio_root = zio_root(spa, NULL, NULL, 414*e0f1c0afSOlaf Faaland flags | ZIO_FLAG_GODFATHER); 415*e0f1c0afSOlaf Faaland 416*e0f1c0afSOlaf Faaland ub = &mmp->mmp_ub; 417*e0f1c0afSOlaf Faaland ub->ub_timestamp = gethrestime_sec(); 418*e0f1c0afSOlaf Faaland ub->ub_mmp_magic = MMP_MAGIC; 419*e0f1c0afSOlaf Faaland ub->ub_mmp_delay = mmp->mmp_delay; 420*e0f1c0afSOlaf Faaland vd->vdev_mmp_pending = gethrtime(); 421*e0f1c0afSOlaf Faaland vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id; 422*e0f1c0afSOlaf Faaland 423*e0f1c0afSOlaf Faaland zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags); 424*e0f1c0afSOlaf Faaland abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); 425*e0f1c0afSOlaf Faaland abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); 426*e0f1c0afSOlaf Faaland abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); 427*e0f1c0afSOlaf Faaland 428*e0f1c0afSOlaf Faaland mmp->mmp_kstat_id++; 429*e0f1c0afSOlaf Faaland mutex_exit(&mmp->mmp_io_lock); 430*e0f1c0afSOlaf Faaland 431*e0f1c0afSOlaf Faaland offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) - 432*e0f1c0afSOlaf Faaland MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL)); 433*e0f1c0afSOlaf Faaland 434*e0f1c0afSOlaf Faaland label = spa_get_random(VDEV_LABELS); 435*e0f1c0afSOlaf Faaland vdev_label_write(zio, vd, label, ub_abd, offset, 436*e0f1c0afSOlaf Faaland VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp, 437*e0f1c0afSOlaf Faaland flags | ZIO_FLAG_DONT_PROPAGATE); 438*e0f1c0afSOlaf Faaland 439*e0f1c0afSOlaf Faaland /* 440*e0f1c0afSOlaf Faaland * ZoL porting note: the following is TBD 441*e0f1c0afSOlaf Faaland * (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp, 442*e0f1c0afSOlaf Faaland * ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0); 443*e0f1c0afSOlaf Faaland */ 444*e0f1c0afSOlaf Faaland 445*e0f1c0afSOlaf Faaland zio_nowait(zio); 446*e0f1c0afSOlaf Faaland } 447*e0f1c0afSOlaf Faaland 448*e0f1c0afSOlaf Faaland static void 449*e0f1c0afSOlaf Faaland mmp_thread(void *arg) 450*e0f1c0afSOlaf Faaland { 451*e0f1c0afSOlaf Faaland spa_t *spa = (spa_t *)arg; 452*e0f1c0afSOlaf Faaland mmp_thread_t *mmp = &spa->spa_mmp; 453*e0f1c0afSOlaf Faaland boolean_t last_spa_suspended = spa_suspended(spa); 454*e0f1c0afSOlaf Faaland boolean_t last_spa_multihost = spa_multihost(spa); 455*e0f1c0afSOlaf Faaland callb_cpr_t cpr; 456*e0f1c0afSOlaf Faaland hrtime_t max_fail_ns = zfs_multihost_fail_intervals * 457*e0f1c0afSOlaf Faaland MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); 458*e0f1c0afSOlaf Faaland 459*e0f1c0afSOlaf Faaland mmp_thread_enter(mmp, &cpr); 460*e0f1c0afSOlaf Faaland 461*e0f1c0afSOlaf Faaland /* 462*e0f1c0afSOlaf Faaland * The mmp_write_done() function calculates mmp_delay based on the 463*e0f1c0afSOlaf Faaland * prior value of mmp_delay and the elapsed time since the last write. 464*e0f1c0afSOlaf Faaland * For the first mmp write, there is no "last write", so we start 465*e0f1c0afSOlaf Faaland * with fake, but reasonable, default non-zero values. 466*e0f1c0afSOlaf Faaland */ 467*e0f1c0afSOlaf Faaland mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval, 468*e0f1c0afSOlaf Faaland MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1); 469*e0f1c0afSOlaf Faaland mmp->mmp_last_write = gethrtime() - mmp->mmp_delay; 470*e0f1c0afSOlaf Faaland 471*e0f1c0afSOlaf Faaland while (!mmp->mmp_thread_exiting) { 472*e0f1c0afSOlaf Faaland uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals; 473*e0f1c0afSOlaf Faaland uint64_t mmp_interval = MSEC2NSEC( 474*e0f1c0afSOlaf Faaland MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); 475*e0f1c0afSOlaf Faaland boolean_t suspended = spa_suspended(spa); 476*e0f1c0afSOlaf Faaland boolean_t multihost = spa_multihost(spa); 477*e0f1c0afSOlaf Faaland hrtime_t next_time; 478*e0f1c0afSOlaf Faaland 479*e0f1c0afSOlaf Faaland if (multihost) 480*e0f1c0afSOlaf Faaland next_time = gethrtime() + mmp_interval / 481*e0f1c0afSOlaf Faaland MAX(vdev_count_leaves(spa), 1); 482*e0f1c0afSOlaf Faaland else 483*e0f1c0afSOlaf Faaland next_time = gethrtime() + 484*e0f1c0afSOlaf Faaland MSEC2NSEC(MMP_DEFAULT_INTERVAL); 485*e0f1c0afSOlaf Faaland 486*e0f1c0afSOlaf Faaland /* 487*e0f1c0afSOlaf Faaland * MMP off => on, or suspended => !suspended: 488*e0f1c0afSOlaf Faaland * No writes occurred recently. Update mmp_last_write to give 489*e0f1c0afSOlaf Faaland * us some time to try. 490*e0f1c0afSOlaf Faaland */ 491*e0f1c0afSOlaf Faaland if ((!last_spa_multihost && multihost) || 492*e0f1c0afSOlaf Faaland (last_spa_suspended && !suspended)) { 493*e0f1c0afSOlaf Faaland mutex_enter(&mmp->mmp_io_lock); 494*e0f1c0afSOlaf Faaland mmp->mmp_last_write = gethrtime(); 495*e0f1c0afSOlaf Faaland mutex_exit(&mmp->mmp_io_lock); 496*e0f1c0afSOlaf Faaland } 497*e0f1c0afSOlaf Faaland 498*e0f1c0afSOlaf Faaland /* 499*e0f1c0afSOlaf Faaland * MMP on => off: 500*e0f1c0afSOlaf Faaland * mmp_delay == 0 tells importing node to skip activity check. 501*e0f1c0afSOlaf Faaland */ 502*e0f1c0afSOlaf Faaland if (last_spa_multihost && !multihost) { 503*e0f1c0afSOlaf Faaland mutex_enter(&mmp->mmp_io_lock); 504*e0f1c0afSOlaf Faaland mmp->mmp_delay = 0; 505*e0f1c0afSOlaf Faaland mutex_exit(&mmp->mmp_io_lock); 506*e0f1c0afSOlaf Faaland } 507*e0f1c0afSOlaf Faaland last_spa_multihost = multihost; 508*e0f1c0afSOlaf Faaland last_spa_suspended = suspended; 509*e0f1c0afSOlaf Faaland 510*e0f1c0afSOlaf Faaland /* 511*e0f1c0afSOlaf Faaland * Smooth max_fail_ns when its factors are decreased, because 512*e0f1c0afSOlaf Faaland * making (max_fail_ns < mmp_interval) results in the pool being 513*e0f1c0afSOlaf Faaland * immediately suspended before writes can occur at the new 514*e0f1c0afSOlaf Faaland * higher frequency. 515*e0f1c0afSOlaf Faaland */ 516*e0f1c0afSOlaf Faaland if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) { 517*e0f1c0afSOlaf Faaland max_fail_ns = ((31 * max_fail_ns) + (mmp_interval * 518*e0f1c0afSOlaf Faaland mmp_fail_intervals)) / 32; 519*e0f1c0afSOlaf Faaland } else { 520*e0f1c0afSOlaf Faaland max_fail_ns = mmp_interval * mmp_fail_intervals; 521*e0f1c0afSOlaf Faaland } 522*e0f1c0afSOlaf Faaland 523*e0f1c0afSOlaf Faaland /* 524*e0f1c0afSOlaf Faaland * Suspend the pool if no MMP write has succeeded in over 525*e0f1c0afSOlaf Faaland * mmp_interval * mmp_fail_intervals nanoseconds. 526*e0f1c0afSOlaf Faaland */ 527*e0f1c0afSOlaf Faaland if (!suspended && mmp_fail_intervals && multihost && 528*e0f1c0afSOlaf Faaland (gethrtime() - mmp->mmp_last_write) > max_fail_ns) { 529*e0f1c0afSOlaf Faaland cmn_err(CE_WARN, "MMP writes to pool '%s' have not " 530*e0f1c0afSOlaf Faaland "succeeded in over %llus; suspending pool", 531*e0f1c0afSOlaf Faaland spa_name(spa), 532*e0f1c0afSOlaf Faaland NSEC2SEC(gethrtime() - mmp->mmp_last_write)); 533*e0f1c0afSOlaf Faaland zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); 534*e0f1c0afSOlaf Faaland } 535*e0f1c0afSOlaf Faaland 536*e0f1c0afSOlaf Faaland if (multihost && !suspended) 537*e0f1c0afSOlaf Faaland mmp_write_uberblock(spa); 538*e0f1c0afSOlaf Faaland 539*e0f1c0afSOlaf Faaland CALLB_CPR_SAFE_BEGIN(&cpr); 540*e0f1c0afSOlaf Faaland (void) cv_timedwait_sig_hrtime(&mmp->mmp_thread_cv, 541*e0f1c0afSOlaf Faaland &mmp->mmp_thread_lock, next_time); 542*e0f1c0afSOlaf Faaland CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); 543*e0f1c0afSOlaf Faaland } 544*e0f1c0afSOlaf Faaland 545*e0f1c0afSOlaf Faaland /* Outstanding writes are allowed to complete. */ 546*e0f1c0afSOlaf Faaland if (mmp->mmp_zio_root) 547*e0f1c0afSOlaf Faaland zio_wait(mmp->mmp_zio_root); 548*e0f1c0afSOlaf Faaland 549*e0f1c0afSOlaf Faaland mmp->mmp_zio_root = NULL; 550*e0f1c0afSOlaf Faaland mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); 551*e0f1c0afSOlaf Faaland } 552*e0f1c0afSOlaf Faaland 553*e0f1c0afSOlaf Faaland /* 554*e0f1c0afSOlaf Faaland * Signal the MMP thread to wake it, when it is sleeping on 555*e0f1c0afSOlaf Faaland * its cv. Used when some module parameter has changed and 556*e0f1c0afSOlaf Faaland * we want the thread to know about it. 557*e0f1c0afSOlaf Faaland * Only signal if the pool is active and mmp thread is 558*e0f1c0afSOlaf Faaland * running, otherwise there is no thread to wake. 559*e0f1c0afSOlaf Faaland */ 560*e0f1c0afSOlaf Faaland static void 561*e0f1c0afSOlaf Faaland mmp_signal_thread(spa_t *spa) 562*e0f1c0afSOlaf Faaland { 563*e0f1c0afSOlaf Faaland mmp_thread_t *mmp = &spa->spa_mmp; 564*e0f1c0afSOlaf Faaland 565*e0f1c0afSOlaf Faaland mutex_enter(&mmp->mmp_thread_lock); 566*e0f1c0afSOlaf Faaland if (mmp->mmp_thread) 567*e0f1c0afSOlaf Faaland cv_broadcast(&mmp->mmp_thread_cv); 568*e0f1c0afSOlaf Faaland mutex_exit(&mmp->mmp_thread_lock); 569*e0f1c0afSOlaf Faaland } 570*e0f1c0afSOlaf Faaland 571*e0f1c0afSOlaf Faaland void 572*e0f1c0afSOlaf Faaland mmp_signal_all_threads(void) 573*e0f1c0afSOlaf Faaland { 574*e0f1c0afSOlaf Faaland spa_t *spa = NULL; 575*e0f1c0afSOlaf Faaland 576*e0f1c0afSOlaf Faaland mutex_enter(&spa_namespace_lock); 577*e0f1c0afSOlaf Faaland while ((spa = spa_next(spa))) { 578*e0f1c0afSOlaf Faaland if (spa->spa_state == POOL_STATE_ACTIVE) 579*e0f1c0afSOlaf Faaland mmp_signal_thread(spa); 580*e0f1c0afSOlaf Faaland } 581*e0f1c0afSOlaf Faaland mutex_exit(&spa_namespace_lock); 582*e0f1c0afSOlaf Faaland } 583