xref: /illumos-gate/usr/src/uts/common/fs/zfs/mmp.c (revision e0f1c0afa46cc84d4b1e40124032a9a87310386e)
1*e0f1c0afSOlaf Faaland /*
2*e0f1c0afSOlaf Faaland  * CDDL HEADER START
3*e0f1c0afSOlaf Faaland  *
4*e0f1c0afSOlaf Faaland  * The contents of this file are subject to the terms of the
5*e0f1c0afSOlaf Faaland  * Common Development and Distribution License (the "License").
6*e0f1c0afSOlaf Faaland  * You may not use this file except in compliance with the License.
7*e0f1c0afSOlaf Faaland  *
8*e0f1c0afSOlaf Faaland  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*e0f1c0afSOlaf Faaland  * or http://www.opensolaris.org/os/licensing.
10*e0f1c0afSOlaf Faaland  * See the License for the specific language governing permissions
11*e0f1c0afSOlaf Faaland  * and limitations under the License.
12*e0f1c0afSOlaf Faaland  *
13*e0f1c0afSOlaf Faaland  * When distributing Covered Code, include this CDDL HEADER in each
14*e0f1c0afSOlaf Faaland  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*e0f1c0afSOlaf Faaland  * If applicable, add the following below this CDDL HEADER, with the
16*e0f1c0afSOlaf Faaland  * fields enclosed by brackets "[]" replaced with your own identifying
17*e0f1c0afSOlaf Faaland  * information: Portions Copyright [yyyy] [name of copyright owner]
18*e0f1c0afSOlaf Faaland  *
19*e0f1c0afSOlaf Faaland  * CDDL HEADER END
20*e0f1c0afSOlaf Faaland  */
21*e0f1c0afSOlaf Faaland /*
22*e0f1c0afSOlaf Faaland  * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
23*e0f1c0afSOlaf Faaland  * Copyright 2019 Joyent, Inc.
24*e0f1c0afSOlaf Faaland  */
25*e0f1c0afSOlaf Faaland 
26*e0f1c0afSOlaf Faaland #include <sys/abd.h>
27*e0f1c0afSOlaf Faaland #include <sys/mmp.h>
28*e0f1c0afSOlaf Faaland #include <sys/spa.h>
29*e0f1c0afSOlaf Faaland #include <sys/spa_impl.h>
30*e0f1c0afSOlaf Faaland #include <sys/time.h>
31*e0f1c0afSOlaf Faaland #include <sys/vdev.h>
32*e0f1c0afSOlaf Faaland #include <sys/vdev_impl.h>
33*e0f1c0afSOlaf Faaland #include <sys/zfs_context.h>
34*e0f1c0afSOlaf Faaland #include <sys/callb.h>
35*e0f1c0afSOlaf Faaland 
36*e0f1c0afSOlaf Faaland /*
37*e0f1c0afSOlaf Faaland  * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
38*e0f1c0afSOlaf Faaland  * or opening a pool on more than one host at a time.  In particular, it
39*e0f1c0afSOlaf Faaland  * prevents "zpool import -f" on a host from succeeding while the pool is
40*e0f1c0afSOlaf Faaland  * already imported on another host.  There are many other ways in which a
41*e0f1c0afSOlaf Faaland  * device could be used by two hosts for different purposes at the same time
42*e0f1c0afSOlaf Faaland  * resulting in pool damage.  This implementation does not attempt to detect
43*e0f1c0afSOlaf Faaland  * those cases.
44*e0f1c0afSOlaf Faaland  *
45*e0f1c0afSOlaf Faaland  * MMP operates by ensuring there are frequent visible changes on disk (a
46*e0f1c0afSOlaf Faaland  * "heartbeat") at all times.  And by altering the import process to check
47*e0f1c0afSOlaf Faaland  * for these changes and failing the import when they are detected.  This
48*e0f1c0afSOlaf Faaland  * functionality is enabled by setting the 'multihost' pool property to on.
49*e0f1c0afSOlaf Faaland  *
50*e0f1c0afSOlaf Faaland  * Uberblocks written by the txg_sync thread always go into the first
51*e0f1c0afSOlaf Faaland  * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
52*e0f1c0afSOlaf Faaland  * They are used to hold uberblocks which are exactly the same as the last
53*e0f1c0afSOlaf Faaland  * synced uberblock except that the ub_timestamp is frequently updated.
54*e0f1c0afSOlaf Faaland  * Like all other uberblocks, the slot is written with an embedded checksum,
55*e0f1c0afSOlaf Faaland  * and slots with invalid checksums are ignored.  This provides the
56*e0f1c0afSOlaf Faaland  * "heartbeat", with no risk of overwriting good uberblocks that must be
57*e0f1c0afSOlaf Faaland  * preserved, e.g. previous txgs and associated block pointers.
58*e0f1c0afSOlaf Faaland  *
59*e0f1c0afSOlaf Faaland  * Two optional fields are added to uberblock structure: ub_mmp_magic and
60*e0f1c0afSOlaf Faaland  * ub_mmp_delay.  The magic field allows zfs to tell whether ub_mmp_delay is
61*e0f1c0afSOlaf Faaland  * valid.  The delay field is a decaying average of the amount of time between
62*e0f1c0afSOlaf Faaland  * completion of successive MMP writes, in nanoseconds.  It is used to predict
63*e0f1c0afSOlaf Faaland  * how long the import must wait to detect activity in the pool, before
64*e0f1c0afSOlaf Faaland  * concluding it is not in use.
65*e0f1c0afSOlaf Faaland  *
66*e0f1c0afSOlaf Faaland  * During import an activity test may now be performed to determine if
67*e0f1c0afSOlaf Faaland  * the pool is in use.  The activity test is typically required if the
68*e0f1c0afSOlaf Faaland  * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
69*e0f1c0afSOlaf Faaland  * POOL_STATE_ACTIVE, and the pool is not a root pool.
70*e0f1c0afSOlaf Faaland  *
71*e0f1c0afSOlaf Faaland  * The activity test finds the "best" uberblock (highest txg & timestamp),
72*e0f1c0afSOlaf Faaland  * waits some time, and then finds the "best" uberblock again.  If the txg
73*e0f1c0afSOlaf Faaland  * and timestamp in both "best" uberblocks do not match, the pool is in use
74*e0f1c0afSOlaf Faaland  * by another host and the import fails.  Since the granularity of the
75*e0f1c0afSOlaf Faaland  * timestamp is in seconds this activity test must take a bare minimum of one
76*e0f1c0afSOlaf Faaland  * second.  In order to assure the accuracy of the activity test, the default
77*e0f1c0afSOlaf Faaland  * values result in an activity test duration of 10x the mmp write interval.
78*e0f1c0afSOlaf Faaland  *
79*e0f1c0afSOlaf Faaland  * The "zpool import"  activity test can be expected to take a minimum time of
80*e0f1c0afSOlaf Faaland  * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds.  If the
81*e0f1c0afSOlaf Faaland  * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
82*e0f1c0afSOlaf Faaland  * test may take longer if MMP writes were occurring less frequently than
83*e0f1c0afSOlaf Faaland  * expected.  Additionally, the duration is then extended by a random 25% to
84*e0f1c0afSOlaf Faaland  * attempt to to detect simultaneous imports.  For example, if both partner
85*e0f1c0afSOlaf Faaland  * hosts are rebooted at the same time and automatically attempt to import the
86*e0f1c0afSOlaf Faaland  * pool.
87*e0f1c0afSOlaf Faaland  */
88*e0f1c0afSOlaf Faaland 
89*e0f1c0afSOlaf Faaland /*
90*e0f1c0afSOlaf Faaland  * Used to control the frequency of mmp writes which are performed when the
91*e0f1c0afSOlaf Faaland  * 'multihost' pool property is on.  This is one factor used to determine the
92*e0f1c0afSOlaf Faaland  * length of the activity check during import.
93*e0f1c0afSOlaf Faaland  *
94*e0f1c0afSOlaf Faaland  * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
95*e0f1c0afSOlaf Faaland  * This means that on average an mmp write will be issued for each leaf vdev
96*e0f1c0afSOlaf Faaland  * every zfs_multihost_interval milliseconds.  In practice, the observed period
97*e0f1c0afSOlaf Faaland  * can vary with the I/O load and this observed value is the delay which is
98*e0f1c0afSOlaf Faaland  * stored in the uberblock.  The minimum allowed value is 100 ms.
99*e0f1c0afSOlaf Faaland  */
100*e0f1c0afSOlaf Faaland ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
101*e0f1c0afSOlaf Faaland 
102*e0f1c0afSOlaf Faaland /*
103*e0f1c0afSOlaf Faaland  * Used to control the duration of the activity test on import.  Smaller values
104*e0f1c0afSOlaf Faaland  * of zfs_multihost_import_intervals will reduce the import time but increase
105*e0f1c0afSOlaf Faaland  * the risk of failing to detect an active pool.  The total activity check time
106*e0f1c0afSOlaf Faaland  * is never allowed to drop below one second.  A value of 0 is ignored and
107*e0f1c0afSOlaf Faaland  * treated as if it was set to 1.
108*e0f1c0afSOlaf Faaland  */
109*e0f1c0afSOlaf Faaland uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
110*e0f1c0afSOlaf Faaland 
111*e0f1c0afSOlaf Faaland /*
112*e0f1c0afSOlaf Faaland  * Controls the behavior of the pool when mmp write failures are detected.
113*e0f1c0afSOlaf Faaland  *
114*e0f1c0afSOlaf Faaland  * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
115*e0f1c0afSOlaf Faaland  * The failures will still be reported to the ZED which depending on its
116*e0f1c0afSOlaf Faaland  * configuration may take action such as suspending the pool or taking a
117*e0f1c0afSOlaf Faaland  * device offline.
118*e0f1c0afSOlaf Faaland  *
119*e0f1c0afSOlaf Faaland  * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
120*e0f1c0afSOlaf Faaland  * cause the pool to be suspended.  This occurs when
121*e0f1c0afSOlaf Faaland  * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
122*e0f1c0afSOlaf Faaland  * passed since the last successful mmp write.  This guarantees the activity
123*e0f1c0afSOlaf Faaland  * test will see mmp writes if the
124*e0f1c0afSOlaf Faaland  * pool is imported.
125*e0f1c0afSOlaf Faaland  */
126*e0f1c0afSOlaf Faaland uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
127*e0f1c0afSOlaf Faaland 
128*e0f1c0afSOlaf Faaland char *mmp_tag = "mmp_write_uberblock";
129*e0f1c0afSOlaf Faaland static void mmp_thread(void *arg);
130*e0f1c0afSOlaf Faaland 
131*e0f1c0afSOlaf Faaland void
132*e0f1c0afSOlaf Faaland mmp_init(spa_t *spa)
133*e0f1c0afSOlaf Faaland {
134*e0f1c0afSOlaf Faaland 	mmp_thread_t *mmp = &spa->spa_mmp;
135*e0f1c0afSOlaf Faaland 
136*e0f1c0afSOlaf Faaland 	mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
137*e0f1c0afSOlaf Faaland 	cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
138*e0f1c0afSOlaf Faaland 	mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
139*e0f1c0afSOlaf Faaland 	mmp->mmp_kstat_id = 1;
140*e0f1c0afSOlaf Faaland }
141*e0f1c0afSOlaf Faaland 
142*e0f1c0afSOlaf Faaland void
143*e0f1c0afSOlaf Faaland mmp_fini(spa_t *spa)
144*e0f1c0afSOlaf Faaland {
145*e0f1c0afSOlaf Faaland 	mmp_thread_t *mmp = &spa->spa_mmp;
146*e0f1c0afSOlaf Faaland 
147*e0f1c0afSOlaf Faaland 	mutex_destroy(&mmp->mmp_thread_lock);
148*e0f1c0afSOlaf Faaland 	cv_destroy(&mmp->mmp_thread_cv);
149*e0f1c0afSOlaf Faaland 	mutex_destroy(&mmp->mmp_io_lock);
150*e0f1c0afSOlaf Faaland }
151*e0f1c0afSOlaf Faaland 
152*e0f1c0afSOlaf Faaland static void
153*e0f1c0afSOlaf Faaland mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
154*e0f1c0afSOlaf Faaland {
155*e0f1c0afSOlaf Faaland 	CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
156*e0f1c0afSOlaf Faaland 	mutex_enter(&mmp->mmp_thread_lock);
157*e0f1c0afSOlaf Faaland }
158*e0f1c0afSOlaf Faaland 
159*e0f1c0afSOlaf Faaland static void
160*e0f1c0afSOlaf Faaland mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
161*e0f1c0afSOlaf Faaland {
162*e0f1c0afSOlaf Faaland 	ASSERT(*mpp != NULL);
163*e0f1c0afSOlaf Faaland 	*mpp = NULL;
164*e0f1c0afSOlaf Faaland 	cv_broadcast(&mmp->mmp_thread_cv);
165*e0f1c0afSOlaf Faaland 	CALLB_CPR_EXIT(cpr);		/* drops &mmp->mmp_thread_lock */
166*e0f1c0afSOlaf Faaland 	thread_exit();
167*e0f1c0afSOlaf Faaland }
168*e0f1c0afSOlaf Faaland 
169*e0f1c0afSOlaf Faaland void
170*e0f1c0afSOlaf Faaland mmp_thread_start(spa_t *spa)
171*e0f1c0afSOlaf Faaland {
172*e0f1c0afSOlaf Faaland 	mmp_thread_t *mmp = &spa->spa_mmp;
173*e0f1c0afSOlaf Faaland 
174*e0f1c0afSOlaf Faaland 	if (spa_writeable(spa)) {
175*e0f1c0afSOlaf Faaland 		mutex_enter(&mmp->mmp_thread_lock);
176*e0f1c0afSOlaf Faaland 		if (!mmp->mmp_thread) {
177*e0f1c0afSOlaf Faaland 			dprintf("mmp_thread_start pool %s\n",
178*e0f1c0afSOlaf Faaland 			    spa->spa_name);
179*e0f1c0afSOlaf Faaland 			mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
180*e0f1c0afSOlaf Faaland 			    spa, 0, &p0, TS_RUN, minclsyspri);
181*e0f1c0afSOlaf Faaland 		}
182*e0f1c0afSOlaf Faaland 		mutex_exit(&mmp->mmp_thread_lock);
183*e0f1c0afSOlaf Faaland 	}
184*e0f1c0afSOlaf Faaland }
185*e0f1c0afSOlaf Faaland 
186*e0f1c0afSOlaf Faaland void
187*e0f1c0afSOlaf Faaland mmp_thread_stop(spa_t *spa)
188*e0f1c0afSOlaf Faaland {
189*e0f1c0afSOlaf Faaland 	mmp_thread_t *mmp = &spa->spa_mmp;
190*e0f1c0afSOlaf Faaland 
191*e0f1c0afSOlaf Faaland 	mutex_enter(&mmp->mmp_thread_lock);
192*e0f1c0afSOlaf Faaland 	mmp->mmp_thread_exiting = 1;
193*e0f1c0afSOlaf Faaland 	cv_broadcast(&mmp->mmp_thread_cv);
194*e0f1c0afSOlaf Faaland 
195*e0f1c0afSOlaf Faaland 	while (mmp->mmp_thread) {
196*e0f1c0afSOlaf Faaland 		cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
197*e0f1c0afSOlaf Faaland 	}
198*e0f1c0afSOlaf Faaland 	mutex_exit(&mmp->mmp_thread_lock);
199*e0f1c0afSOlaf Faaland 
200*e0f1c0afSOlaf Faaland 	ASSERT(mmp->mmp_thread == NULL);
201*e0f1c0afSOlaf Faaland 	mmp->mmp_thread_exiting = 0;
202*e0f1c0afSOlaf Faaland }
203*e0f1c0afSOlaf Faaland 
204*e0f1c0afSOlaf Faaland typedef enum mmp_vdev_state_flag {
205*e0f1c0afSOlaf Faaland 	MMP_FAIL_NOT_WRITABLE	= (1 << 0),
206*e0f1c0afSOlaf Faaland 	MMP_FAIL_WRITE_PENDING	= (1 << 1),
207*e0f1c0afSOlaf Faaland } mmp_vdev_state_flag_t;
208*e0f1c0afSOlaf Faaland 
209*e0f1c0afSOlaf Faaland /*
210*e0f1c0afSOlaf Faaland  * Find a leaf vdev to write an MMP block to.  It must not have an outstanding
211*e0f1c0afSOlaf Faaland  * mmp write (if so a new write will also likely block).  If there is no usable
212*e0f1c0afSOlaf Faaland  * leaf, a nonzero error value is returned. The error value returned is a bit
213*e0f1c0afSOlaf Faaland  * field.
214*e0f1c0afSOlaf Faaland  *
215*e0f1c0afSOlaf Faaland  * MMP_FAIL_WRITE_PENDING   One or more leaf vdevs are writeable, but have an
216*e0f1c0afSOlaf Faaland  *                          outstanding MMP write.
217*e0f1c0afSOlaf Faaland  * MMP_FAIL_NOT_WRITABLE    One or more leaf vdevs are not writeable.
218*e0f1c0afSOlaf Faaland  */
219*e0f1c0afSOlaf Faaland 
220*e0f1c0afSOlaf Faaland static int
221*e0f1c0afSOlaf Faaland mmp_next_leaf(spa_t *spa)
222*e0f1c0afSOlaf Faaland {
223*e0f1c0afSOlaf Faaland 	vdev_t *leaf;
224*e0f1c0afSOlaf Faaland 	vdev_t *starting_leaf;
225*e0f1c0afSOlaf Faaland 	int fail_mask = 0;
226*e0f1c0afSOlaf Faaland 
227*e0f1c0afSOlaf Faaland 	ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
228*e0f1c0afSOlaf Faaland 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
229*e0f1c0afSOlaf Faaland 	ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
230*e0f1c0afSOlaf Faaland 	ASSERT(!list_is_empty(&spa->spa_leaf_list));
231*e0f1c0afSOlaf Faaland 
232*e0f1c0afSOlaf Faaland 	if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
233*e0f1c0afSOlaf Faaland 		spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
234*e0f1c0afSOlaf Faaland 		spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
235*e0f1c0afSOlaf Faaland 	}
236*e0f1c0afSOlaf Faaland 
237*e0f1c0afSOlaf Faaland 	leaf = spa->spa_mmp.mmp_last_leaf;
238*e0f1c0afSOlaf Faaland 	if (leaf == NULL)
239*e0f1c0afSOlaf Faaland 		leaf = list_head(&spa->spa_leaf_list);
240*e0f1c0afSOlaf Faaland 	starting_leaf = leaf;
241*e0f1c0afSOlaf Faaland 
242*e0f1c0afSOlaf Faaland 	do {
243*e0f1c0afSOlaf Faaland 		leaf = list_next(&spa->spa_leaf_list, leaf);
244*e0f1c0afSOlaf Faaland 		if (leaf == NULL)
245*e0f1c0afSOlaf Faaland 			leaf = list_head(&spa->spa_leaf_list);
246*e0f1c0afSOlaf Faaland 
247*e0f1c0afSOlaf Faaland 		if (!vdev_writeable(leaf)) {
248*e0f1c0afSOlaf Faaland 			fail_mask |= MMP_FAIL_NOT_WRITABLE;
249*e0f1c0afSOlaf Faaland 		} else if (leaf->vdev_mmp_pending != 0) {
250*e0f1c0afSOlaf Faaland 			fail_mask |= MMP_FAIL_WRITE_PENDING;
251*e0f1c0afSOlaf Faaland 		} else {
252*e0f1c0afSOlaf Faaland 			spa->spa_mmp.mmp_last_leaf = leaf;
253*e0f1c0afSOlaf Faaland 			return (0);
254*e0f1c0afSOlaf Faaland 		}
255*e0f1c0afSOlaf Faaland 	} while (leaf != starting_leaf);
256*e0f1c0afSOlaf Faaland 
257*e0f1c0afSOlaf Faaland 	ASSERT(fail_mask);
258*e0f1c0afSOlaf Faaland 
259*e0f1c0afSOlaf Faaland 	return (fail_mask);
260*e0f1c0afSOlaf Faaland }
261*e0f1c0afSOlaf Faaland 
262*e0f1c0afSOlaf Faaland /*
263*e0f1c0afSOlaf Faaland  * MMP writes are issued on a fixed schedule, but may complete at variable,
264*e0f1c0afSOlaf Faaland  * much longer, intervals.  The mmp_delay captures long periods between
265*e0f1c0afSOlaf Faaland  * successful writes for any reason, including disk latency, scheduling delays,
266*e0f1c0afSOlaf Faaland  * etc.
267*e0f1c0afSOlaf Faaland  *
268*e0f1c0afSOlaf Faaland  * The mmp_delay is usually calculated as a decaying average, but if the latest
269*e0f1c0afSOlaf Faaland  * delay is higher we do not average it, so that we do not hide sudden spikes
270*e0f1c0afSOlaf Faaland  * which the importing host must wait for.
271*e0f1c0afSOlaf Faaland  *
272*e0f1c0afSOlaf Faaland  * If writes are occurring frequently, such as due to a high rate of txg syncs,
273*e0f1c0afSOlaf Faaland  * the mmp_delay could become very small.  Since those short delays depend on
274*e0f1c0afSOlaf Faaland  * activity we cannot count on, we never allow mmp_delay to get lower than rate
275*e0f1c0afSOlaf Faaland  * expected if only mmp_thread writes occur.
276*e0f1c0afSOlaf Faaland  *
277*e0f1c0afSOlaf Faaland  * If an mmp write was skipped or fails, and we have already waited longer than
278*e0f1c0afSOlaf Faaland  * mmp_delay, we need to update it so the next write reflects the longer delay.
279*e0f1c0afSOlaf Faaland  *
280*e0f1c0afSOlaf Faaland  * Do not set mmp_delay if the multihost property is not on, so as not to
281*e0f1c0afSOlaf Faaland  * trigger an activity check on import.
282*e0f1c0afSOlaf Faaland  */
283*e0f1c0afSOlaf Faaland static void
284*e0f1c0afSOlaf Faaland mmp_delay_update(spa_t *spa, boolean_t write_completed)
285*e0f1c0afSOlaf Faaland {
286*e0f1c0afSOlaf Faaland 	mmp_thread_t *mts = &spa->spa_mmp;
287*e0f1c0afSOlaf Faaland 	hrtime_t delay = gethrtime() - mts->mmp_last_write;
288*e0f1c0afSOlaf Faaland 
289*e0f1c0afSOlaf Faaland 	ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
290*e0f1c0afSOlaf Faaland 
291*e0f1c0afSOlaf Faaland 	if (spa_multihost(spa) == B_FALSE) {
292*e0f1c0afSOlaf Faaland 		mts->mmp_delay = 0;
293*e0f1c0afSOlaf Faaland 		return;
294*e0f1c0afSOlaf Faaland 	}
295*e0f1c0afSOlaf Faaland 
296*e0f1c0afSOlaf Faaland 	if (delay > mts->mmp_delay)
297*e0f1c0afSOlaf Faaland 		mts->mmp_delay = delay;
298*e0f1c0afSOlaf Faaland 
299*e0f1c0afSOlaf Faaland 	if (write_completed == B_FALSE)
300*e0f1c0afSOlaf Faaland 		return;
301*e0f1c0afSOlaf Faaland 
302*e0f1c0afSOlaf Faaland 	mts->mmp_last_write = gethrtime();
303*e0f1c0afSOlaf Faaland 
304*e0f1c0afSOlaf Faaland 	/*
305*e0f1c0afSOlaf Faaland 	 * strictly less than, in case delay was changed above.
306*e0f1c0afSOlaf Faaland 	 */
307*e0f1c0afSOlaf Faaland 	if (delay < mts->mmp_delay) {
308*e0f1c0afSOlaf Faaland 		hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) /
309*e0f1c0afSOlaf Faaland 		    MAX(1, vdev_count_leaves(spa));
310*e0f1c0afSOlaf Faaland 		mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
311*e0f1c0afSOlaf Faaland 		    min_delay);
312*e0f1c0afSOlaf Faaland 	}
313*e0f1c0afSOlaf Faaland }
314*e0f1c0afSOlaf Faaland 
315*e0f1c0afSOlaf Faaland static void
316*e0f1c0afSOlaf Faaland mmp_write_done(zio_t *zio)
317*e0f1c0afSOlaf Faaland {
318*e0f1c0afSOlaf Faaland 	spa_t *spa = zio->io_spa;
319*e0f1c0afSOlaf Faaland 	vdev_t *vd = zio->io_vd;
320*e0f1c0afSOlaf Faaland 	mmp_thread_t *mts = zio->io_private;
321*e0f1c0afSOlaf Faaland 
322*e0f1c0afSOlaf Faaland 	mutex_enter(&mts->mmp_io_lock);
323*e0f1c0afSOlaf Faaland 	uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
324*e0f1c0afSOlaf Faaland 	hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
325*e0f1c0afSOlaf Faaland 
326*e0f1c0afSOlaf Faaland 	mmp_delay_update(spa, (zio->io_error == 0));
327*e0f1c0afSOlaf Faaland 
328*e0f1c0afSOlaf Faaland 	vd->vdev_mmp_pending = 0;
329*e0f1c0afSOlaf Faaland 	vd->vdev_mmp_kstat_id = 0;
330*e0f1c0afSOlaf Faaland 
331*e0f1c0afSOlaf Faaland 	mutex_exit(&mts->mmp_io_lock);
332*e0f1c0afSOlaf Faaland 	spa_config_exit(spa, SCL_STATE, mmp_tag);
333*e0f1c0afSOlaf Faaland 
334*e0f1c0afSOlaf Faaland 	abd_free(zio->io_abd);
335*e0f1c0afSOlaf Faaland }
336*e0f1c0afSOlaf Faaland 
337*e0f1c0afSOlaf Faaland /*
338*e0f1c0afSOlaf Faaland  * When the uberblock on-disk is updated by a spa_sync,
339*e0f1c0afSOlaf Faaland  * creating a new "best" uberblock, update the one stored
340*e0f1c0afSOlaf Faaland  * in the mmp thread state, used for mmp writes.
341*e0f1c0afSOlaf Faaland  */
342*e0f1c0afSOlaf Faaland void
343*e0f1c0afSOlaf Faaland mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
344*e0f1c0afSOlaf Faaland {
345*e0f1c0afSOlaf Faaland 	mmp_thread_t *mmp = &spa->spa_mmp;
346*e0f1c0afSOlaf Faaland 
347*e0f1c0afSOlaf Faaland 	mutex_enter(&mmp->mmp_io_lock);
348*e0f1c0afSOlaf Faaland 	mmp->mmp_ub = *ub;
349*e0f1c0afSOlaf Faaland 	mmp->mmp_ub.ub_timestamp = gethrestime_sec();
350*e0f1c0afSOlaf Faaland 	mmp_delay_update(spa, B_TRUE);
351*e0f1c0afSOlaf Faaland 	mutex_exit(&mmp->mmp_io_lock);
352*e0f1c0afSOlaf Faaland }
353*e0f1c0afSOlaf Faaland 
354*e0f1c0afSOlaf Faaland /*
355*e0f1c0afSOlaf Faaland  * Choose a random vdev, label, and MMP block, and write over it
356*e0f1c0afSOlaf Faaland  * with a copy of the last-synced uberblock, whose timestamp
357*e0f1c0afSOlaf Faaland  * has been updated to reflect that the pool is in use.
358*e0f1c0afSOlaf Faaland  */
359*e0f1c0afSOlaf Faaland static void
360*e0f1c0afSOlaf Faaland mmp_write_uberblock(spa_t *spa)
361*e0f1c0afSOlaf Faaland {
362*e0f1c0afSOlaf Faaland 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
363*e0f1c0afSOlaf Faaland 	mmp_thread_t *mmp = &spa->spa_mmp;
364*e0f1c0afSOlaf Faaland 	uberblock_t *ub;
365*e0f1c0afSOlaf Faaland 	vdev_t *vd = NULL;
366*e0f1c0afSOlaf Faaland 	int label, error;
367*e0f1c0afSOlaf Faaland 	uint64_t offset;
368*e0f1c0afSOlaf Faaland 
369*e0f1c0afSOlaf Faaland 	hrtime_t lock_acquire_time = gethrtime();
370*e0f1c0afSOlaf Faaland 	spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
371*e0f1c0afSOlaf Faaland 	lock_acquire_time = gethrtime() - lock_acquire_time;
372*e0f1c0afSOlaf Faaland 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
373*e0f1c0afSOlaf Faaland 		zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n",
374*e0f1c0afSOlaf Faaland 		    (u_longlong_t)lock_acquire_time);
375*e0f1c0afSOlaf Faaland 
376*e0f1c0afSOlaf Faaland 	mutex_enter(&mmp->mmp_io_lock);
377*e0f1c0afSOlaf Faaland 
378*e0f1c0afSOlaf Faaland 	error = mmp_next_leaf(spa);
379*e0f1c0afSOlaf Faaland 
380*e0f1c0afSOlaf Faaland 	/*
381*e0f1c0afSOlaf Faaland 	 * spa_mmp_history has two types of entries:
382*e0f1c0afSOlaf Faaland 	 * Issued MMP write: records time issued, error status, etc.
383*e0f1c0afSOlaf Faaland 	 * Skipped MMP write: an MMP write could not be issued because no
384*e0f1c0afSOlaf Faaland 	 * suitable leaf vdev was available.  See comment above struct
385*e0f1c0afSOlaf Faaland 	 * spa_mmp_history for details.
386*e0f1c0afSOlaf Faaland 	 */
387*e0f1c0afSOlaf Faaland 
388*e0f1c0afSOlaf Faaland 	if (error) {
389*e0f1c0afSOlaf Faaland 		mmp_delay_update(spa, B_FALSE);
390*e0f1c0afSOlaf Faaland 		if (mmp->mmp_skip_error == error) {
391*e0f1c0afSOlaf Faaland 			/*
392*e0f1c0afSOlaf Faaland 			 * ZoL porting note: the following is TBD
393*e0f1c0afSOlaf Faaland 			 * spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
394*e0f1c0afSOlaf Faaland 			 */
395*e0f1c0afSOlaf Faaland 		} else {
396*e0f1c0afSOlaf Faaland 			mmp->mmp_skip_error = error;
397*e0f1c0afSOlaf Faaland 			/*
398*e0f1c0afSOlaf Faaland 			 * ZoL porting note: the following is TBD
399*e0f1c0afSOlaf Faaland 			 * spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
400*e0f1c0afSOlaf Faaland 			 * gethrestime_sec(), mmp->mmp_delay, NULL, 0,
401*e0f1c0afSOlaf Faaland 			 * mmp->mmp_kstat_id++, error);
402*e0f1c0afSOlaf Faaland 			 */
403*e0f1c0afSOlaf Faaland 		}
404*e0f1c0afSOlaf Faaland 		mutex_exit(&mmp->mmp_io_lock);
405*e0f1c0afSOlaf Faaland 		spa_config_exit(spa, SCL_STATE, mmp_tag);
406*e0f1c0afSOlaf Faaland 		return;
407*e0f1c0afSOlaf Faaland 	}
408*e0f1c0afSOlaf Faaland 
409*e0f1c0afSOlaf Faaland 	vd = spa->spa_mmp.mmp_last_leaf;
410*e0f1c0afSOlaf Faaland 	mmp->mmp_skip_error = 0;
411*e0f1c0afSOlaf Faaland 
412*e0f1c0afSOlaf Faaland 	if (mmp->mmp_zio_root == NULL)
413*e0f1c0afSOlaf Faaland 		mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
414*e0f1c0afSOlaf Faaland 		    flags | ZIO_FLAG_GODFATHER);
415*e0f1c0afSOlaf Faaland 
416*e0f1c0afSOlaf Faaland 	ub = &mmp->mmp_ub;
417*e0f1c0afSOlaf Faaland 	ub->ub_timestamp = gethrestime_sec();
418*e0f1c0afSOlaf Faaland 	ub->ub_mmp_magic = MMP_MAGIC;
419*e0f1c0afSOlaf Faaland 	ub->ub_mmp_delay = mmp->mmp_delay;
420*e0f1c0afSOlaf Faaland 	vd->vdev_mmp_pending = gethrtime();
421*e0f1c0afSOlaf Faaland 	vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
422*e0f1c0afSOlaf Faaland 
423*e0f1c0afSOlaf Faaland 	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
424*e0f1c0afSOlaf Faaland 	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
425*e0f1c0afSOlaf Faaland 	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
426*e0f1c0afSOlaf Faaland 	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
427*e0f1c0afSOlaf Faaland 
428*e0f1c0afSOlaf Faaland 	mmp->mmp_kstat_id++;
429*e0f1c0afSOlaf Faaland 	mutex_exit(&mmp->mmp_io_lock);
430*e0f1c0afSOlaf Faaland 
431*e0f1c0afSOlaf Faaland 	offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
432*e0f1c0afSOlaf Faaland 	    MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
433*e0f1c0afSOlaf Faaland 
434*e0f1c0afSOlaf Faaland 	label = spa_get_random(VDEV_LABELS);
435*e0f1c0afSOlaf Faaland 	vdev_label_write(zio, vd, label, ub_abd, offset,
436*e0f1c0afSOlaf Faaland 	    VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
437*e0f1c0afSOlaf Faaland 	    flags | ZIO_FLAG_DONT_PROPAGATE);
438*e0f1c0afSOlaf Faaland 
439*e0f1c0afSOlaf Faaland 	/*
440*e0f1c0afSOlaf Faaland 	 * ZoL porting note: the following is TBD
441*e0f1c0afSOlaf Faaland 	 * (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
442*e0f1c0afSOlaf Faaland 	 * ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
443*e0f1c0afSOlaf Faaland 	 */
444*e0f1c0afSOlaf Faaland 
445*e0f1c0afSOlaf Faaland 	zio_nowait(zio);
446*e0f1c0afSOlaf Faaland }
447*e0f1c0afSOlaf Faaland 
448*e0f1c0afSOlaf Faaland static void
449*e0f1c0afSOlaf Faaland mmp_thread(void *arg)
450*e0f1c0afSOlaf Faaland {
451*e0f1c0afSOlaf Faaland 	spa_t *spa = (spa_t *)arg;
452*e0f1c0afSOlaf Faaland 	mmp_thread_t *mmp = &spa->spa_mmp;
453*e0f1c0afSOlaf Faaland 	boolean_t last_spa_suspended = spa_suspended(spa);
454*e0f1c0afSOlaf Faaland 	boolean_t last_spa_multihost = spa_multihost(spa);
455*e0f1c0afSOlaf Faaland 	callb_cpr_t cpr;
456*e0f1c0afSOlaf Faaland 	hrtime_t max_fail_ns = zfs_multihost_fail_intervals *
457*e0f1c0afSOlaf Faaland 	    MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
458*e0f1c0afSOlaf Faaland 
459*e0f1c0afSOlaf Faaland 	mmp_thread_enter(mmp, &cpr);
460*e0f1c0afSOlaf Faaland 
461*e0f1c0afSOlaf Faaland 	/*
462*e0f1c0afSOlaf Faaland 	 * The mmp_write_done() function calculates mmp_delay based on the
463*e0f1c0afSOlaf Faaland 	 * prior value of mmp_delay and the elapsed time since the last write.
464*e0f1c0afSOlaf Faaland 	 * For the first mmp write, there is no "last write", so we start
465*e0f1c0afSOlaf Faaland 	 * with fake, but reasonable, default non-zero values.
466*e0f1c0afSOlaf Faaland 	 */
467*e0f1c0afSOlaf Faaland 	mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval,
468*e0f1c0afSOlaf Faaland 	    MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1);
469*e0f1c0afSOlaf Faaland 	mmp->mmp_last_write = gethrtime() - mmp->mmp_delay;
470*e0f1c0afSOlaf Faaland 
471*e0f1c0afSOlaf Faaland 	while (!mmp->mmp_thread_exiting) {
472*e0f1c0afSOlaf Faaland 		uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals;
473*e0f1c0afSOlaf Faaland 		uint64_t mmp_interval = MSEC2NSEC(
474*e0f1c0afSOlaf Faaland 		    MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
475*e0f1c0afSOlaf Faaland 		boolean_t suspended = spa_suspended(spa);
476*e0f1c0afSOlaf Faaland 		boolean_t multihost = spa_multihost(spa);
477*e0f1c0afSOlaf Faaland 		hrtime_t next_time;
478*e0f1c0afSOlaf Faaland 
479*e0f1c0afSOlaf Faaland 		if (multihost)
480*e0f1c0afSOlaf Faaland 			next_time = gethrtime() + mmp_interval /
481*e0f1c0afSOlaf Faaland 			    MAX(vdev_count_leaves(spa), 1);
482*e0f1c0afSOlaf Faaland 		else
483*e0f1c0afSOlaf Faaland 			next_time = gethrtime() +
484*e0f1c0afSOlaf Faaland 			    MSEC2NSEC(MMP_DEFAULT_INTERVAL);
485*e0f1c0afSOlaf Faaland 
486*e0f1c0afSOlaf Faaland 		/*
487*e0f1c0afSOlaf Faaland 		 * MMP off => on, or suspended => !suspended:
488*e0f1c0afSOlaf Faaland 		 * No writes occurred recently.  Update mmp_last_write to give
489*e0f1c0afSOlaf Faaland 		 * us some time to try.
490*e0f1c0afSOlaf Faaland 		 */
491*e0f1c0afSOlaf Faaland 		if ((!last_spa_multihost && multihost) ||
492*e0f1c0afSOlaf Faaland 		    (last_spa_suspended && !suspended)) {
493*e0f1c0afSOlaf Faaland 			mutex_enter(&mmp->mmp_io_lock);
494*e0f1c0afSOlaf Faaland 			mmp->mmp_last_write = gethrtime();
495*e0f1c0afSOlaf Faaland 			mutex_exit(&mmp->mmp_io_lock);
496*e0f1c0afSOlaf Faaland 		}
497*e0f1c0afSOlaf Faaland 
498*e0f1c0afSOlaf Faaland 		/*
499*e0f1c0afSOlaf Faaland 		 * MMP on => off:
500*e0f1c0afSOlaf Faaland 		 * mmp_delay == 0 tells importing node to skip activity check.
501*e0f1c0afSOlaf Faaland 		 */
502*e0f1c0afSOlaf Faaland 		if (last_spa_multihost && !multihost) {
503*e0f1c0afSOlaf Faaland 			mutex_enter(&mmp->mmp_io_lock);
504*e0f1c0afSOlaf Faaland 			mmp->mmp_delay = 0;
505*e0f1c0afSOlaf Faaland 			mutex_exit(&mmp->mmp_io_lock);
506*e0f1c0afSOlaf Faaland 		}
507*e0f1c0afSOlaf Faaland 		last_spa_multihost = multihost;
508*e0f1c0afSOlaf Faaland 		last_spa_suspended = suspended;
509*e0f1c0afSOlaf Faaland 
510*e0f1c0afSOlaf Faaland 		/*
511*e0f1c0afSOlaf Faaland 		 * Smooth max_fail_ns when its factors are decreased, because
512*e0f1c0afSOlaf Faaland 		 * making (max_fail_ns < mmp_interval) results in the pool being
513*e0f1c0afSOlaf Faaland 		 * immediately suspended before writes can occur at the new
514*e0f1c0afSOlaf Faaland 		 * higher frequency.
515*e0f1c0afSOlaf Faaland 		 */
516*e0f1c0afSOlaf Faaland 		if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) {
517*e0f1c0afSOlaf Faaland 			max_fail_ns = ((31 * max_fail_ns) + (mmp_interval *
518*e0f1c0afSOlaf Faaland 			    mmp_fail_intervals)) / 32;
519*e0f1c0afSOlaf Faaland 		} else {
520*e0f1c0afSOlaf Faaland 			max_fail_ns = mmp_interval * mmp_fail_intervals;
521*e0f1c0afSOlaf Faaland 		}
522*e0f1c0afSOlaf Faaland 
523*e0f1c0afSOlaf Faaland 		/*
524*e0f1c0afSOlaf Faaland 		 * Suspend the pool if no MMP write has succeeded in over
525*e0f1c0afSOlaf Faaland 		 * mmp_interval * mmp_fail_intervals nanoseconds.
526*e0f1c0afSOlaf Faaland 		 */
527*e0f1c0afSOlaf Faaland 		if (!suspended && mmp_fail_intervals && multihost &&
528*e0f1c0afSOlaf Faaland 		    (gethrtime() - mmp->mmp_last_write) > max_fail_ns) {
529*e0f1c0afSOlaf Faaland 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
530*e0f1c0afSOlaf Faaland 			    "succeeded in over %llus; suspending pool",
531*e0f1c0afSOlaf Faaland 			    spa_name(spa),
532*e0f1c0afSOlaf Faaland 			    NSEC2SEC(gethrtime() - mmp->mmp_last_write));
533*e0f1c0afSOlaf Faaland 			zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
534*e0f1c0afSOlaf Faaland 		}
535*e0f1c0afSOlaf Faaland 
536*e0f1c0afSOlaf Faaland 		if (multihost && !suspended)
537*e0f1c0afSOlaf Faaland 			mmp_write_uberblock(spa);
538*e0f1c0afSOlaf Faaland 
539*e0f1c0afSOlaf Faaland 		CALLB_CPR_SAFE_BEGIN(&cpr);
540*e0f1c0afSOlaf Faaland 		(void) cv_timedwait_sig_hrtime(&mmp->mmp_thread_cv,
541*e0f1c0afSOlaf Faaland 		    &mmp->mmp_thread_lock, next_time);
542*e0f1c0afSOlaf Faaland 		CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
543*e0f1c0afSOlaf Faaland 	}
544*e0f1c0afSOlaf Faaland 
545*e0f1c0afSOlaf Faaland 	/* Outstanding writes are allowed to complete. */
546*e0f1c0afSOlaf Faaland 	if (mmp->mmp_zio_root)
547*e0f1c0afSOlaf Faaland 		zio_wait(mmp->mmp_zio_root);
548*e0f1c0afSOlaf Faaland 
549*e0f1c0afSOlaf Faaland 	mmp->mmp_zio_root = NULL;
550*e0f1c0afSOlaf Faaland 	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
551*e0f1c0afSOlaf Faaland }
552*e0f1c0afSOlaf Faaland 
553*e0f1c0afSOlaf Faaland /*
554*e0f1c0afSOlaf Faaland  * Signal the MMP thread to wake it, when it is sleeping on
555*e0f1c0afSOlaf Faaland  * its cv.  Used when some module parameter has changed and
556*e0f1c0afSOlaf Faaland  * we want the thread to know about it.
557*e0f1c0afSOlaf Faaland  * Only signal if the pool is active and mmp thread is
558*e0f1c0afSOlaf Faaland  * running, otherwise there is no thread to wake.
559*e0f1c0afSOlaf Faaland  */
560*e0f1c0afSOlaf Faaland static void
561*e0f1c0afSOlaf Faaland mmp_signal_thread(spa_t *spa)
562*e0f1c0afSOlaf Faaland {
563*e0f1c0afSOlaf Faaland 	mmp_thread_t *mmp = &spa->spa_mmp;
564*e0f1c0afSOlaf Faaland 
565*e0f1c0afSOlaf Faaland 	mutex_enter(&mmp->mmp_thread_lock);
566*e0f1c0afSOlaf Faaland 	if (mmp->mmp_thread)
567*e0f1c0afSOlaf Faaland 		cv_broadcast(&mmp->mmp_thread_cv);
568*e0f1c0afSOlaf Faaland 	mutex_exit(&mmp->mmp_thread_lock);
569*e0f1c0afSOlaf Faaland }
570*e0f1c0afSOlaf Faaland 
571*e0f1c0afSOlaf Faaland void
572*e0f1c0afSOlaf Faaland mmp_signal_all_threads(void)
573*e0f1c0afSOlaf Faaland {
574*e0f1c0afSOlaf Faaland 	spa_t *spa = NULL;
575*e0f1c0afSOlaf Faaland 
576*e0f1c0afSOlaf Faaland 	mutex_enter(&spa_namespace_lock);
577*e0f1c0afSOlaf Faaland 	while ((spa = spa_next(spa))) {
578*e0f1c0afSOlaf Faaland 		if (spa->spa_state == POOL_STATE_ACTIVE)
579*e0f1c0afSOlaf Faaland 			mmp_signal_thread(spa);
580*e0f1c0afSOlaf Faaland 	}
581*e0f1c0afSOlaf Faaland 	mutex_exit(&spa_namespace_lock);
582*e0f1c0afSOlaf Faaland }
583