xref: /illumos-gate/usr/src/uts/common/fs/zfs/mmp.c (revision e0f1c0af)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
23  * Copyright 2019 Joyent, Inc.
24  */
25 
26 #include <sys/abd.h>
27 #include <sys/mmp.h>
28 #include <sys/spa.h>
29 #include <sys/spa_impl.h>
30 #include <sys/time.h>
31 #include <sys/vdev.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zfs_context.h>
34 #include <sys/callb.h>
35 
36 /*
37  * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
38  * or opening a pool on more than one host at a time.  In particular, it
39  * prevents "zpool import -f" on a host from succeeding while the pool is
40  * already imported on another host.  There are many other ways in which a
41  * device could be used by two hosts for different purposes at the same time
42  * resulting in pool damage.  This implementation does not attempt to detect
43  * those cases.
44  *
45  * MMP operates by ensuring there are frequent visible changes on disk (a
46  * "heartbeat") at all times.  And by altering the import process to check
47  * for these changes and failing the import when they are detected.  This
48  * functionality is enabled by setting the 'multihost' pool property to on.
49  *
50  * Uberblocks written by the txg_sync thread always go into the first
51  * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
52  * They are used to hold uberblocks which are exactly the same as the last
53  * synced uberblock except that the ub_timestamp is frequently updated.
54  * Like all other uberblocks, the slot is written with an embedded checksum,
55  * and slots with invalid checksums are ignored.  This provides the
56  * "heartbeat", with no risk of overwriting good uberblocks that must be
57  * preserved, e.g. previous txgs and associated block pointers.
58  *
59  * Two optional fields are added to uberblock structure: ub_mmp_magic and
60  * ub_mmp_delay.  The magic field allows zfs to tell whether ub_mmp_delay is
61  * valid.  The delay field is a decaying average of the amount of time between
62  * completion of successive MMP writes, in nanoseconds.  It is used to predict
63  * how long the import must wait to detect activity in the pool, before
64  * concluding it is not in use.
65  *
66  * During import an activity test may now be performed to determine if
67  * the pool is in use.  The activity test is typically required if the
68  * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
69  * POOL_STATE_ACTIVE, and the pool is not a root pool.
70  *
71  * The activity test finds the "best" uberblock (highest txg & timestamp),
72  * waits some time, and then finds the "best" uberblock again.  If the txg
73  * and timestamp in both "best" uberblocks do not match, the pool is in use
74  * by another host and the import fails.  Since the granularity of the
75  * timestamp is in seconds this activity test must take a bare minimum of one
76  * second.  In order to assure the accuracy of the activity test, the default
77  * values result in an activity test duration of 10x the mmp write interval.
78  *
79  * The "zpool import"  activity test can be expected to take a minimum time of
80  * zfs_multihost_import_intervals * zfs_multihost_interval milliseconds.  If the
81  * "best" uberblock has a valid ub_mmp_delay field, then the duration of the
82  * test may take longer if MMP writes were occurring less frequently than
83  * expected.  Additionally, the duration is then extended by a random 25% to
84  * attempt to to detect simultaneous imports.  For example, if both partner
85  * hosts are rebooted at the same time and automatically attempt to import the
86  * pool.
87  */
88 
89 /*
90  * Used to control the frequency of mmp writes which are performed when the
91  * 'multihost' pool property is on.  This is one factor used to determine the
92  * length of the activity check during import.
93  *
94  * The mmp write period is zfs_multihost_interval / leaf-vdevs milliseconds.
95  * This means that on average an mmp write will be issued for each leaf vdev
96  * every zfs_multihost_interval milliseconds.  In practice, the observed period
97  * can vary with the I/O load and this observed value is the delay which is
98  * stored in the uberblock.  The minimum allowed value is 100 ms.
99  */
100 ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
101 
102 /*
103  * Used to control the duration of the activity test on import.  Smaller values
104  * of zfs_multihost_import_intervals will reduce the import time but increase
105  * the risk of failing to detect an active pool.  The total activity check time
106  * is never allowed to drop below one second.  A value of 0 is ignored and
107  * treated as if it was set to 1.
108  */
109 uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
110 
111 /*
112  * Controls the behavior of the pool when mmp write failures are detected.
113  *
114  * When zfs_multihost_fail_intervals = 0 then mmp write failures are ignored.
115  * The failures will still be reported to the ZED which depending on its
116  * configuration may take action such as suspending the pool or taking a
117  * device offline.
118  *
119  * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will
120  * cause the pool to be suspended.  This occurs when
121  * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have
122  * passed since the last successful mmp write.  This guarantees the activity
123  * test will see mmp writes if the
124  * pool is imported.
125  */
126 uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
127 
128 char *mmp_tag = "mmp_write_uberblock";
129 static void mmp_thread(void *arg);
130 
131 void
132 mmp_init(spa_t *spa)
133 {
134 	mmp_thread_t *mmp = &spa->spa_mmp;
135 
136 	mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
137 	cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
138 	mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
139 	mmp->mmp_kstat_id = 1;
140 }
141 
142 void
143 mmp_fini(spa_t *spa)
144 {
145 	mmp_thread_t *mmp = &spa->spa_mmp;
146 
147 	mutex_destroy(&mmp->mmp_thread_lock);
148 	cv_destroy(&mmp->mmp_thread_cv);
149 	mutex_destroy(&mmp->mmp_io_lock);
150 }
151 
152 static void
153 mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
154 {
155 	CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
156 	mutex_enter(&mmp->mmp_thread_lock);
157 }
158 
159 static void
160 mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
161 {
162 	ASSERT(*mpp != NULL);
163 	*mpp = NULL;
164 	cv_broadcast(&mmp->mmp_thread_cv);
165 	CALLB_CPR_EXIT(cpr);		/* drops &mmp->mmp_thread_lock */
166 	thread_exit();
167 }
168 
169 void
170 mmp_thread_start(spa_t *spa)
171 {
172 	mmp_thread_t *mmp = &spa->spa_mmp;
173 
174 	if (spa_writeable(spa)) {
175 		mutex_enter(&mmp->mmp_thread_lock);
176 		if (!mmp->mmp_thread) {
177 			dprintf("mmp_thread_start pool %s\n",
178 			    spa->spa_name);
179 			mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
180 			    spa, 0, &p0, TS_RUN, minclsyspri);
181 		}
182 		mutex_exit(&mmp->mmp_thread_lock);
183 	}
184 }
185 
186 void
187 mmp_thread_stop(spa_t *spa)
188 {
189 	mmp_thread_t *mmp = &spa->spa_mmp;
190 
191 	mutex_enter(&mmp->mmp_thread_lock);
192 	mmp->mmp_thread_exiting = 1;
193 	cv_broadcast(&mmp->mmp_thread_cv);
194 
195 	while (mmp->mmp_thread) {
196 		cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
197 	}
198 	mutex_exit(&mmp->mmp_thread_lock);
199 
200 	ASSERT(mmp->mmp_thread == NULL);
201 	mmp->mmp_thread_exiting = 0;
202 }
203 
204 typedef enum mmp_vdev_state_flag {
205 	MMP_FAIL_NOT_WRITABLE	= (1 << 0),
206 	MMP_FAIL_WRITE_PENDING	= (1 << 1),
207 } mmp_vdev_state_flag_t;
208 
209 /*
210  * Find a leaf vdev to write an MMP block to.  It must not have an outstanding
211  * mmp write (if so a new write will also likely block).  If there is no usable
212  * leaf, a nonzero error value is returned. The error value returned is a bit
213  * field.
214  *
215  * MMP_FAIL_WRITE_PENDING   One or more leaf vdevs are writeable, but have an
216  *                          outstanding MMP write.
217  * MMP_FAIL_NOT_WRITABLE    One or more leaf vdevs are not writeable.
218  */
219 
220 static int
221 mmp_next_leaf(spa_t *spa)
222 {
223 	vdev_t *leaf;
224 	vdev_t *starting_leaf;
225 	int fail_mask = 0;
226 
227 	ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
228 	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
229 	ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
230 	ASSERT(!list_is_empty(&spa->spa_leaf_list));
231 
232 	if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
233 		spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
234 		spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
235 	}
236 
237 	leaf = spa->spa_mmp.mmp_last_leaf;
238 	if (leaf == NULL)
239 		leaf = list_head(&spa->spa_leaf_list);
240 	starting_leaf = leaf;
241 
242 	do {
243 		leaf = list_next(&spa->spa_leaf_list, leaf);
244 		if (leaf == NULL)
245 			leaf = list_head(&spa->spa_leaf_list);
246 
247 		if (!vdev_writeable(leaf)) {
248 			fail_mask |= MMP_FAIL_NOT_WRITABLE;
249 		} else if (leaf->vdev_mmp_pending != 0) {
250 			fail_mask |= MMP_FAIL_WRITE_PENDING;
251 		} else {
252 			spa->spa_mmp.mmp_last_leaf = leaf;
253 			return (0);
254 		}
255 	} while (leaf != starting_leaf);
256 
257 	ASSERT(fail_mask);
258 
259 	return (fail_mask);
260 }
261 
262 /*
263  * MMP writes are issued on a fixed schedule, but may complete at variable,
264  * much longer, intervals.  The mmp_delay captures long periods between
265  * successful writes for any reason, including disk latency, scheduling delays,
266  * etc.
267  *
268  * The mmp_delay is usually calculated as a decaying average, but if the latest
269  * delay is higher we do not average it, so that we do not hide sudden spikes
270  * which the importing host must wait for.
271  *
272  * If writes are occurring frequently, such as due to a high rate of txg syncs,
273  * the mmp_delay could become very small.  Since those short delays depend on
274  * activity we cannot count on, we never allow mmp_delay to get lower than rate
275  * expected if only mmp_thread writes occur.
276  *
277  * If an mmp write was skipped or fails, and we have already waited longer than
278  * mmp_delay, we need to update it so the next write reflects the longer delay.
279  *
280  * Do not set mmp_delay if the multihost property is not on, so as not to
281  * trigger an activity check on import.
282  */
283 static void
284 mmp_delay_update(spa_t *spa, boolean_t write_completed)
285 {
286 	mmp_thread_t *mts = &spa->spa_mmp;
287 	hrtime_t delay = gethrtime() - mts->mmp_last_write;
288 
289 	ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
290 
291 	if (spa_multihost(spa) == B_FALSE) {
292 		mts->mmp_delay = 0;
293 		return;
294 	}
295 
296 	if (delay > mts->mmp_delay)
297 		mts->mmp_delay = delay;
298 
299 	if (write_completed == B_FALSE)
300 		return;
301 
302 	mts->mmp_last_write = gethrtime();
303 
304 	/*
305 	 * strictly less than, in case delay was changed above.
306 	 */
307 	if (delay < mts->mmp_delay) {
308 		hrtime_t min_delay = MSEC2NSEC(zfs_multihost_interval) /
309 		    MAX(1, vdev_count_leaves(spa));
310 		mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
311 		    min_delay);
312 	}
313 }
314 
315 static void
316 mmp_write_done(zio_t *zio)
317 {
318 	spa_t *spa = zio->io_spa;
319 	vdev_t *vd = zio->io_vd;
320 	mmp_thread_t *mts = zio->io_private;
321 
322 	mutex_enter(&mts->mmp_io_lock);
323 	uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
324 	hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
325 
326 	mmp_delay_update(spa, (zio->io_error == 0));
327 
328 	vd->vdev_mmp_pending = 0;
329 	vd->vdev_mmp_kstat_id = 0;
330 
331 	mutex_exit(&mts->mmp_io_lock);
332 	spa_config_exit(spa, SCL_STATE, mmp_tag);
333 
334 	abd_free(zio->io_abd);
335 }
336 
337 /*
338  * When the uberblock on-disk is updated by a spa_sync,
339  * creating a new "best" uberblock, update the one stored
340  * in the mmp thread state, used for mmp writes.
341  */
342 void
343 mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
344 {
345 	mmp_thread_t *mmp = &spa->spa_mmp;
346 
347 	mutex_enter(&mmp->mmp_io_lock);
348 	mmp->mmp_ub = *ub;
349 	mmp->mmp_ub.ub_timestamp = gethrestime_sec();
350 	mmp_delay_update(spa, B_TRUE);
351 	mutex_exit(&mmp->mmp_io_lock);
352 }
353 
354 /*
355  * Choose a random vdev, label, and MMP block, and write over it
356  * with a copy of the last-synced uberblock, whose timestamp
357  * has been updated to reflect that the pool is in use.
358  */
359 static void
360 mmp_write_uberblock(spa_t *spa)
361 {
362 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
363 	mmp_thread_t *mmp = &spa->spa_mmp;
364 	uberblock_t *ub;
365 	vdev_t *vd = NULL;
366 	int label, error;
367 	uint64_t offset;
368 
369 	hrtime_t lock_acquire_time = gethrtime();
370 	spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
371 	lock_acquire_time = gethrtime() - lock_acquire_time;
372 	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
373 		zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n",
374 		    (u_longlong_t)lock_acquire_time);
375 
376 	mutex_enter(&mmp->mmp_io_lock);
377 
378 	error = mmp_next_leaf(spa);
379 
380 	/*
381 	 * spa_mmp_history has two types of entries:
382 	 * Issued MMP write: records time issued, error status, etc.
383 	 * Skipped MMP write: an MMP write could not be issued because no
384 	 * suitable leaf vdev was available.  See comment above struct
385 	 * spa_mmp_history for details.
386 	 */
387 
388 	if (error) {
389 		mmp_delay_update(spa, B_FALSE);
390 		if (mmp->mmp_skip_error == error) {
391 			/*
392 			 * ZoL porting note: the following is TBD
393 			 * spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
394 			 */
395 		} else {
396 			mmp->mmp_skip_error = error;
397 			/*
398 			 * ZoL porting note: the following is TBD
399 			 * spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
400 			 * gethrestime_sec(), mmp->mmp_delay, NULL, 0,
401 			 * mmp->mmp_kstat_id++, error);
402 			 */
403 		}
404 		mutex_exit(&mmp->mmp_io_lock);
405 		spa_config_exit(spa, SCL_STATE, mmp_tag);
406 		return;
407 	}
408 
409 	vd = spa->spa_mmp.mmp_last_leaf;
410 	mmp->mmp_skip_error = 0;
411 
412 	if (mmp->mmp_zio_root == NULL)
413 		mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
414 		    flags | ZIO_FLAG_GODFATHER);
415 
416 	ub = &mmp->mmp_ub;
417 	ub->ub_timestamp = gethrestime_sec();
418 	ub->ub_mmp_magic = MMP_MAGIC;
419 	ub->ub_mmp_delay = mmp->mmp_delay;
420 	vd->vdev_mmp_pending = gethrtime();
421 	vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
422 
423 	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
424 	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
425 	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
426 	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
427 
428 	mmp->mmp_kstat_id++;
429 	mutex_exit(&mmp->mmp_io_lock);
430 
431 	offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
432 	    MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
433 
434 	label = spa_get_random(VDEV_LABELS);
435 	vdev_label_write(zio, vd, label, ub_abd, offset,
436 	    VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
437 	    flags | ZIO_FLAG_DONT_PROPAGATE);
438 
439 	/*
440 	 * ZoL porting note: the following is TBD
441 	 * (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
442 	 * ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
443 	 */
444 
445 	zio_nowait(zio);
446 }
447 
448 static void
449 mmp_thread(void *arg)
450 {
451 	spa_t *spa = (spa_t *)arg;
452 	mmp_thread_t *mmp = &spa->spa_mmp;
453 	boolean_t last_spa_suspended = spa_suspended(spa);
454 	boolean_t last_spa_multihost = spa_multihost(spa);
455 	callb_cpr_t cpr;
456 	hrtime_t max_fail_ns = zfs_multihost_fail_intervals *
457 	    MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
458 
459 	mmp_thread_enter(mmp, &cpr);
460 
461 	/*
462 	 * The mmp_write_done() function calculates mmp_delay based on the
463 	 * prior value of mmp_delay and the elapsed time since the last write.
464 	 * For the first mmp write, there is no "last write", so we start
465 	 * with fake, but reasonable, default non-zero values.
466 	 */
467 	mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval,
468 	    MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1);
469 	mmp->mmp_last_write = gethrtime() - mmp->mmp_delay;
470 
471 	while (!mmp->mmp_thread_exiting) {
472 		uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals;
473 		uint64_t mmp_interval = MSEC2NSEC(
474 		    MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
475 		boolean_t suspended = spa_suspended(spa);
476 		boolean_t multihost = spa_multihost(spa);
477 		hrtime_t next_time;
478 
479 		if (multihost)
480 			next_time = gethrtime() + mmp_interval /
481 			    MAX(vdev_count_leaves(spa), 1);
482 		else
483 			next_time = gethrtime() +
484 			    MSEC2NSEC(MMP_DEFAULT_INTERVAL);
485 
486 		/*
487 		 * MMP off => on, or suspended => !suspended:
488 		 * No writes occurred recently.  Update mmp_last_write to give
489 		 * us some time to try.
490 		 */
491 		if ((!last_spa_multihost && multihost) ||
492 		    (last_spa_suspended && !suspended)) {
493 			mutex_enter(&mmp->mmp_io_lock);
494 			mmp->mmp_last_write = gethrtime();
495 			mutex_exit(&mmp->mmp_io_lock);
496 		}
497 
498 		/*
499 		 * MMP on => off:
500 		 * mmp_delay == 0 tells importing node to skip activity check.
501 		 */
502 		if (last_spa_multihost && !multihost) {
503 			mutex_enter(&mmp->mmp_io_lock);
504 			mmp->mmp_delay = 0;
505 			mutex_exit(&mmp->mmp_io_lock);
506 		}
507 		last_spa_multihost = multihost;
508 		last_spa_suspended = suspended;
509 
510 		/*
511 		 * Smooth max_fail_ns when its factors are decreased, because
512 		 * making (max_fail_ns < mmp_interval) results in the pool being
513 		 * immediately suspended before writes can occur at the new
514 		 * higher frequency.
515 		 */
516 		if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) {
517 			max_fail_ns = ((31 * max_fail_ns) + (mmp_interval *
518 			    mmp_fail_intervals)) / 32;
519 		} else {
520 			max_fail_ns = mmp_interval * mmp_fail_intervals;
521 		}
522 
523 		/*
524 		 * Suspend the pool if no MMP write has succeeded in over
525 		 * mmp_interval * mmp_fail_intervals nanoseconds.
526 		 */
527 		if (!suspended && mmp_fail_intervals && multihost &&
528 		    (gethrtime() - mmp->mmp_last_write) > max_fail_ns) {
529 			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
530 			    "succeeded in over %llus; suspending pool",
531 			    spa_name(spa),
532 			    NSEC2SEC(gethrtime() - mmp->mmp_last_write));
533 			zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
534 		}
535 
536 		if (multihost && !suspended)
537 			mmp_write_uberblock(spa);
538 
539 		CALLB_CPR_SAFE_BEGIN(&cpr);
540 		(void) cv_timedwait_sig_hrtime(&mmp->mmp_thread_cv,
541 		    &mmp->mmp_thread_lock, next_time);
542 		CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
543 	}
544 
545 	/* Outstanding writes are allowed to complete. */
546 	if (mmp->mmp_zio_root)
547 		zio_wait(mmp->mmp_zio_root);
548 
549 	mmp->mmp_zio_root = NULL;
550 	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
551 }
552 
553 /*
554  * Signal the MMP thread to wake it, when it is sleeping on
555  * its cv.  Used when some module parameter has changed and
556  * we want the thread to know about it.
557  * Only signal if the pool is active and mmp thread is
558  * running, otherwise there is no thread to wake.
559  */
560 static void
561 mmp_signal_thread(spa_t *spa)
562 {
563 	mmp_thread_t *mmp = &spa->spa_mmp;
564 
565 	mutex_enter(&mmp->mmp_thread_lock);
566 	if (mmp->mmp_thread)
567 		cv_broadcast(&mmp->mmp_thread_cv);
568 	mutex_exit(&mmp->mmp_thread_lock);
569 }
570 
571 void
572 mmp_signal_all_threads(void)
573 {
574 	spa_t *spa = NULL;
575 
576 	mutex_enter(&spa_namespace_lock);
577 	while ((spa = spa_next(spa))) {
578 		if (spa->spa_state == POOL_STATE_ACTIVE)
579 			mmp_signal_thread(spa);
580 	}
581 	mutex_exit(&spa_namespace_lock);
582 }
583