1#!/bin/ksh -p
2
3#
4# CDDL HEADER START
5#
6# This file and its contents are supplied under the terms of the
7# Common Development and Distribution License ("CDDL"), version 1.0.
8# You may only use this file in accordance with the terms of version
9# 1.0 of the CDDL.
10#
11# A full copy of the text of the CDDL should have accompanied this
12# source.  A copy of the CDDL is also available via the Internet at
13# http://www.illumos.org/license/CDDL.
14#
15# CDDL HEADER END
16#
17
18#
19# Copyright (c) 2019, Datto Inc. All rights reserved.
20#
21
22. $STF_SUITE/include/libtest.shlib
23. $STF_SUITE/tests/functional/resilver/resilver.cfg
24
25SYSEVENT=$STF_SUITE/tests/functional/resilver/sysevent
26
27#
28# DESCRIPTION:
29# Testing resilver restart logic both with and without the deferred resilver
30# feature enabled, verifying that resilver is not restarted when it is
31# unecessary.
32#
33# STRATEGY:
34# 1. Create a pool
35# 2. Create four filesystems with the primary cache disable to force reads
36# 3. Write four files simultaneously, one to each filesystem
37# 4. Do with and without deferred resilvers enabled
38#    a. Replace a vdev with a spare & suspend resilver immediately
39#    b. Verify resilver starts properly
40#    c. Offline / online another vdev to introduce a new DTL range
41#    d. Verify resilver restart restart or defer
42#    e. Inject read errors on vdev that was offlined / onlned
43#    f. Verify that resilver did not restart
44#    g. Unsuspend resilver and wait for it to finish
45#    h. Verify that there are two resilvers and nothing is deferred
46#
47
48function cleanup
49{
50	log_must set_tunable32 zfs_resilver_min_time_ms $ORIG_RESILVER_MIN_TIME
51	log_must set_tunable32 zfs_scan_suspend_progress \
52	    $ORIG_SCAN_SUSPEND_PROGRESS
53	log_must zinject -c all
54	destroy_pool $TESTPOOL
55	rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
56	[[ -n "$EVTFILE" ]] && rm -f "$EVTFILE"
57	[[ -n "$EVTPID" ]] && kill "$EVTPID"
58}
59
60# count resilver events in zpool and number of deferred rsilvers on vdevs
61function verify_restarts # <msg> <cnt> <defer>
62{
63	msg=$1
64	cnt=$2
65	defer=$3
66
67	# check the number of resilver start in events log
68	RESILVERS=$(wc -l $EVTFILE | awk '{ print $1 }')
69	log_note "expected $cnt resilver start(s)$msg, found $RESILVERS"
70	[[ "$RESILVERS" -ne "$cnt" ]] &&
71	    log_fail "expected $cnt resilver start(s)$msg, found $RESILVERS"
72
73	[[ -z "$defer" ]] && return
74
75	# use zdb to find which vdevs have the resilver defer flag
76	VDEV_DEFERS=$(zdb -C $TESTPOOL | awk '
77	    /children/ { gsub(/[^0-9]/, ""); child = $0 }
78	    /com\.datto:resilver_defer$/ { print child }
79	')
80
81	if [[ "$defer" == "-" ]]
82	then
83		[[ -n $VDEV_DEFERS ]] &&
84		    log_fail "didn't expect any vdevs to have resilver deferred"
85		return
86	fi
87
88	[[ $VDEV_DEFERS -eq $defer ]] ||
89	    log_fail "resilver deferred set on unexpected vdev: $VDEV_DEFERS"
90}
91
92log_assert "Check for unnecessary resilver restarts"
93
94ORIG_RESILVER_MIN_TIME=$(get_tunable zfs_resilver_min_time_ms)
95ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable zfs_scan_suspend_progress)
96
97set -A RESTARTS -- '1' '2' '2' '2'
98set -A VDEVS -- '' '' '' ''
99set -A DEFER_RESTARTS -- '1' '1' '1' '2'
100set -A DEFER_VDEVS -- '-' '2' '2' '-'
101
102VDEV_REPLACE="${VDEV_FILES[1]} $SPARE_VDEV_FILE"
103
104log_onexit cleanup
105
106# Monitor for resilver start events and log them to $EVTFILE as they occur
107EVTFILE=$(mktemp /tmp/resilver_events.XXXXXX)
108EVTPID=$($SYSEVENT -o $EVTFILE ESC_ZFS_resilver_start)
109log_must test -n "$EVTPID"
110
111log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
112
113log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL \
114    raidz ${VDEV_FILES[@]}
115
116# create 4 filesystems
117for fs in fs{0..3}
118do
119	log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL/$fs
120done
121
122# simultaneously write 16M to each of them
123set -A DATAPATHS /$TESTPOOL/fs{0..3}/dat.0
124log_note "Writing data files"
125for path in ${DATAPATHS[@]}
126do
127	dd if=/dev/urandom of=$path bs=1M count=16 > /dev/null 2>&1 &
128done
129wait
130
131# test without and with deferred resilve feature enabled
132for test in "without" "with"
133do
134	log_note "Testing $test deferred resilvers"
135
136	if [[ $test == "with" ]]
137	then
138		log_must zpool set feature@resilver_defer=enabled $TESTPOOL
139		RESTARTS=( "${DEFER_RESTARTS[@]}" )
140		VDEVS=( "${DEFER_VDEVS[@]}" )
141		VDEV_REPLACE="$SPARE_VDEV_FILE ${VDEV_FILES[1]}"
142	fi
143
144	# clear the events
145	cp /dev/null $EVTFILE
146
147	# limit scanning time
148	log_must set_tunable32 zfs_resilver_min_time_ms 50
149
150	# initiate a resilver and suspend the scan as soon as possible
151	log_must zpool replace $TESTPOOL $VDEV_REPLACE
152	log_must set_tunable32 zfs_scan_suspend_progress 1
153
154	# there should only be 1 resilver start
155	verify_restarts '' "${RESTARTS[0]}" "${VDEVS[0]}"
156
157	# offline then online a vdev to introduce a new DTL range after current
158	# scan, which should restart (or defer) the resilver
159	log_must zpool offline $TESTPOOL ${VDEV_FILES[2]}
160	log_must zpool sync $TESTPOOL
161	log_must zpool online $TESTPOOL ${VDEV_FILES[2]}
162	log_must zpool sync $TESTPOOL
163
164	# there should now be 2 resilver starts w/o defer, 1 with defer
165	verify_restarts ' after offline/online' "${RESTARTS[1]}" "${VDEVS[1]}"
166
167	# inject read io errors on vdev and verify resilver does not restart
168	log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL
169	log_must cat ${DATAPATHS[1]} > /dev/null
170	log_must zinject -c all
171
172	# there should still be 2 resilver starts w/o defer, 1 with defer
173	verify_restarts ' after zinject' "${RESTARTS[2]}" "${VDEVS[2]}"
174
175	# unsuspend resilver
176	log_must set_tunable32 zfs_scan_suspend_progress 0
177	log_must set_tunable32 zfs_resilver_min_time_ms 3000
178
179	# wait for resilver to finish
180	for iter in {0..59}
181	do
182		is_pool_resilvered $TESTPOOL && break
183		sleep 1
184	done
185	is_pool_resilvered $TESTPOOL ||
186	    log_fail "resilver timed out"
187
188	# wait for a few txg's to see if a resilver happens
189	log_must zpool sync $TESTPOOL
190	log_must zpool sync $TESTPOOL
191
192	# there should now be 2 resilver starts
193	verify_restarts ' after resilver' "${RESTARTS[3]}" "${VDEVS[3]}"
194done
195
196log_pass "Resilver did not restart unnecessarily"
197