xref: /illumos-gate/usr/src/uts/common/fs/zfs/dsl_deadlist.c (revision cde58dbc6a23d4d38db7c8866312be83221c765f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/dsl_dataset.h>
26 #include <sys/dmu.h>
27 #include <sys/refcount.h>
28 #include <sys/zap.h>
29 #include <sys/zfs_context.h>
30 #include <sys/dsl_pool.h>
31 
32 static int
33 dsl_deadlist_compare(const void *arg1, const void *arg2)
34 {
35 	const dsl_deadlist_entry_t *dle1 = arg1;
36 	const dsl_deadlist_entry_t *dle2 = arg2;
37 
38 	if (dle1->dle_mintxg < dle2->dle_mintxg)
39 		return (-1);
40 	else if (dle1->dle_mintxg > dle2->dle_mintxg)
41 		return (+1);
42 	else
43 		return (0);
44 }
45 
46 static void
47 dsl_deadlist_load_tree(dsl_deadlist_t *dl)
48 {
49 	zap_cursor_t zc;
50 	zap_attribute_t za;
51 
52 	ASSERT(!dl->dl_oldfmt);
53 	if (dl->dl_havetree)
54 		return;
55 
56 	avl_create(&dl->dl_tree, dsl_deadlist_compare,
57 	    sizeof (dsl_deadlist_entry_t),
58 	    offsetof(dsl_deadlist_entry_t, dle_node));
59 	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
60 	    zap_cursor_retrieve(&zc, &za) == 0;
61 	    zap_cursor_advance(&zc)) {
62 		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
63 		dle->dle_mintxg = strtonum(za.za_name, NULL);
64 		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
65 		    za.za_first_integer));
66 		avl_add(&dl->dl_tree, dle);
67 	}
68 	zap_cursor_fini(&zc);
69 	dl->dl_havetree = B_TRUE;
70 }
71 
72 void
73 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
74 {
75 	dmu_object_info_t doi;
76 
77 	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
78 	dl->dl_os = os;
79 	dl->dl_object = object;
80 	VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
81 	dmu_object_info_from_db(dl->dl_dbuf, &doi);
82 	if (doi.doi_type == DMU_OT_BPOBJ) {
83 		dmu_buf_rele(dl->dl_dbuf, dl);
84 		dl->dl_dbuf = NULL;
85 		dl->dl_oldfmt = B_TRUE;
86 		VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
87 		return;
88 	}
89 
90 	dl->dl_oldfmt = B_FALSE;
91 	dl->dl_phys = dl->dl_dbuf->db_data;
92 	dl->dl_havetree = B_FALSE;
93 }
94 
95 void
96 dsl_deadlist_close(dsl_deadlist_t *dl)
97 {
98 	void *cookie = NULL;
99 	dsl_deadlist_entry_t *dle;
100 
101 	if (dl->dl_oldfmt) {
102 		dl->dl_oldfmt = B_FALSE;
103 		bpobj_close(&dl->dl_bpobj);
104 		return;
105 	}
106 
107 	if (dl->dl_havetree) {
108 		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
109 		    != NULL) {
110 			bpobj_close(&dle->dle_bpobj);
111 			kmem_free(dle, sizeof (*dle));
112 		}
113 		avl_destroy(&dl->dl_tree);
114 	}
115 	dmu_buf_rele(dl->dl_dbuf, dl);
116 	mutex_destroy(&dl->dl_lock);
117 	dl->dl_dbuf = NULL;
118 	dl->dl_phys = NULL;
119 }
120 
121 uint64_t
122 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
123 {
124 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
125 		return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
126 	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
127 	    sizeof (dsl_deadlist_phys_t), tx));
128 }
129 
130 void
131 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
132 {
133 	dmu_object_info_t doi;
134 	zap_cursor_t zc;
135 	zap_attribute_t za;
136 
137 	VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
138 	if (doi.doi_type == DMU_OT_BPOBJ) {
139 		bpobj_free(os, dlobj, tx);
140 		return;
141 	}
142 
143 	for (zap_cursor_init(&zc, os, dlobj);
144 	    zap_cursor_retrieve(&zc, &za) == 0;
145 	    zap_cursor_advance(&zc))
146 		bpobj_free(os, za.za_first_integer, tx);
147 	zap_cursor_fini(&zc);
148 	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
149 }
150 
151 void
152 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
153 {
154 	dsl_deadlist_entry_t dle_tofind;
155 	dsl_deadlist_entry_t *dle;
156 	avl_index_t where;
157 
158 	if (dl->dl_oldfmt) {
159 		bpobj_enqueue(&dl->dl_bpobj, bp, tx);
160 		return;
161 	}
162 
163 	dsl_deadlist_load_tree(dl);
164 
165 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
166 	mutex_enter(&dl->dl_lock);
167 	dl->dl_phys->dl_used +=
168 	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
169 	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
170 	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
171 	mutex_exit(&dl->dl_lock);
172 
173 	dle_tofind.dle_mintxg = bp->blk_birth;
174 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
175 	if (dle == NULL)
176 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
177 	else
178 		dle = AVL_PREV(&dl->dl_tree, dle);
179 	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
180 }
181 
182 /*
183  * Insert new key in deadlist, which must be > all current entries.
184  * mintxg is not inclusive.
185  */
186 void
187 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
188 {
189 	uint64_t obj;
190 	dsl_deadlist_entry_t *dle;
191 
192 	if (dl->dl_oldfmt)
193 		return;
194 
195 	dsl_deadlist_load_tree(dl);
196 
197 	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
198 	dle->dle_mintxg = mintxg;
199 	obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
200 	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
201 	avl_add(&dl->dl_tree, dle);
202 
203 	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
204 	    mintxg, obj, tx));
205 }
206 
207 /*
208  * Remove this key, merging its entries into the previous key.
209  */
210 void
211 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
212 {
213 	dsl_deadlist_entry_t dle_tofind;
214 	dsl_deadlist_entry_t *dle, *dle_prev;
215 
216 	if (dl->dl_oldfmt)
217 		return;
218 
219 	dsl_deadlist_load_tree(dl);
220 
221 	dle_tofind.dle_mintxg = mintxg;
222 	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
223 	dle_prev = AVL_PREV(&dl->dl_tree, dle);
224 
225 	bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
226 	    dle->dle_bpobj.bpo_object, tx);
227 
228 	avl_remove(&dl->dl_tree, dle);
229 	bpobj_close(&dle->dle_bpobj);
230 	kmem_free(dle, sizeof (*dle));
231 
232 	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
233 }
234 
235 /*
236  * Walk ds's snapshots to regenerate generate ZAP & AVL.
237  */
238 static void
239 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
240     uint64_t mrs_obj, dmu_tx_t *tx)
241 {
242 	dsl_deadlist_t dl;
243 	dsl_pool_t *dp = dmu_objset_pool(os);
244 
245 	dsl_deadlist_open(&dl, os, dlobj);
246 	if (dl.dl_oldfmt) {
247 		dsl_deadlist_close(&dl);
248 		return;
249 	}
250 
251 	while (mrs_obj != 0) {
252 		dsl_dataset_t *ds;
253 		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
254 		dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
255 		mrs_obj = ds->ds_phys->ds_prev_snap_obj;
256 		dsl_dataset_rele(ds, FTAG);
257 	}
258 	dsl_deadlist_close(&dl);
259 }
260 
261 uint64_t
262 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
263     uint64_t mrs_obj, dmu_tx_t *tx)
264 {
265 	dsl_deadlist_entry_t *dle;
266 	uint64_t newobj;
267 
268 	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
269 
270 	if (dl->dl_oldfmt) {
271 		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
272 		return (newobj);
273 	}
274 
275 	dsl_deadlist_load_tree(dl);
276 
277 	for (dle = avl_first(&dl->dl_tree); dle;
278 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
279 		uint64_t obj;
280 
281 		if (dle->dle_mintxg >= maxtxg)
282 			break;
283 
284 		obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
285 		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
286 		    dle->dle_mintxg, obj, tx));
287 	}
288 	return (newobj);
289 }
290 
291 void
292 dsl_deadlist_space(dsl_deadlist_t *dl,
293     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
294 {
295 	if (dl->dl_oldfmt) {
296 		VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
297 		    usedp, compp, uncompp));
298 		return;
299 	}
300 
301 	mutex_enter(&dl->dl_lock);
302 	*usedp = dl->dl_phys->dl_used;
303 	*compp = dl->dl_phys->dl_comp;
304 	*uncompp = dl->dl_phys->dl_uncomp;
305 	mutex_exit(&dl->dl_lock);
306 }
307 
308 /*
309  * return space used in the range (mintxg, maxtxg].
310  * Includes maxtxg, does not include mintxg.
311  * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
312  * UINT64_MAX).
313  */
314 void
315 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
316     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
317 {
318 	dsl_deadlist_entry_t dle_tofind;
319 	dsl_deadlist_entry_t *dle;
320 	avl_index_t where;
321 
322 	if (dl->dl_oldfmt) {
323 		VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
324 		    mintxg, maxtxg, usedp, compp, uncompp));
325 		return;
326 	}
327 
328 	dsl_deadlist_load_tree(dl);
329 	*usedp = *compp = *uncompp = 0;
330 
331 	dle_tofind.dle_mintxg = mintxg;
332 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
333 	/*
334 	 * If we don't find this mintxg, there shouldn't be anything
335 	 * after it either.
336 	 */
337 	ASSERT(dle != NULL ||
338 	    avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
339 	for (; dle && dle->dle_mintxg < maxtxg;
340 	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
341 		uint64_t used, comp, uncomp;
342 
343 		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
344 		    &used, &comp, &uncomp));
345 
346 		*usedp += used;
347 		*compp += comp;
348 		*uncompp += uncomp;
349 	}
350 }
351 
352 static void
353 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
354     dmu_tx_t *tx)
355 {
356 	dsl_deadlist_entry_t dle_tofind;
357 	dsl_deadlist_entry_t *dle;
358 	avl_index_t where;
359 	uint64_t used, comp, uncomp;
360 	bpobj_t bpo;
361 
362 	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
363 	VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
364 	bpobj_close(&bpo);
365 
366 	dsl_deadlist_load_tree(dl);
367 
368 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
369 	mutex_enter(&dl->dl_lock);
370 	dl->dl_phys->dl_used += used;
371 	dl->dl_phys->dl_comp += comp;
372 	dl->dl_phys->dl_uncomp += uncomp;
373 	mutex_exit(&dl->dl_lock);
374 
375 	dle_tofind.dle_mintxg = birth;
376 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
377 	if (dle == NULL)
378 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
379 	bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
380 }
381 
382 static int
383 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
384 {
385 	dsl_deadlist_t *dl = arg;
386 	dsl_deadlist_insert(dl, bp, tx);
387 	return (0);
388 }
389 
390 /*
391  * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
392  * an empty deadlist.
393  */
394 void
395 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
396 {
397 	zap_cursor_t zc;
398 	zap_attribute_t za;
399 	dmu_buf_t *bonus;
400 	dsl_deadlist_phys_t *dlp;
401 	dmu_object_info_t doi;
402 
403 	VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
404 	if (doi.doi_type == DMU_OT_BPOBJ) {
405 		bpobj_t bpo;
406 		VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
407 		VERIFY3U(0, ==, bpobj_iterate(&bpo,
408 		    dsl_deadlist_insert_cb, dl, tx));
409 		bpobj_close(&bpo);
410 		return;
411 	}
412 
413 	for (zap_cursor_init(&zc, dl->dl_os, obj);
414 	    zap_cursor_retrieve(&zc, &za) == 0;
415 	    zap_cursor_advance(&zc)) {
416 		uint64_t mintxg = strtonum(za.za_name, NULL);
417 		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
418 		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
419 	}
420 	zap_cursor_fini(&zc);
421 
422 	VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
423 	dlp = bonus->db_data;
424 	dmu_buf_will_dirty(bonus, tx);
425 	bzero(dlp, sizeof (*dlp));
426 	dmu_buf_rele(bonus, FTAG);
427 }
428 
429 /*
430  * Remove entries on dl that are >= mintxg, and put them on the bpobj.
431  */
432 void
433 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
434     dmu_tx_t *tx)
435 {
436 	dsl_deadlist_entry_t dle_tofind;
437 	dsl_deadlist_entry_t *dle;
438 	avl_index_t where;
439 
440 	ASSERT(!dl->dl_oldfmt);
441 	dmu_buf_will_dirty(dl->dl_dbuf, tx);
442 	dsl_deadlist_load_tree(dl);
443 
444 	dle_tofind.dle_mintxg = mintxg;
445 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
446 	if (dle == NULL)
447 		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
448 	while (dle) {
449 		uint64_t used, comp, uncomp;
450 		dsl_deadlist_entry_t *dle_next;
451 
452 		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
453 
454 		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
455 		    &used, &comp, &uncomp));
456 		mutex_enter(&dl->dl_lock);
457 		ASSERT3U(dl->dl_phys->dl_used, >=, used);
458 		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
459 		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
460 		dl->dl_phys->dl_used -= used;
461 		dl->dl_phys->dl_comp -= comp;
462 		dl->dl_phys->dl_uncomp -= uncomp;
463 		mutex_exit(&dl->dl_lock);
464 
465 		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
466 		    dle->dle_mintxg, tx));
467 
468 		dle_next = AVL_NEXT(&dl->dl_tree, dle);
469 		avl_remove(&dl->dl_tree, dle);
470 		bpobj_close(&dle->dle_bpobj);
471 		kmem_free(dle, sizeof (*dle));
472 		dle = dle_next;
473 	}
474 }
475