xref: /illumos-gate/usr/src/uts/common/fs/zfs/bpobj.c (revision b420f3adeb349714478d1a7813d2c0e069d41555)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 by Delphix. All rights reserved.
24  */
25 
26 #include <sys/bpobj.h>
27 #include <sys/zfs_context.h>
28 #include <sys/refcount.h>
29 #include <sys/dsl_pool.h>
30 
31 uint64_t
32 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
33 {
34 	int size;
35 
36 	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
37 		size = BPOBJ_SIZE_V0;
38 	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
39 		size = BPOBJ_SIZE_V1;
40 	else
41 		size = sizeof (bpobj_phys_t);
42 
43 	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
44 	    DMU_OT_BPOBJ_HDR, size, tx));
45 }
46 
47 void
48 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
49 {
50 	int64_t i;
51 	bpobj_t bpo;
52 	dmu_object_info_t doi;
53 	int epb;
54 	dmu_buf_t *dbuf = NULL;
55 
56 	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
57 
58 	mutex_enter(&bpo.bpo_lock);
59 
60 	if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
61 		goto out;
62 
63 	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
64 	epb = doi.doi_data_block_size / sizeof (uint64_t);
65 
66 	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
67 		uint64_t *objarray;
68 		uint64_t offset, blkoff;
69 
70 		offset = i * sizeof (uint64_t);
71 		blkoff = P2PHASE(i, epb);
72 
73 		if (dbuf == NULL || dbuf->db_offset > offset) {
74 			if (dbuf)
75 				dmu_buf_rele(dbuf, FTAG);
76 			VERIFY3U(0, ==, dmu_buf_hold(os,
77 			    bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
78 		}
79 
80 		ASSERT3U(offset, >=, dbuf->db_offset);
81 		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
82 
83 		objarray = dbuf->db_data;
84 		bpobj_free(os, objarray[blkoff], tx);
85 	}
86 	if (dbuf) {
87 		dmu_buf_rele(dbuf, FTAG);
88 		dbuf = NULL;
89 	}
90 	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
91 
92 out:
93 	mutex_exit(&bpo.bpo_lock);
94 	bpobj_close(&bpo);
95 
96 	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
97 }
98 
99 int
100 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
101 {
102 	dmu_object_info_t doi;
103 	int err;
104 
105 	err = dmu_object_info(os, object, &doi);
106 	if (err)
107 		return (err);
108 
109 	bzero(bpo, sizeof (*bpo));
110 	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
111 
112 	ASSERT(bpo->bpo_dbuf == NULL);
113 	ASSERT(bpo->bpo_phys == NULL);
114 	ASSERT(object != 0);
115 	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
116 	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
117 
118 	err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
119 	if (err)
120 		return (err);
121 
122 	bpo->bpo_os = os;
123 	bpo->bpo_object = object;
124 	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
125 	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
126 	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
127 	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
128 	return (0);
129 }
130 
131 void
132 bpobj_close(bpobj_t *bpo)
133 {
134 	/* Lame workaround for closing a bpobj that was never opened. */
135 	if (bpo->bpo_object == 0)
136 		return;
137 
138 	dmu_buf_rele(bpo->bpo_dbuf, bpo);
139 	if (bpo->bpo_cached_dbuf != NULL)
140 		dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
141 	bpo->bpo_dbuf = NULL;
142 	bpo->bpo_phys = NULL;
143 	bpo->bpo_cached_dbuf = NULL;
144 	bpo->bpo_object = 0;
145 
146 	mutex_destroy(&bpo->bpo_lock);
147 }
148 
149 static int
150 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
151     boolean_t free)
152 {
153 	dmu_object_info_t doi;
154 	int epb;
155 	int64_t i;
156 	int err = 0;
157 	dmu_buf_t *dbuf = NULL;
158 
159 	mutex_enter(&bpo->bpo_lock);
160 
161 	if (free)
162 		dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
163 
164 	for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
165 		blkptr_t *bparray;
166 		blkptr_t *bp;
167 		uint64_t offset, blkoff;
168 
169 		offset = i * sizeof (blkptr_t);
170 		blkoff = P2PHASE(i, bpo->bpo_epb);
171 
172 		if (dbuf == NULL || dbuf->db_offset > offset) {
173 			if (dbuf)
174 				dmu_buf_rele(dbuf, FTAG);
175 			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
176 			    FTAG, &dbuf, 0);
177 			if (err)
178 				break;
179 		}
180 
181 		ASSERT3U(offset, >=, dbuf->db_offset);
182 		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
183 
184 		bparray = dbuf->db_data;
185 		bp = &bparray[blkoff];
186 		err = func(arg, bp, tx);
187 		if (err)
188 			break;
189 		if (free) {
190 			bpo->bpo_phys->bpo_bytes -=
191 			    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
192 			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
193 			if (bpo->bpo_havecomp) {
194 				bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
195 				bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
196 			}
197 			bpo->bpo_phys->bpo_num_blkptrs--;
198 			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
199 		}
200 	}
201 	if (dbuf) {
202 		dmu_buf_rele(dbuf, FTAG);
203 		dbuf = NULL;
204 	}
205 	if (free) {
206 		i++;
207 		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
208 		    i * sizeof (blkptr_t), -1ULL, tx));
209 	}
210 	if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
211 		goto out;
212 
213 	ASSERT(bpo->bpo_havecomp);
214 	err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
215 	if (err) {
216 		mutex_exit(&bpo->bpo_lock);
217 		return (err);
218 	}
219 	epb = doi.doi_data_block_size / sizeof (uint64_t);
220 
221 	for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
222 		uint64_t *objarray;
223 		uint64_t offset, blkoff;
224 		bpobj_t sublist;
225 		uint64_t used_before, comp_before, uncomp_before;
226 		uint64_t used_after, comp_after, uncomp_after;
227 
228 		offset = i * sizeof (uint64_t);
229 		blkoff = P2PHASE(i, epb);
230 
231 		if (dbuf == NULL || dbuf->db_offset > offset) {
232 			if (dbuf)
233 				dmu_buf_rele(dbuf, FTAG);
234 			err = dmu_buf_hold(bpo->bpo_os,
235 			    bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
236 			if (err)
237 				break;
238 		}
239 
240 		ASSERT3U(offset, >=, dbuf->db_offset);
241 		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
242 
243 		objarray = dbuf->db_data;
244 		err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
245 		if (err)
246 			break;
247 		if (free) {
248 			err = bpobj_space(&sublist,
249 			    &used_before, &comp_before, &uncomp_before);
250 			if (err)
251 				break;
252 		}
253 		err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
254 		if (free) {
255 			VERIFY3U(0, ==, bpobj_space(&sublist,
256 			    &used_after, &comp_after, &uncomp_after));
257 			bpo->bpo_phys->bpo_bytes -= used_before - used_after;
258 			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
259 			bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
260 			bpo->bpo_phys->bpo_uncomp -=
261 			    uncomp_before - uncomp_after;
262 		}
263 
264 		bpobj_close(&sublist);
265 		if (err)
266 			break;
267 		if (free) {
268 			err = dmu_object_free(bpo->bpo_os,
269 			    objarray[blkoff], tx);
270 			if (err)
271 				break;
272 			bpo->bpo_phys->bpo_num_subobjs--;
273 			ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
274 		}
275 	}
276 	if (dbuf) {
277 		dmu_buf_rele(dbuf, FTAG);
278 		dbuf = NULL;
279 	}
280 	if (free) {
281 		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
282 		    bpo->bpo_phys->bpo_subobjs,
283 		    (i + 1) * sizeof (uint64_t), -1ULL, tx));
284 	}
285 
286 out:
287 	/* If there are no entries, there should be no bytes. */
288 	ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
289 	    (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
290 	    bpo->bpo_phys->bpo_bytes == 0);
291 
292 	mutex_exit(&bpo->bpo_lock);
293 	return (err);
294 }
295 
296 /*
297  * Iterate and remove the entries.  If func returns nonzero, iteration
298  * will stop and that entry will not be removed.
299  */
300 int
301 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
302 {
303 	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
304 }
305 
306 /*
307  * Iterate the entries.  If func returns nonzero, iteration will stop.
308  */
309 int
310 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
311 {
312 	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
313 }
314 
315 void
316 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
317 {
318 	bpobj_t subbpo;
319 	uint64_t used, comp, uncomp, subsubobjs;
320 
321 	ASSERT(bpo->bpo_havesubobj);
322 	ASSERT(bpo->bpo_havecomp);
323 
324 	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
325 	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
326 
327 	if (used == 0) {
328 		/* No point in having an empty subobj. */
329 		bpobj_close(&subbpo);
330 		bpobj_free(bpo->bpo_os, subobj, tx);
331 		return;
332 	}
333 
334 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
335 	if (bpo->bpo_phys->bpo_subobjs == 0) {
336 		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
337 		    DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
338 	}
339 
340 	mutex_enter(&bpo->bpo_lock);
341 	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
342 	    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
343 	    sizeof (subobj), &subobj, tx);
344 	bpo->bpo_phys->bpo_num_subobjs++;
345 
346 	/*
347 	 * If subobj has only one block of subobjs, then move subobj's
348 	 * subobjs to bpo's subobj list directly.  This reduces
349 	 * recursion in bpobj_iterate due to nested subobjs.
350 	 */
351 	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
352 	if (subsubobjs != 0) {
353 		dmu_object_info_t doi;
354 
355 		VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
356 		if (doi.doi_max_offset == doi.doi_data_block_size) {
357 			dmu_buf_t *subdb;
358 			uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
359 
360 			VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
361 			    0, FTAG, &subdb, 0));
362 			dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
363 			    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
364 			    numsubsub * sizeof (subobj), subdb->db_data, tx);
365 			dmu_buf_rele(subdb, FTAG);
366 			bpo->bpo_phys->bpo_num_subobjs += numsubsub;
367 
368 			dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
369 			subbpo.bpo_phys->bpo_subobjs = 0;
370 			VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
371 			    subsubobjs, tx));
372 		}
373 	}
374 	bpo->bpo_phys->bpo_bytes += used;
375 	bpo->bpo_phys->bpo_comp += comp;
376 	bpo->bpo_phys->bpo_uncomp += uncomp;
377 	mutex_exit(&bpo->bpo_lock);
378 
379 	bpobj_close(&subbpo);
380 }
381 
382 void
383 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
384 {
385 	blkptr_t stored_bp = *bp;
386 	uint64_t offset;
387 	int blkoff;
388 	blkptr_t *bparray;
389 
390 	ASSERT(!BP_IS_HOLE(bp));
391 
392 	/* We never need the fill count. */
393 	stored_bp.blk_fill = 0;
394 
395 	/* The bpobj will compress better if we can leave off the checksum */
396 	if (!BP_GET_DEDUP(bp))
397 		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
398 
399 	mutex_enter(&bpo->bpo_lock);
400 
401 	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
402 	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
403 
404 	if (bpo->bpo_cached_dbuf == NULL ||
405 	    offset < bpo->bpo_cached_dbuf->db_offset ||
406 	    offset >= bpo->bpo_cached_dbuf->db_offset +
407 	    bpo->bpo_cached_dbuf->db_size) {
408 		if (bpo->bpo_cached_dbuf)
409 			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
410 		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
411 		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
412 	}
413 
414 	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
415 	bparray = bpo->bpo_cached_dbuf->db_data;
416 	bparray[blkoff] = stored_bp;
417 
418 	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
419 	bpo->bpo_phys->bpo_num_blkptrs++;
420 	bpo->bpo_phys->bpo_bytes +=
421 	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
422 	if (bpo->bpo_havecomp) {
423 		bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
424 		bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
425 	}
426 	mutex_exit(&bpo->bpo_lock);
427 }
428 
429 struct space_range_arg {
430 	spa_t *spa;
431 	uint64_t mintxg;
432 	uint64_t maxtxg;
433 	uint64_t used;
434 	uint64_t comp;
435 	uint64_t uncomp;
436 };
437 
438 /* ARGSUSED */
439 static int
440 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
441 {
442 	struct space_range_arg *sra = arg;
443 
444 	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
445 		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
446 			sra->used += bp_get_dsize_sync(sra->spa, bp);
447 		else
448 			sra->used += bp_get_dsize(sra->spa, bp);
449 		sra->comp += BP_GET_PSIZE(bp);
450 		sra->uncomp += BP_GET_UCSIZE(bp);
451 	}
452 	return (0);
453 }
454 
455 int
456 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
457 {
458 	mutex_enter(&bpo->bpo_lock);
459 
460 	*usedp = bpo->bpo_phys->bpo_bytes;
461 	if (bpo->bpo_havecomp) {
462 		*compp = bpo->bpo_phys->bpo_comp;
463 		*uncompp = bpo->bpo_phys->bpo_uncomp;
464 		mutex_exit(&bpo->bpo_lock);
465 		return (0);
466 	} else {
467 		mutex_exit(&bpo->bpo_lock);
468 		return (bpobj_space_range(bpo, 0, UINT64_MAX,
469 		    usedp, compp, uncompp));
470 	}
471 }
472 
473 /*
474  * Return the amount of space in the bpobj which is:
475  * mintxg < blk_birth <= maxtxg
476  */
477 int
478 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
479     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
480 {
481 	struct space_range_arg sra = { 0 };
482 	int err;
483 
484 	/*
485 	 * As an optimization, if they want the whole txg range, just
486 	 * get bpo_bytes rather than iterating over the bps.
487 	 */
488 	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
489 		return (bpobj_space(bpo, usedp, compp, uncompp));
490 
491 	sra.spa = dmu_objset_spa(bpo->bpo_os);
492 	sra.mintxg = mintxg;
493 	sra.maxtxg = maxtxg;
494 
495 	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
496 	*usedp = sra.used;
497 	*compp = sra.comp;
498 	*uncompp = sra.uncomp;
499 	return (err);
500 }
501