xref: /illumos-gate/usr/src/uts/common/fs/zfs/range_tree.c (revision bf16b11e8deb633dd6c4296d46e92399d1582df4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/dnode.h>
33 #include <sys/zio.h>
34 #include <sys/range_tree.h>
35 
36 static kmem_cache_t *range_seg_cache;
37 
38 void
39 range_tree_init(void)
40 {
41 	ASSERT(range_seg_cache == NULL);
42 	range_seg_cache = kmem_cache_create("range_seg_cache",
43 	    sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
44 }
45 
46 void
47 range_tree_fini(void)
48 {
49 	kmem_cache_destroy(range_seg_cache);
50 	range_seg_cache = NULL;
51 }
52 
53 void
54 range_tree_stat_verify(range_tree_t *rt)
55 {
56 	range_seg_t *rs;
57 	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
58 	int i;
59 
60 	for (rs = avl_first(&rt->rt_root); rs != NULL;
61 	    rs = AVL_NEXT(&rt->rt_root, rs)) {
62 		uint64_t size = rs->rs_end - rs->rs_start;
63 		int idx	= highbit64(size) - 1;
64 
65 		hist[idx]++;
66 		ASSERT3U(hist[idx], !=, 0);
67 	}
68 
69 	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
70 		if (hist[i] != rt->rt_histogram[i]) {
71 			zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu",
72 			    i, hist, hist[i], rt->rt_histogram[i]);
73 		}
74 		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
75 	}
76 }
77 
78 static void
79 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
80 {
81 	uint64_t size = rs->rs_end - rs->rs_start;
82 	int idx = highbit64(size) - 1;
83 
84 	ASSERT3U(idx, <,
85 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
86 
87 	ASSERT(MUTEX_HELD(rt->rt_lock));
88 	rt->rt_histogram[idx]++;
89 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
90 }
91 
92 static void
93 range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
94 {
95 	uint64_t size = rs->rs_end - rs->rs_start;
96 	int idx = highbit64(size) - 1;
97 
98 	ASSERT3U(idx, <,
99 	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
100 
101 	ASSERT(MUTEX_HELD(rt->rt_lock));
102 	ASSERT3U(rt->rt_histogram[idx], !=, 0);
103 	rt->rt_histogram[idx]--;
104 }
105 
106 /*
107  * NOTE: caller is responsible for all locking.
108  */
109 static int
110 range_tree_seg_compare(const void *x1, const void *x2)
111 {
112 	const range_seg_t *r1 = x1;
113 	const range_seg_t *r2 = x2;
114 
115 	if (r1->rs_start < r2->rs_start) {
116 		if (r1->rs_end > r2->rs_start)
117 			return (0);
118 		return (-1);
119 	}
120 	if (r1->rs_start > r2->rs_start) {
121 		if (r1->rs_start < r2->rs_end)
122 			return (0);
123 		return (1);
124 	}
125 	return (0);
126 }
127 
128 range_tree_t *
129 range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
130 {
131 	range_tree_t *rt;
132 
133 	rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
134 
135 	avl_create(&rt->rt_root, range_tree_seg_compare,
136 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
137 
138 	rt->rt_lock = lp;
139 	rt->rt_ops = ops;
140 	rt->rt_arg = arg;
141 
142 	if (rt->rt_ops != NULL)
143 		rt->rt_ops->rtop_create(rt, rt->rt_arg);
144 
145 	return (rt);
146 }
147 
148 void
149 range_tree_destroy(range_tree_t *rt)
150 {
151 	VERIFY0(rt->rt_space);
152 
153 	if (rt->rt_ops != NULL)
154 		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
155 
156 	avl_destroy(&rt->rt_root);
157 	kmem_free(rt, sizeof (*rt));
158 }
159 
160 void
161 range_tree_add(void *arg, uint64_t start, uint64_t size)
162 {
163 	range_tree_t *rt = arg;
164 	avl_index_t where;
165 	range_seg_t rsearch, *rs_before, *rs_after, *rs;
166 	uint64_t end = start + size;
167 	boolean_t merge_before, merge_after;
168 
169 	ASSERT(MUTEX_HELD(rt->rt_lock));
170 	VERIFY(size != 0);
171 
172 	rsearch.rs_start = start;
173 	rsearch.rs_end = end;
174 	rs = avl_find(&rt->rt_root, &rsearch, &where);
175 
176 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
177 		zfs_panic_recover("zfs: allocating allocated segment"
178 		    "(offset=%llu size=%llu)\n",
179 		    (longlong_t)start, (longlong_t)size);
180 		return;
181 	}
182 
183 	/* Make sure we don't overlap with either of our neighbors */
184 	VERIFY(rs == NULL);
185 
186 	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
187 	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
188 
189 	merge_before = (rs_before != NULL && rs_before->rs_end == start);
190 	merge_after = (rs_after != NULL && rs_after->rs_start == end);
191 
192 	if (merge_before && merge_after) {
193 		avl_remove(&rt->rt_root, rs_before);
194 		if (rt->rt_ops != NULL) {
195 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
196 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
197 		}
198 
199 		range_tree_stat_decr(rt, rs_before);
200 		range_tree_stat_decr(rt, rs_after);
201 
202 		rs_after->rs_start = rs_before->rs_start;
203 		kmem_cache_free(range_seg_cache, rs_before);
204 		rs = rs_after;
205 	} else if (merge_before) {
206 		if (rt->rt_ops != NULL)
207 			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
208 
209 		range_tree_stat_decr(rt, rs_before);
210 
211 		rs_before->rs_end = end;
212 		rs = rs_before;
213 	} else if (merge_after) {
214 		if (rt->rt_ops != NULL)
215 			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
216 
217 		range_tree_stat_decr(rt, rs_after);
218 
219 		rs_after->rs_start = start;
220 		rs = rs_after;
221 	} else {
222 		rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
223 		rs->rs_start = start;
224 		rs->rs_end = end;
225 		avl_insert(&rt->rt_root, rs, where);
226 	}
227 
228 	if (rt->rt_ops != NULL)
229 		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
230 
231 	range_tree_stat_incr(rt, rs);
232 	rt->rt_space += size;
233 }
234 
235 void
236 range_tree_remove(void *arg, uint64_t start, uint64_t size)
237 {
238 	range_tree_t *rt = arg;
239 	avl_index_t where;
240 	range_seg_t rsearch, *rs, *newseg;
241 	uint64_t end = start + size;
242 	boolean_t left_over, right_over;
243 
244 	ASSERT(MUTEX_HELD(rt->rt_lock));
245 	VERIFY3U(size, !=, 0);
246 	VERIFY3U(size, <=, rt->rt_space);
247 
248 	rsearch.rs_start = start;
249 	rsearch.rs_end = end;
250 	rs = avl_find(&rt->rt_root, &rsearch, &where);
251 
252 	/* Make sure we completely overlap with someone */
253 	if (rs == NULL) {
254 		zfs_panic_recover("zfs: freeing free segment "
255 		    "(offset=%llu size=%llu)",
256 		    (longlong_t)start, (longlong_t)size);
257 		return;
258 	}
259 	VERIFY3U(rs->rs_start, <=, start);
260 	VERIFY3U(rs->rs_end, >=, end);
261 
262 	left_over = (rs->rs_start != start);
263 	right_over = (rs->rs_end != end);
264 
265 	range_tree_stat_decr(rt, rs);
266 
267 	if (rt->rt_ops != NULL)
268 		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
269 
270 	if (left_over && right_over) {
271 		newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
272 		newseg->rs_start = end;
273 		newseg->rs_end = rs->rs_end;
274 		range_tree_stat_incr(rt, newseg);
275 
276 		rs->rs_end = start;
277 
278 		avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
279 		if (rt->rt_ops != NULL)
280 			rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
281 	} else if (left_over) {
282 		rs->rs_end = start;
283 	} else if (right_over) {
284 		rs->rs_start = end;
285 	} else {
286 		avl_remove(&rt->rt_root, rs);
287 		kmem_cache_free(range_seg_cache, rs);
288 		rs = NULL;
289 	}
290 
291 	if (rs != NULL) {
292 		range_tree_stat_incr(rt, rs);
293 
294 		if (rt->rt_ops != NULL)
295 			rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
296 	}
297 
298 	rt->rt_space -= size;
299 }
300 
301 static range_seg_t *
302 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
303 {
304 	avl_index_t where;
305 	range_seg_t rsearch;
306 	uint64_t end = start + size;
307 
308 	ASSERT(MUTEX_HELD(rt->rt_lock));
309 	VERIFY(size != 0);
310 
311 	rsearch.rs_start = start;
312 	rsearch.rs_end = end;
313 	return (avl_find(&rt->rt_root, &rsearch, &where));
314 }
315 
316 static range_seg_t *
317 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
318 {
319 	range_seg_t *rs = range_tree_find_impl(rt, start, size);
320 	if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size)
321 		return (rs);
322 	return (NULL);
323 }
324 
325 void
326 range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size)
327 {
328 	range_seg_t *rs;
329 
330 	mutex_enter(rt->rt_lock);
331 	rs = range_tree_find(rt, off, size);
332 	if (rs != NULL)
333 		panic("freeing free block; rs=%p", (void *)rs);
334 	mutex_exit(rt->rt_lock);
335 }
336 
337 boolean_t
338 range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
339 {
340 	return (range_tree_find(rt, start, size) != NULL);
341 }
342 
343 /*
344  * Ensure that this range is not in the tree, regardless of whether
345  * it is currently in the tree.
346  */
347 void
348 range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
349 {
350 	range_seg_t *rs;
351 
352 	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
353 		uint64_t free_start = MAX(rs->rs_start, start);
354 		uint64_t free_end = MIN(rs->rs_end, start + size);
355 		range_tree_remove(rt, free_start, free_end - free_start);
356 	}
357 }
358 
359 void
360 range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
361 {
362 	range_tree_t *rt;
363 
364 	ASSERT(MUTEX_HELD((*rtsrc)->rt_lock));
365 	ASSERT0(range_tree_space(*rtdst));
366 	ASSERT0(avl_numnodes(&(*rtdst)->rt_root));
367 
368 	rt = *rtsrc;
369 	*rtsrc = *rtdst;
370 	*rtdst = rt;
371 }
372 
373 void
374 range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
375 {
376 	range_seg_t *rs;
377 	void *cookie = NULL;
378 
379 	ASSERT(MUTEX_HELD(rt->rt_lock));
380 
381 	if (rt->rt_ops != NULL)
382 		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
383 
384 	while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
385 		if (func != NULL)
386 			func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
387 		kmem_cache_free(range_seg_cache, rs);
388 	}
389 
390 	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
391 	rt->rt_space = 0;
392 }
393 
394 void
395 range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
396 {
397 	range_seg_t *rs;
398 
399 	ASSERT(MUTEX_HELD(rt->rt_lock));
400 
401 	for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs))
402 		func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
403 }
404 
405 uint64_t
406 range_tree_space(range_tree_t *rt)
407 {
408 	return (rt->rt_space);
409 }
410