1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2013 Martin Matuska. All rights reserved.
25 * Copyright (c) 2014 Joyent, Inc. All rights reserved.
26 */
27
28#include <sys/dmu.h>
29#include <sys/dmu_objset.h>
30#include <sys/dmu_tx.h>
31#include <sys/dsl_dataset.h>
32#include <sys/dsl_dir.h>
33#include <sys/dsl_prop.h>
34#include <sys/dsl_synctask.h>
35#include <sys/dsl_deleg.h>
36#include <sys/dmu_impl.h>
37#include <sys/spa.h>
38#include <sys/metaslab.h>
39#include <sys/zap.h>
40#include <sys/zio.h>
41#include <sys/arc.h>
42#include <sys/sunddi.h>
43#include <sys/zfeature.h>
44#include <sys/policy.h>
45#include <sys/zfs_znode.h>
46#include "zfs_namecheck.h"
47#include "zfs_prop.h"
48
49/*
50 * Filesystem and Snapshot Limits
51 * ------------------------------
52 *
53 * These limits are used to restrict the number of filesystems and/or snapshots
54 * that can be created at a given level in the tree or below. A typical
55 * use-case is with a delegated dataset where the administrator wants to ensure
56 * that a user within the zone is not creating too many additional filesystems
57 * or snapshots, even though they're not exceeding their space quota.
58 *
59 * The filesystem and snapshot counts are stored as extensible properties. This
60 * capability is controlled by a feature flag and must be enabled to be used.
61 * Once enabled, the feature is not active until the first limit is set. At
62 * that point, future operations to create/destroy filesystems or snapshots
63 * will validate and update the counts.
64 *
65 * Because the count properties will not exist before the feature is active,
66 * the counts are updated when a limit is first set on an uninitialized
67 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
68 * all of the nested filesystems/snapshots. Thus, a new leaf node has a
69 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
70 * snapshot count properties on a node indicate uninitialized counts on that
71 * node.) When first setting a limit on an uninitialized node, the code starts
72 * at the filesystem with the new limit and descends into all sub-filesystems
73 * to add the count properties.
74 *
75 * In practice this is lightweight since a limit is typically set when the
76 * filesystem is created and thus has no children. Once valid, changing the
77 * limit value won't require a re-traversal since the counts are already valid.
78 * When recursively fixing the counts, if a node with a limit is encountered
79 * during the descent, the counts are known to be valid and there is no need to
80 * descend into that filesystem's children. The counts on filesystems above the
81 * one with the new limit will still be uninitialized, unless a limit is
82 * eventually set on one of those filesystems. The counts are always recursively
83 * updated when a limit is set on a dataset, unless there is already a limit.
84 * When a new limit value is set on a filesystem with an existing limit, it is
85 * possible for the new limit to be less than the current count at that level
86 * since a user who can change the limit is also allowed to exceed the limit.
87 *
88 * Once the feature is active, then whenever a filesystem or snapshot is
89 * created, the code recurses up the tree, validating the new count against the
90 * limit at each initialized level. In practice, most levels will not have a
91 * limit set. If there is a limit at any initialized level up the tree, the
92 * check must pass or the creation will fail. Likewise, when a filesystem or
93 * snapshot is destroyed, the counts are recursively adjusted all the way up
94 * the initizized nodes in the tree. Renaming a filesystem into different point
95 * in the tree will first validate, then update the counts on each branch up to
96 * the common ancestor. A receive will also validate the counts and then update
97 * them.
98 *
99 * An exception to the above behavior is that the limit is not enforced if the
100 * user has permission to modify the limit. This is primarily so that
101 * recursive snapshots in the global zone always work. We want to prevent a
102 * denial-of-service in which a lower level delegated dataset could max out its
103 * limit and thus block recursive snapshots from being taken in the global zone.
104 * Because of this, it is possible for the snapshot count to be over the limit
105 * and snapshots taken in the global zone could cause a lower level dataset to
106 * hit or exceed its limit. The administrator taking the global zone recursive
107 * snapshot should be aware of this side-effect and behave accordingly.
108 * For consistency, the filesystem limit is also not enforced if the user can
109 * modify the limit.
110 *
111 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
112 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
113 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
114 * dsl_dir_init_fs_ss_count().
115 *
116 * There is a special case when we receive a filesystem that already exists. In
117 * this case a temporary clone name of %X is created (see dmu_recv_begin). We
118 * never update the filesystem counts for temporary clones.
119 *
120 * Likewise, we do not update the snapshot counts for temporary snapshots,
121 * such as those created by zfs diff.
122 */
123
124static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
125
126/* ARGSUSED */
127static void
128dsl_dir_evict(dmu_buf_t *db, void *arg)
129{
130	dsl_dir_t *dd = arg;
131	dsl_pool_t *dp = dd->dd_pool;
132	int t;
133
134	for (t = 0; t < TXG_SIZE; t++) {
135		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
136		ASSERT(dd->dd_tempreserved[t] == 0);
137		ASSERT(dd->dd_space_towrite[t] == 0);
138	}
139
140	if (dd->dd_parent)
141		dsl_dir_rele(dd->dd_parent, dd);
142
143	spa_close(dd->dd_pool->dp_spa, dd);
144
145	/*
146	 * The props callback list should have been cleaned up by
147	 * objset_evict().
148	 */
149	list_destroy(&dd->dd_prop_cbs);
150	mutex_destroy(&dd->dd_lock);
151	kmem_free(dd, sizeof (dsl_dir_t));
152}
153
154int
155dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
156    const char *tail, void *tag, dsl_dir_t **ddp)
157{
158	dmu_buf_t *dbuf;
159	dsl_dir_t *dd;
160	int err;
161
162	ASSERT(dsl_pool_config_held(dp));
163
164	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
165	if (err != 0)
166		return (err);
167	dd = dmu_buf_get_user(dbuf);
168#ifdef ZFS_DEBUG
169	{
170		dmu_object_info_t doi;
171		dmu_object_info_from_db(dbuf, &doi);
172		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
173		ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
174	}
175#endif
176	if (dd == NULL) {
177		dsl_dir_t *winner;
178
179		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
180		dd->dd_object = ddobj;
181		dd->dd_dbuf = dbuf;
182		dd->dd_pool = dp;
183		dd->dd_phys = dbuf->db_data;
184		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
185
186		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
187		    offsetof(dsl_prop_cb_record_t, cbr_node));
188
189		dsl_dir_snap_cmtime_update(dd);
190
191		if (dd->dd_phys->dd_parent_obj) {
192			err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
193			    NULL, dd, &dd->dd_parent);
194			if (err != 0)
195				goto errout;
196			if (tail) {
197#ifdef ZFS_DEBUG
198				uint64_t foundobj;
199
200				err = zap_lookup(dp->dp_meta_objset,
201				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
202				    tail, sizeof (foundobj), 1, &foundobj);
203				ASSERT(err || foundobj == ddobj);
204#endif
205				(void) strcpy(dd->dd_myname, tail);
206			} else {
207				err = zap_value_search(dp->dp_meta_objset,
208				    dd->dd_parent->dd_phys->dd_child_dir_zapobj,
209				    ddobj, 0, dd->dd_myname);
210			}
211			if (err != 0)
212				goto errout;
213		} else {
214			(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
215		}
216
217		if (dsl_dir_is_clone(dd)) {
218			dmu_buf_t *origin_bonus;
219			dsl_dataset_phys_t *origin_phys;
220
221			/*
222			 * We can't open the origin dataset, because
223			 * that would require opening this dsl_dir.
224			 * Just look at its phys directly instead.
225			 */
226			err = dmu_bonus_hold(dp->dp_meta_objset,
227			    dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
228			if (err != 0)
229				goto errout;
230			origin_phys = origin_bonus->db_data;
231			dd->dd_origin_txg =
232			    origin_phys->ds_creation_txg;
233			dmu_buf_rele(origin_bonus, FTAG);
234		}
235
236		winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
237		    dsl_dir_evict);
238		if (winner) {
239			if (dd->dd_parent)
240				dsl_dir_rele(dd->dd_parent, dd);
241			mutex_destroy(&dd->dd_lock);
242			kmem_free(dd, sizeof (dsl_dir_t));
243			dd = winner;
244		} else {
245			spa_open_ref(dp->dp_spa, dd);
246		}
247	}
248
249	/*
250	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
251	 * holds on the spa.  We need the open-to-close holds because
252	 * otherwise the spa_refcnt wouldn't change when we open a
253	 * dir which the spa also has open, so we could incorrectly
254	 * think it was OK to unload/export/destroy the pool.  We need
255	 * the instantiate-to-evict hold because the dsl_dir_t has a
256	 * pointer to the dd_pool, which has a pointer to the spa_t.
257	 */
258	spa_open_ref(dp->dp_spa, tag);
259	ASSERT3P(dd->dd_pool, ==, dp);
260	ASSERT3U(dd->dd_object, ==, ddobj);
261	ASSERT3P(dd->dd_dbuf, ==, dbuf);
262	*ddp = dd;
263	return (0);
264
265errout:
266	if (dd->dd_parent)
267		dsl_dir_rele(dd->dd_parent, dd);
268	mutex_destroy(&dd->dd_lock);
269	kmem_free(dd, sizeof (dsl_dir_t));
270	dmu_buf_rele(dbuf, tag);
271	return (err);
272}
273
274void
275dsl_dir_rele(dsl_dir_t *dd, void *tag)
276{
277	dprintf_dd(dd, "%s\n", "");
278	spa_close(dd->dd_pool->dp_spa, tag);
279	dmu_buf_rele(dd->dd_dbuf, tag);
280}
281
282/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
283void
284dsl_dir_name(dsl_dir_t *dd, char *buf)
285{
286	if (dd->dd_parent) {
287		dsl_dir_name(dd->dd_parent, buf);
288		(void) strcat(buf, "/");
289	} else {
290		buf[0] = '\0';
291	}
292	if (!MUTEX_HELD(&dd->dd_lock)) {
293		/*
294		 * recursive mutex so that we can use
295		 * dprintf_dd() with dd_lock held
296		 */
297		mutex_enter(&dd->dd_lock);
298		(void) strcat(buf, dd->dd_myname);
299		mutex_exit(&dd->dd_lock);
300	} else {
301		(void) strcat(buf, dd->dd_myname);
302	}
303}
304
305/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
306int
307dsl_dir_namelen(dsl_dir_t *dd)
308{
309	int result = 0;
310
311	if (dd->dd_parent) {
312		/* parent's name + 1 for the "/" */
313		result = dsl_dir_namelen(dd->dd_parent) + 1;
314	}
315
316	if (!MUTEX_HELD(&dd->dd_lock)) {
317		/* see dsl_dir_name */
318		mutex_enter(&dd->dd_lock);
319		result += strlen(dd->dd_myname);
320		mutex_exit(&dd->dd_lock);
321	} else {
322		result += strlen(dd->dd_myname);
323	}
324
325	return (result);
326}
327
328static int
329getcomponent(const char *path, char *component, const char **nextp)
330{
331	char *p;
332
333	if ((path == NULL) || (path[0] == '\0'))
334		return (SET_ERROR(ENOENT));
335	/* This would be a good place to reserve some namespace... */
336	p = strpbrk(path, "/@");
337	if (p && (p[1] == '/' || p[1] == '@')) {
338		/* two separators in a row */
339		return (SET_ERROR(EINVAL));
340	}
341	if (p == NULL || p == path) {
342		/*
343		 * if the first thing is an @ or /, it had better be an
344		 * @ and it had better not have any more ats or slashes,
345		 * and it had better have something after the @.
346		 */
347		if (p != NULL &&
348		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
349			return (SET_ERROR(EINVAL));
350		if (strlen(path) >= MAXNAMELEN)
351			return (SET_ERROR(ENAMETOOLONG));
352		(void) strcpy(component, path);
353		p = NULL;
354	} else if (p[0] == '/') {
355		if (p - path >= MAXNAMELEN)
356			return (SET_ERROR(ENAMETOOLONG));
357		(void) strncpy(component, path, p - path);
358		component[p - path] = '\0';
359		p++;
360	} else if (p[0] == '@') {
361		/*
362		 * if the next separator is an @, there better not be
363		 * any more slashes.
364		 */
365		if (strchr(path, '/'))
366			return (SET_ERROR(EINVAL));
367		if (p - path >= MAXNAMELEN)
368			return (SET_ERROR(ENAMETOOLONG));
369		(void) strncpy(component, path, p - path);
370		component[p - path] = '\0';
371	} else {
372		panic("invalid p=%p", (void *)p);
373	}
374	*nextp = p;
375	return (0);
376}
377
378/*
379 * Return the dsl_dir_t, and possibly the last component which couldn't
380 * be found in *tail.  The name must be in the specified dsl_pool_t.  This
381 * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
382 * path is bogus, or if tail==NULL and we couldn't parse the whole name.
383 * (*tail)[0] == '@' means that the last component is a snapshot.
384 */
385int
386dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
387    dsl_dir_t **ddp, const char **tailp)
388{
389	char buf[MAXNAMELEN];
390	const char *spaname, *next, *nextnext = NULL;
391	int err;
392	dsl_dir_t *dd;
393	uint64_t ddobj;
394
395	err = getcomponent(name, buf, &next);
396	if (err != 0)
397		return (err);
398
399	/* Make sure the name is in the specified pool. */
400	spaname = spa_name(dp->dp_spa);
401	if (strcmp(buf, spaname) != 0)
402		return (SET_ERROR(EINVAL));
403
404	ASSERT(dsl_pool_config_held(dp));
405
406	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
407	if (err != 0) {
408		return (err);
409	}
410
411	while (next != NULL) {
412		dsl_dir_t *child_ds;
413		err = getcomponent(next, buf, &nextnext);
414		if (err != 0)
415			break;
416		ASSERT(next[0] != '\0');
417		if (next[0] == '@')
418			break;
419		dprintf("looking up %s in obj%lld\n",
420		    buf, dd->dd_phys->dd_child_dir_zapobj);
421
422		err = zap_lookup(dp->dp_meta_objset,
423		    dd->dd_phys->dd_child_dir_zapobj,
424		    buf, sizeof (ddobj), 1, &ddobj);
425		if (err != 0) {
426			if (err == ENOENT)
427				err = 0;
428			break;
429		}
430
431		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
432		if (err != 0)
433			break;
434		dsl_dir_rele(dd, tag);
435		dd = child_ds;
436		next = nextnext;
437	}
438
439	if (err != 0) {
440		dsl_dir_rele(dd, tag);
441		return (err);
442	}
443
444	/*
445	 * It's an error if there's more than one component left, or
446	 * tailp==NULL and there's any component left.
447	 */
448	if (next != NULL &&
449	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
450		/* bad path name */
451		dsl_dir_rele(dd, tag);
452		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
453		err = SET_ERROR(ENOENT);
454	}
455	if (tailp != NULL)
456		*tailp = next;
457	*ddp = dd;
458	return (err);
459}
460
461/*
462 * If the counts are already initialized for this filesystem and its
463 * descendants then do nothing, otherwise initialize the counts.
464 *
465 * The counts on this filesystem, and those below, may be uninitialized due to
466 * either the use of a pre-existing pool which did not support the
467 * filesystem/snapshot limit feature, or one in which the feature had not yet
468 * been enabled.
469 *
470 * Recursively descend the filesystem tree and update the filesystem/snapshot
471 * counts on each filesystem below, then update the cumulative count on the
472 * current filesystem. If the filesystem already has a count set on it,
473 * then we know that its counts, and the counts on the filesystems below it,
474 * are already correct, so we don't have to update this filesystem.
475 */
476static void
477dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
478{
479	uint64_t my_fs_cnt = 0;
480	uint64_t my_ss_cnt = 0;
481	dsl_pool_t *dp = dd->dd_pool;
482	objset_t *os = dp->dp_meta_objset;
483	zap_cursor_t *zc;
484	zap_attribute_t *za;
485	dsl_dataset_t *ds;
486
487	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
488	ASSERT(dsl_pool_config_held(dp));
489	ASSERT(dmu_tx_is_syncing(tx));
490
491	dsl_dir_zapify(dd, tx);
492
493	/*
494	 * If the filesystem count has already been initialized then we
495	 * don't need to recurse down any further.
496	 */
497	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
498		return;
499
500	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
501	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
502
503	/* Iterate my child dirs */
504	for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
505	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
506		dsl_dir_t *chld_dd;
507		uint64_t count;
508
509		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
510		    &chld_dd));
511
512		/*
513		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
514		 * temporary datasets.
515		 */
516		if (chld_dd->dd_myname[0] == '$' ||
517		    chld_dd->dd_myname[0] == '%') {
518			dsl_dir_rele(chld_dd, FTAG);
519			continue;
520		}
521
522		my_fs_cnt++;	/* count this child */
523
524		dsl_dir_init_fs_ss_count(chld_dd, tx);
525
526		VERIFY0(zap_lookup(os, chld_dd->dd_object,
527		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
528		my_fs_cnt += count;
529		VERIFY0(zap_lookup(os, chld_dd->dd_object,
530		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
531		my_ss_cnt += count;
532
533		dsl_dir_rele(chld_dd, FTAG);
534	}
535	zap_cursor_fini(zc);
536	/* Count my snapshots (we counted children's snapshots above) */
537	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
538	    dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
539
540	for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
541	    zap_cursor_retrieve(zc, za) == 0;
542	    zap_cursor_advance(zc)) {
543		/* Don't count temporary snapshots */
544		if (za->za_name[0] != '%')
545			my_ss_cnt++;
546	}
547	zap_cursor_fini(zc);
548
549	dsl_dataset_rele(ds, FTAG);
550
551	kmem_free(zc, sizeof (zap_cursor_t));
552	kmem_free(za, sizeof (zap_attribute_t));
553
554	/* we're in a sync task, update counts */
555	dmu_buf_will_dirty(dd->dd_dbuf, tx);
556	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
557	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
558	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
559	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
560}
561
562static int
563dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
564{
565	char *ddname = (char *)arg;
566	dsl_pool_t *dp = dmu_tx_pool(tx);
567	dsl_dataset_t *ds;
568	dsl_dir_t *dd;
569	int error;
570
571	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
572	if (error != 0)
573		return (error);
574
575	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
576		dsl_dataset_rele(ds, FTAG);
577		return (SET_ERROR(ENOTSUP));
578	}
579
580	dd = ds->ds_dir;
581	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
582	    dsl_dir_is_zapified(dd) &&
583	    zap_contains(dp->dp_meta_objset, dd->dd_object,
584	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
585		dsl_dataset_rele(ds, FTAG);
586		return (SET_ERROR(EALREADY));
587	}
588
589	dsl_dataset_rele(ds, FTAG);
590	return (0);
591}
592
593static void
594dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
595{
596	char *ddname = (char *)arg;
597	dsl_pool_t *dp = dmu_tx_pool(tx);
598	dsl_dataset_t *ds;
599	spa_t *spa;
600
601	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
602
603	spa = dsl_dataset_get_spa(ds);
604
605	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
606		/*
607		 * Since the feature was not active and we're now setting a
608		 * limit, increment the feature-active counter so that the
609		 * feature becomes active for the first time.
610		 *
611		 * We are already in a sync task so we can update the MOS.
612		 */
613		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
614	}
615
616	/*
617	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
618	 * we need to ensure the counts are correct. Descend down the tree from
619	 * this point and update all of the counts to be accurate.
620	 */
621	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
622
623	dsl_dataset_rele(ds, FTAG);
624}
625
626/*
627 * Make sure the feature is enabled and activate it if necessary.
628 * Since we're setting a limit, ensure the on-disk counts are valid.
629 * This is only called by the ioctl path when setting a limit value.
630 *
631 * We do not need to validate the new limit, since users who can change the
632 * limit are also allowed to exceed the limit.
633 */
634int
635dsl_dir_activate_fs_ss_limit(const char *ddname)
636{
637	int error;
638
639	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
640	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
641	    ZFS_SPACE_CHECK_RESERVED);
642
643	if (error == EALREADY)
644		error = 0;
645
646	return (error);
647}
648
649/*
650 * Used to determine if the filesystem_limit or snapshot_limit should be
651 * enforced. We allow the limit to be exceeded if the user has permission to
652 * write the property value. We pass in the creds that we got in the open
653 * context since we will always be the GZ root in syncing context. We also have
654 * to handle the case where we are allowed to change the limit on the current
655 * dataset, but there may be another limit in the tree above.
656 *
657 * We can never modify these two properties within a non-global zone. In
658 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
659 * can't use that function since we are already holding the dp_config_rwlock.
660 * In addition, we already have the dd and dealing with snapshots is simplified
661 * in this code.
662 */
663
664typedef enum {
665	ENFORCE_ALWAYS,
666	ENFORCE_NEVER,
667	ENFORCE_ABOVE
668} enforce_res_t;
669
670static enforce_res_t
671dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
672{
673	enforce_res_t enforce = ENFORCE_ALWAYS;
674	uint64_t obj;
675	dsl_dataset_t *ds;
676	uint64_t zoned;
677
678	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
679	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
680
681#ifdef _KERNEL
682	if (crgetzoneid(cr) != GLOBAL_ZONEID)
683		return (ENFORCE_ALWAYS);
684
685	if (secpolicy_zfs(cr) == 0)
686		return (ENFORCE_NEVER);
687#endif
688
689	if ((obj = dd->dd_phys->dd_head_dataset_obj) == 0)
690		return (ENFORCE_ALWAYS);
691
692	ASSERT(dsl_pool_config_held(dd->dd_pool));
693
694	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
695		return (ENFORCE_ALWAYS);
696
697	if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
698		/* Only root can access zoned fs's from the GZ */
699		enforce = ENFORCE_ALWAYS;
700	} else {
701		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
702			enforce = ENFORCE_ABOVE;
703	}
704
705	dsl_dataset_rele(ds, FTAG);
706	return (enforce);
707}
708
709/*
710 * Check if adding additional child filesystem(s) would exceed any filesystem
711 * limits or adding additional snapshot(s) would exceed any snapshot limits.
712 * The prop argument indicates which limit to check.
713 *
714 * Note that all filesystem limits up to the root (or the highest
715 * initialized) filesystem or the given ancestor must be satisfied.
716 */
717int
718dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
719    dsl_dir_t *ancestor, cred_t *cr)
720{
721	objset_t *os = dd->dd_pool->dp_meta_objset;
722	uint64_t limit, count;
723	char *count_prop;
724	enforce_res_t enforce;
725	int err = 0;
726
727	ASSERT(dsl_pool_config_held(dd->dd_pool));
728	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
729	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
730
731	/*
732	 * If we're allowed to change the limit, don't enforce the limit
733	 * e.g. this can happen if a snapshot is taken by an administrative
734	 * user in the global zone (i.e. a recursive snapshot by root).
735	 * However, we must handle the case of delegated permissions where we
736	 * are allowed to change the limit on the current dataset, but there
737	 * is another limit in the tree above.
738	 */
739	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
740	if (enforce == ENFORCE_NEVER)
741		return (0);
742
743	/*
744	 * e.g. if renaming a dataset with no snapshots, count adjustment
745	 * is 0.
746	 */
747	if (delta == 0)
748		return (0);
749
750	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
751		/*
752		 * We don't enforce the limit for temporary snapshots. This is
753		 * indicated by a NULL cred_t argument.
754		 */
755		if (cr == NULL)
756			return (0);
757
758		count_prop = DD_FIELD_SNAPSHOT_COUNT;
759	} else {
760		count_prop = DD_FIELD_FILESYSTEM_COUNT;
761	}
762
763	/*
764	 * If an ancestor has been provided, stop checking the limit once we
765	 * hit that dir. We need this during rename so that we don't overcount
766	 * the check once we recurse up to the common ancestor.
767	 */
768	if (ancestor == dd)
769		return (0);
770
771	/*
772	 * If we hit an uninitialized node while recursing up the tree, we can
773	 * stop since we know there is no limit here (or above). The counts are
774	 * not valid on this node and we know we won't touch this node's counts.
775	 */
776	if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
777	    count_prop, sizeof (count), 1, &count) == ENOENT)
778		return (0);
779
780	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
781	    B_FALSE);
782	if (err != 0)
783		return (err);
784
785	/* Is there a limit which we've hit? */
786	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
787		return (SET_ERROR(EDQUOT));
788
789	if (dd->dd_parent != NULL)
790		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
791		    ancestor, cr);
792
793	return (err);
794}
795
796/*
797 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
798 * parents. When a new filesystem/snapshot is created, increment the count on
799 * all parents, and when a filesystem/snapshot is destroyed, decrement the
800 * count.
801 */
802void
803dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
804    dmu_tx_t *tx)
805{
806	int err;
807	objset_t *os = dd->dd_pool->dp_meta_objset;
808	uint64_t count;
809
810	ASSERT(dsl_pool_config_held(dd->dd_pool));
811	ASSERT(dmu_tx_is_syncing(tx));
812	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
813	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
814
815	/*
816	 * When we receive an incremental stream into a filesystem that already
817	 * exists, a temporary clone is created.  We don't count this temporary
818	 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
819	 * $MOS & $ORIGIN) objsets.
820	 */
821	if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
822	    strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
823		return;
824
825	/*
826	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
827	 */
828	if (delta == 0)
829		return;
830
831	/*
832	 * If we hit an uninitialized node while recursing up the tree, we can
833	 * stop since we know the counts are not valid on this node and we
834	 * know we shouldn't touch this node's counts. An uninitialized count
835	 * on the node indicates that either the feature has not yet been
836	 * activated or there are no limits on this part of the tree.
837	 */
838	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
839	    prop, sizeof (count), 1, &count)) == ENOENT)
840		return;
841	VERIFY0(err);
842
843	count += delta;
844	/* Use a signed verify to make sure we're not neg. */
845	VERIFY3S(count, >=, 0);
846
847	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
848	    tx));
849
850	/* Roll up this additional count into our ancestors */
851	if (dd->dd_parent != NULL)
852		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
853}
854
855uint64_t
856dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
857    dmu_tx_t *tx)
858{
859	objset_t *mos = dp->dp_meta_objset;
860	uint64_t ddobj;
861	dsl_dir_phys_t *ddphys;
862	dmu_buf_t *dbuf;
863
864	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
865	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
866	if (pds) {
867		VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
868		    name, sizeof (uint64_t), 1, &ddobj, tx));
869	} else {
870		/* it's the root dir */
871		VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
872		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
873	}
874	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
875	dmu_buf_will_dirty(dbuf, tx);
876	ddphys = dbuf->db_data;
877
878	ddphys->dd_creation_time = gethrestime_sec();
879	if (pds) {
880		ddphys->dd_parent_obj = pds->dd_object;
881
882		/* update the filesystem counts */
883		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
884	}
885	ddphys->dd_props_zapobj = zap_create(mos,
886	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
887	ddphys->dd_child_dir_zapobj = zap_create(mos,
888	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
889	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
890		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
891	dmu_buf_rele(dbuf, FTAG);
892
893	return (ddobj);
894}
895
896boolean_t
897dsl_dir_is_clone(dsl_dir_t *dd)
898{
899	return (dd->dd_phys->dd_origin_obj &&
900	    (dd->dd_pool->dp_origin_snap == NULL ||
901	    dd->dd_phys->dd_origin_obj !=
902	    dd->dd_pool->dp_origin_snap->ds_object));
903}
904
905void
906dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
907{
908	mutex_enter(&dd->dd_lock);
909	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
910	    dd->dd_phys->dd_used_bytes);
911	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
912	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
913	    dd->dd_phys->dd_reserved);
914	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
915	    dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
916	    (dd->dd_phys->dd_uncompressed_bytes * 100 /
917	    dd->dd_phys->dd_compressed_bytes));
918	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
919	    dd->dd_phys->dd_uncompressed_bytes);
920	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
921		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
922		    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
923		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
924		    dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
925		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
926		    dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
927		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
928		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
929		    dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
930	}
931	mutex_exit(&dd->dd_lock);
932
933	if (dsl_dir_is_zapified(dd)) {
934		uint64_t count;
935		objset_t *os = dd->dd_pool->dp_meta_objset;
936
937		if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
938		    sizeof (count), 1, &count) == 0) {
939			dsl_prop_nvlist_add_uint64(nv,
940			    ZFS_PROP_FILESYSTEM_COUNT, count);
941		}
942		if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
943		    sizeof (count), 1, &count) == 0) {
944			dsl_prop_nvlist_add_uint64(nv,
945			    ZFS_PROP_SNAPSHOT_COUNT, count);
946		}
947	}
948
949	if (dsl_dir_is_clone(dd)) {
950		dsl_dataset_t *ds;
951		char buf[MAXNAMELEN];
952
953		VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
954		    dd->dd_phys->dd_origin_obj, FTAG, &ds));
955		dsl_dataset_name(ds, buf);
956		dsl_dataset_rele(ds, FTAG);
957		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
958	}
959}
960
961void
962dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
963{
964	dsl_pool_t *dp = dd->dd_pool;
965
966	ASSERT(dd->dd_phys);
967
968	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
969		/* up the hold count until we can be written out */
970		dmu_buf_add_ref(dd->dd_dbuf, dd);
971	}
972}
973
974static int64_t
975parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
976{
977	uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
978	uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
979	return (new_accounted - old_accounted);
980}
981
982void
983dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
984{
985	ASSERT(dmu_tx_is_syncing(tx));
986
987	mutex_enter(&dd->dd_lock);
988	ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
989	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
990	    dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
991	dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
992	mutex_exit(&dd->dd_lock);
993
994	/* release the hold from dsl_dir_dirty */
995	dmu_buf_rele(dd->dd_dbuf, dd);
996}
997
998static uint64_t
999dsl_dir_space_towrite(dsl_dir_t *dd)
1000{
1001	uint64_t space = 0;
1002	int i;
1003
1004	ASSERT(MUTEX_HELD(&dd->dd_lock));
1005
1006	for (i = 0; i < TXG_SIZE; i++) {
1007		space += dd->dd_space_towrite[i&TXG_MASK];
1008		ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
1009	}
1010	return (space);
1011}
1012
1013/*
1014 * How much space would dd have available if ancestor had delta applied
1015 * to it?  If ondiskonly is set, we're only interested in what's
1016 * on-disk, not estimated pending changes.
1017 */
1018uint64_t
1019dsl_dir_space_available(dsl_dir_t *dd,
1020    dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
1021{
1022	uint64_t parentspace, myspace, quota, used;
1023
1024	/*
1025	 * If there are no restrictions otherwise, assume we have
1026	 * unlimited space available.
1027	 */
1028	quota = UINT64_MAX;
1029	parentspace = UINT64_MAX;
1030
1031	if (dd->dd_parent != NULL) {
1032		parentspace = dsl_dir_space_available(dd->dd_parent,
1033		    ancestor, delta, ondiskonly);
1034	}
1035
1036	mutex_enter(&dd->dd_lock);
1037	if (dd->dd_phys->dd_quota != 0)
1038		quota = dd->dd_phys->dd_quota;
1039	used = dd->dd_phys->dd_used_bytes;
1040	if (!ondiskonly)
1041		used += dsl_dir_space_towrite(dd);
1042
1043	if (dd->dd_parent == NULL) {
1044		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
1045		quota = MIN(quota, poolsize);
1046	}
1047
1048	if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
1049		/*
1050		 * We have some space reserved, in addition to what our
1051		 * parent gave us.
1052		 */
1053		parentspace += dd->dd_phys->dd_reserved - used;
1054	}
1055
1056	if (dd == ancestor) {
1057		ASSERT(delta <= 0);
1058		ASSERT(used >= -delta);
1059		used += delta;
1060		if (parentspace != UINT64_MAX)
1061			parentspace -= delta;
1062	}
1063
1064	if (used > quota) {
1065		/* over quota */
1066		myspace = 0;
1067	} else {
1068		/*
1069		 * the lesser of the space provided by our parent and
1070		 * the space left in our quota
1071		 */
1072		myspace = MIN(parentspace, quota - used);
1073	}
1074
1075	mutex_exit(&dd->dd_lock);
1076
1077	return (myspace);
1078}
1079
1080struct tempreserve {
1081	list_node_t tr_node;
1082	dsl_dir_t *tr_ds;
1083	uint64_t tr_size;
1084};
1085
1086static int
1087dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
1088    boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
1089    dmu_tx_t *tx, boolean_t first)
1090{
1091	uint64_t txg = tx->tx_txg;
1092	uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
1093	uint64_t deferred = 0;
1094	struct tempreserve *tr;
1095	int retval = EDQUOT;
1096	int txgidx = txg & TXG_MASK;
1097	int i;
1098	uint64_t ref_rsrv = 0;
1099
1100	ASSERT3U(txg, !=, 0);
1101	ASSERT3S(asize, >, 0);
1102
1103	mutex_enter(&dd->dd_lock);
1104
1105	/*
1106	 * Check against the dsl_dir's quota.  We don't add in the delta
1107	 * when checking for over-quota because they get one free hit.
1108	 */
1109	est_inflight = dsl_dir_space_towrite(dd);
1110	for (i = 0; i < TXG_SIZE; i++)
1111		est_inflight += dd->dd_tempreserved[i];
1112	used_on_disk = dd->dd_phys->dd_used_bytes;
1113
1114	/*
1115	 * On the first iteration, fetch the dataset's used-on-disk and
1116	 * refreservation values. Also, if checkrefquota is set, test if
1117	 * allocating this space would exceed the dataset's refquota.
1118	 */
1119	if (first && tx->tx_objset) {
1120		int error;
1121		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
1122
1123		error = dsl_dataset_check_quota(ds, checkrefquota,
1124		    asize, est_inflight, &used_on_disk, &ref_rsrv);
1125		if (error) {
1126			mutex_exit(&dd->dd_lock);
1127			return (error);
1128		}
1129	}
1130
1131	/*
1132	 * If this transaction will result in a net free of space,
1133	 * we want to let it through.
1134	 */
1135	if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
1136		quota = UINT64_MAX;
1137	else
1138		quota = dd->dd_phys->dd_quota;
1139
1140	/*
1141	 * Adjust the quota against the actual pool size at the root
1142	 * minus any outstanding deferred frees.
1143	 * To ensure that it's possible to remove files from a full
1144	 * pool without inducing transient overcommits, we throttle
1145	 * netfree transactions against a quota that is slightly larger,
1146	 * but still within the pool's allocation slop.  In cases where
1147	 * we're very close to full, this will allow a steady trickle of
1148	 * removes to get through.
1149	 */
1150	if (dd->dd_parent == NULL) {
1151		spa_t *spa = dd->dd_pool->dp_spa;
1152		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
1153		deferred = metaslab_class_get_deferred(spa_normal_class(spa));
1154		if (poolsize - deferred < quota) {
1155			quota = poolsize - deferred;
1156			retval = ENOSPC;
1157		}
1158	}
1159
1160	/*
1161	 * If they are requesting more space, and our current estimate
1162	 * is over quota, they get to try again unless the actual
1163	 * on-disk is over quota and there are no pending changes (which
1164	 * may free up space for us).
1165	 */
1166	if (used_on_disk + est_inflight >= quota) {
1167		if (est_inflight > 0 || used_on_disk < quota ||
1168		    (retval == ENOSPC && used_on_disk < quota + deferred))
1169			retval = ERESTART;
1170		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
1171		    "quota=%lluK tr=%lluK err=%d\n",
1172		    used_on_disk>>10, est_inflight>>10,
1173		    quota>>10, asize>>10, retval);
1174		mutex_exit(&dd->dd_lock);
1175		return (SET_ERROR(retval));
1176	}
1177
1178	/* We need to up our estimated delta before dropping dd_lock */
1179	dd->dd_tempreserved[txgidx] += asize;
1180
1181	parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
1182	    asize - ref_rsrv);
1183	mutex_exit(&dd->dd_lock);
1184
1185	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1186	tr->tr_ds = dd;
1187	tr->tr_size = asize;
1188	list_insert_tail(tr_list, tr);
1189
1190	/* see if it's OK with our parent */
1191	if (dd->dd_parent && parent_rsrv) {
1192		boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
1193
1194		return (dsl_dir_tempreserve_impl(dd->dd_parent,
1195		    parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
1196	} else {
1197		return (0);
1198	}
1199}
1200
1201/*
1202 * Reserve space in this dsl_dir, to be used in this tx's txg.
1203 * After the space has been dirtied (and dsl_dir_willuse_space()
1204 * has been called), the reservation should be canceled, using
1205 * dsl_dir_tempreserve_clear().
1206 */
1207int
1208dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
1209    uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
1210{
1211	int err;
1212	list_t *tr_list;
1213
1214	if (asize == 0) {
1215		*tr_cookiep = NULL;
1216		return (0);
1217	}
1218
1219	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
1220	list_create(tr_list, sizeof (struct tempreserve),
1221	    offsetof(struct tempreserve, tr_node));
1222	ASSERT3S(asize, >, 0);
1223	ASSERT3S(fsize, >=, 0);
1224
1225	err = arc_tempreserve_space(lsize, tx->tx_txg);
1226	if (err == 0) {
1227		struct tempreserve *tr;
1228
1229		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1230		tr->tr_size = lsize;
1231		list_insert_tail(tr_list, tr);
1232	} else {
1233		if (err == EAGAIN) {
1234			/*
1235			 * If arc_memory_throttle() detected that pageout
1236			 * is running and we are low on memory, we delay new
1237			 * non-pageout transactions to give pageout an
1238			 * advantage.
1239			 *
1240			 * It is unfortunate to be delaying while the caller's
1241			 * locks are held.
1242			 */
1243			txg_delay(dd->dd_pool, tx->tx_txg,
1244			    MSEC2NSEC(10), MSEC2NSEC(10));
1245			err = SET_ERROR(ERESTART);
1246		}
1247	}
1248
1249	if (err == 0) {
1250		err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
1251		    FALSE, asize > usize, tr_list, tx, TRUE);
1252	}
1253
1254	if (err != 0)
1255		dsl_dir_tempreserve_clear(tr_list, tx);
1256	else
1257		*tr_cookiep = tr_list;
1258
1259	return (err);
1260}
1261
1262/*
1263 * Clear a temporary reservation that we previously made with
1264 * dsl_dir_tempreserve_space().
1265 */
1266void
1267dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
1268{
1269	int txgidx = tx->tx_txg & TXG_MASK;
1270	list_t *tr_list = tr_cookie;
1271	struct tempreserve *tr;
1272
1273	ASSERT3U(tx->tx_txg, !=, 0);
1274
1275	if (tr_cookie == NULL)
1276		return;
1277
1278	while ((tr = list_head(tr_list)) != NULL) {
1279		if (tr->tr_ds) {
1280			mutex_enter(&tr->tr_ds->dd_lock);
1281			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
1282			    tr->tr_size);
1283			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
1284			mutex_exit(&tr->tr_ds->dd_lock);
1285		} else {
1286			arc_tempreserve_clear(tr->tr_size);
1287		}
1288		list_remove(tr_list, tr);
1289		kmem_free(tr, sizeof (struct tempreserve));
1290	}
1291
1292	kmem_free(tr_list, sizeof (list_t));
1293}
1294
1295/*
1296 * This should be called from open context when we think we're going to write
1297 * or free space, for example when dirtying data. Be conservative; it's okay
1298 * to write less space or free more, but we don't want to write more or free
1299 * less than the amount specified.
1300 */
1301void
1302dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
1303{
1304	int64_t parent_space;
1305	uint64_t est_used;
1306
1307	mutex_enter(&dd->dd_lock);
1308	if (space > 0)
1309		dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
1310
1311	est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
1312	parent_space = parent_delta(dd, est_used, space);
1313	mutex_exit(&dd->dd_lock);
1314
1315	/* Make sure that we clean up dd_space_to* */
1316	dsl_dir_dirty(dd, tx);
1317
1318	/* XXX this is potentially expensive and unnecessary... */
1319	if (parent_space && dd->dd_parent)
1320		dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
1321}
1322
1323/* call from syncing context when we actually write/free space for this dd */
1324void
1325dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
1326    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
1327{
1328	int64_t accounted_delta;
1329
1330	/*
1331	 * dsl_dataset_set_refreservation_sync_impl() calls this with
1332	 * dd_lock held, so that it can atomically update
1333	 * ds->ds_reserved and the dsl_dir accounting, so that
1334	 * dsl_dataset_check_quota() can see dataset and dir accounting
1335	 * consistently.
1336	 */
1337	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
1338
1339	ASSERT(dmu_tx_is_syncing(tx));
1340	ASSERT(type < DD_USED_NUM);
1341
1342	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1343
1344	if (needlock)
1345		mutex_enter(&dd->dd_lock);
1346	accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
1347	ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
1348	ASSERT(compressed >= 0 ||
1349	    dd->dd_phys->dd_compressed_bytes >= -compressed);
1350	ASSERT(uncompressed >= 0 ||
1351	    dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
1352	dd->dd_phys->dd_used_bytes += used;
1353	dd->dd_phys->dd_uncompressed_bytes += uncompressed;
1354	dd->dd_phys->dd_compressed_bytes += compressed;
1355
1356	if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1357		ASSERT(used > 0 ||
1358		    dd->dd_phys->dd_used_breakdown[type] >= -used);
1359		dd->dd_phys->dd_used_breakdown[type] += used;
1360#ifdef DEBUG
1361		dd_used_t t;
1362		uint64_t u = 0;
1363		for (t = 0; t < DD_USED_NUM; t++)
1364			u += dd->dd_phys->dd_used_breakdown[t];
1365		ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
1366#endif
1367	}
1368	if (needlock)
1369		mutex_exit(&dd->dd_lock);
1370
1371	if (dd->dd_parent != NULL) {
1372		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1373		    accounted_delta, compressed, uncompressed, tx);
1374		dsl_dir_transfer_space(dd->dd_parent,
1375		    used - accounted_delta,
1376		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
1377	}
1378}
1379
1380void
1381dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
1382    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1383{
1384	ASSERT(dmu_tx_is_syncing(tx));
1385	ASSERT(oldtype < DD_USED_NUM);
1386	ASSERT(newtype < DD_USED_NUM);
1387
1388	if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
1389		return;
1390
1391	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1392	mutex_enter(&dd->dd_lock);
1393	ASSERT(delta > 0 ?
1394	    dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
1395	    dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
1396	ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
1397	dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
1398	dd->dd_phys->dd_used_breakdown[newtype] += delta;
1399	mutex_exit(&dd->dd_lock);
1400}
1401
1402typedef struct dsl_dir_set_qr_arg {
1403	const char *ddsqra_name;
1404	zprop_source_t ddsqra_source;
1405	uint64_t ddsqra_value;
1406} dsl_dir_set_qr_arg_t;
1407
1408static int
1409dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
1410{
1411	dsl_dir_set_qr_arg_t *ddsqra = arg;
1412	dsl_pool_t *dp = dmu_tx_pool(tx);
1413	dsl_dataset_t *ds;
1414	int error;
1415	uint64_t towrite, newval;
1416
1417	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1418	if (error != 0)
1419		return (error);
1420
1421	error = dsl_prop_predict(ds->ds_dir, "quota",
1422	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1423	if (error != 0) {
1424		dsl_dataset_rele(ds, FTAG);
1425		return (error);
1426	}
1427
1428	if (newval == 0) {
1429		dsl_dataset_rele(ds, FTAG);
1430		return (0);
1431	}
1432
1433	mutex_enter(&ds->ds_dir->dd_lock);
1434	/*
1435	 * If we are doing the preliminary check in open context, and
1436	 * there are pending changes, then don't fail it, since the
1437	 * pending changes could under-estimate the amount of space to be
1438	 * freed up.
1439	 */
1440	towrite = dsl_dir_space_towrite(ds->ds_dir);
1441	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1442	    (newval < ds->ds_dir->dd_phys->dd_reserved ||
1443	    newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
1444		error = SET_ERROR(ENOSPC);
1445	}
1446	mutex_exit(&ds->ds_dir->dd_lock);
1447	dsl_dataset_rele(ds, FTAG);
1448	return (error);
1449}
1450
1451static void
1452dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
1453{
1454	dsl_dir_set_qr_arg_t *ddsqra = arg;
1455	dsl_pool_t *dp = dmu_tx_pool(tx);
1456	dsl_dataset_t *ds;
1457	uint64_t newval;
1458
1459	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1460
1461	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1462		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
1463		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1464		    &ddsqra->ddsqra_value, tx);
1465
1466		VERIFY0(dsl_prop_get_int_ds(ds,
1467		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
1468	} else {
1469		newval = ddsqra->ddsqra_value;
1470		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1471		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
1472	}
1473
1474	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1475	mutex_enter(&ds->ds_dir->dd_lock);
1476	ds->ds_dir->dd_phys->dd_quota = newval;
1477	mutex_exit(&ds->ds_dir->dd_lock);
1478	dsl_dataset_rele(ds, FTAG);
1479}
1480
1481int
1482dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1483{
1484	dsl_dir_set_qr_arg_t ddsqra;
1485
1486	ddsqra.ddsqra_name = ddname;
1487	ddsqra.ddsqra_source = source;
1488	ddsqra.ddsqra_value = quota;
1489
1490	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
1491	    dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
1492}
1493
1494int
1495dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
1496{
1497	dsl_dir_set_qr_arg_t *ddsqra = arg;
1498	dsl_pool_t *dp = dmu_tx_pool(tx);
1499	dsl_dataset_t *ds;
1500	dsl_dir_t *dd;
1501	uint64_t newval, used, avail;
1502	int error;
1503
1504	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1505	if (error != 0)
1506		return (error);
1507	dd = ds->ds_dir;
1508
1509	/*
1510	 * If we are doing the preliminary check in open context, the
1511	 * space estimates may be inaccurate.
1512	 */
1513	if (!dmu_tx_is_syncing(tx)) {
1514		dsl_dataset_rele(ds, FTAG);
1515		return (0);
1516	}
1517
1518	error = dsl_prop_predict(ds->ds_dir,
1519	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1520	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1521	if (error != 0) {
1522		dsl_dataset_rele(ds, FTAG);
1523		return (error);
1524	}
1525
1526	mutex_enter(&dd->dd_lock);
1527	used = dd->dd_phys->dd_used_bytes;
1528	mutex_exit(&dd->dd_lock);
1529
1530	if (dd->dd_parent) {
1531		avail = dsl_dir_space_available(dd->dd_parent,
1532		    NULL, 0, FALSE);
1533	} else {
1534		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1535	}
1536
1537	if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
1538		uint64_t delta = MAX(used, newval) -
1539		    MAX(used, dd->dd_phys->dd_reserved);
1540
1541		if (delta > avail ||
1542		    (dd->dd_phys->dd_quota > 0 &&
1543		    newval > dd->dd_phys->dd_quota))
1544			error = SET_ERROR(ENOSPC);
1545	}
1546
1547	dsl_dataset_rele(ds, FTAG);
1548	return (error);
1549}
1550
1551void
1552dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1553{
1554	uint64_t used;
1555	int64_t delta;
1556
1557	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1558
1559	mutex_enter(&dd->dd_lock);
1560	used = dd->dd_phys->dd_used_bytes;
1561	delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
1562	dd->dd_phys->dd_reserved = value;
1563
1564	if (dd->dd_parent != NULL) {
1565		/* Roll up this additional usage into our ancestors */
1566		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1567		    delta, 0, 0, tx);
1568	}
1569	mutex_exit(&dd->dd_lock);
1570}
1571
1572
1573static void
1574dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
1575{
1576	dsl_dir_set_qr_arg_t *ddsqra = arg;
1577	dsl_pool_t *dp = dmu_tx_pool(tx);
1578	dsl_dataset_t *ds;
1579	uint64_t newval;
1580
1581	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1582
1583	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1584		dsl_prop_set_sync_impl(ds,
1585		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1586		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1587		    &ddsqra->ddsqra_value, tx);
1588
1589		VERIFY0(dsl_prop_get_int_ds(ds,
1590		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1591	} else {
1592		newval = ddsqra->ddsqra_value;
1593		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1594		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
1595		    (longlong_t)newval);
1596	}
1597
1598	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1599	dsl_dataset_rele(ds, FTAG);
1600}
1601
1602int
1603dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1604    uint64_t reservation)
1605{
1606	dsl_dir_set_qr_arg_t ddsqra;
1607
1608	ddsqra.ddsqra_name = ddname;
1609	ddsqra.ddsqra_source = source;
1610	ddsqra.ddsqra_value = reservation;
1611
1612	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
1613	    dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
1614}
1615
1616static dsl_dir_t *
1617closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1618{
1619	for (; ds1; ds1 = ds1->dd_parent) {
1620		dsl_dir_t *dd;
1621		for (dd = ds2; dd; dd = dd->dd_parent) {
1622			if (ds1 == dd)
1623				return (dd);
1624		}
1625	}
1626	return (NULL);
1627}
1628
1629/*
1630 * If delta is applied to dd, how much of that delta would be applied to
1631 * ancestor?  Syncing context only.
1632 */
1633static int64_t
1634would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1635{
1636	if (dd == ancestor)
1637		return (delta);
1638
1639	mutex_enter(&dd->dd_lock);
1640	delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1641	mutex_exit(&dd->dd_lock);
1642	return (would_change(dd->dd_parent, delta, ancestor));
1643}
1644
1645typedef struct dsl_dir_rename_arg {
1646	const char *ddra_oldname;
1647	const char *ddra_newname;
1648	cred_t *ddra_cred;
1649} dsl_dir_rename_arg_t;
1650
1651/* ARGSUSED */
1652static int
1653dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1654{
1655	int *deltap = arg;
1656	char namebuf[MAXNAMELEN];
1657
1658	dsl_dataset_name(ds, namebuf);
1659
1660	if (strlen(namebuf) + *deltap >= MAXNAMELEN)
1661		return (SET_ERROR(ENAMETOOLONG));
1662	return (0);
1663}
1664
1665static int
1666dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1667{
1668	dsl_dir_rename_arg_t *ddra = arg;
1669	dsl_pool_t *dp = dmu_tx_pool(tx);
1670	dsl_dir_t *dd, *newparent;
1671	const char *mynewname;
1672	int error;
1673	int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
1674
1675	/* target dir should exist */
1676	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1677	if (error != 0)
1678		return (error);
1679
1680	/* new parent should exist */
1681	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1682	    &newparent, &mynewname);
1683	if (error != 0) {
1684		dsl_dir_rele(dd, FTAG);
1685		return (error);
1686	}
1687
1688	/* can't rename to different pool */
1689	if (dd->dd_pool != newparent->dd_pool) {
1690		dsl_dir_rele(newparent, FTAG);
1691		dsl_dir_rele(dd, FTAG);
1692		return (SET_ERROR(ENXIO));
1693	}
1694
1695	/* new name should not already exist */
1696	if (mynewname == NULL) {
1697		dsl_dir_rele(newparent, FTAG);
1698		dsl_dir_rele(dd, FTAG);
1699		return (SET_ERROR(EEXIST));
1700	}
1701
1702	/* if the name length is growing, validate child name lengths */
1703	if (delta > 0) {
1704		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
1705		    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1706		if (error != 0) {
1707			dsl_dir_rele(newparent, FTAG);
1708			dsl_dir_rele(dd, FTAG);
1709			return (error);
1710		}
1711	}
1712
1713	if (dmu_tx_is_syncing(tx)) {
1714		if (spa_feature_is_active(dp->dp_spa,
1715		    SPA_FEATURE_FS_SS_LIMIT)) {
1716			/*
1717			 * Although this is the check function and we don't
1718			 * normally make on-disk changes in check functions,
1719			 * we need to do that here.
1720			 *
1721			 * Ensure this portion of the tree's counts have been
1722			 * initialized in case the new parent has limits set.
1723			 */
1724			dsl_dir_init_fs_ss_count(dd, tx);
1725		}
1726	}
1727
1728	if (newparent != dd->dd_parent) {
1729		/* is there enough space? */
1730		uint64_t myspace =
1731		    MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1732		objset_t *os = dd->dd_pool->dp_meta_objset;
1733		uint64_t fs_cnt = 0;
1734		uint64_t ss_cnt = 0;
1735
1736		if (dsl_dir_is_zapified(dd)) {
1737			int err;
1738
1739			err = zap_lookup(os, dd->dd_object,
1740			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1741			    &fs_cnt);
1742			if (err != ENOENT && err != 0) {
1743				dsl_dir_rele(newparent, FTAG);
1744				dsl_dir_rele(dd, FTAG);
1745				return (err);
1746			}
1747
1748			/*
1749			 * have to add 1 for the filesystem itself that we're
1750			 * moving
1751			 */
1752			fs_cnt++;
1753
1754			err = zap_lookup(os, dd->dd_object,
1755			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
1756			    &ss_cnt);
1757			if (err != ENOENT && err != 0) {
1758				dsl_dir_rele(newparent, FTAG);
1759				dsl_dir_rele(dd, FTAG);
1760				return (err);
1761			}
1762		}
1763
1764		/* no rename into our descendant */
1765		if (closest_common_ancestor(dd, newparent) == dd) {
1766			dsl_dir_rele(newparent, FTAG);
1767			dsl_dir_rele(dd, FTAG);
1768			return (SET_ERROR(EINVAL));
1769		}
1770
1771		error = dsl_dir_transfer_possible(dd->dd_parent,
1772		    newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
1773		if (error != 0) {
1774			dsl_dir_rele(newparent, FTAG);
1775			dsl_dir_rele(dd, FTAG);
1776			return (error);
1777		}
1778	}
1779
1780	dsl_dir_rele(newparent, FTAG);
1781	dsl_dir_rele(dd, FTAG);
1782	return (0);
1783}
1784
1785static void
1786dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
1787{
1788	dsl_dir_rename_arg_t *ddra = arg;
1789	dsl_pool_t *dp = dmu_tx_pool(tx);
1790	dsl_dir_t *dd, *newparent;
1791	const char *mynewname;
1792	int error;
1793	objset_t *mos = dp->dp_meta_objset;
1794
1795	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
1796	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
1797	    &mynewname));
1798
1799	/* Log this before we change the name. */
1800	spa_history_log_internal_dd(dd, "rename", tx,
1801	    "-> %s", ddra->ddra_newname);
1802
1803	if (newparent != dd->dd_parent) {
1804		objset_t *os = dd->dd_pool->dp_meta_objset;
1805		uint64_t fs_cnt = 0;
1806		uint64_t ss_cnt = 0;
1807
1808		/*
1809		 * We already made sure the dd counts were initialized in the
1810		 * check function.
1811		 */
1812		if (spa_feature_is_active(dp->dp_spa,
1813		    SPA_FEATURE_FS_SS_LIMIT)) {
1814			VERIFY0(zap_lookup(os, dd->dd_object,
1815			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1816			    &fs_cnt));
1817			/* add 1 for the filesystem itself that we're moving */
1818			fs_cnt++;
1819
1820			VERIFY0(zap_lookup(os, dd->dd_object,
1821			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
1822			    &ss_cnt));
1823		}
1824
1825		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
1826		    DD_FIELD_FILESYSTEM_COUNT, tx);
1827		dsl_fs_ss_count_adjust(newparent, fs_cnt,
1828		    DD_FIELD_FILESYSTEM_COUNT, tx);
1829
1830		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
1831		    DD_FIELD_SNAPSHOT_COUNT, tx);
1832		dsl_fs_ss_count_adjust(newparent, ss_cnt,
1833		    DD_FIELD_SNAPSHOT_COUNT, tx);
1834
1835		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1836		    -dd->dd_phys->dd_used_bytes,
1837		    -dd->dd_phys->dd_compressed_bytes,
1838		    -dd->dd_phys->dd_uncompressed_bytes, tx);
1839		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
1840		    dd->dd_phys->dd_used_bytes,
1841		    dd->dd_phys->dd_compressed_bytes,
1842		    dd->dd_phys->dd_uncompressed_bytes, tx);
1843
1844		if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1845			uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1846			    dd->dd_phys->dd_used_bytes;
1847
1848			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1849			    -unused_rsrv, 0, 0, tx);
1850			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
1851			    unused_rsrv, 0, 0, tx);
1852		}
1853	}
1854
1855	dmu_buf_will_dirty(dd->dd_dbuf, tx);
1856
1857	/* remove from old parent zapobj */
1858	error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1859	    dd->dd_myname, tx);
1860	ASSERT0(error);
1861
1862	(void) strcpy(dd->dd_myname, mynewname);
1863	dsl_dir_rele(dd->dd_parent, dd);
1864	dd->dd_phys->dd_parent_obj = newparent->dd_object;
1865	VERIFY0(dsl_dir_hold_obj(dp,
1866	    newparent->dd_object, NULL, dd, &dd->dd_parent));
1867
1868	/* add to new parent zapobj */
1869	VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
1870	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
1871
1872	dsl_prop_notify_all(dd);
1873
1874	dsl_dir_rele(newparent, FTAG);
1875	dsl_dir_rele(dd, FTAG);
1876}
1877
1878int
1879dsl_dir_rename(const char *oldname, const char *newname)
1880{
1881	dsl_dir_rename_arg_t ddra;
1882
1883	ddra.ddra_oldname = oldname;
1884	ddra.ddra_newname = newname;
1885	ddra.ddra_cred = CRED();
1886
1887	return (dsl_sync_task(oldname,
1888	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
1889	    3, ZFS_SPACE_CHECK_RESERVED));
1890}
1891
1892int
1893dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
1894    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
1895{
1896	dsl_dir_t *ancestor;
1897	int64_t adelta;
1898	uint64_t avail;
1899	int err;
1900
1901	ancestor = closest_common_ancestor(sdd, tdd);
1902	adelta = would_change(sdd, -space, ancestor);
1903	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1904	if (avail < space)
1905		return (SET_ERROR(ENOSPC));
1906
1907	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
1908	    ancestor, cr);
1909	if (err != 0)
1910		return (err);
1911	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
1912	    ancestor, cr);
1913	if (err != 0)
1914		return (err);
1915
1916	return (0);
1917}
1918
1919timestruc_t
1920dsl_dir_snap_cmtime(dsl_dir_t *dd)
1921{
1922	timestruc_t t;
1923
1924	mutex_enter(&dd->dd_lock);
1925	t = dd->dd_snap_cmtime;
1926	mutex_exit(&dd->dd_lock);
1927
1928	return (t);
1929}
1930
1931void
1932dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1933{
1934	timestruc_t t;
1935
1936	gethrestime(&t);
1937	mutex_enter(&dd->dd_lock);
1938	dd->dd_snap_cmtime = t;
1939	mutex_exit(&dd->dd_lock);
1940}
1941
1942void
1943dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
1944{
1945	objset_t *mos = dd->dd_pool->dp_meta_objset;
1946	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
1947}
1948
1949boolean_t
1950dsl_dir_is_zapified(dsl_dir_t *dd)
1951{
1952	dmu_object_info_t doi;
1953
1954	dmu_object_info_from_db(dd->dd_dbuf, &doi);
1955	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
1956}
1957