xref: /illumos-gate/usr/src/uts/common/fs/zfs/dmu_send.c (revision 43466aae47bfcd2ad9bf501faec8e75c08095e4f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24  * Copyright (c) 2013 by Delphix. All rights reserved.
25  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
26  */
27 
28 #include <sys/dmu.h>
29 #include <sys/dmu_impl.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dbuf.h>
32 #include <sys/dnode.h>
33 #include <sys/zfs_context.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/dmu_traverse.h>
36 #include <sys/dsl_dataset.h>
37 #include <sys/dsl_dir.h>
38 #include <sys/dsl_prop.h>
39 #include <sys/dsl_pool.h>
40 #include <sys/dsl_synctask.h>
41 #include <sys/zfs_ioctl.h>
42 #include <sys/zap.h>
43 #include <sys/zio_checksum.h>
44 #include <sys/zfs_znode.h>
45 #include <zfs_fletcher.h>
46 #include <sys/avl.h>
47 #include <sys/ddt.h>
48 #include <sys/zfs_onexit.h>
49 #include <sys/dmu_send.h>
50 #include <sys/dsl_destroy.h>
51 
52 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
53 int zfs_send_corrupt_data = B_FALSE;
54 
55 static char *dmu_recv_tag = "dmu_recv_tag";
56 static const char *recv_clone_name = "%recv";
57 
58 static int
59 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
60 {
61 	dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
62 	ssize_t resid; /* have to get resid to get detailed errno */
63 	ASSERT0(len % 8);
64 
65 	fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
66 	dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
67 	    (caddr_t)buf, len,
68 	    0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
69 
70 	mutex_enter(&ds->ds_sendstream_lock);
71 	*dsp->dsa_off += len;
72 	mutex_exit(&ds->ds_sendstream_lock);
73 
74 	return (dsp->dsa_err);
75 }
76 
77 static int
78 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
79     uint64_t length)
80 {
81 	struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
82 
83 	/*
84 	 * When we receive a free record, dbuf_free_range() assumes
85 	 * that the receiving system doesn't have any dbufs in the range
86 	 * being freed.  This is always true because there is a one-record
87 	 * constraint: we only send one WRITE record for any given
88 	 * object+offset.  We know that the one-record constraint is
89 	 * true because we always send data in increasing order by
90 	 * object,offset.
91 	 *
92 	 * If the increasing-order constraint ever changes, we should find
93 	 * another way to assert that the one-record constraint is still
94 	 * satisfied.
95 	 */
96 	ASSERT(object > dsp->dsa_last_data_object ||
97 	    (object == dsp->dsa_last_data_object &&
98 	    offset > dsp->dsa_last_data_offset));
99 
100 	/*
101 	 * If we are doing a non-incremental send, then there can't
102 	 * be any data in the dataset we're receiving into.  Therefore
103 	 * a free record would simply be a no-op.  Save space by not
104 	 * sending it to begin with.
105 	 */
106 	if (!dsp->dsa_incremental)
107 		return (0);
108 
109 	if (length != -1ULL && offset + length < offset)
110 		length = -1ULL;
111 
112 	/*
113 	 * If there is a pending op, but it's not PENDING_FREE, push it out,
114 	 * since free block aggregation can only be done for blocks of the
115 	 * same type (i.e., DRR_FREE records can only be aggregated with
116 	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
117 	 * aggregated with other DRR_FREEOBJECTS records.
118 	 */
119 	if (dsp->dsa_pending_op != PENDING_NONE &&
120 	    dsp->dsa_pending_op != PENDING_FREE) {
121 		if (dump_bytes(dsp, dsp->dsa_drr,
122 		    sizeof (dmu_replay_record_t)) != 0)
123 			return (SET_ERROR(EINTR));
124 		dsp->dsa_pending_op = PENDING_NONE;
125 	}
126 
127 	if (dsp->dsa_pending_op == PENDING_FREE) {
128 		/*
129 		 * There should never be a PENDING_FREE if length is -1
130 		 * (because dump_dnode is the only place where this
131 		 * function is called with a -1, and only after flushing
132 		 * any pending record).
133 		 */
134 		ASSERT(length != -1ULL);
135 		/*
136 		 * Check to see whether this free block can be aggregated
137 		 * with pending one.
138 		 */
139 		if (drrf->drr_object == object && drrf->drr_offset +
140 		    drrf->drr_length == offset) {
141 			drrf->drr_length += length;
142 			return (0);
143 		} else {
144 			/* not a continuation.  Push out pending record */
145 			if (dump_bytes(dsp, dsp->dsa_drr,
146 			    sizeof (dmu_replay_record_t)) != 0)
147 				return (SET_ERROR(EINTR));
148 			dsp->dsa_pending_op = PENDING_NONE;
149 		}
150 	}
151 	/* create a FREE record and make it pending */
152 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
153 	dsp->dsa_drr->drr_type = DRR_FREE;
154 	drrf->drr_object = object;
155 	drrf->drr_offset = offset;
156 	drrf->drr_length = length;
157 	drrf->drr_toguid = dsp->dsa_toguid;
158 	if (length == -1ULL) {
159 		if (dump_bytes(dsp, dsp->dsa_drr,
160 		    sizeof (dmu_replay_record_t)) != 0)
161 			return (SET_ERROR(EINTR));
162 	} else {
163 		dsp->dsa_pending_op = PENDING_FREE;
164 	}
165 
166 	return (0);
167 }
168 
169 static int
170 dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
171     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
172 {
173 	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
174 
175 	/*
176 	 * We send data in increasing object, offset order.
177 	 * See comment in dump_free() for details.
178 	 */
179 	ASSERT(object > dsp->dsa_last_data_object ||
180 	    (object == dsp->dsa_last_data_object &&
181 	    offset > dsp->dsa_last_data_offset));
182 	dsp->dsa_last_data_object = object;
183 	dsp->dsa_last_data_offset = offset + blksz - 1;
184 
185 	/*
186 	 * If there is any kind of pending aggregation (currently either
187 	 * a grouping of free objects or free blocks), push it out to
188 	 * the stream, since aggregation can't be done across operations
189 	 * of different types.
190 	 */
191 	if (dsp->dsa_pending_op != PENDING_NONE) {
192 		if (dump_bytes(dsp, dsp->dsa_drr,
193 		    sizeof (dmu_replay_record_t)) != 0)
194 			return (SET_ERROR(EINTR));
195 		dsp->dsa_pending_op = PENDING_NONE;
196 	}
197 	/* write a DATA record */
198 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
199 	dsp->dsa_drr->drr_type = DRR_WRITE;
200 	drrw->drr_object = object;
201 	drrw->drr_type = type;
202 	drrw->drr_offset = offset;
203 	drrw->drr_length = blksz;
204 	drrw->drr_toguid = dsp->dsa_toguid;
205 	drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
206 	if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
207 		drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
208 	DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
209 	DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
210 	DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
211 	drrw->drr_key.ddk_cksum = bp->blk_cksum;
212 
213 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
214 		return (SET_ERROR(EINTR));
215 	if (dump_bytes(dsp, data, blksz) != 0)
216 		return (SET_ERROR(EINTR));
217 	return (0);
218 }
219 
220 static int
221 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
222 {
223 	struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
224 
225 	if (dsp->dsa_pending_op != PENDING_NONE) {
226 		if (dump_bytes(dsp, dsp->dsa_drr,
227 		    sizeof (dmu_replay_record_t)) != 0)
228 			return (SET_ERROR(EINTR));
229 		dsp->dsa_pending_op = PENDING_NONE;
230 	}
231 
232 	/* write a SPILL record */
233 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
234 	dsp->dsa_drr->drr_type = DRR_SPILL;
235 	drrs->drr_object = object;
236 	drrs->drr_length = blksz;
237 	drrs->drr_toguid = dsp->dsa_toguid;
238 
239 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
240 		return (SET_ERROR(EINTR));
241 	if (dump_bytes(dsp, data, blksz))
242 		return (SET_ERROR(EINTR));
243 	return (0);
244 }
245 
246 static int
247 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
248 {
249 	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
250 
251 	/* See comment in dump_free(). */
252 	if (!dsp->dsa_incremental)
253 		return (0);
254 
255 	/*
256 	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
257 	 * push it out, since free block aggregation can only be done for
258 	 * blocks of the same type (i.e., DRR_FREE records can only be
259 	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
260 	 * can only be aggregated with other DRR_FREEOBJECTS records.
261 	 */
262 	if (dsp->dsa_pending_op != PENDING_NONE &&
263 	    dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
264 		if (dump_bytes(dsp, dsp->dsa_drr,
265 		    sizeof (dmu_replay_record_t)) != 0)
266 			return (SET_ERROR(EINTR));
267 		dsp->dsa_pending_op = PENDING_NONE;
268 	}
269 	if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
270 		/*
271 		 * See whether this free object array can be aggregated
272 		 * with pending one
273 		 */
274 		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
275 			drrfo->drr_numobjs += numobjs;
276 			return (0);
277 		} else {
278 			/* can't be aggregated.  Push out pending record */
279 			if (dump_bytes(dsp, dsp->dsa_drr,
280 			    sizeof (dmu_replay_record_t)) != 0)
281 				return (SET_ERROR(EINTR));
282 			dsp->dsa_pending_op = PENDING_NONE;
283 		}
284 	}
285 
286 	/* write a FREEOBJECTS record */
287 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
288 	dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
289 	drrfo->drr_firstobj = firstobj;
290 	drrfo->drr_numobjs = numobjs;
291 	drrfo->drr_toguid = dsp->dsa_toguid;
292 
293 	dsp->dsa_pending_op = PENDING_FREEOBJECTS;
294 
295 	return (0);
296 }
297 
298 static int
299 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
300 {
301 	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
302 
303 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
304 		return (dump_freeobjects(dsp, object, 1));
305 
306 	if (dsp->dsa_pending_op != PENDING_NONE) {
307 		if (dump_bytes(dsp, dsp->dsa_drr,
308 		    sizeof (dmu_replay_record_t)) != 0)
309 			return (SET_ERROR(EINTR));
310 		dsp->dsa_pending_op = PENDING_NONE;
311 	}
312 
313 	/* write an OBJECT record */
314 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
315 	dsp->dsa_drr->drr_type = DRR_OBJECT;
316 	drro->drr_object = object;
317 	drro->drr_type = dnp->dn_type;
318 	drro->drr_bonustype = dnp->dn_bonustype;
319 	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
320 	drro->drr_bonuslen = dnp->dn_bonuslen;
321 	drro->drr_checksumtype = dnp->dn_checksum;
322 	drro->drr_compress = dnp->dn_compress;
323 	drro->drr_toguid = dsp->dsa_toguid;
324 
325 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
326 		return (SET_ERROR(EINTR));
327 
328 	if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
329 		return (SET_ERROR(EINTR));
330 
331 	/* Free anything past the end of the file. */
332 	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
333 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
334 		return (SET_ERROR(EINTR));
335 	if (dsp->dsa_err != 0)
336 		return (SET_ERROR(EINTR));
337 	return (0);
338 }
339 
340 #define	BP_SPAN(dnp, level) \
341 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
342 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
343 
344 /* ARGSUSED */
345 static int
346 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
347     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
348 {
349 	dmu_sendarg_t *dsp = arg;
350 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
351 	int err = 0;
352 
353 	if (issig(JUSTLOOKING) && issig(FORREAL))
354 		return (SET_ERROR(EINTR));
355 
356 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
357 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
358 		return (0);
359 	} else if (BP_IS_HOLE(bp) &&
360 	    zb->zb_object == DMU_META_DNODE_OBJECT) {
361 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
362 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
363 		err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
364 	} else if (BP_IS_HOLE(bp)) {
365 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
366 		err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
367 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
368 		return (0);
369 	} else if (type == DMU_OT_DNODE) {
370 		dnode_phys_t *blk;
371 		int i;
372 		int blksz = BP_GET_LSIZE(bp);
373 		uint32_t aflags = ARC_WAIT;
374 		arc_buf_t *abuf;
375 
376 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
377 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
378 		    &aflags, zb) != 0)
379 			return (SET_ERROR(EIO));
380 
381 		blk = abuf->b_data;
382 		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
383 			uint64_t dnobj = (zb->zb_blkid <<
384 			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
385 			err = dump_dnode(dsp, dnobj, blk+i);
386 			if (err != 0)
387 				break;
388 		}
389 		(void) arc_buf_remove_ref(abuf, &abuf);
390 	} else if (type == DMU_OT_SA) {
391 		uint32_t aflags = ARC_WAIT;
392 		arc_buf_t *abuf;
393 		int blksz = BP_GET_LSIZE(bp);
394 
395 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
396 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
397 		    &aflags, zb) != 0)
398 			return (SET_ERROR(EIO));
399 
400 		err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
401 		(void) arc_buf_remove_ref(abuf, &abuf);
402 	} else { /* it's a level-0 block of a regular object */
403 		uint32_t aflags = ARC_WAIT;
404 		arc_buf_t *abuf;
405 		int blksz = BP_GET_LSIZE(bp);
406 
407 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
408 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
409 		    &aflags, zb) != 0) {
410 			if (zfs_send_corrupt_data) {
411 				/* Send a block filled with 0x"zfs badd bloc" */
412 				abuf = arc_buf_alloc(spa, blksz, &abuf,
413 				    ARC_BUFC_DATA);
414 				uint64_t *ptr;
415 				for (ptr = abuf->b_data;
416 				    (char *)ptr < (char *)abuf->b_data + blksz;
417 				    ptr++)
418 					*ptr = 0x2f5baddb10c;
419 			} else {
420 				return (SET_ERROR(EIO));
421 			}
422 		}
423 
424 		err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
425 		    blksz, bp, abuf->b_data);
426 		(void) arc_buf_remove_ref(abuf, &abuf);
427 	}
428 
429 	ASSERT(err == 0 || err == EINTR);
430 	return (err);
431 }
432 
433 /*
434  * Releases dp, ds, and fromds, using the specified tag.
435  */
436 static int
437 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
438     dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off)
439 {
440 	objset_t *os;
441 	dmu_replay_record_t *drr;
442 	dmu_sendarg_t *dsp;
443 	int err;
444 	uint64_t fromtxg = 0;
445 
446 	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) {
447 		dsl_dataset_rele(fromds, tag);
448 		dsl_dataset_rele(ds, tag);
449 		dsl_pool_rele(dp, tag);
450 		return (SET_ERROR(EXDEV));
451 	}
452 
453 	err = dmu_objset_from_ds(ds, &os);
454 	if (err != 0) {
455 		if (fromds != NULL)
456 			dsl_dataset_rele(fromds, tag);
457 		dsl_dataset_rele(ds, tag);
458 		dsl_pool_rele(dp, tag);
459 		return (err);
460 	}
461 
462 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
463 	drr->drr_type = DRR_BEGIN;
464 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
465 	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
466 	    DMU_SUBSTREAM);
467 
468 #ifdef _KERNEL
469 	if (dmu_objset_type(os) == DMU_OST_ZFS) {
470 		uint64_t version;
471 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
472 			kmem_free(drr, sizeof (dmu_replay_record_t));
473 			if (fromds != NULL)
474 				dsl_dataset_rele(fromds, tag);
475 			dsl_dataset_rele(ds, tag);
476 			dsl_pool_rele(dp, tag);
477 			return (SET_ERROR(EINVAL));
478 		}
479 		if (version >= ZPL_VERSION_SA) {
480 			DMU_SET_FEATUREFLAGS(
481 			    drr->drr_u.drr_begin.drr_versioninfo,
482 			    DMU_BACKUP_FEATURE_SA_SPILL);
483 		}
484 	}
485 #endif
486 
487 	drr->drr_u.drr_begin.drr_creation_time =
488 	    ds->ds_phys->ds_creation_time;
489 	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
490 	if (fromds != NULL && ds->ds_dir != fromds->ds_dir)
491 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
492 	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
493 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
494 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
495 
496 	if (fromds != NULL)
497 		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
498 	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
499 
500 	if (fromds != NULL) {
501 		fromtxg = fromds->ds_phys->ds_creation_txg;
502 		dsl_dataset_rele(fromds, tag);
503 		fromds = NULL;
504 	}
505 
506 	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
507 
508 	dsp->dsa_drr = drr;
509 	dsp->dsa_vp = vp;
510 	dsp->dsa_outfd = outfd;
511 	dsp->dsa_proc = curproc;
512 	dsp->dsa_os = os;
513 	dsp->dsa_off = off;
514 	dsp->dsa_toguid = ds->ds_phys->ds_guid;
515 	ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
516 	dsp->dsa_pending_op = PENDING_NONE;
517 	dsp->dsa_incremental = (fromtxg != 0);
518 
519 	mutex_enter(&ds->ds_sendstream_lock);
520 	list_insert_head(&ds->ds_sendstreams, dsp);
521 	mutex_exit(&ds->ds_sendstream_lock);
522 
523 	dsl_dataset_long_hold(ds, FTAG);
524 	dsl_pool_rele(dp, tag);
525 
526 	if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
527 		err = dsp->dsa_err;
528 		goto out;
529 	}
530 
531 	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
532 	    backup_cb, dsp);
533 
534 	if (dsp->dsa_pending_op != PENDING_NONE)
535 		if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
536 			err = SET_ERROR(EINTR);
537 
538 	if (err != 0) {
539 		if (err == EINTR && dsp->dsa_err != 0)
540 			err = dsp->dsa_err;
541 		goto out;
542 	}
543 
544 	bzero(drr, sizeof (dmu_replay_record_t));
545 	drr->drr_type = DRR_END;
546 	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
547 	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
548 
549 	if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
550 		err = dsp->dsa_err;
551 		goto out;
552 	}
553 
554 out:
555 	mutex_enter(&ds->ds_sendstream_lock);
556 	list_remove(&ds->ds_sendstreams, dsp);
557 	mutex_exit(&ds->ds_sendstream_lock);
558 
559 	kmem_free(drr, sizeof (dmu_replay_record_t));
560 	kmem_free(dsp, sizeof (dmu_sendarg_t));
561 
562 	dsl_dataset_long_rele(ds, FTAG);
563 	dsl_dataset_rele(ds, tag);
564 
565 	return (err);
566 }
567 
568 int
569 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
570     int outfd, vnode_t *vp, offset_t *off)
571 {
572 	dsl_pool_t *dp;
573 	dsl_dataset_t *ds;
574 	dsl_dataset_t *fromds = NULL;
575 	int err;
576 
577 	err = dsl_pool_hold(pool, FTAG, &dp);
578 	if (err != 0)
579 		return (err);
580 
581 	err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
582 	if (err != 0) {
583 		dsl_pool_rele(dp, FTAG);
584 		return (err);
585 	}
586 
587 	if (fromsnap != 0) {
588 		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
589 		if (err != 0) {
590 			dsl_dataset_rele(ds, FTAG);
591 			dsl_pool_rele(dp, FTAG);
592 			return (err);
593 		}
594 	}
595 
596 	return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off));
597 }
598 
599 int
600 dmu_send(const char *tosnap, const char *fromsnap,
601     int outfd, vnode_t *vp, offset_t *off)
602 {
603 	dsl_pool_t *dp;
604 	dsl_dataset_t *ds;
605 	dsl_dataset_t *fromds = NULL;
606 	int err;
607 
608 	if (strchr(tosnap, '@') == NULL)
609 		return (SET_ERROR(EINVAL));
610 	if (fromsnap != NULL && strchr(fromsnap, '@') == NULL)
611 		return (SET_ERROR(EINVAL));
612 
613 	err = dsl_pool_hold(tosnap, FTAG, &dp);
614 	if (err != 0)
615 		return (err);
616 
617 	err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
618 	if (err != 0) {
619 		dsl_pool_rele(dp, FTAG);
620 		return (err);
621 	}
622 
623 	if (fromsnap != NULL) {
624 		err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
625 		if (err != 0) {
626 			dsl_dataset_rele(ds, FTAG);
627 			dsl_pool_rele(dp, FTAG);
628 			return (err);
629 		}
630 	}
631 	return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off));
632 }
633 
634 int
635 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
636 {
637 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
638 	int err;
639 	uint64_t size;
640 
641 	ASSERT(dsl_pool_config_held(dp));
642 
643 	/* tosnap must be a snapshot */
644 	if (!dsl_dataset_is_snapshot(ds))
645 		return (SET_ERROR(EINVAL));
646 
647 	/*
648 	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
649 	 * or the origin's fs.
650 	 */
651 	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds))
652 		return (SET_ERROR(EXDEV));
653 
654 	/* Get uncompressed size estimate of changed data. */
655 	if (fromds == NULL) {
656 		size = ds->ds_phys->ds_uncompressed_bytes;
657 	} else {
658 		uint64_t used, comp;
659 		err = dsl_dataset_space_written(fromds, ds,
660 		    &used, &comp, &size);
661 		if (err != 0)
662 			return (err);
663 	}
664 
665 	/*
666 	 * Assume that space (both on-disk and in-stream) is dominated by
667 	 * data.  We will adjust for indirect blocks and the copies property,
668 	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
669 	 */
670 
671 	/*
672 	 * Subtract out approximate space used by indirect blocks.
673 	 * Assume most space is used by data blocks (non-indirect, non-dnode).
674 	 * Assume all blocks are recordsize.  Assume ditto blocks and
675 	 * internal fragmentation counter out compression.
676 	 *
677 	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
678 	 * block, which we observe in practice.
679 	 */
680 	uint64_t recordsize;
681 	err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
682 	if (err != 0)
683 		return (err);
684 	size -= size / recordsize * sizeof (blkptr_t);
685 
686 	/* Add in the space for the record associated with each block. */
687 	size += size / recordsize * sizeof (dmu_replay_record_t);
688 
689 	*sizep = size;
690 
691 	return (0);
692 }
693 
694 typedef struct dmu_recv_begin_arg {
695 	const char *drba_origin;
696 	dmu_recv_cookie_t *drba_cookie;
697 	cred_t *drba_cred;
698 	uint64_t drba_snapobj;
699 } dmu_recv_begin_arg_t;
700 
701 static int
702 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
703     uint64_t fromguid)
704 {
705 	uint64_t val;
706 	int error;
707 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
708 
709 	/* temporary clone name must not exist */
710 	error = zap_lookup(dp->dp_meta_objset,
711 	    ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name,
712 	    8, 1, &val);
713 	if (error != ENOENT)
714 		return (error == 0 ? EBUSY : error);
715 
716 	/* new snapshot name must not exist */
717 	error = zap_lookup(dp->dp_meta_objset,
718 	    ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap,
719 	    8, 1, &val);
720 	if (error != ENOENT)
721 		return (error == 0 ? EEXIST : error);
722 
723 	if (fromguid != 0) {
724 		dsl_dataset_t *snap;
725 		uint64_t obj = ds->ds_phys->ds_prev_snap_obj;
726 
727 		/* Find snapshot in this dir that matches fromguid. */
728 		while (obj != 0) {
729 			error = dsl_dataset_hold_obj(dp, obj, FTAG,
730 			    &snap);
731 			if (error != 0)
732 				return (SET_ERROR(ENODEV));
733 			if (snap->ds_dir != ds->ds_dir) {
734 				dsl_dataset_rele(snap, FTAG);
735 				return (SET_ERROR(ENODEV));
736 			}
737 			if (snap->ds_phys->ds_guid == fromguid)
738 				break;
739 			obj = snap->ds_phys->ds_prev_snap_obj;
740 			dsl_dataset_rele(snap, FTAG);
741 		}
742 		if (obj == 0)
743 			return (SET_ERROR(ENODEV));
744 
745 		if (drba->drba_cookie->drc_force) {
746 			drba->drba_snapobj = obj;
747 		} else {
748 			/*
749 			 * If we are not forcing, there must be no
750 			 * changes since fromsnap.
751 			 */
752 			if (dsl_dataset_modified_since_snap(ds, snap)) {
753 				dsl_dataset_rele(snap, FTAG);
754 				return (SET_ERROR(ETXTBSY));
755 			}
756 			drba->drba_snapobj = ds->ds_prev->ds_object;
757 		}
758 
759 		dsl_dataset_rele(snap, FTAG);
760 	} else {
761 		/* if full, most recent snapshot must be $ORIGIN */
762 		if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
763 			return (SET_ERROR(ENODEV));
764 		drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj;
765 	}
766 
767 	return (0);
768 
769 }
770 
771 static int
772 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
773 {
774 	dmu_recv_begin_arg_t *drba = arg;
775 	dsl_pool_t *dp = dmu_tx_pool(tx);
776 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
777 	uint64_t fromguid = drrb->drr_fromguid;
778 	int flags = drrb->drr_flags;
779 	int error;
780 	dsl_dataset_t *ds;
781 	const char *tofs = drba->drba_cookie->drc_tofs;
782 
783 	/* already checked */
784 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
785 
786 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
787 	    DMU_COMPOUNDSTREAM ||
788 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
789 	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
790 		return (SET_ERROR(EINVAL));
791 
792 	/* Verify pool version supports SA if SA_SPILL feature set */
793 	if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
794 	    DMU_BACKUP_FEATURE_SA_SPILL) &&
795 	    spa_version(dp->dp_spa) < SPA_VERSION_SA) {
796 		return (SET_ERROR(ENOTSUP));
797 	}
798 
799 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
800 	if (error == 0) {
801 		/* target fs already exists; recv into temp clone */
802 
803 		/* Can't recv a clone into an existing fs */
804 		if (flags & DRR_FLAG_CLONE) {
805 			dsl_dataset_rele(ds, FTAG);
806 			return (SET_ERROR(EINVAL));
807 		}
808 
809 		error = recv_begin_check_existing_impl(drba, ds, fromguid);
810 		dsl_dataset_rele(ds, FTAG);
811 	} else if (error == ENOENT) {
812 		/* target fs does not exist; must be a full backup or clone */
813 		char buf[MAXNAMELEN];
814 
815 		/*
816 		 * If it's a non-clone incremental, we are missing the
817 		 * target fs, so fail the recv.
818 		 */
819 		if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
820 			return (SET_ERROR(ENOENT));
821 
822 		/* Open the parent of tofs */
823 		ASSERT3U(strlen(tofs), <, MAXNAMELEN);
824 		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
825 		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
826 		if (error != 0)
827 			return (error);
828 
829 		if (drba->drba_origin != NULL) {
830 			dsl_dataset_t *origin;
831 			error = dsl_dataset_hold(dp, drba->drba_origin,
832 			    FTAG, &origin);
833 			if (error != 0) {
834 				dsl_dataset_rele(ds, FTAG);
835 				return (error);
836 			}
837 			if (!dsl_dataset_is_snapshot(origin)) {
838 				dsl_dataset_rele(origin, FTAG);
839 				dsl_dataset_rele(ds, FTAG);
840 				return (SET_ERROR(EINVAL));
841 			}
842 			if (origin->ds_phys->ds_guid != fromguid) {
843 				dsl_dataset_rele(origin, FTAG);
844 				dsl_dataset_rele(ds, FTAG);
845 				return (SET_ERROR(ENODEV));
846 			}
847 			dsl_dataset_rele(origin, FTAG);
848 		}
849 		dsl_dataset_rele(ds, FTAG);
850 		error = 0;
851 	}
852 	return (error);
853 }
854 
855 static void
856 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
857 {
858 	dmu_recv_begin_arg_t *drba = arg;
859 	dsl_pool_t *dp = dmu_tx_pool(tx);
860 	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
861 	const char *tofs = drba->drba_cookie->drc_tofs;
862 	dsl_dataset_t *ds, *newds;
863 	uint64_t dsobj;
864 	int error;
865 	uint64_t crflags;
866 
867 	crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
868 	    DS_FLAG_CI_DATASET : 0;
869 
870 	error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
871 	if (error == 0) {
872 		/* create temporary clone */
873 		dsl_dataset_t *snap = NULL;
874 		if (drba->drba_snapobj != 0) {
875 			VERIFY0(dsl_dataset_hold_obj(dp,
876 			    drba->drba_snapobj, FTAG, &snap));
877 		}
878 		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
879 		    snap, crflags, drba->drba_cred, tx);
880 		dsl_dataset_rele(snap, FTAG);
881 		dsl_dataset_rele(ds, FTAG);
882 	} else {
883 		dsl_dir_t *dd;
884 		const char *tail;
885 		dsl_dataset_t *origin = NULL;
886 
887 		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
888 
889 		if (drba->drba_origin != NULL) {
890 			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
891 			    FTAG, &origin));
892 		}
893 
894 		/* Create new dataset. */
895 		dsobj = dsl_dataset_create_sync(dd,
896 		    strrchr(tofs, '/') + 1,
897 		    origin, crflags, drba->drba_cred, tx);
898 		if (origin != NULL)
899 			dsl_dataset_rele(origin, FTAG);
900 		dsl_dir_rele(dd, FTAG);
901 		drba->drba_cookie->drc_newfs = B_TRUE;
902 	}
903 	VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
904 
905 	dmu_buf_will_dirty(newds->ds_dbuf, tx);
906 	newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
907 
908 	/*
909 	 * If we actually created a non-clone, we need to create the
910 	 * objset in our new dataset.
911 	 */
912 	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
913 		(void) dmu_objset_create_impl(dp->dp_spa,
914 		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
915 	}
916 
917 	drba->drba_cookie->drc_ds = newds;
918 
919 	spa_history_log_internal_ds(newds, "receive", tx, "");
920 }
921 
922 /*
923  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
924  * succeeds; otherwise we will leak the holds on the datasets.
925  */
926 int
927 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
928     boolean_t force, char *origin, dmu_recv_cookie_t *drc)
929 {
930 	dmu_recv_begin_arg_t drba = { 0 };
931 	dmu_replay_record_t *drr;
932 
933 	bzero(drc, sizeof (dmu_recv_cookie_t));
934 	drc->drc_drrb = drrb;
935 	drc->drc_tosnap = tosnap;
936 	drc->drc_tofs = tofs;
937 	drc->drc_force = force;
938 
939 	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
940 		drc->drc_byteswap = B_TRUE;
941 	else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
942 		return (SET_ERROR(EINVAL));
943 
944 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
945 	drr->drr_type = DRR_BEGIN;
946 	drr->drr_u.drr_begin = *drc->drc_drrb;
947 	if (drc->drc_byteswap) {
948 		fletcher_4_incremental_byteswap(drr,
949 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
950 	} else {
951 		fletcher_4_incremental_native(drr,
952 		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
953 	}
954 	kmem_free(drr, sizeof (dmu_replay_record_t));
955 
956 	if (drc->drc_byteswap) {
957 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
958 		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
959 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
960 		drrb->drr_type = BSWAP_32(drrb->drr_type);
961 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
962 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
963 	}
964 
965 	drba.drba_origin = origin;
966 	drba.drba_cookie = drc;
967 	drba.drba_cred = CRED();
968 
969 	return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
970 	    &drba, 5));
971 }
972 
973 struct restorearg {
974 	int err;
975 	boolean_t byteswap;
976 	vnode_t *vp;
977 	char *buf;
978 	uint64_t voff;
979 	int bufsize; /* amount of memory allocated for buf */
980 	zio_cksum_t cksum;
981 	avl_tree_t *guid_to_ds_map;
982 };
983 
984 typedef struct guid_map_entry {
985 	uint64_t	guid;
986 	dsl_dataset_t	*gme_ds;
987 	avl_node_t	avlnode;
988 } guid_map_entry_t;
989 
990 static int
991 guid_compare(const void *arg1, const void *arg2)
992 {
993 	const guid_map_entry_t *gmep1 = arg1;
994 	const guid_map_entry_t *gmep2 = arg2;
995 
996 	if (gmep1->guid < gmep2->guid)
997 		return (-1);
998 	else if (gmep1->guid > gmep2->guid)
999 		return (1);
1000 	return (0);
1001 }
1002 
1003 static void
1004 free_guid_map_onexit(void *arg)
1005 {
1006 	avl_tree_t *ca = arg;
1007 	void *cookie = NULL;
1008 	guid_map_entry_t *gmep;
1009 
1010 	while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
1011 		dsl_dataset_long_rele(gmep->gme_ds, gmep);
1012 		dsl_dataset_rele(gmep->gme_ds, gmep);
1013 		kmem_free(gmep, sizeof (guid_map_entry_t));
1014 	}
1015 	avl_destroy(ca);
1016 	kmem_free(ca, sizeof (avl_tree_t));
1017 }
1018 
1019 static void *
1020 restore_read(struct restorearg *ra, int len)
1021 {
1022 	void *rv;
1023 	int done = 0;
1024 
1025 	/* some things will require 8-byte alignment, so everything must */
1026 	ASSERT0(len % 8);
1027 
1028 	while (done < len) {
1029 		ssize_t resid;
1030 
1031 		ra->err = vn_rdwr(UIO_READ, ra->vp,
1032 		    (caddr_t)ra->buf + done, len - done,
1033 		    ra->voff, UIO_SYSSPACE, FAPPEND,
1034 		    RLIM64_INFINITY, CRED(), &resid);
1035 
1036 		if (resid == len - done)
1037 			ra->err = SET_ERROR(EINVAL);
1038 		ra->voff += len - done - resid;
1039 		done = len - resid;
1040 		if (ra->err != 0)
1041 			return (NULL);
1042 	}
1043 
1044 	ASSERT3U(done, ==, len);
1045 	rv = ra->buf;
1046 	if (ra->byteswap)
1047 		fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
1048 	else
1049 		fletcher_4_incremental_native(rv, len, &ra->cksum);
1050 	return (rv);
1051 }
1052 
1053 static void
1054 backup_byteswap(dmu_replay_record_t *drr)
1055 {
1056 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
1057 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
1058 	drr->drr_type = BSWAP_32(drr->drr_type);
1059 	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
1060 	switch (drr->drr_type) {
1061 	case DRR_BEGIN:
1062 		DO64(drr_begin.drr_magic);
1063 		DO64(drr_begin.drr_versioninfo);
1064 		DO64(drr_begin.drr_creation_time);
1065 		DO32(drr_begin.drr_type);
1066 		DO32(drr_begin.drr_flags);
1067 		DO64(drr_begin.drr_toguid);
1068 		DO64(drr_begin.drr_fromguid);
1069 		break;
1070 	case DRR_OBJECT:
1071 		DO64(drr_object.drr_object);
1072 		/* DO64(drr_object.drr_allocation_txg); */
1073 		DO32(drr_object.drr_type);
1074 		DO32(drr_object.drr_bonustype);
1075 		DO32(drr_object.drr_blksz);
1076 		DO32(drr_object.drr_bonuslen);
1077 		DO64(drr_object.drr_toguid);
1078 		break;
1079 	case DRR_FREEOBJECTS:
1080 		DO64(drr_freeobjects.drr_firstobj);
1081 		DO64(drr_freeobjects.drr_numobjs);
1082 		DO64(drr_freeobjects.drr_toguid);
1083 		break;
1084 	case DRR_WRITE:
1085 		DO64(drr_write.drr_object);
1086 		DO32(drr_write.drr_type);
1087 		DO64(drr_write.drr_offset);
1088 		DO64(drr_write.drr_length);
1089 		DO64(drr_write.drr_toguid);
1090 		DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
1091 		DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
1092 		DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
1093 		DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
1094 		DO64(drr_write.drr_key.ddk_prop);
1095 		break;
1096 	case DRR_WRITE_BYREF:
1097 		DO64(drr_write_byref.drr_object);
1098 		DO64(drr_write_byref.drr_offset);
1099 		DO64(drr_write_byref.drr_length);
1100 		DO64(drr_write_byref.drr_toguid);
1101 		DO64(drr_write_byref.drr_refguid);
1102 		DO64(drr_write_byref.drr_refobject);
1103 		DO64(drr_write_byref.drr_refoffset);
1104 		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
1105 		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
1106 		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
1107 		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
1108 		DO64(drr_write_byref.drr_key.ddk_prop);
1109 		break;
1110 	case DRR_FREE:
1111 		DO64(drr_free.drr_object);
1112 		DO64(drr_free.drr_offset);
1113 		DO64(drr_free.drr_length);
1114 		DO64(drr_free.drr_toguid);
1115 		break;
1116 	case DRR_SPILL:
1117 		DO64(drr_spill.drr_object);
1118 		DO64(drr_spill.drr_length);
1119 		DO64(drr_spill.drr_toguid);
1120 		break;
1121 	case DRR_END:
1122 		DO64(drr_end.drr_checksum.zc_word[0]);
1123 		DO64(drr_end.drr_checksum.zc_word[1]);
1124 		DO64(drr_end.drr_checksum.zc_word[2]);
1125 		DO64(drr_end.drr_checksum.zc_word[3]);
1126 		DO64(drr_end.drr_toguid);
1127 		break;
1128 	}
1129 #undef DO64
1130 #undef DO32
1131 }
1132 
1133 static int
1134 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1135 {
1136 	int err;
1137 	dmu_tx_t *tx;
1138 	void *data = NULL;
1139 
1140 	if (drro->drr_type == DMU_OT_NONE ||
1141 	    !DMU_OT_IS_VALID(drro->drr_type) ||
1142 	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
1143 	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1144 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1145 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1146 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
1147 	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
1148 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
1149 		return (SET_ERROR(EINVAL));
1150 	}
1151 
1152 	err = dmu_object_info(os, drro->drr_object, NULL);
1153 
1154 	if (err != 0 && err != ENOENT)
1155 		return (SET_ERROR(EINVAL));
1156 
1157 	if (drro->drr_bonuslen) {
1158 		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
1159 		if (ra->err != 0)
1160 			return (ra->err);
1161 	}
1162 
1163 	if (err == ENOENT) {
1164 		/* currently free, want to be allocated */
1165 		tx = dmu_tx_create(os);
1166 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1167 		err = dmu_tx_assign(tx, TXG_WAIT);
1168 		if (err != 0) {
1169 			dmu_tx_abort(tx);
1170 			return (err);
1171 		}
1172 		err = dmu_object_claim(os, drro->drr_object,
1173 		    drro->drr_type, drro->drr_blksz,
1174 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
1175 		dmu_tx_commit(tx);
1176 	} else {
1177 		/* currently allocated, want to be allocated */
1178 		err = dmu_object_reclaim(os, drro->drr_object,
1179 		    drro->drr_type, drro->drr_blksz,
1180 		    drro->drr_bonustype, drro->drr_bonuslen);
1181 	}
1182 	if (err != 0) {
1183 		return (SET_ERROR(EINVAL));
1184 	}
1185 
1186 	tx = dmu_tx_create(os);
1187 	dmu_tx_hold_bonus(tx, drro->drr_object);
1188 	err = dmu_tx_assign(tx, TXG_WAIT);
1189 	if (err != 0) {
1190 		dmu_tx_abort(tx);
1191 		return (err);
1192 	}
1193 
1194 	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
1195 	    tx);
1196 	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
1197 
1198 	if (data != NULL) {
1199 		dmu_buf_t *db;
1200 
1201 		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
1202 		dmu_buf_will_dirty(db, tx);
1203 
1204 		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1205 		bcopy(data, db->db_data, drro->drr_bonuslen);
1206 		if (ra->byteswap) {
1207 			dmu_object_byteswap_t byteswap =
1208 			    DMU_OT_BYTESWAP(drro->drr_bonustype);
1209 			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
1210 			    drro->drr_bonuslen);
1211 		}
1212 		dmu_buf_rele(db, FTAG);
1213 	}
1214 	dmu_tx_commit(tx);
1215 	return (0);
1216 }
1217 
1218 /* ARGSUSED */
1219 static int
1220 restore_freeobjects(struct restorearg *ra, objset_t *os,
1221     struct drr_freeobjects *drrfo)
1222 {
1223 	uint64_t obj;
1224 
1225 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1226 		return (SET_ERROR(EINVAL));
1227 
1228 	for (obj = drrfo->drr_firstobj;
1229 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
1230 	    (void) dmu_object_next(os, &obj, FALSE, 0)) {
1231 		int err;
1232 
1233 		if (dmu_object_info(os, obj, NULL) != 0)
1234 			continue;
1235 
1236 		err = dmu_free_long_object(os, obj);
1237 		if (err != 0)
1238 			return (err);
1239 	}
1240 	return (0);
1241 }
1242 
1243 static int
1244 restore_write(struct restorearg *ra, objset_t *os,
1245     struct drr_write *drrw)
1246 {
1247 	dmu_tx_t *tx;
1248 	void *data;
1249 	int err;
1250 
1251 	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
1252 	    !DMU_OT_IS_VALID(drrw->drr_type))
1253 		return (SET_ERROR(EINVAL));
1254 
1255 	data = restore_read(ra, drrw->drr_length);
1256 	if (data == NULL)
1257 		return (ra->err);
1258 
1259 	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
1260 		return (SET_ERROR(EINVAL));
1261 
1262 	tx = dmu_tx_create(os);
1263 
1264 	dmu_tx_hold_write(tx, drrw->drr_object,
1265 	    drrw->drr_offset, drrw->drr_length);
1266 	err = dmu_tx_assign(tx, TXG_WAIT);
1267 	if (err != 0) {
1268 		dmu_tx_abort(tx);
1269 		return (err);
1270 	}
1271 	if (ra->byteswap) {
1272 		dmu_object_byteswap_t byteswap =
1273 		    DMU_OT_BYTESWAP(drrw->drr_type);
1274 		dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
1275 	}
1276 	dmu_write(os, drrw->drr_object,
1277 	    drrw->drr_offset, drrw->drr_length, data, tx);
1278 	dmu_tx_commit(tx);
1279 	return (0);
1280 }
1281 
1282 /*
1283  * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
1284  * streams to refer to a copy of the data that is already on the
1285  * system because it came in earlier in the stream.  This function
1286  * finds the earlier copy of the data, and uses that copy instead of
1287  * data from the stream to fulfill this write.
1288  */
1289 static int
1290 restore_write_byref(struct restorearg *ra, objset_t *os,
1291     struct drr_write_byref *drrwbr)
1292 {
1293 	dmu_tx_t *tx;
1294 	int err;
1295 	guid_map_entry_t gmesrch;
1296 	guid_map_entry_t *gmep;
1297 	avl_index_t	where;
1298 	objset_t *ref_os = NULL;
1299 	dmu_buf_t *dbp;
1300 
1301 	if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1302 		return (SET_ERROR(EINVAL));
1303 
1304 	/*
1305 	 * If the GUID of the referenced dataset is different from the
1306 	 * GUID of the target dataset, find the referenced dataset.
1307 	 */
1308 	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1309 		gmesrch.guid = drrwbr->drr_refguid;
1310 		if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
1311 		    &where)) == NULL) {
1312 			return (SET_ERROR(EINVAL));
1313 		}
1314 		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1315 			return (SET_ERROR(EINVAL));
1316 	} else {
1317 		ref_os = os;
1318 	}
1319 
1320 	if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1321 	    drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
1322 		return (err);
1323 
1324 	tx = dmu_tx_create(os);
1325 
1326 	dmu_tx_hold_write(tx, drrwbr->drr_object,
1327 	    drrwbr->drr_offset, drrwbr->drr_length);
1328 	err = dmu_tx_assign(tx, TXG_WAIT);
1329 	if (err != 0) {
1330 		dmu_tx_abort(tx);
1331 		return (err);
1332 	}
1333 	dmu_write(os, drrwbr->drr_object,
1334 	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1335 	dmu_buf_rele(dbp, FTAG);
1336 	dmu_tx_commit(tx);
1337 	return (0);
1338 }
1339 
1340 static int
1341 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
1342 {
1343 	dmu_tx_t *tx;
1344 	void *data;
1345 	dmu_buf_t *db, *db_spill;
1346 	int err;
1347 
1348 	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1349 	    drrs->drr_length > SPA_MAXBLOCKSIZE)
1350 		return (SET_ERROR(EINVAL));
1351 
1352 	data = restore_read(ra, drrs->drr_length);
1353 	if (data == NULL)
1354 		return (ra->err);
1355 
1356 	if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
1357 		return (SET_ERROR(EINVAL));
1358 
1359 	VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
1360 	if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1361 		dmu_buf_rele(db, FTAG);
1362 		return (err);
1363 	}
1364 
1365 	tx = dmu_tx_create(os);
1366 
1367 	dmu_tx_hold_spill(tx, db->db_object);
1368 
1369 	err = dmu_tx_assign(tx, TXG_WAIT);
1370 	if (err != 0) {
1371 		dmu_buf_rele(db, FTAG);
1372 		dmu_buf_rele(db_spill, FTAG);
1373 		dmu_tx_abort(tx);
1374 		return (err);
1375 	}
1376 	dmu_buf_will_dirty(db_spill, tx);
1377 
1378 	if (db_spill->db_size < drrs->drr_length)
1379 		VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1380 		    drrs->drr_length, tx));
1381 	bcopy(data, db_spill->db_data, drrs->drr_length);
1382 
1383 	dmu_buf_rele(db, FTAG);
1384 	dmu_buf_rele(db_spill, FTAG);
1385 
1386 	dmu_tx_commit(tx);
1387 	return (0);
1388 }
1389 
1390 /* ARGSUSED */
1391 static int
1392 restore_free(struct restorearg *ra, objset_t *os,
1393     struct drr_free *drrf)
1394 {
1395 	int err;
1396 
1397 	if (drrf->drr_length != -1ULL &&
1398 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1399 		return (SET_ERROR(EINVAL));
1400 
1401 	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
1402 		return (SET_ERROR(EINVAL));
1403 
1404 	err = dmu_free_long_range(os, drrf->drr_object,
1405 	    drrf->drr_offset, drrf->drr_length);
1406 	return (err);
1407 }
1408 
1409 /* used to destroy the drc_ds on error */
1410 static void
1411 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
1412 {
1413 	char name[MAXNAMELEN];
1414 	dsl_dataset_name(drc->drc_ds, name);
1415 	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
1416 	(void) dsl_destroy_head(name);
1417 }
1418 
1419 /*
1420  * NB: callers *must* call dmu_recv_end() if this succeeds.
1421  */
1422 int
1423 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
1424     int cleanup_fd, uint64_t *action_handlep)
1425 {
1426 	struct restorearg ra = { 0 };
1427 	dmu_replay_record_t *drr;
1428 	objset_t *os;
1429 	zio_cksum_t pcksum;
1430 	int featureflags;
1431 
1432 	ra.byteswap = drc->drc_byteswap;
1433 	ra.cksum = drc->drc_cksum;
1434 	ra.vp = vp;
1435 	ra.voff = *voffp;
1436 	ra.bufsize = 1<<20;
1437 	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
1438 
1439 	/* these were verified in dmu_recv_begin */
1440 	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
1441 	    DMU_SUBSTREAM);
1442 	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
1443 
1444 	/*
1445 	 * Open the objset we are modifying.
1446 	 */
1447 	VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
1448 
1449 	ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
1450 
1451 	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1452 
1453 	/* if this stream is dedup'ed, set up the avl tree for guid mapping */
1454 	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
1455 		minor_t minor;
1456 
1457 		if (cleanup_fd == -1) {
1458 			ra.err = SET_ERROR(EBADF);
1459 			goto out;
1460 		}
1461 		ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
1462 		if (ra.err != 0) {
1463 			cleanup_fd = -1;
1464 			goto out;
1465 		}
1466 
1467 		if (*action_handlep == 0) {
1468 			ra.guid_to_ds_map =
1469 			    kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
1470 			avl_create(ra.guid_to_ds_map, guid_compare,
1471 			    sizeof (guid_map_entry_t),
1472 			    offsetof(guid_map_entry_t, avlnode));
1473 			ra.err = zfs_onexit_add_cb(minor,
1474 			    free_guid_map_onexit, ra.guid_to_ds_map,
1475 			    action_handlep);
1476 			if (ra.err != 0)
1477 				goto out;
1478 		} else {
1479 			ra.err = zfs_onexit_cb_data(minor, *action_handlep,
1480 			    (void **)&ra.guid_to_ds_map);
1481 			if (ra.err != 0)
1482 				goto out;
1483 		}
1484 
1485 		drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
1486 	}
1487 
1488 	/*
1489 	 * Read records and process them.
1490 	 */
1491 	pcksum = ra.cksum;
1492 	while (ra.err == 0 &&
1493 	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
1494 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
1495 			ra.err = SET_ERROR(EINTR);
1496 			goto out;
1497 		}
1498 
1499 		if (ra.byteswap)
1500 			backup_byteswap(drr);
1501 
1502 		switch (drr->drr_type) {
1503 		case DRR_OBJECT:
1504 		{
1505 			/*
1506 			 * We need to make a copy of the record header,
1507 			 * because restore_{object,write} may need to
1508 			 * restore_read(), which will invalidate drr.
1509 			 */
1510 			struct drr_object drro = drr->drr_u.drr_object;
1511 			ra.err = restore_object(&ra, os, &drro);
1512 			break;
1513 		}
1514 		case DRR_FREEOBJECTS:
1515 		{
1516 			struct drr_freeobjects drrfo =
1517 			    drr->drr_u.drr_freeobjects;
1518 			ra.err = restore_freeobjects(&ra, os, &drrfo);
1519 			break;
1520 		}
1521 		case DRR_WRITE:
1522 		{
1523 			struct drr_write drrw = drr->drr_u.drr_write;
1524 			ra.err = restore_write(&ra, os, &drrw);
1525 			break;
1526 		}
1527 		case DRR_WRITE_BYREF:
1528 		{
1529 			struct drr_write_byref drrwbr =
1530 			    drr->drr_u.drr_write_byref;
1531 			ra.err = restore_write_byref(&ra, os, &drrwbr);
1532 			break;
1533 		}
1534 		case DRR_FREE:
1535 		{
1536 			struct drr_free drrf = drr->drr_u.drr_free;
1537 			ra.err = restore_free(&ra, os, &drrf);
1538 			break;
1539 		}
1540 		case DRR_END:
1541 		{
1542 			struct drr_end drre = drr->drr_u.drr_end;
1543 			/*
1544 			 * We compare against the *previous* checksum
1545 			 * value, because the stored checksum is of
1546 			 * everything before the DRR_END record.
1547 			 */
1548 			if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
1549 				ra.err = SET_ERROR(ECKSUM);
1550 			goto out;
1551 		}
1552 		case DRR_SPILL:
1553 		{
1554 			struct drr_spill drrs = drr->drr_u.drr_spill;
1555 			ra.err = restore_spill(&ra, os, &drrs);
1556 			break;
1557 		}
1558 		default:
1559 			ra.err = SET_ERROR(EINVAL);
1560 			goto out;
1561 		}
1562 		pcksum = ra.cksum;
1563 	}
1564 	ASSERT(ra.err != 0);
1565 
1566 out:
1567 	if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
1568 		zfs_onexit_fd_rele(cleanup_fd);
1569 
1570 	if (ra.err != 0) {
1571 		/*
1572 		 * destroy what we created, so we don't leave it in the
1573 		 * inconsistent restoring state.
1574 		 */
1575 		dmu_recv_cleanup_ds(drc);
1576 	}
1577 
1578 	kmem_free(ra.buf, ra.bufsize);
1579 	*voffp = ra.voff;
1580 	return (ra.err);
1581 }
1582 
1583 static int
1584 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
1585 {
1586 	dmu_recv_cookie_t *drc = arg;
1587 	dsl_pool_t *dp = dmu_tx_pool(tx);
1588 	int error;
1589 
1590 	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
1591 
1592 	if (!drc->drc_newfs) {
1593 		dsl_dataset_t *origin_head;
1594 
1595 		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
1596 		if (error != 0)
1597 			return (error);
1598 		if (drc->drc_force) {
1599 			/*
1600 			 * We will destroy any snapshots in tofs (i.e. before
1601 			 * origin_head) that are after the origin (which is
1602 			 * the snap before drc_ds, because drc_ds can not
1603 			 * have any snaps of its own).
1604 			 */
1605 			uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj;
1606 			while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) {
1607 				dsl_dataset_t *snap;
1608 				error = dsl_dataset_hold_obj(dp, obj, FTAG,
1609 				    &snap);
1610 				if (error != 0)
1611 					return (error);
1612 				if (snap->ds_dir != origin_head->ds_dir)
1613 					error = SET_ERROR(EINVAL);
1614 				if (error == 0)  {
1615 					error = dsl_destroy_snapshot_check_impl(
1616 					    snap, B_FALSE);
1617 				}
1618 				obj = snap->ds_phys->ds_prev_snap_obj;
1619 				dsl_dataset_rele(snap, FTAG);
1620 				if (error != 0)
1621 					return (error);
1622 			}
1623 		}
1624 		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
1625 		    origin_head, drc->drc_force, drc->drc_owner, tx);
1626 		if (error != 0) {
1627 			dsl_dataset_rele(origin_head, FTAG);
1628 			return (error);
1629 		}
1630 		error = dsl_dataset_snapshot_check_impl(origin_head,
1631 		    drc->drc_tosnap, tx, B_TRUE);
1632 		dsl_dataset_rele(origin_head, FTAG);
1633 		if (error != 0)
1634 			return (error);
1635 
1636 		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
1637 	} else {
1638 		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
1639 		    drc->drc_tosnap, tx, B_TRUE);
1640 	}
1641 	return (error);
1642 }
1643 
1644 static void
1645 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
1646 {
1647 	dmu_recv_cookie_t *drc = arg;
1648 	dsl_pool_t *dp = dmu_tx_pool(tx);
1649 
1650 	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
1651 	    tx, "snap=%s", drc->drc_tosnap);
1652 
1653 	if (!drc->drc_newfs) {
1654 		dsl_dataset_t *origin_head;
1655 
1656 		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
1657 		    &origin_head));
1658 
1659 		if (drc->drc_force) {
1660 			/*
1661 			 * Destroy any snapshots of drc_tofs (origin_head)
1662 			 * after the origin (the snap before drc_ds).
1663 			 */
1664 			uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj;
1665 			while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) {
1666 				dsl_dataset_t *snap;
1667 				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
1668 				    &snap));
1669 				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
1670 				obj = snap->ds_phys->ds_prev_snap_obj;
1671 				dsl_destroy_snapshot_sync_impl(snap,
1672 				    B_FALSE, tx);
1673 				dsl_dataset_rele(snap, FTAG);
1674 			}
1675 		}
1676 		VERIFY3P(drc->drc_ds->ds_prev, ==,
1677 		    origin_head->ds_prev);
1678 
1679 		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
1680 		    origin_head, tx);
1681 		dsl_dataset_snapshot_sync_impl(origin_head,
1682 		    drc->drc_tosnap, tx);
1683 
1684 		/* set snapshot's creation time and guid */
1685 		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
1686 		origin_head->ds_prev->ds_phys->ds_creation_time =
1687 		    drc->drc_drrb->drr_creation_time;
1688 		origin_head->ds_prev->ds_phys->ds_guid =
1689 		    drc->drc_drrb->drr_toguid;
1690 		origin_head->ds_prev->ds_phys->ds_flags &=
1691 		    ~DS_FLAG_INCONSISTENT;
1692 
1693 		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
1694 		origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1695 
1696 		dsl_dataset_rele(origin_head, FTAG);
1697 		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
1698 
1699 		if (drc->drc_owner != NULL)
1700 			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
1701 	} else {
1702 		dsl_dataset_t *ds = drc->drc_ds;
1703 
1704 		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
1705 
1706 		/* set snapshot's creation time and guid */
1707 		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1708 		ds->ds_prev->ds_phys->ds_creation_time =
1709 		    drc->drc_drrb->drr_creation_time;
1710 		ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid;
1711 		ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1712 
1713 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
1714 		ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1715 	}
1716 	drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj;
1717 	/*
1718 	 * Release the hold from dmu_recv_begin.  This must be done before
1719 	 * we return to open context, so that when we free the dataset's dnode,
1720 	 * we can evict its bonus buffer.
1721 	 */
1722 	dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
1723 	drc->drc_ds = NULL;
1724 }
1725 
1726 static int
1727 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
1728 {
1729 	dsl_pool_t *dp;
1730 	dsl_dataset_t *snapds;
1731 	guid_map_entry_t *gmep;
1732 	int err;
1733 
1734 	ASSERT(guid_map != NULL);
1735 
1736 	err = dsl_pool_hold(name, FTAG, &dp);
1737 	if (err != 0)
1738 		return (err);
1739 	gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
1740 	err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
1741 	if (err == 0) {
1742 		gmep->guid = snapds->ds_phys->ds_guid;
1743 		gmep->gme_ds = snapds;
1744 		avl_add(guid_map, gmep);
1745 		dsl_dataset_long_hold(snapds, gmep);
1746 	} else {
1747 		kmem_free(gmep, sizeof (*gmep));
1748 	}
1749 
1750 	dsl_pool_rele(dp, FTAG);
1751 	return (err);
1752 }
1753 
1754 static int dmu_recv_end_modified_blocks = 3;
1755 
1756 static int
1757 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
1758 {
1759 	int error;
1760 	char name[MAXNAMELEN];
1761 
1762 #ifdef _KERNEL
1763 	/*
1764 	 * We will be destroying the ds; make sure its origin is unmounted if
1765 	 * necessary.
1766 	 */
1767 	dsl_dataset_name(drc->drc_ds, name);
1768 	zfs_destroy_unmount_origin(name);
1769 #endif
1770 
1771 	error = dsl_sync_task(drc->drc_tofs,
1772 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
1773 	    dmu_recv_end_modified_blocks);
1774 
1775 	if (error != 0)
1776 		dmu_recv_cleanup_ds(drc);
1777 	return (error);
1778 }
1779 
1780 static int
1781 dmu_recv_new_end(dmu_recv_cookie_t *drc)
1782 {
1783 	int error;
1784 
1785 	error = dsl_sync_task(drc->drc_tofs,
1786 	    dmu_recv_end_check, dmu_recv_end_sync, drc,
1787 	    dmu_recv_end_modified_blocks);
1788 
1789 	if (error != 0) {
1790 		dmu_recv_cleanup_ds(drc);
1791 	} else if (drc->drc_guid_to_ds_map != NULL) {
1792 		(void) add_ds_to_guidmap(drc->drc_tofs,
1793 		    drc->drc_guid_to_ds_map,
1794 		    drc->drc_newsnapobj);
1795 	}
1796 	return (error);
1797 }
1798 
1799 int
1800 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
1801 {
1802 	drc->drc_owner = owner;
1803 
1804 	if (drc->drc_newfs)
1805 		return (dmu_recv_new_end(drc));
1806 	else
1807 		return (dmu_recv_existing_end(drc));
1808 }
1809 
1810 /*
1811  * Return TRUE if this objset is currently being received into.
1812  */
1813 boolean_t
1814 dmu_objset_is_receiving(objset_t *os)
1815 {
1816 	return (os->os_dsl_dataset != NULL &&
1817 	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
1818 }
1819