1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
24 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
25 * Copyright 2013 Saso Kiselkov. All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/spa_impl.h>
31#include <sys/zio.h>
32#include <sys/zio_checksum.h>
33#include <sys/zil.h>
34#include <sys/abd.h>
35#include <zfs_fletcher.h>
36
37/*
38 * Checksum vectors.
39 *
40 * In the SPA, everything is checksummed.  We support checksum vectors
41 * for three distinct reasons:
42 *
43 *   1. Different kinds of data need different levels of protection.
44 *	For SPA metadata, we always want a very strong checksum.
45 *	For user data, we let users make the trade-off between speed
46 *	and checksum strength.
47 *
48 *   2. Cryptographic hash and MAC algorithms are an area of active research.
49 *	It is likely that in future hash functions will be at least as strong
50 *	as current best-of-breed, and may be substantially faster as well.
51 *	We want the ability to take advantage of these new hashes as soon as
52 *	they become available.
53 *
54 *   3. If someone develops hardware that can compute a strong hash quickly,
55 *	we want the ability to take advantage of that hardware.
56 *
57 * Of course, we don't want a checksum upgrade to invalidate existing
58 * data, so we store the checksum *function* in eight bits of the bp.
59 * This gives us room for up to 256 different checksum functions.
60 *
61 * When writing a block, we always checksum it with the latest-and-greatest
62 * checksum function of the appropriate strength.  When reading a block,
63 * we compare the expected checksum against the actual checksum, which we
64 * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
65 *
66 * SALTED CHECKSUMS
67 *
68 * To enable the use of less secure hash algorithms with dedup, we
69 * introduce the notion of salted checksums (MACs, really).  A salted
70 * checksum is fed both a random 256-bit value (the salt) and the data
71 * to be checksummed.  This salt is kept secret (stored on the pool, but
72 * never shown to the user).  Thus even if an attacker knew of collision
73 * weaknesses in the hash algorithm, they won't be able to mount a known
74 * plaintext attack on the DDT, since the actual hash value cannot be
75 * known ahead of time.  How the salt is used is algorithm-specific
76 * (some might simply prefix it to the data block, others might need to
77 * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
78 * object in the MOS (DMU_POOL_CHECKSUM_SALT).
79 *
80 * CONTEXT TEMPLATES
81 *
82 * Some hashing algorithms need to perform a substantial amount of
83 * initialization work (e.g. salted checksums above may need to pre-hash
84 * the salt) before being able to process data.  Performing this
85 * redundant work for each block would be wasteful, so we instead allow
86 * a checksum algorithm to do the work once (the first time it's used)
87 * and then keep this pre-initialized context as a template inside the
88 * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
89 * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
90 * construct and destruct the pre-initialized checksum context.  The
91 * pre-initialized context is then reused during each checksum
92 * invocation and passed to the checksum function.
93 */
94
95/*ARGSUSED*/
96static void
97abd_checksum_off(abd_t *abd, uint64_t size,
98    const void *ctx_template, zio_cksum_t *zcp)
99{
100	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
101}
102
103/*ARGSUSED*/
104void
105abd_fletcher_2_native(abd_t *abd, uint64_t size,
106    const void *ctx_template, zio_cksum_t *zcp)
107{
108	fletcher_init(zcp);
109	(void) abd_iterate_func(abd, 0, size,
110	    fletcher_2_incremental_native, zcp);
111}
112
113/*ARGSUSED*/
114void
115abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
116    const void *ctx_template, zio_cksum_t *zcp)
117{
118	fletcher_init(zcp);
119	(void) abd_iterate_func(abd, 0, size,
120	    fletcher_2_incremental_byteswap, zcp);
121}
122
123/*ARGSUSED*/
124void
125abd_fletcher_4_native(abd_t *abd, uint64_t size,
126    const void *ctx_template, zio_cksum_t *zcp)
127{
128	fletcher_init(zcp);
129	(void) abd_iterate_func(abd, 0, size,
130	    fletcher_4_incremental_native, zcp);
131}
132
133/*ARGSUSED*/
134void
135abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
136    const void *ctx_template, zio_cksum_t *zcp)
137{
138	fletcher_init(zcp);
139	(void) abd_iterate_func(abd, 0, size,
140	    fletcher_4_incremental_byteswap, zcp);
141}
142
143zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
144	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
145	{{NULL, NULL}, NULL, NULL, 0, "on"},
146	{{abd_checksum_off,		abd_checksum_off},
147	    NULL, NULL, 0, "off"},
148	{{abd_checksum_SHA256,		abd_checksum_SHA256},
149	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
150	    "label"},
151	{{abd_checksum_SHA256,		abd_checksum_SHA256},
152	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
153	    "gang_header"},
154	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
155	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
156	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
157	    NULL, NULL, 0, "fletcher2"},
158	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
159	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
160	{{abd_checksum_SHA256,		abd_checksum_SHA256},
161	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
162	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
163	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
164	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
165	{{abd_checksum_off,		abd_checksum_off},
166	    NULL, NULL, 0, "noparity"},
167	{{abd_checksum_SHA512_native,	abd_checksum_SHA512_byteswap},
168	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
169	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
170	{{abd_checksum_skein_native,	abd_checksum_skein_byteswap},
171	    abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
172	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
173	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
174	{{abd_checksum_edonr_native,	abd_checksum_edonr_byteswap},
175	    abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
176	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
177	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
178};
179
180/*
181 * The flag corresponding to the "verify" in dedup=[checksum,]verify
182 * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
183 */
184spa_feature_t
185zio_checksum_to_feature(enum zio_checksum cksum)
186{
187	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
188
189	switch (cksum) {
190	case ZIO_CHECKSUM_SHA512:
191		return (SPA_FEATURE_SHA512);
192	case ZIO_CHECKSUM_SKEIN:
193		return (SPA_FEATURE_SKEIN);
194	case ZIO_CHECKSUM_EDONR:
195		return (SPA_FEATURE_EDONR);
196	}
197	return (SPA_FEATURE_NONE);
198}
199
200enum zio_checksum
201zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
202{
203	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
204	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
205	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
206
207	if (child == ZIO_CHECKSUM_INHERIT)
208		return (parent);
209
210	if (child == ZIO_CHECKSUM_ON)
211		return (ZIO_CHECKSUM_ON_VALUE);
212
213	return (child);
214}
215
216enum zio_checksum
217zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
218    enum zio_checksum parent)
219{
220	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
221	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
222	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
223
224	if (child == ZIO_CHECKSUM_INHERIT)
225		return (parent);
226
227	if (child == ZIO_CHECKSUM_ON)
228		return (spa_dedup_checksum(spa));
229
230	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
231		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
232
233	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
234	    ZCHECKSUM_FLAG_DEDUP) ||
235	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
236
237	return (child);
238}
239
240/*
241 * Set the external verifier for a gang block based on <vdev, offset, txg>,
242 * a tuple which is guaranteed to be unique for the life of the pool.
243 */
244static void
245zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
246{
247	const dva_t *dva = BP_IDENTITY(bp);
248	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
249
250	ASSERT(BP_IS_GANG(bp));
251
252	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
253}
254
255/*
256 * Set the external verifier for a label block based on its offset.
257 * The vdev is implicit, and the txg is unknowable at pool open time --
258 * hence the logic in vdev_uberblock_load() to find the most recent copy.
259 */
260static void
261zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
262{
263	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
264}
265
266/*
267 * Calls the template init function of a checksum which supports context
268 * templates and installs the template into the spa_t.
269 */
270static void
271zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
272{
273	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
274
275	if (ci->ci_tmpl_init == NULL)
276		return;
277	if (spa->spa_cksum_tmpls[checksum] != NULL)
278		return;
279
280	VERIFY(ci->ci_tmpl_free != NULL);
281	mutex_enter(&spa->spa_cksum_tmpls_lock);
282	if (spa->spa_cksum_tmpls[checksum] == NULL) {
283		spa->spa_cksum_tmpls[checksum] =
284		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
285		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
286	}
287	mutex_exit(&spa->spa_cksum_tmpls_lock);
288}
289
290/* convenience function to update a checksum to accomodate an encryption MAC */
291static void
292zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor)
293{
294	/*
295	 * Weak checksums do not have their entropy spread evenly
296	 * across the bits of the checksum. Therefore, when truncating
297	 * a weak checksum we XOR the first 2 words with the last 2 so
298	 * that we don't "lose" any entropy unnecessarily.
299	 */
300	if (xor) {
301		cksum->zc_word[0] ^= cksum->zc_word[2];
302		cksum->zc_word[1] ^= cksum->zc_word[3];
303	}
304
305	cksum->zc_word[2] = saved->zc_word[2];
306	cksum->zc_word[3] = saved->zc_word[3];
307}
308
309/*
310 * Generate the checksum.
311 */
312void
313zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
314    abd_t *abd, uint64_t size)
315{
316	static const uint64_t zec_magic = ZEC_MAGIC;
317	blkptr_t *bp = zio->io_bp;
318	uint64_t offset = zio->io_offset;
319	zio_checksum_info_t *ci;
320	zio_cksum_t cksum, saved;
321	spa_t *spa = zio->io_spa;
322	boolean_t insecure;
323
324	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
325	ci = &zio_checksum_table[checksum];
326	ASSERT(ci->ci_func[0] != NULL);
327	insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0;
328
329	zio_checksum_template_init(checksum, spa);
330
331	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
332		zio_eck_t eck;
333		size_t eck_offset;
334
335		bzero(&saved, sizeof (zio_cksum_t));
336
337		if (checksum == ZIO_CHECKSUM_ZILOG2) {
338			zil_chain_t zilc;
339			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
340
341			size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
342			    uint64_t);
343			eck = zilc.zc_eck;
344			eck_offset = offsetof(zil_chain_t, zc_eck);
345		} else {
346			eck_offset = size - sizeof (zio_eck_t);
347			abd_copy_to_buf_off(&eck, abd, eck_offset,
348			    sizeof (zio_eck_t));
349		}
350
351		if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
352			zio_checksum_gang_verifier(&eck.zec_cksum, bp);
353		} else if (checksum == ZIO_CHECKSUM_LABEL) {
354			zio_checksum_label_verifier(&eck.zec_cksum, offset);
355		} else {
356			saved = eck.zec_cksum;
357			eck.zec_cksum = bp->blk_cksum;
358		}
359
360		abd_copy_from_buf_off(abd, &zec_magic,
361		    eck_offset + offsetof(zio_eck_t, zec_magic),
362		    sizeof (zec_magic));
363		abd_copy_from_buf_off(abd, &eck.zec_cksum,
364		    eck_offset + offsetof(zio_eck_t, zec_cksum),
365		    sizeof (zio_cksum_t));
366
367		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
368		    &cksum);
369		if (bp != NULL && BP_USES_CRYPT(bp) &&
370		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
371			zio_checksum_handle_crypt(&cksum, &saved, insecure);
372
373		abd_copy_from_buf_off(abd, &cksum,
374		    eck_offset + offsetof(zio_eck_t, zec_cksum),
375		    sizeof (zio_cksum_t));
376	} else {
377		saved = bp->blk_cksum;
378		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
379		    &cksum);
380		if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
381			zio_checksum_handle_crypt(&cksum, &saved, insecure);
382		bp->blk_cksum = cksum;
383	}
384}
385
386int
387zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
388    enum zio_checksum checksum, abd_t *abd, uint64_t size,
389    uint64_t offset, zio_bad_cksum_t *info)
390{
391	zio_checksum_info_t *ci;
392	zio_cksum_t actual_cksum, expected_cksum;
393	zio_eck_t eck;
394	int byteswap;
395
396	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
397		return (SET_ERROR(EINVAL));
398
399	ci = &zio_checksum_table[checksum];
400
401	if (ci->ci_func[0] == NULL)
402		return (SET_ERROR(EINVAL));
403
404	zio_checksum_template_init(checksum, spa);
405
406	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
407		zio_cksum_t verifier;
408		size_t eck_offset;
409
410		if (checksum == ZIO_CHECKSUM_ZILOG2) {
411			zil_chain_t zilc;
412			uint64_t nused;
413
414			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
415
416			eck = zilc.zc_eck;
417			eck_offset = offsetof(zil_chain_t, zc_eck) +
418			    offsetof(zio_eck_t, zec_cksum);
419
420			if (eck.zec_magic == ZEC_MAGIC) {
421				nused = zilc.zc_nused;
422			} else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) {
423				nused = BSWAP_64(zilc.zc_nused);
424			} else {
425				return (SET_ERROR(ECKSUM));
426			}
427
428			if (nused > size) {
429				return (SET_ERROR(ECKSUM));
430			}
431
432			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
433		} else {
434			eck_offset = size - sizeof (zio_eck_t);
435			abd_copy_to_buf_off(&eck, abd, eck_offset,
436			    sizeof (zio_eck_t));
437			eck_offset += offsetof(zio_eck_t, zec_cksum);
438		}
439
440		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
441			zio_checksum_gang_verifier(&verifier, bp);
442		else if (checksum == ZIO_CHECKSUM_LABEL)
443			zio_checksum_label_verifier(&verifier, offset);
444		else
445			verifier = bp->blk_cksum;
446
447		byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC));
448
449		if (byteswap)
450			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
451
452		expected_cksum = eck.zec_cksum;
453
454		abd_copy_from_buf_off(abd, &verifier, eck_offset,
455		    sizeof (zio_cksum_t));
456
457		ci->ci_func[byteswap](abd, size,
458		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
459
460		abd_copy_from_buf_off(abd, &expected_cksum, eck_offset,
461		    sizeof (zio_cksum_t));
462
463		if (byteswap) {
464			byteswap_uint64_array(&expected_cksum,
465			    sizeof (zio_cksum_t));
466		}
467	} else {
468		byteswap = BP_SHOULD_BYTESWAP(bp);
469		expected_cksum = bp->blk_cksum;
470		ci->ci_func[byteswap](abd, size,
471		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
472	}
473
474	/*
475	 * MAC checksums are a special case since half of this checksum will
476	 * actually be the encryption MAC. This will be verified by the
477	 * decryption process, so we just check the truncated checksum now.
478	 * Objset blocks use embedded MACs so we don't truncate the checksum
479	 * for them.
480	 */
481	if (bp != NULL && BP_USES_CRYPT(bp) &&
482	    BP_GET_TYPE(bp) != DMU_OT_OBJSET) {
483		if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) {
484			actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2];
485			actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3];
486		}
487
488		actual_cksum.zc_word[2] = 0;
489		actual_cksum.zc_word[3] = 0;
490		expected_cksum.zc_word[2] = 0;
491		expected_cksum.zc_word[3] = 0;
492	}
493
494	if (info != NULL) {
495		info->zbc_expected = expected_cksum;
496		info->zbc_actual = actual_cksum;
497		info->zbc_checksum_name = ci->ci_name;
498		info->zbc_byteswapped = byteswap;
499		info->zbc_injected = 0;
500		info->zbc_has_cksum = 1;
501	}
502	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
503		return (SET_ERROR(ECKSUM));
504
505	return (0);
506}
507
508int
509zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
510{
511	blkptr_t *bp = zio->io_bp;
512	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
513	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
514	int error;
515	uint64_t size = (bp == NULL ? zio->io_size :
516	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
517	uint64_t offset = zio->io_offset;
518	abd_t *data = zio->io_abd;
519	spa_t *spa = zio->io_spa;
520
521	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
522	    offset, info);
523
524	if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
525		error = zio_handle_fault_injection(zio, ECKSUM);
526		if (error != 0)
527			info->zbc_injected = 1;
528	}
529
530	return (error);
531}
532
533/*
534 * Called by a spa_t that's about to be deallocated. This steps through
535 * all of the checksum context templates and deallocates any that were
536 * initialized using the algorithm-specific template init function.
537 */
538void
539zio_checksum_templates_free(spa_t *spa)
540{
541	for (enum zio_checksum checksum = 0;
542	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
543		if (spa->spa_cksum_tmpls[checksum] != NULL) {
544			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
545
546			VERIFY(ci->ci_tmpl_free != NULL);
547			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
548			spa->spa_cksum_tmpls[checksum] = NULL;
549		}
550	}
551}
552