xref: /illumos-gate/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c (revision ece0bc84)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 #include <lz4.h>
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #define	ECKSUM	666
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 #define	kmem_alloc(size, flag)	zfs_alloc((size))
40 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
41 
42 static void
43 zfs_init_crc(void)
44 {
45 	int i, j;
46 	uint64_t *ct;
47 
48 	/*
49 	 * Calculate the crc64 table (used for the zap hash
50 	 * function).
51 	 */
52 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
53 		memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table));
54 		for (i = 0; i < 256; i++) {
55 			ct = zfs_crc64_table + i;
56 			for (*ct = i, j = 8; j > 0; j--)
57 				*ct = (*ct >> 1) ^
58 				    (-(*ct & 1) & ZFS_CRC64_POLY);
59 		}
60 	}
61 }
62 
63 static void
64 zio_checksum_off(const void *buf __unused, uint64_t size __unused,
65     const void *ctx_template __unused, zio_cksum_t *zcp)
66 {
67 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
68 }
69 
70 /*
71  * Signature for checksum functions.
72  */
73 typedef void zio_checksum_t(const void *data, uint64_t size,
74     const void *ctx_template, zio_cksum_t *zcp);
75 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
76 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
77 
78 typedef enum zio_checksum_flags {
79 	/* Strong enough for metadata? */
80 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
81 	/* ZIO embedded checksum */
82 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
83 	/* Strong enough for dedup (without verification)? */
84 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
85 	/* Uses salt value */
86 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
87 	/* Strong enough for nopwrite? */
88 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
89 } zio_checksum_flags_t;
90 
91 /*
92  * Information about each checksum function.
93  */
94 typedef struct zio_checksum_info {
95 	/* checksum function for each byteorder */
96 	zio_checksum_t			*ci_func[2];
97 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
98 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
99 	zio_checksum_flags_t		ci_flags;
100 	const char			*ci_name;	/* descriptive name */
101 } zio_checksum_info_t;
102 
103 #include "blkptr.c"
104 
105 #include "fletcher.c"
106 #include "sha256.c"
107 #include "skein_zfs.c"
108 #include "edonr_zfs.c"
109 
110 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
111 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
112 	{{NULL, NULL}, NULL, NULL, 0, "on"},
113 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
114 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
115 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
116 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
117 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
118 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
119 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
120 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
121 	    0, "fletcher2"},
122 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
123 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
124 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
125 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
126 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
127 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
128 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
129 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
130 	    0, "noparity"},
131 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
132 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
133 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
134 	/* no skein and edonr for now */
135 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
136 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
137 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
138 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
139 	{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
140 	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
141 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
142 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
143 };
144 
145 /*
146  * Common signature for all zio compress/decompress functions.
147  */
148 typedef size_t zio_compress_func_t(void *src, void *dst,
149     size_t s_len, size_t d_len, int);
150 typedef int zio_decompress_func_t(void *src, void *dst,
151     size_t s_len, size_t d_len, int);
152 
153 extern int gzip_decompress(void *src, void *dst,
154     size_t s_len, size_t d_len, int);
155 /*
156  * Information about each compression function.
157  */
158 typedef struct zio_compress_info {
159 	zio_compress_func_t	*ci_compress;	/* compression function */
160 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
161 	int			ci_level;	/* level parameter */
162 	const char		*ci_name;	/* algorithm name */
163 } zio_compress_info_t;
164 
165 #include "lzjb.c"
166 #include "zle.c"
167 
168 /*
169  * Compression vectors.
170  */
171 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
172 	{NULL,			NULL,			0,	"inherit"},
173 	{NULL,			NULL,			0,	"on"},
174 	{NULL,			NULL,			0,	"uncompressed"},
175 	{NULL,			lzjb_decompress,	0,	"lzjb"},
176 	{NULL,			NULL,			0,	"empty"},
177 	{NULL,			gzip_decompress,	1,	"gzip-1"},
178 	{NULL,			gzip_decompress,	2,	"gzip-2"},
179 	{NULL,			gzip_decompress,	3,	"gzip-3"},
180 	{NULL,			gzip_decompress,	4,	"gzip-4"},
181 	{NULL,			gzip_decompress,	5,	"gzip-5"},
182 	{NULL,			gzip_decompress,	6,	"gzip-6"},
183 	{NULL,			gzip_decompress,	7,	"gzip-7"},
184 	{NULL,			gzip_decompress,	8,	"gzip-8"},
185 	{NULL,			gzip_decompress,	9,	"gzip-9"},
186 	{NULL,			zle_decompress,		64,	"zle"},
187 	{NULL,			lz4_decompress,		0,	"lz4"},
188 };
189 
190 static void
191 byteswap_uint64_array(void *vbuf, size_t size)
192 {
193 	uint64_t *buf = vbuf;
194 	size_t count = size >> 3;
195 	int i;
196 
197 	ASSERT((size & 7) == 0);
198 
199 	for (i = 0; i < count; i++)
200 		buf[i] = BSWAP_64(buf[i]);
201 }
202 
203 /*
204  * Set the external verifier for a gang block based on <vdev, offset, txg>,
205  * a tuple which is guaranteed to be unique for the life of the pool.
206  */
207 static void
208 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
209 {
210 	const dva_t *dva = BP_IDENTITY(bp);
211 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
212 
213 	ASSERT(BP_IS_GANG(bp));
214 
215 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
216 }
217 
218 /*
219  * Set the external verifier for a label block based on its offset.
220  * The vdev is implicit, and the txg is unknowable at pool open time --
221  * hence the logic in vdev_uberblock_load() to find the most recent copy.
222  */
223 static void
224 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
225 {
226 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
227 }
228 
229 /*
230  * Calls the template init function of a checksum which supports context
231  * templates and installs the template into the spa_t.
232  */
233 static void
234 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
235 {
236 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
237 
238 	if (ci->ci_tmpl_init == NULL)
239 		return;
240 
241 	if (spa->spa_cksum_tmpls[checksum] != NULL)
242 		return;
243 
244 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
245 		spa->spa_cksum_tmpls[checksum] =
246 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
247 	}
248 }
249 
250 /*
251  * Called by a spa_t that's about to be deallocated. This steps through
252  * all of the checksum context templates and deallocates any that were
253  * initialized using the algorithm-specific template init function.
254  */
255 void
256 zio_checksum_templates_free(spa_t *spa)
257 {
258 	for (enum zio_checksum checksum = 0;
259 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
260 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
261 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
262 
263 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
264 			spa->spa_cksum_tmpls[checksum] = NULL;
265 		}
266 	}
267 }
268 
269 static int
270 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
271 {
272 	uint64_t size;
273 	unsigned int checksum;
274 	zio_checksum_info_t *ci;
275 	void *ctx = NULL;
276 	zio_cksum_t actual_cksum, expected_cksum, verifier;
277 	int byteswap;
278 
279 	checksum = BP_GET_CHECKSUM(bp);
280 	size = BP_GET_PSIZE(bp);
281 
282 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
283 		return (EINVAL);
284 	ci = &zio_checksum_table[checksum];
285 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
286 		return (EINVAL);
287 
288 	if (spa != NULL) {
289 		zio_checksum_template_init(checksum, (spa_t *)spa);
290 		ctx = spa->spa_cksum_tmpls[checksum];
291 	}
292 
293 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
294 		zio_eck_t *eck;
295 
296 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
297 		    checksum == ZIO_CHECKSUM_LABEL);
298 
299 		eck = (zio_eck_t *)((char *)data + size) - 1;
300 
301 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
302 			zio_checksum_gang_verifier(&verifier, bp);
303 		else if (checksum == ZIO_CHECKSUM_LABEL)
304 			zio_checksum_label_verifier(&verifier,
305 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
306 		else
307 			verifier = bp->blk_cksum;
308 
309 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
310 
311 		if (byteswap)
312 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
313 
314 		expected_cksum = eck->zec_cksum;
315 		eck->zec_cksum = verifier;
316 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
317 		eck->zec_cksum = expected_cksum;
318 
319 		if (byteswap)
320 			byteswap_uint64_array(&expected_cksum,
321 			    sizeof (zio_cksum_t));
322 	} else {
323 		byteswap = BP_SHOULD_BYTESWAP(bp);
324 		expected_cksum = bp->blk_cksum;
325 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
326 	}
327 
328 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
329 		/* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
330 		return (EIO);
331 	}
332 
333 	return (0);
334 }
335 
336 static int
337 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
338     void *dest, uint64_t destsize)
339 {
340 	zio_compress_info_t *ci;
341 
342 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
343 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
344 		return (EIO);
345 	}
346 
347 	ci = &zio_compress_table[cpfunc];
348 	if (!ci->ci_decompress) {
349 		printf("ZFS: unsupported compression algorithm %s\n",
350 		    ci->ci_name);
351 		return (EIO);
352 	}
353 
354 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
355 }
356 
357 static uint64_t
358 zap_hash(uint64_t salt, const char *name)
359 {
360 	const uint8_t *cp;
361 	uint8_t c;
362 	uint64_t crc = salt;
363 
364 	ASSERT(crc != 0);
365 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
366 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
367 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
368 
369 	/*
370 	 * Only use 28 bits, since we need 4 bits in the cookie for the
371 	 * collision differentiator.  We MUST use the high bits, since
372 	 * those are the onces that we first pay attention to when
373 	 * chosing the bucket.
374 	 */
375 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
376 
377 	return (crc);
378 }
379 
380 static void *zfs_alloc(size_t size);
381 static void zfs_free(void *ptr, size_t size);
382 
383 typedef struct raidz_col {
384 	uint64_t rc_devidx;		/* child device index for I/O */
385 	uint64_t rc_offset;		/* device offset */
386 	uint64_t rc_size;		/* I/O size */
387 	void *rc_data;			/* I/O data */
388 	int rc_error;			/* I/O error for this device */
389 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
390 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
391 } raidz_col_t;
392 
393 typedef struct raidz_map {
394 	uint64_t rm_cols;		/* Regular column count */
395 	uint64_t rm_scols;		/* Count including skipped columns */
396 	uint64_t rm_bigcols;		/* Number of oversized columns */
397 	uint64_t rm_asize;		/* Actual total I/O size */
398 	uint64_t rm_missingdata;	/* Count of missing data devices */
399 	uint64_t rm_missingparity;	/* Count of missing parity devices */
400 	uint64_t rm_firstdatacol;	/* First data column/parity count */
401 	uint64_t rm_nskip;		/* Skipped sectors for padding */
402 	uint64_t rm_skipstart;		/* Column index of padding start */
403 	uintptr_t rm_reports;		/* # of referencing checksum reports */
404 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
405 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
406 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
407 } raidz_map_t;
408 
409 #define	VDEV_RAIDZ_P		0
410 #define	VDEV_RAIDZ_Q		1
411 #define	VDEV_RAIDZ_R		2
412 
413 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
414 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
415 
416 /*
417  * We provide a mechanism to perform the field multiplication operation on a
418  * 64-bit value all at once rather than a byte at a time. This works by
419  * creating a mask from the top bit in each byte and using that to
420  * conditionally apply the XOR of 0x1d.
421  */
422 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
423 { \
424 	(mask) = (x) & 0x8080808080808080ULL; \
425 	(mask) = ((mask) << 1) - ((mask) >> 7); \
426 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
427 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
428 }
429 
430 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
431 { \
432 	VDEV_RAIDZ_64MUL_2((x), mask); \
433 	VDEV_RAIDZ_64MUL_2((x), mask); \
434 }
435 
436 /*
437  * These two tables represent powers and logs of 2 in the Galois field defined
438  * above. These values were computed by repeatedly multiplying by 2 as above.
439  */
440 static const uint8_t vdev_raidz_pow2[256] = {
441 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
442 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
443 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
444 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
445 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
446 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
447 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
448 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
449 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
450 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
451 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
452 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
453 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
454 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
455 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
456 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
457 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
458 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
459 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
460 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
461 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
462 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
463 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
464 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
465 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
466 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
467 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
468 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
469 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
470 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
471 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
472 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
473 };
474 static const uint8_t vdev_raidz_log2[256] = {
475 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
476 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
477 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
478 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
479 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
480 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
481 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
482 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
483 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
484 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
485 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
486 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
487 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
488 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
489 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
490 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
491 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
492 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
493 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
494 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
495 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
496 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
497 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
498 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
499 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
500 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
501 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
502 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
503 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
504 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
505 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
506 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
507 };
508 
509 /*
510  * Multiply a given number by 2 raised to the given power.
511  */
512 static uint8_t
513 vdev_raidz_exp2(uint8_t a, int exp)
514 {
515 	if (a == 0)
516 		return (0);
517 
518 	ASSERT(exp >= 0);
519 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
520 
521 	exp += vdev_raidz_log2[a];
522 	if (exp > 255)
523 		exp -= 255;
524 
525 	return (vdev_raidz_pow2[exp]);
526 }
527 
528 static void
529 vdev_raidz_generate_parity_p(raidz_map_t *rm)
530 {
531 	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
532 	int c;
533 
534 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
535 
536 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
537 		src = rm->rm_col[c].rc_data;
538 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
539 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
540 
541 		if (c == rm->rm_firstdatacol) {
542 			ASSERT(ccount == pcount);
543 			for (i = 0; i < ccount; i++, src++, p++) {
544 				*p = *src;
545 			}
546 		} else {
547 			ASSERT(ccount <= pcount);
548 			for (i = 0; i < ccount; i++, src++, p++) {
549 				*p ^= *src;
550 			}
551 		}
552 	}
553 }
554 
555 static void
556 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
557 {
558 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
559 	int c;
560 
561 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
562 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
563 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
564 
565 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
566 		src = rm->rm_col[c].rc_data;
567 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
568 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
569 
570 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
571 
572 		if (c == rm->rm_firstdatacol) {
573 			ASSERT(ccnt == pcnt || ccnt == 0);
574 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
575 				*p = *src;
576 				*q = *src;
577 			}
578 			for (; i < pcnt; i++, src++, p++, q++) {
579 				*p = 0;
580 				*q = 0;
581 			}
582 		} else {
583 			ASSERT(ccnt <= pcnt);
584 
585 			/*
586 			 * Apply the algorithm described above by multiplying
587 			 * the previous result and adding in the new value.
588 			 */
589 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
590 				*p ^= *src;
591 
592 				VDEV_RAIDZ_64MUL_2(*q, mask);
593 				*q ^= *src;
594 			}
595 
596 			/*
597 			 * Treat short columns as though they are full of 0s.
598 			 * Note that there's therefore nothing needed for P.
599 			 */
600 			for (; i < pcnt; i++, q++) {
601 				VDEV_RAIDZ_64MUL_2(*q, mask);
602 			}
603 		}
604 	}
605 }
606 
607 static void
608 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
609 {
610 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
611 	int c;
612 
613 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
614 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
615 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
616 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
617 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
618 
619 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
620 		src = rm->rm_col[c].rc_data;
621 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
622 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
623 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
624 
625 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
626 
627 		if (c == rm->rm_firstdatacol) {
628 			ASSERT(ccnt == pcnt || ccnt == 0);
629 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
630 				*p = *src;
631 				*q = *src;
632 				*r = *src;
633 			}
634 			for (; i < pcnt; i++, src++, p++, q++, r++) {
635 				*p = 0;
636 				*q = 0;
637 				*r = 0;
638 			}
639 		} else {
640 			ASSERT(ccnt <= pcnt);
641 
642 			/*
643 			 * Apply the algorithm described above by multiplying
644 			 * the previous result and adding in the new value.
645 			 */
646 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
647 				*p ^= *src;
648 
649 				VDEV_RAIDZ_64MUL_2(*q, mask);
650 				*q ^= *src;
651 
652 				VDEV_RAIDZ_64MUL_4(*r, mask);
653 				*r ^= *src;
654 			}
655 
656 			/*
657 			 * Treat short columns as though they are full of 0s.
658 			 * Note that there's therefore nothing needed for P.
659 			 */
660 			for (; i < pcnt; i++, q++, r++) {
661 				VDEV_RAIDZ_64MUL_2(*q, mask);
662 				VDEV_RAIDZ_64MUL_4(*r, mask);
663 			}
664 		}
665 	}
666 }
667 
668 /*
669  * Generate RAID parity in the first virtual columns according to the number of
670  * parity columns available.
671  */
672 static void
673 vdev_raidz_generate_parity(raidz_map_t *rm)
674 {
675 	switch (rm->rm_firstdatacol) {
676 	case 1:
677 		vdev_raidz_generate_parity_p(rm);
678 		break;
679 	case 2:
680 		vdev_raidz_generate_parity_pq(rm);
681 		break;
682 	case 3:
683 		vdev_raidz_generate_parity_pqr(rm);
684 		break;
685 	default:
686 		panic("invalid RAID-Z configuration");
687 	}
688 }
689 
690 /* BEGIN CSTYLED */
691 /*
692  * In the general case of reconstruction, we must solve the system of linear
693  * equations defined by the coeffecients used to generate parity as well as
694  * the contents of the data and parity disks. This can be expressed with
695  * vectors for the original data (D) and the actual data (d) and parity (p)
696  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
697  *
698  *            __   __                     __     __
699  *            |     |         __     __   |  p_0  |
700  *            |  V  |         |  D_0  |   | p_m-1 |
701  *            |     |    x    |   :   | = |  d_0  |
702  *            |  I  |         | D_n-1 |   |   :   |
703  *            |     |         ~~     ~~   | d_n-1 |
704  *            ~~   ~~                     ~~     ~~
705  *
706  * I is simply a square identity matrix of size n, and V is a vandermonde
707  * matrix defined by the coeffecients we chose for the various parity columns
708  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
709  * computation as well as linear separability.
710  *
711  *      __               __               __     __
712  *      |   1   ..  1 1 1 |               |  p_0  |
713  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
714  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
715  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
716  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
717  *      |   :       : : : |   |   :   |   |  d_2  |
718  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
719  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
720  *      |   0   ..  0 0 1 |               | d_n-1 |
721  *      ~~               ~~               ~~     ~~
722  *
723  * Note that I, V, d, and p are known. To compute D, we must invert the
724  * matrix and use the known data and parity values to reconstruct the unknown
725  * data values. We begin by removing the rows in V|I and d|p that correspond
726  * to failed or missing columns; we then make V|I square (n x n) and d|p
727  * sized n by removing rows corresponding to unused parity from the bottom up
728  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
729  * using Gauss-Jordan elimination. In the example below we use m=3 parity
730  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
731  *           __                               __
732  *           |  1   1   1   1   1   1   1   1  |
733  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
734  *           |  19 205 116  29  64  16  4   1  |      / /
735  *           |  1   0   0   0   0   0   0   0  |     / /
736  *           |  0   1   0   0   0   0   0   0  | <--' /
737  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
738  *           |  0   0   0   1   0   0   0   0  |
739  *           |  0   0   0   0   1   0   0   0  |
740  *           |  0   0   0   0   0   1   0   0  |
741  *           |  0   0   0   0   0   0   1   0  |
742  *           |  0   0   0   0   0   0   0   1  |
743  *           ~~                               ~~
744  *           __                               __
745  *           |  1   1   1   1   1   1   1   1  |
746  *           | 128  64  32  16  8   4   2   1  |
747  *           |  19 205 116  29  64  16  4   1  |
748  *           |  1   0   0   0   0   0   0   0  |
749  *           |  0   1   0   0   0   0   0   0  |
750  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
751  *           |  0   0   0   1   0   0   0   0  |
752  *           |  0   0   0   0   1   0   0   0  |
753  *           |  0   0   0   0   0   1   0   0  |
754  *           |  0   0   0   0   0   0   1   0  |
755  *           |  0   0   0   0   0   0   0   1  |
756  *           ~~                               ~~
757  *
758  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
759  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
760  * matrix is not singular.
761  * __                                                                 __
762  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
763  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
764  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
765  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
766  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
767  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
768  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
769  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
770  * ~~                                                                 ~~
771  * __                                                                 __
772  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
773  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
774  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
775  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
776  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
777  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
778  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
779  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
780  * ~~                                                                 ~~
781  * __                                                                 __
782  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
783  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
784  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
785  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
786  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
787  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
788  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
789  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
790  * ~~                                                                 ~~
791  * __                                                                 __
792  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
793  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
794  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
795  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
796  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
797  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
798  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
799  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
800  * ~~                                                                 ~~
801  * __                                                                 __
802  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
803  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
804  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
805  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
806  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
807  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
808  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
809  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
810  * ~~                                                                 ~~
811  * __                                                                 __
812  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
813  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
814  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
815  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
816  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
817  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
818  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
819  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
820  * ~~                                                                 ~~
821  *                   __                               __
822  *                   |  0   0   1   0   0   0   0   0  |
823  *                   | 167 100  5   41 159 169 217 208 |
824  *                   | 166 100  4   40 158 168 216 209 |
825  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
826  *                   |  0   0   0   0   1   0   0   0  |
827  *                   |  0   0   0   0   0   1   0   0  |
828  *                   |  0   0   0   0   0   0   1   0  |
829  *                   |  0   0   0   0   0   0   0   1  |
830  *                   ~~                               ~~
831  *
832  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
833  * of the missing data.
834  *
835  * As is apparent from the example above, the only non-trivial rows in the
836  * inverse matrix correspond to the data disks that we're trying to
837  * reconstruct. Indeed, those are the only rows we need as the others would
838  * only be useful for reconstructing data known or assumed to be valid. For
839  * that reason, we only build the coefficients in the rows that correspond to
840  * targeted columns.
841  */
842 /* END CSTYLED */
843 
844 static void
845 vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map,
846     uint8_t **rows)
847 {
848 	int i, j;
849 	int pow;
850 
851 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
852 
853 	/*
854 	 * Fill in the missing rows of interest.
855 	 */
856 	for (i = 0; i < nmap; i++) {
857 		ASSERT3S(0, <=, map[i]);
858 		ASSERT3S(map[i], <=, 2);
859 
860 		pow = map[i] * n;
861 		if (pow > 255)
862 			pow -= 255;
863 		ASSERT(pow <= 255);
864 
865 		for (j = 0; j < n; j++) {
866 			pow -= map[i];
867 			if (pow < 0)
868 				pow += 255;
869 			rows[i][j] = vdev_raidz_pow2[pow];
870 		}
871 	}
872 }
873 
874 static void
875 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
876     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
877 {
878 	int i, j, ii, jj;
879 	uint8_t log;
880 
881 	/*
882 	 * Assert that the first nmissing entries from the array of used
883 	 * columns correspond to parity columns and that subsequent entries
884 	 * correspond to data columns.
885 	 */
886 	for (i = 0; i < nmissing; i++) {
887 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
888 	}
889 	for (; i < n; i++) {
890 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
891 	}
892 
893 	/*
894 	 * First initialize the storage where we'll compute the inverse rows.
895 	 */
896 	for (i = 0; i < nmissing; i++) {
897 		for (j = 0; j < n; j++) {
898 			invrows[i][j] = (i == j) ? 1 : 0;
899 		}
900 	}
901 
902 	/*
903 	 * Subtract all trivial rows from the rows of consequence.
904 	 */
905 	for (i = 0; i < nmissing; i++) {
906 		for (j = nmissing; j < n; j++) {
907 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
908 			jj = used[j] - rm->rm_firstdatacol;
909 			ASSERT3S(jj, <, n);
910 			invrows[i][j] = rows[i][jj];
911 			rows[i][jj] = 0;
912 		}
913 	}
914 
915 	/*
916 	 * For each of the rows of interest, we must normalize it and subtract
917 	 * a multiple of it from the other rows.
918 	 */
919 	for (i = 0; i < nmissing; i++) {
920 		for (j = 0; j < missing[i]; j++) {
921 			ASSERT3U(rows[i][j], ==, 0);
922 		}
923 		ASSERT3U(rows[i][missing[i]], !=, 0);
924 
925 		/*
926 		 * Compute the inverse of the first element and multiply each
927 		 * element in the row by that value.
928 		 */
929 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
930 
931 		for (j = 0; j < n; j++) {
932 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
933 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
934 		}
935 
936 		for (ii = 0; ii < nmissing; ii++) {
937 			if (i == ii)
938 				continue;
939 
940 			ASSERT3U(rows[ii][missing[i]], !=, 0);
941 
942 			log = vdev_raidz_log2[rows[ii][missing[i]]];
943 
944 			for (j = 0; j < n; j++) {
945 				rows[ii][j] ^=
946 				    vdev_raidz_exp2(rows[i][j], log);
947 				invrows[ii][j] ^=
948 				    vdev_raidz_exp2(invrows[i][j], log);
949 			}
950 		}
951 	}
952 
953 	/*
954 	 * Verify that the data that is left in the rows are properly part of
955 	 * an identity matrix.
956 	 */
957 	for (i = 0; i < nmissing; i++) {
958 		for (j = 0; j < n; j++) {
959 			if (j == missing[i]) {
960 				ASSERT3U(rows[i][j], ==, 1);
961 			} else {
962 				ASSERT3U(rows[i][j], ==, 0);
963 			}
964 		}
965 	}
966 }
967 
968 static void
969 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
970     int *missing, uint8_t **invrows, const uint8_t *used)
971 {
972 	int i, j, x, cc, c;
973 	uint8_t *src;
974 	uint64_t ccount;
975 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
976 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
977 	uint8_t log, val;
978 	int ll;
979 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
980 	uint8_t *p, *pp;
981 	size_t psize;
982 
983 	log = 0;	/* gcc */
984 	psize = sizeof (invlog[0][0]) * n * nmissing;
985 	p = zfs_alloc(psize);
986 
987 	for (pp = p, i = 0; i < nmissing; i++) {
988 		invlog[i] = pp;
989 		pp += n;
990 	}
991 
992 	for (i = 0; i < nmissing; i++) {
993 		for (j = 0; j < n; j++) {
994 			ASSERT3U(invrows[i][j], !=, 0);
995 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
996 		}
997 	}
998 
999 	for (i = 0; i < n; i++) {
1000 		c = used[i];
1001 		ASSERT3U(c, <, rm->rm_cols);
1002 
1003 		src = rm->rm_col[c].rc_data;
1004 		ccount = rm->rm_col[c].rc_size;
1005 		for (j = 0; j < nmissing; j++) {
1006 			cc = missing[j] + rm->rm_firstdatacol;
1007 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1008 			ASSERT3U(cc, <, rm->rm_cols);
1009 			ASSERT3U(cc, !=, c);
1010 
1011 			dst[j] = rm->rm_col[cc].rc_data;
1012 			dcount[j] = rm->rm_col[cc].rc_size;
1013 		}
1014 
1015 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1016 
1017 		for (x = 0; x < ccount; x++, src++) {
1018 			if (*src != 0)
1019 				log = vdev_raidz_log2[*src];
1020 
1021 			for (cc = 0; cc < nmissing; cc++) {
1022 				if (x >= dcount[cc])
1023 					continue;
1024 
1025 				if (*src == 0) {
1026 					val = 0;
1027 				} else {
1028 					if ((ll = log + invlog[cc][i]) >= 255)
1029 						ll -= 255;
1030 					val = vdev_raidz_pow2[ll];
1031 				}
1032 
1033 				if (i == 0)
1034 					dst[cc][x] = val;
1035 				else
1036 					dst[cc][x] ^= val;
1037 			}
1038 		}
1039 	}
1040 
1041 	zfs_free(p, psize);
1042 }
1043 
1044 static int
1045 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1046 {
1047 	int n, i, c, t, tt;
1048 	int nmissing_rows;
1049 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1050 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1051 
1052 	uint8_t *p, *pp;
1053 	size_t psize;
1054 
1055 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1056 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1057 	uint8_t *used;
1058 
1059 	int code = 0;
1060 
1061 
1062 	n = rm->rm_cols - rm->rm_firstdatacol;
1063 
1064 	/*
1065 	 * Figure out which data columns are missing.
1066 	 */
1067 	nmissing_rows = 0;
1068 	for (t = 0; t < ntgts; t++) {
1069 		if (tgts[t] >= rm->rm_firstdatacol) {
1070 			missing_rows[nmissing_rows++] =
1071 			    tgts[t] - rm->rm_firstdatacol;
1072 		}
1073 	}
1074 
1075 	/*
1076 	 * Figure out which parity columns to use to help generate the missing
1077 	 * data columns.
1078 	 */
1079 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1080 		ASSERT(tt < ntgts);
1081 		ASSERT(c < rm->rm_firstdatacol);
1082 
1083 		/*
1084 		 * Skip any targeted parity columns.
1085 		 */
1086 		if (c == tgts[tt]) {
1087 			tt++;
1088 			continue;
1089 		}
1090 
1091 		code |= 1 << c;
1092 
1093 		parity_map[i] = c;
1094 		i++;
1095 	}
1096 
1097 	ASSERT(code != 0);
1098 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1099 
1100 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1101 	    nmissing_rows * n + sizeof (used[0]) * n;
1102 	p = kmem_alloc(psize, KM_SLEEP);
1103 
1104 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1105 		rows[i] = pp;
1106 		pp += n;
1107 		invrows[i] = pp;
1108 		pp += n;
1109 	}
1110 	used = pp;
1111 
1112 	for (i = 0; i < nmissing_rows; i++) {
1113 		used[i] = parity_map[i];
1114 	}
1115 
1116 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1117 		if (tt < nmissing_rows &&
1118 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1119 			tt++;
1120 			continue;
1121 		}
1122 
1123 		ASSERT3S(i, <, n);
1124 		used[i] = c;
1125 		i++;
1126 	}
1127 
1128 	/*
1129 	 * Initialize the interesting rows of the matrix.
1130 	 */
1131 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1132 
1133 	/*
1134 	 * Invert the matrix.
1135 	 */
1136 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1137 	    invrows, used);
1138 
1139 	/*
1140 	 * Reconstruct the missing data using the generated matrix.
1141 	 */
1142 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1143 	    invrows, used);
1144 
1145 	kmem_free(p, psize);
1146 
1147 	return (code);
1148 }
1149 
1150 static int
1151 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1152 {
1153 	int tgts[VDEV_RAIDZ_MAXPARITY];
1154 	int ntgts;
1155 	int i, c;
1156 	int code;
1157 	int nbadparity, nbaddata;
1158 
1159 	/*
1160 	 * The tgts list must already be sorted.
1161 	 */
1162 	for (i = 1; i < nt; i++) {
1163 		ASSERT(t[i] > t[i - 1]);
1164 	}
1165 
1166 	nbadparity = rm->rm_firstdatacol;
1167 	nbaddata = rm->rm_cols - nbadparity;
1168 	ntgts = 0;
1169 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1170 		if (i < nt && c == t[i]) {
1171 			tgts[ntgts++] = c;
1172 			i++;
1173 		} else if (rm->rm_col[c].rc_error != 0) {
1174 			tgts[ntgts++] = c;
1175 		} else if (c >= rm->rm_firstdatacol) {
1176 			nbaddata--;
1177 		} else {
1178 			nbadparity--;
1179 		}
1180 	}
1181 
1182 	ASSERT(ntgts >= nt);
1183 	ASSERT(nbaddata >= 0);
1184 	ASSERT(nbaddata + nbadparity == ntgts);
1185 
1186 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1187 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1188 	ASSERT(code > 0);
1189 	return (code);
1190 }
1191 
1192 static raidz_map_t *
1193 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1194     uint64_t dcols, uint64_t nparity)
1195 {
1196 	raidz_map_t *rm;
1197 	uint64_t b = offset >> unit_shift;
1198 	uint64_t s = size >> unit_shift;
1199 	uint64_t f = b % dcols;
1200 	uint64_t o = (b / dcols) << unit_shift;
1201 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1202 
1203 	q = s / (dcols - nparity);
1204 	r = s - q * (dcols - nparity);
1205 	bc = (r == 0 ? 0 : r + nparity);
1206 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1207 
1208 	if (q == 0) {
1209 		acols = bc;
1210 		scols = MIN(dcols, roundup(bc, nparity + 1));
1211 	} else {
1212 		acols = dcols;
1213 		scols = dcols;
1214 	}
1215 
1216 	ASSERT3U(acols, <=, scols);
1217 
1218 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1219 
1220 	rm->rm_cols = acols;
1221 	rm->rm_scols = scols;
1222 	rm->rm_bigcols = bc;
1223 	rm->rm_skipstart = bc;
1224 	rm->rm_missingdata = 0;
1225 	rm->rm_missingparity = 0;
1226 	rm->rm_firstdatacol = nparity;
1227 	rm->rm_reports = 0;
1228 	rm->rm_freed = 0;
1229 	rm->rm_ecksuminjected = 0;
1230 
1231 	asize = 0;
1232 
1233 	for (c = 0; c < scols; c++) {
1234 		col = f + c;
1235 		coff = o;
1236 		if (col >= dcols) {
1237 			col -= dcols;
1238 			coff += 1ULL << unit_shift;
1239 		}
1240 		rm->rm_col[c].rc_devidx = col;
1241 		rm->rm_col[c].rc_offset = coff;
1242 		rm->rm_col[c].rc_data = NULL;
1243 		rm->rm_col[c].rc_error = 0;
1244 		rm->rm_col[c].rc_tried = 0;
1245 		rm->rm_col[c].rc_skipped = 0;
1246 
1247 		if (c >= acols)
1248 			rm->rm_col[c].rc_size = 0;
1249 		else if (c < bc)
1250 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1251 		else
1252 			rm->rm_col[c].rc_size = q << unit_shift;
1253 
1254 		asize += rm->rm_col[c].rc_size;
1255 	}
1256 
1257 	ASSERT3U(asize, ==, tot << unit_shift);
1258 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1259 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1260 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1261 	ASSERT3U(rm->rm_nskip, <=, nparity);
1262 
1263 	for (c = 0; c < rm->rm_firstdatacol; c++)
1264 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1265 
1266 	rm->rm_col[c].rc_data = data;
1267 
1268 	for (c = c + 1; c < acols; c++)
1269 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1270 		    rm->rm_col[c - 1].rc_size;
1271 
1272 	/*
1273 	 * If all data stored spans all columns, there's a danger that parity
1274 	 * will always be on the same device and, since parity isn't read
1275 	 * during normal operation, that that device's I/O bandwidth won't be
1276 	 * used effectively. We therefore switch the parity every 1MB.
1277 	 *
1278 	 * ... at least that was, ostensibly, the theory. As a practical
1279 	 * matter unless we juggle the parity between all devices evenly, we
1280 	 * won't see any benefit. Further, occasional writes that aren't a
1281 	 * multiple of the LCM of the number of children and the minimum
1282 	 * stripe width are sufficient to avoid pessimal behavior.
1283 	 * Unfortunately, this decision created an implicit on-disk format
1284 	 * requirement that we need to support for all eternity, but only
1285 	 * for single-parity RAID-Z.
1286 	 *
1287 	 * If we intend to skip a sector in the zeroth column for padding
1288 	 * we must make sure to note this swap. We will never intend to
1289 	 * skip the first column since at least one data and one parity
1290 	 * column must appear in each row.
1291 	 */
1292 	ASSERT(rm->rm_cols >= 2);
1293 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1294 
1295 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1296 		devidx = rm->rm_col[0].rc_devidx;
1297 		o = rm->rm_col[0].rc_offset;
1298 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1299 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1300 		rm->rm_col[1].rc_devidx = devidx;
1301 		rm->rm_col[1].rc_offset = o;
1302 
1303 		if (rm->rm_skipstart == 0)
1304 			rm->rm_skipstart = 1;
1305 	}
1306 
1307 	return (rm);
1308 }
1309 
1310 static void
1311 vdev_raidz_map_free(raidz_map_t *rm)
1312 {
1313 	int c;
1314 
1315 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1316 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1317 
1318 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1319 }
1320 
1321 static vdev_t *
1322 vdev_child(vdev_t *pvd, uint64_t devidx)
1323 {
1324 	vdev_t *cvd;
1325 
1326 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1327 		if (cvd->v_id == devidx)
1328 			break;
1329 	}
1330 
1331 	return (cvd);
1332 }
1333 
1334 /*
1335  * We keep track of whether or not there were any injected errors, so that
1336  * any ereports we generate can note it.
1337  */
1338 static int
1339 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1340     uint64_t size __unused)
1341 {
1342 
1343 	return (zio_checksum_verify(spa, bp, data));
1344 }
1345 
1346 /*
1347  * Generate the parity from the data columns. If we tried and were able to
1348  * read the parity without error, verify that the generated parity matches the
1349  * data we read. If it doesn't, we fire off a checksum error. Return the
1350  * number such failures.
1351  */
1352 static int
1353 raidz_parity_verify(raidz_map_t *rm)
1354 {
1355 	void *orig[VDEV_RAIDZ_MAXPARITY];
1356 	int c, ret = 0;
1357 	raidz_col_t *rc;
1358 
1359 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1360 		rc = &rm->rm_col[c];
1361 		if (!rc->rc_tried || rc->rc_error != 0)
1362 			continue;
1363 		orig[c] = zfs_alloc(rc->rc_size);
1364 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1365 	}
1366 
1367 	vdev_raidz_generate_parity(rm);
1368 
1369 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1370 		rc = &rm->rm_col[c];
1371 		if (!rc->rc_tried || rc->rc_error != 0)
1372 			continue;
1373 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1374 			rc->rc_error = ECKSUM;
1375 			ret++;
1376 		}
1377 		zfs_free(orig[c], rc->rc_size);
1378 	}
1379 
1380 	return (ret);
1381 }
1382 
1383 /*
1384  * Iterate over all combinations of bad data and attempt a reconstruction.
1385  * Note that the algorithm below is non-optimal because it doesn't take into
1386  * account how reconstruction is actually performed. For example, with
1387  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1388  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1389  * cases we'd only use parity information in column 0.
1390  */
1391 static int
1392 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1393     void *data, off_t offset __unused, uint64_t bytes, int total_errors,
1394     int data_errors)
1395 {
1396 	raidz_col_t *rc;
1397 	void *orig[VDEV_RAIDZ_MAXPARITY];
1398 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1399 	int *tgts = &tstore[1];
1400 	int current, next, i, c, n;
1401 	int code, ret = 0;
1402 
1403 	ASSERT(total_errors < rm->rm_firstdatacol);
1404 
1405 	/*
1406 	 * This simplifies one edge condition.
1407 	 */
1408 	tgts[-1] = -1;
1409 
1410 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1411 		/*
1412 		 * Initialize the targets array by finding the first n columns
1413 		 * that contain no error.
1414 		 *
1415 		 * If there were no data errors, we need to ensure that we're
1416 		 * always explicitly attempting to reconstruct at least one
1417 		 * data column. To do this, we simply push the highest target
1418 		 * up into the data columns.
1419 		 */
1420 		for (c = 0, i = 0; i < n; i++) {
1421 			if (i == n - 1 && data_errors == 0 &&
1422 			    c < rm->rm_firstdatacol) {
1423 				c = rm->rm_firstdatacol;
1424 			}
1425 
1426 			while (rm->rm_col[c].rc_error != 0) {
1427 				c++;
1428 				ASSERT3S(c, <, rm->rm_cols);
1429 			}
1430 
1431 			tgts[i] = c++;
1432 		}
1433 
1434 		/*
1435 		 * Setting tgts[n] simplifies the other edge condition.
1436 		 */
1437 		tgts[n] = rm->rm_cols;
1438 
1439 		/*
1440 		 * These buffers were allocated in previous iterations.
1441 		 */
1442 		for (i = 0; i < n - 1; i++) {
1443 			ASSERT(orig[i] != NULL);
1444 		}
1445 
1446 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1447 
1448 		current = 0;
1449 		next = tgts[current];
1450 
1451 		while (current != n) {
1452 			tgts[current] = next;
1453 			current = 0;
1454 
1455 			/*
1456 			 * Save off the original data that we're going to
1457 			 * attempt to reconstruct.
1458 			 */
1459 			for (i = 0; i < n; i++) {
1460 				ASSERT(orig[i] != NULL);
1461 				c = tgts[i];
1462 				ASSERT3S(c, >=, 0);
1463 				ASSERT3S(c, <, rm->rm_cols);
1464 				rc = &rm->rm_col[c];
1465 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1466 			}
1467 
1468 			/*
1469 			 * Attempt a reconstruction and exit the outer loop on
1470 			 * success.
1471 			 */
1472 			code = vdev_raidz_reconstruct(rm, tgts, n);
1473 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1474 				for (i = 0; i < n; i++) {
1475 					c = tgts[i];
1476 					rc = &rm->rm_col[c];
1477 					ASSERT(rc->rc_error == 0);
1478 					rc->rc_error = ECKSUM;
1479 				}
1480 
1481 				ret = code;
1482 				goto done;
1483 			}
1484 
1485 			/*
1486 			 * Restore the original data.
1487 			 */
1488 			for (i = 0; i < n; i++) {
1489 				c = tgts[i];
1490 				rc = &rm->rm_col[c];
1491 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1492 			}
1493 
1494 			do {
1495 				/*
1496 				 * Find the next valid column after the current
1497 				 * position..
1498 				 */
1499 				for (next = tgts[current] + 1;
1500 				    next < rm->rm_cols &&
1501 				    rm->rm_col[next].rc_error != 0; next++)
1502 					continue;
1503 
1504 				ASSERT(next <= tgts[current + 1]);
1505 
1506 				/*
1507 				 * If that spot is available, we're done here.
1508 				 */
1509 				if (next != tgts[current + 1])
1510 					break;
1511 
1512 				/*
1513 				 * Otherwise, find the next valid column after
1514 				 * the previous position.
1515 				 */
1516 				for (c = tgts[current - 1] + 1;
1517 				    rm->rm_col[c].rc_error != 0; c++)
1518 					continue;
1519 
1520 				tgts[current] = c;
1521 				current++;
1522 
1523 			} while (current != n);
1524 		}
1525 	}
1526 	n--;
1527 done:
1528 	for (i = n - 1; i >= 0; i--) {
1529 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1530 	}
1531 
1532 	return (ret);
1533 }
1534 
1535 static int
1536 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1537     off_t offset, size_t bytes)
1538 {
1539 	vdev_t *tvd = vd->v_top;
1540 	vdev_t *cvd;
1541 	raidz_map_t *rm;
1542 	raidz_col_t *rc;
1543 	int c, error;
1544 	int unexpected_errors;
1545 	int parity_errors;
1546 	int parity_untried;
1547 	int data_errors;
1548 	int total_errors;
1549 	int n;
1550 	int tgts[VDEV_RAIDZ_MAXPARITY];
1551 	int code;
1552 
1553 	rc = NULL;	/* gcc */
1554 	error = 0;
1555 
1556 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1557 	    vd->v_nchildren, vd->v_nparity);
1558 
1559 	/*
1560 	 * Iterate over the columns in reverse order so that we hit the parity
1561 	 * last -- any errors along the way will force us to read the parity.
1562 	 */
1563 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1564 		rc = &rm->rm_col[c];
1565 		cvd = vdev_child(vd, rc->rc_devidx);
1566 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1567 			if (c >= rm->rm_firstdatacol)
1568 				rm->rm_missingdata++;
1569 			else
1570 				rm->rm_missingparity++;
1571 			rc->rc_error = ENXIO;
1572 			rc->rc_tried = 1;	/* don't even try */
1573 			rc->rc_skipped = 1;
1574 			continue;
1575 		}
1576 #if 0		/* XXX: Too hard for the boot code. */
1577 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1578 			if (c >= rm->rm_firstdatacol)
1579 				rm->rm_missingdata++;
1580 			else
1581 				rm->rm_missingparity++;
1582 			rc->rc_error = ESTALE;
1583 			rc->rc_skipped = 1;
1584 			continue;
1585 		}
1586 #endif
1587 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1588 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1589 			    rc->rc_offset, rc->rc_size);
1590 			rc->rc_tried = 1;
1591 			rc->rc_skipped = 0;
1592 		}
1593 	}
1594 
1595 reconstruct:
1596 	unexpected_errors = 0;
1597 	parity_errors = 0;
1598 	parity_untried = 0;
1599 	data_errors = 0;
1600 	total_errors = 0;
1601 
1602 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1603 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1604 
1605 	for (c = 0; c < rm->rm_cols; c++) {
1606 		rc = &rm->rm_col[c];
1607 
1608 		if (rc->rc_error) {
1609 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1610 
1611 			if (c < rm->rm_firstdatacol)
1612 				parity_errors++;
1613 			else
1614 				data_errors++;
1615 
1616 			if (!rc->rc_skipped)
1617 				unexpected_errors++;
1618 
1619 			total_errors++;
1620 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1621 			parity_untried++;
1622 		}
1623 	}
1624 
1625 	/*
1626 	 * There are three potential phases for a read:
1627 	 *	1. produce valid data from the columns read
1628 	 *	2. read all disks and try again
1629 	 *	3. perform combinatorial reconstruction
1630 	 *
1631 	 * Each phase is progressively both more expensive and less likely to
1632 	 * occur. If we encounter more errors than we can repair or all phases
1633 	 * fail, we have no choice but to return an error.
1634 	 */
1635 
1636 	/*
1637 	 * If the number of errors we saw was correctable -- less than or equal
1638 	 * to the number of parity disks read -- attempt to produce data that
1639 	 * has a valid checksum. Naturally, this case applies in the absence of
1640 	 * any errors.
1641 	 */
1642 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1643 		int rv;
1644 
1645 		if (data_errors == 0) {
1646 			rv = raidz_checksum_verify(vd->spa, bp, data, bytes);
1647 			if (rv == 0) {
1648 				/*
1649 				 * If we read parity information (unnecessarily
1650 				 * as it happens since no reconstruction was
1651 				 * needed) regenerate and verify the parity.
1652 				 * We also regenerate parity when resilvering
1653 				 * so we can write it out to the failed device
1654 				 * later.
1655 				 */
1656 				if (parity_errors + parity_untried <
1657 				    rm->rm_firstdatacol) {
1658 					n = raidz_parity_verify(rm);
1659 					unexpected_errors += n;
1660 					ASSERT(parity_errors + n <=
1661 					    rm->rm_firstdatacol);
1662 				}
1663 				goto done;
1664 			}
1665 		} else {
1666 			/*
1667 			 * We either attempt to read all the parity columns or
1668 			 * none of them. If we didn't try to read parity, we
1669 			 * wouldn't be here in the correctable case. There must
1670 			 * also have been fewer parity errors than parity
1671 			 * columns or, again, we wouldn't be in this code path.
1672 			 */
1673 			ASSERT(parity_untried == 0);
1674 			ASSERT(parity_errors < rm->rm_firstdatacol);
1675 
1676 			/*
1677 			 * Identify the data columns that reported an error.
1678 			 */
1679 			n = 0;
1680 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1681 				rc = &rm->rm_col[c];
1682 				if (rc->rc_error != 0) {
1683 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1684 					tgts[n++] = c;
1685 				}
1686 			}
1687 
1688 			ASSERT(rm->rm_firstdatacol >= n);
1689 
1690 			code = vdev_raidz_reconstruct(rm, tgts, n);
1691 
1692 			rv = raidz_checksum_verify(vd->spa, bp, data, bytes);
1693 			if (rv == 0) {
1694 				/*
1695 				 * If we read more parity disks than were used
1696 				 * for reconstruction, confirm that the other
1697 				 * parity disks produced correct data. This
1698 				 * routine is suboptimal in that it regenerates
1699 				 * the parity that we already used in addition
1700 				 * to the parity that we're attempting to
1701 				 * verify, but this should be a relatively
1702 				 * uncommon case, and can be optimized if it
1703 				 * becomes a problem. Note that we regenerate
1704 				 * parity when resilvering so we can write it
1705 				 * out to failed devices later.
1706 				 */
1707 				if (parity_errors < rm->rm_firstdatacol - n) {
1708 					n = raidz_parity_verify(rm);
1709 					unexpected_errors += n;
1710 					ASSERT(parity_errors + n <=
1711 					    rm->rm_firstdatacol);
1712 				}
1713 
1714 				goto done;
1715 			}
1716 		}
1717 	}
1718 
1719 	/*
1720 	 * This isn't a typical situation -- either we got a read
1721 	 * error or a child silently returned bad data. Read every
1722 	 * block so we can try again with as much data and parity as
1723 	 * we can track down. If we've already been through once
1724 	 * before, all children will be marked as tried so we'll
1725 	 * proceed to combinatorial reconstruction.
1726 	 */
1727 	unexpected_errors = 1;
1728 	rm->rm_missingdata = 0;
1729 	rm->rm_missingparity = 0;
1730 
1731 	n = 0;
1732 	for (c = 0; c < rm->rm_cols; c++) {
1733 		rc = &rm->rm_col[c];
1734 
1735 		if (rc->rc_tried)
1736 			continue;
1737 
1738 		cvd = vdev_child(vd, rc->rc_devidx);
1739 		ASSERT(cvd != NULL);
1740 		rc->rc_error = cvd->v_read(cvd, NULL,
1741 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1742 		if (rc->rc_error == 0)
1743 			n++;
1744 		rc->rc_tried = 1;
1745 		rc->rc_skipped = 0;
1746 	}
1747 	/*
1748 	 * If we managed to read anything more, retry the
1749 	 * reconstruction.
1750 	 */
1751 	if (n > 0)
1752 		goto reconstruct;
1753 
1754 	/*
1755 	 * At this point we've attempted to reconstruct the data given the
1756 	 * errors we detected, and we've attempted to read all columns. There
1757 	 * must, therefore, be one or more additional problems -- silent errors
1758 	 * resulting in invalid data rather than explicit I/O errors resulting
1759 	 * in absent data. We check if there is enough additional data to
1760 	 * possibly reconstruct the data and then perform combinatorial
1761 	 * reconstruction over all possible combinations. If that fails,
1762 	 * we're cooked.
1763 	 */
1764 	if (total_errors > rm->rm_firstdatacol) {
1765 		error = EIO;
1766 	} else if (total_errors < rm->rm_firstdatacol &&
1767 	    (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1768 	    total_errors, data_errors)) != 0) {
1769 		/*
1770 		 * If we didn't use all the available parity for the
1771 		 * combinatorial reconstruction, verify that the remaining
1772 		 * parity is correct.
1773 		 */
1774 		if (code != (1 << rm->rm_firstdatacol) - 1)
1775 			(void) raidz_parity_verify(rm);
1776 	} else {
1777 		/*
1778 		 * We're here because either:
1779 		 *
1780 		 *	total_errors == rm_first_datacol, or
1781 		 *	vdev_raidz_combrec() failed
1782 		 *
1783 		 * In either case, there is enough bad data to prevent
1784 		 * reconstruction.
1785 		 *
1786 		 * Start checksum ereports for all children which haven't
1787 		 * failed, and the IO wasn't speculative.
1788 		 */
1789 		error = ECKSUM;
1790 	}
1791 
1792 done:
1793 	vdev_raidz_map_free(rm);
1794 
1795 	return (error);
1796 }
1797