xref: /illumos-gate/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c (revision 4a04e8db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 
28 static uint64_t zfs_crc64_table[256];
29 
30 #define	ECKSUM	666
31 
32 #define	ASSERT3S(x, y, z)	((void)0)
33 #define	ASSERT3U(x, y, z)	((void)0)
34 #define	ASSERT3P(x, y, z)	((void)0)
35 #define	ASSERT0(x)		((void)0)
36 #define	ASSERT(x)		((void)0)
37 
38 #define	panic(...)	do {						\
39 	printf(__VA_ARGS__);						\
40 	for (;;) ;							\
41 } while (0)
42 
43 #define	kmem_alloc(size, flag)	zfs_alloc((size))
44 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
45 
46 static void
47 zfs_init_crc(void)
48 {
49 	int i, j;
50 	uint64_t *ct;
51 
52 	/*
53 	 * Calculate the crc64 table (used for the zap hash
54 	 * function).
55 	 */
56 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
57 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
58 		for (i = 0; i < 256; i++)
59 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
60 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
61 	}
62 }
63 
64 static void
65 zio_checksum_off(const void *buf, uint64_t size,
66     const void *ctx_template, zio_cksum_t *zcp)
67 {
68 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69 }
70 
71 /*
72  * Signature for checksum functions.
73  */
74 typedef void zio_checksum_t(const void *data, uint64_t size,
75     const void *ctx_template, zio_cksum_t *zcp);
76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
78 
79 typedef enum zio_checksum_flags {
80 	/* Strong enough for metadata? */
81 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
82 	/* ZIO embedded checksum */
83 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
84 	/* Strong enough for dedup (without verification)? */
85 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
86 	/* Uses salt value */
87 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
88 	/* Strong enough for nopwrite? */
89 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
90 } zio_checksum_flags_t;
91 
92 /*
93  * Information about each checksum function.
94  */
95 typedef struct zio_checksum_info {
96 	/* checksum function for each byteorder */
97 	zio_checksum_t			*ci_func[2];
98 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
99 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
100 	zio_checksum_flags_t		ci_flags;
101 	const char			*ci_name;	/* descriptive name */
102 } zio_checksum_info_t;
103 
104 #include "blkptr.c"
105 
106 #include "fletcher.c"
107 #include "sha256.c"
108 #include "skein_zfs.c"
109 #include "edonr_zfs.c"
110 
111 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
112 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
113 	{{NULL, NULL}, NULL, NULL, 0, "on"},
114 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
115 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
116 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
117 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
118 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
119 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
120 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
121 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
122 	    0, "fletcher2"},
123 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
124 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
125 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
126 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
127 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
128 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
129 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
130 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
131 	    0, "noparity"},
132 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
133 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
134 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
135 	/* no skein and edonr for now */
136 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
137 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
138 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
139 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
140 	{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
141 	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
142 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
143 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
144 };
145 
146 /*
147  * Common signature for all zio compress/decompress functions.
148  */
149 typedef size_t zio_compress_func_t(void *src, void *dst,
150     size_t s_len, size_t d_len, int);
151 typedef int zio_decompress_func_t(void *src, void *dst,
152     size_t s_len, size_t d_len, int);
153 
154 extern int gzip_decompress(void *src, void *dst,
155     size_t s_len, size_t d_len, int);
156 /*
157  * Information about each compression function.
158  */
159 typedef struct zio_compress_info {
160 	zio_compress_func_t	*ci_compress;	/* compression function */
161 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
162 	int			ci_level;	/* level parameter */
163 	const char		*ci_name;	/* algorithm name */
164 } zio_compress_info_t;
165 
166 #include "lzjb.c"
167 #include "zle.c"
168 #include "lz4.c"
169 
170 /*
171  * Compression vectors.
172  */
173 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
174 	{NULL,			NULL,			0,	"inherit"},
175 	{NULL,			NULL,			0,	"on"},
176 	{NULL,			NULL,			0,	"uncompressed"},
177 	{NULL,			lzjb_decompress,	0,	"lzjb"},
178 	{NULL,			NULL,			0,	"empty"},
179 	{NULL,			gzip_decompress,	1,	"gzip-1"},
180 	{NULL,			gzip_decompress,	2,	"gzip-2"},
181 	{NULL,			gzip_decompress,	3,	"gzip-3"},
182 	{NULL,			gzip_decompress,	4,	"gzip-4"},
183 	{NULL,			gzip_decompress,	5,	"gzip-5"},
184 	{NULL,			gzip_decompress,	6,	"gzip-6"},
185 	{NULL,			gzip_decompress,	7,	"gzip-7"},
186 	{NULL,			gzip_decompress,	8,	"gzip-8"},
187 	{NULL,			gzip_decompress,	9,	"gzip-9"},
188 	{NULL,			zle_decompress,		64,	"zle"},
189 	{NULL,			lz4_decompress,		0,	"lz4"},
190 };
191 
192 static void
193 byteswap_uint64_array(void *vbuf, size_t size)
194 {
195 	uint64_t *buf = vbuf;
196 	size_t count = size >> 3;
197 	int i;
198 
199 	ASSERT((size & 7) == 0);
200 
201 	for (i = 0; i < count; i++)
202 		buf[i] = BSWAP_64(buf[i]);
203 }
204 
205 /*
206  * Set the external verifier for a gang block based on <vdev, offset, txg>,
207  * a tuple which is guaranteed to be unique for the life of the pool.
208  */
209 static void
210 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
211 {
212 	const dva_t *dva = BP_IDENTITY(bp);
213 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
214 
215 	ASSERT(BP_IS_GANG(bp));
216 
217 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
218 }
219 
220 /*
221  * Set the external verifier for a label block based on its offset.
222  * The vdev is implicit, and the txg is unknowable at pool open time --
223  * hence the logic in vdev_uberblock_load() to find the most recent copy.
224  */
225 static void
226 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
227 {
228 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
229 }
230 
231 /*
232  * Calls the template init function of a checksum which supports context
233  * templates and installs the template into the spa_t.
234  */
235 static void
236 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
237 {
238 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
239 
240 	if (ci->ci_tmpl_init == NULL)
241 		return;
242 
243 	if (spa->spa_cksum_tmpls[checksum] != NULL)
244 		return;
245 
246 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
247 		spa->spa_cksum_tmpls[checksum] =
248 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
249 	}
250 }
251 
252 /*
253  * Called by a spa_t that's about to be deallocated. This steps through
254  * all of the checksum context templates and deallocates any that were
255  * initialized using the algorithm-specific template init function.
256  */
257 void
258 zio_checksum_templates_free(spa_t *spa)
259 {
260 	for (enum zio_checksum checksum = 0;
261 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
262 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
263 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
264 
265 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
266 			spa->spa_cksum_tmpls[checksum] = NULL;
267 		}
268 	}
269 }
270 
271 static int
272 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
273 {
274 	uint64_t size;
275 	unsigned int checksum;
276 	zio_checksum_info_t *ci;
277 	void *ctx = NULL;
278 	zio_cksum_t actual_cksum, expected_cksum, verifier;
279 	int byteswap;
280 
281 	checksum = BP_GET_CHECKSUM(bp);
282 	size = BP_GET_PSIZE(bp);
283 
284 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
285 		return (EINVAL);
286 	ci = &zio_checksum_table[checksum];
287 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
288 		return (EINVAL);
289 
290 	if (spa != NULL) {
291 		zio_checksum_template_init(checksum, (spa_t *) spa);
292 		ctx = spa->spa_cksum_tmpls[checksum];
293 	}
294 
295 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
296 		zio_eck_t *eck;
297 
298 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
299 		    checksum == ZIO_CHECKSUM_LABEL);
300 
301 		eck = (zio_eck_t *)((char *)data + size) - 1;
302 
303 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
304 			zio_checksum_gang_verifier(&verifier, bp);
305 		else if (checksum == ZIO_CHECKSUM_LABEL)
306 			zio_checksum_label_verifier(&verifier,
307 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
308 		else
309 			verifier = bp->blk_cksum;
310 
311 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
312 
313 		if (byteswap)
314 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
315 
316 		expected_cksum = eck->zec_cksum;
317 		eck->zec_cksum = verifier;
318 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
319 		eck->zec_cksum = expected_cksum;
320 
321 		if (byteswap)
322 			byteswap_uint64_array(&expected_cksum,
323 			    sizeof (zio_cksum_t));
324 	} else {
325 		expected_cksum = bp->blk_cksum;
326 		ci->ci_func[0](data, size, ctx, &actual_cksum);
327 	}
328 
329 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
330 		/* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
331 		return (EIO);
332 	}
333 
334 	return (0);
335 }
336 
337 static int
338 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
339 	void *dest, uint64_t destsize)
340 {
341 	zio_compress_info_t *ci;
342 
343 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
344 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
345 		return (EIO);
346 	}
347 
348 	ci = &zio_compress_table[cpfunc];
349 	if (!ci->ci_decompress) {
350 		printf("ZFS: unsupported compression algorithm %s\n",
351 		    ci->ci_name);
352 		return (EIO);
353 	}
354 
355 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
356 }
357 
358 static uint64_t
359 zap_hash(uint64_t salt, const char *name)
360 {
361 	const uint8_t *cp;
362 	uint8_t c;
363 	uint64_t crc = salt;
364 
365 	ASSERT(crc != 0);
366 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
367 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
368 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
369 
370 	/*
371 	 * Only use 28 bits, since we need 4 bits in the cookie for the
372 	 * collision differentiator.  We MUST use the high bits, since
373 	 * those are the onces that we first pay attention to when
374 	 * chosing the bucket.
375 	 */
376 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
377 
378 	return (crc);
379 }
380 
381 static void *zfs_alloc(size_t size);
382 static void zfs_free(void *ptr, size_t size);
383 
384 typedef struct raidz_col {
385 	uint64_t rc_devidx;		/* child device index for I/O */
386 	uint64_t rc_offset;		/* device offset */
387 	uint64_t rc_size;		/* I/O size */
388 	void *rc_data;			/* I/O data */
389 	int rc_error;			/* I/O error for this device */
390 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
391 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
392 } raidz_col_t;
393 
394 typedef struct raidz_map {
395 	uint64_t rm_cols;		/* Regular column count */
396 	uint64_t rm_scols;		/* Count including skipped columns */
397 	uint64_t rm_bigcols;		/* Number of oversized columns */
398 	uint64_t rm_asize;		/* Actual total I/O size */
399 	uint64_t rm_missingdata;	/* Count of missing data devices */
400 	uint64_t rm_missingparity;	/* Count of missing parity devices */
401 	uint64_t rm_firstdatacol;	/* First data column/parity count */
402 	uint64_t rm_nskip;		/* Skipped sectors for padding */
403 	uint64_t rm_skipstart;		/* Column index of padding start */
404 	uintptr_t rm_reports;		/* # of referencing checksum reports */
405 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
406 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
407 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
408 } raidz_map_t;
409 
410 #define	VDEV_RAIDZ_P		0
411 #define	VDEV_RAIDZ_Q		1
412 #define	VDEV_RAIDZ_R		2
413 
414 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
415 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
416 
417 /*
418  * We provide a mechanism to perform the field multiplication operation on a
419  * 64-bit value all at once rather than a byte at a time. This works by
420  * creating a mask from the top bit in each byte and using that to
421  * conditionally apply the XOR of 0x1d.
422  */
423 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
424 { \
425 	(mask) = (x) & 0x8080808080808080ULL; \
426 	(mask) = ((mask) << 1) - ((mask) >> 7); \
427 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
428 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
429 }
430 
431 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
432 { \
433 	VDEV_RAIDZ_64MUL_2((x), mask); \
434 	VDEV_RAIDZ_64MUL_2((x), mask); \
435 }
436 
437 /*
438  * These two tables represent powers and logs of 2 in the Galois field defined
439  * above. These values were computed by repeatedly multiplying by 2 as above.
440  */
441 static const uint8_t vdev_raidz_pow2[256] = {
442 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
443 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
444 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
445 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
446 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
447 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
448 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
449 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
450 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
451 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
452 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
453 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
454 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
455 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
456 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
457 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
458 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
459 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
460 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
461 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
462 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
463 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
464 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
465 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
466 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
467 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
468 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
469 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
470 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
471 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
472 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
473 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
474 };
475 static const uint8_t vdev_raidz_log2[256] = {
476 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
477 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
478 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
479 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
480 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
481 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
482 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
483 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
484 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
485 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
486 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
487 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
488 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
489 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
490 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
491 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
492 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
493 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
494 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
495 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
496 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
497 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
498 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
499 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
500 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
501 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
502 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
503 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
504 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
505 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
506 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
507 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
508 };
509 
510 /*
511  * Multiply a given number by 2 raised to the given power.
512  */
513 static uint8_t
514 vdev_raidz_exp2(uint8_t a, int exp)
515 {
516 	if (a == 0)
517 		return (0);
518 
519 	ASSERT(exp >= 0);
520 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
521 
522 	exp += vdev_raidz_log2[a];
523 	if (exp > 255)
524 		exp -= 255;
525 
526 	return (vdev_raidz_pow2[exp]);
527 }
528 
529 static void
530 vdev_raidz_generate_parity_p(raidz_map_t *rm)
531 {
532 	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
533 	int c;
534 
535 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
536 
537 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
538 		src = rm->rm_col[c].rc_data;
539 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
540 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
541 
542 		if (c == rm->rm_firstdatacol) {
543 			ASSERT(ccount == pcount);
544 			for (i = 0; i < ccount; i++, src++, p++) {
545 				*p = *src;
546 			}
547 		} else {
548 			ASSERT(ccount <= pcount);
549 			for (i = 0; i < ccount; i++, src++, p++) {
550 				*p ^= *src;
551 			}
552 		}
553 	}
554 }
555 
556 static void
557 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
558 {
559 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
560 	int c;
561 
562 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
564 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
565 
566 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
567 		src = rm->rm_col[c].rc_data;
568 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
569 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
570 
571 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
572 
573 		if (c == rm->rm_firstdatacol) {
574 			ASSERT(ccnt == pcnt || ccnt == 0);
575 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
576 				*p = *src;
577 				*q = *src;
578 			}
579 			for (; i < pcnt; i++, src++, p++, q++) {
580 				*p = 0;
581 				*q = 0;
582 			}
583 		} else {
584 			ASSERT(ccnt <= pcnt);
585 
586 			/*
587 			 * Apply the algorithm described above by multiplying
588 			 * the previous result and adding in the new value.
589 			 */
590 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
591 				*p ^= *src;
592 
593 				VDEV_RAIDZ_64MUL_2(*q, mask);
594 				*q ^= *src;
595 			}
596 
597 			/*
598 			 * Treat short columns as though they are full of 0s.
599 			 * Note that there's therefore nothing needed for P.
600 			 */
601 			for (; i < pcnt; i++, q++) {
602 				VDEV_RAIDZ_64MUL_2(*q, mask);
603 			}
604 		}
605 	}
606 }
607 
608 static void
609 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
610 {
611 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
612 	int c;
613 
614 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
615 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
617 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
618 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
619 
620 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
621 		src = rm->rm_col[c].rc_data;
622 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
623 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
624 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
625 
626 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
627 
628 		if (c == rm->rm_firstdatacol) {
629 			ASSERT(ccnt == pcnt || ccnt == 0);
630 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
631 				*p = *src;
632 				*q = *src;
633 				*r = *src;
634 			}
635 			for (; i < pcnt; i++, src++, p++, q++, r++) {
636 				*p = 0;
637 				*q = 0;
638 				*r = 0;
639 			}
640 		} else {
641 			ASSERT(ccnt <= pcnt);
642 
643 			/*
644 			 * Apply the algorithm described above by multiplying
645 			 * the previous result and adding in the new value.
646 			 */
647 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
648 				*p ^= *src;
649 
650 				VDEV_RAIDZ_64MUL_2(*q, mask);
651 				*q ^= *src;
652 
653 				VDEV_RAIDZ_64MUL_4(*r, mask);
654 				*r ^= *src;
655 			}
656 
657 			/*
658 			 * Treat short columns as though they are full of 0s.
659 			 * Note that there's therefore nothing needed for P.
660 			 */
661 			for (; i < pcnt; i++, q++, r++) {
662 				VDEV_RAIDZ_64MUL_2(*q, mask);
663 				VDEV_RAIDZ_64MUL_4(*r, mask);
664 			}
665 		}
666 	}
667 }
668 
669 /*
670  * Generate RAID parity in the first virtual columns according to the number of
671  * parity columns available.
672  */
673 static void
674 vdev_raidz_generate_parity(raidz_map_t *rm)
675 {
676 	switch (rm->rm_firstdatacol) {
677 	case 1:
678 		vdev_raidz_generate_parity_p(rm);
679 		break;
680 	case 2:
681 		vdev_raidz_generate_parity_pq(rm);
682 		break;
683 	case 3:
684 		vdev_raidz_generate_parity_pqr(rm);
685 		break;
686 	default:
687 		panic("invalid RAID-Z configuration");
688 	}
689 }
690 
691 /* BEGIN CSTYLED */
692 /*
693  * In the general case of reconstruction, we must solve the system of linear
694  * equations defined by the coeffecients used to generate parity as well as
695  * the contents of the data and parity disks. This can be expressed with
696  * vectors for the original data (D) and the actual data (d) and parity (p)
697  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
698  *
699  *            __   __                     __     __
700  *            |     |         __     __   |  p_0  |
701  *            |  V  |         |  D_0  |   | p_m-1 |
702  *            |     |    x    |   :   | = |  d_0  |
703  *            |  I  |         | D_n-1 |   |   :   |
704  *            |     |         ~~     ~~   | d_n-1 |
705  *            ~~   ~~                     ~~     ~~
706  *
707  * I is simply a square identity matrix of size n, and V is a vandermonde
708  * matrix defined by the coeffecients we chose for the various parity columns
709  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
710  * computation as well as linear separability.
711  *
712  *      __               __               __     __
713  *      |   1   ..  1 1 1 |               |  p_0  |
714  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
715  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
716  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
717  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
718  *      |   :       : : : |   |   :   |   |  d_2  |
719  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
720  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
721  *      |   0   ..  0 0 1 |               | d_n-1 |
722  *      ~~               ~~               ~~     ~~
723  *
724  * Note that I, V, d, and p are known. To compute D, we must invert the
725  * matrix and use the known data and parity values to reconstruct the unknown
726  * data values. We begin by removing the rows in V|I and d|p that correspond
727  * to failed or missing columns; we then make V|I square (n x n) and d|p
728  * sized n by removing rows corresponding to unused parity from the bottom up
729  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
730  * using Gauss-Jordan elimination. In the example below we use m=3 parity
731  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
732  *           __                               __
733  *           |  1   1   1   1   1   1   1   1  |
734  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
735  *           |  19 205 116  29  64  16  4   1  |      / /
736  *           |  1   0   0   0   0   0   0   0  |     / /
737  *           |  0   1   0   0   0   0   0   0  | <--' /
738  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
739  *           |  0   0   0   1   0   0   0   0  |
740  *           |  0   0   0   0   1   0   0   0  |
741  *           |  0   0   0   0   0   1   0   0  |
742  *           |  0   0   0   0   0   0   1   0  |
743  *           |  0   0   0   0   0   0   0   1  |
744  *           ~~                               ~~
745  *           __                               __
746  *           |  1   1   1   1   1   1   1   1  |
747  *           | 128  64  32  16  8   4   2   1  |
748  *           |  19 205 116  29  64  16  4   1  |
749  *           |  1   0   0   0   0   0   0   0  |
750  *           |  0   1   0   0   0   0   0   0  |
751  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
752  *           |  0   0   0   1   0   0   0   0  |
753  *           |  0   0   0   0   1   0   0   0  |
754  *           |  0   0   0   0   0   1   0   0  |
755  *           |  0   0   0   0   0   0   1   0  |
756  *           |  0   0   0   0   0   0   0   1  |
757  *           ~~                               ~~
758  *
759  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
760  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
761  * matrix is not singular.
762  * __                                                                 __
763  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
764  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
765  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
766  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
767  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
768  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
769  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
770  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
771  * ~~                                                                 ~~
772  * __                                                                 __
773  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
774  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
775  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
776  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
777  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
778  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
779  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
780  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
781  * ~~                                                                 ~~
782  * __                                                                 __
783  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
784  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
785  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
786  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
787  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
788  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
789  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
790  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
791  * ~~                                                                 ~~
792  * __                                                                 __
793  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
794  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
795  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
796  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
797  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
798  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
799  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
800  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
801  * ~~                                                                 ~~
802  * __                                                                 __
803  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
804  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
805  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
806  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
807  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
808  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
809  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
810  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
811  * ~~                                                                 ~~
812  * __                                                                 __
813  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
814  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
815  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
816  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
817  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
818  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
819  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
820  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
821  * ~~                                                                 ~~
822  *                   __                               __
823  *                   |  0   0   1   0   0   0   0   0  |
824  *                   | 167 100  5   41 159 169 217 208 |
825  *                   | 166 100  4   40 158 168 216 209 |
826  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
827  *                   |  0   0   0   0   1   0   0   0  |
828  *                   |  0   0   0   0   0   1   0   0  |
829  *                   |  0   0   0   0   0   0   1   0  |
830  *                   |  0   0   0   0   0   0   0   1  |
831  *                   ~~                               ~~
832  *
833  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
834  * of the missing data.
835  *
836  * As is apparent from the example above, the only non-trivial rows in the
837  * inverse matrix correspond to the data disks that we're trying to
838  * reconstruct. Indeed, those are the only rows we need as the others would
839  * only be useful for reconstructing data known or assumed to be valid. For
840  * that reason, we only build the coefficients in the rows that correspond to
841  * targeted columns.
842  */
843 /* END CSTYLED */
844 
845 static void
846 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
847     uint8_t **rows)
848 {
849 	int i, j;
850 	int pow;
851 
852 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
853 
854 	/*
855 	 * Fill in the missing rows of interest.
856 	 */
857 	for (i = 0; i < nmap; i++) {
858 		ASSERT3S(0, <=, map[i]);
859 		ASSERT3S(map[i], <=, 2);
860 
861 		pow = map[i] * n;
862 		if (pow > 255)
863 			pow -= 255;
864 		ASSERT(pow <= 255);
865 
866 		for (j = 0; j < n; j++) {
867 			pow -= map[i];
868 			if (pow < 0)
869 				pow += 255;
870 			rows[i][j] = vdev_raidz_pow2[pow];
871 		}
872 	}
873 }
874 
875 static void
876 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
877     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
878 {
879 	int i, j, ii, jj;
880 	uint8_t log;
881 
882 	/*
883 	 * Assert that the first nmissing entries from the array of used
884 	 * columns correspond to parity columns and that subsequent entries
885 	 * correspond to data columns.
886 	 */
887 	for (i = 0; i < nmissing; i++) {
888 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
889 	}
890 	for (; i < n; i++) {
891 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
892 	}
893 
894 	/*
895 	 * First initialize the storage where we'll compute the inverse rows.
896 	 */
897 	for (i = 0; i < nmissing; i++) {
898 		for (j = 0; j < n; j++) {
899 			invrows[i][j] = (i == j) ? 1 : 0;
900 		}
901 	}
902 
903 	/*
904 	 * Subtract all trivial rows from the rows of consequence.
905 	 */
906 	for (i = 0; i < nmissing; i++) {
907 		for (j = nmissing; j < n; j++) {
908 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
909 			jj = used[j] - rm->rm_firstdatacol;
910 			ASSERT3S(jj, <, n);
911 			invrows[i][j] = rows[i][jj];
912 			rows[i][jj] = 0;
913 		}
914 	}
915 
916 	/*
917 	 * For each of the rows of interest, we must normalize it and subtract
918 	 * a multiple of it from the other rows.
919 	 */
920 	for (i = 0; i < nmissing; i++) {
921 		for (j = 0; j < missing[i]; j++) {
922 			ASSERT3U(rows[i][j], ==, 0);
923 		}
924 		ASSERT3U(rows[i][missing[i]], !=, 0);
925 
926 		/*
927 		 * Compute the inverse of the first element and multiply each
928 		 * element in the row by that value.
929 		 */
930 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
931 
932 		for (j = 0; j < n; j++) {
933 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
934 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
935 		}
936 
937 		for (ii = 0; ii < nmissing; ii++) {
938 			if (i == ii)
939 				continue;
940 
941 			ASSERT3U(rows[ii][missing[i]], !=, 0);
942 
943 			log = vdev_raidz_log2[rows[ii][missing[i]]];
944 
945 			for (j = 0; j < n; j++) {
946 				rows[ii][j] ^=
947 				    vdev_raidz_exp2(rows[i][j], log);
948 				invrows[ii][j] ^=
949 				    vdev_raidz_exp2(invrows[i][j], log);
950 			}
951 		}
952 	}
953 
954 	/*
955 	 * Verify that the data that is left in the rows are properly part of
956 	 * an identity matrix.
957 	 */
958 	for (i = 0; i < nmissing; i++) {
959 		for (j = 0; j < n; j++) {
960 			if (j == missing[i]) {
961 				ASSERT3U(rows[i][j], ==, 1);
962 			} else {
963 				ASSERT3U(rows[i][j], ==, 0);
964 			}
965 		}
966 	}
967 }
968 
969 static void
970 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
971     int *missing, uint8_t **invrows, const uint8_t *used)
972 {
973 	int i, j, x, cc, c;
974 	uint8_t *src;
975 	uint64_t ccount;
976 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
977 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
978 	uint8_t log, val;
979 	int ll;
980 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
981 	uint8_t *p, *pp;
982 	size_t psize;
983 
984 	log = 0;	/* gcc */
985 	psize = sizeof (invlog[0][0]) * n * nmissing;
986 	p = zfs_alloc(psize);
987 
988 	for (pp = p, i = 0; i < nmissing; i++) {
989 		invlog[i] = pp;
990 		pp += n;
991 	}
992 
993 	for (i = 0; i < nmissing; i++) {
994 		for (j = 0; j < n; j++) {
995 			ASSERT3U(invrows[i][j], !=, 0);
996 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
997 		}
998 	}
999 
1000 	for (i = 0; i < n; i++) {
1001 		c = used[i];
1002 		ASSERT3U(c, <, rm->rm_cols);
1003 
1004 		src = rm->rm_col[c].rc_data;
1005 		ccount = rm->rm_col[c].rc_size;
1006 		for (j = 0; j < nmissing; j++) {
1007 			cc = missing[j] + rm->rm_firstdatacol;
1008 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1009 			ASSERT3U(cc, <, rm->rm_cols);
1010 			ASSERT3U(cc, !=, c);
1011 
1012 			dst[j] = rm->rm_col[cc].rc_data;
1013 			dcount[j] = rm->rm_col[cc].rc_size;
1014 		}
1015 
1016 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1017 
1018 		for (x = 0; x < ccount; x++, src++) {
1019 			if (*src != 0)
1020 				log = vdev_raidz_log2[*src];
1021 
1022 			for (cc = 0; cc < nmissing; cc++) {
1023 				if (x >= dcount[cc])
1024 					continue;
1025 
1026 				if (*src == 0) {
1027 					val = 0;
1028 				} else {
1029 					if ((ll = log + invlog[cc][i]) >= 255)
1030 						ll -= 255;
1031 					val = vdev_raidz_pow2[ll];
1032 				}
1033 
1034 				if (i == 0)
1035 					dst[cc][x] = val;
1036 				else
1037 					dst[cc][x] ^= val;
1038 			}
1039 		}
1040 	}
1041 
1042 	zfs_free(p, psize);
1043 }
1044 
1045 static int
1046 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1047 {
1048 	int n, i, c, t, tt;
1049 	int nmissing_rows;
1050 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1051 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1052 
1053 	uint8_t *p, *pp;
1054 	size_t psize;
1055 
1056 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1057 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1058 	uint8_t *used;
1059 
1060 	int code = 0;
1061 
1062 
1063 	n = rm->rm_cols - rm->rm_firstdatacol;
1064 
1065 	/*
1066 	 * Figure out which data columns are missing.
1067 	 */
1068 	nmissing_rows = 0;
1069 	for (t = 0; t < ntgts; t++) {
1070 		if (tgts[t] >= rm->rm_firstdatacol) {
1071 			missing_rows[nmissing_rows++] =
1072 			    tgts[t] - rm->rm_firstdatacol;
1073 		}
1074 	}
1075 
1076 	/*
1077 	 * Figure out which parity columns to use to help generate the missing
1078 	 * data columns.
1079 	 */
1080 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1081 		ASSERT(tt < ntgts);
1082 		ASSERT(c < rm->rm_firstdatacol);
1083 
1084 		/*
1085 		 * Skip any targeted parity columns.
1086 		 */
1087 		if (c == tgts[tt]) {
1088 			tt++;
1089 			continue;
1090 		}
1091 
1092 		code |= 1 << c;
1093 
1094 		parity_map[i] = c;
1095 		i++;
1096 	}
1097 
1098 	ASSERT(code != 0);
1099 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1100 
1101 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1102 	    nmissing_rows * n + sizeof (used[0]) * n;
1103 	p = kmem_alloc(psize, KM_SLEEP);
1104 
1105 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1106 		rows[i] = pp;
1107 		pp += n;
1108 		invrows[i] = pp;
1109 		pp += n;
1110 	}
1111 	used = pp;
1112 
1113 	for (i = 0; i < nmissing_rows; i++) {
1114 		used[i] = parity_map[i];
1115 	}
1116 
1117 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1118 		if (tt < nmissing_rows &&
1119 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1120 			tt++;
1121 			continue;
1122 		}
1123 
1124 		ASSERT3S(i, <, n);
1125 		used[i] = c;
1126 		i++;
1127 	}
1128 
1129 	/*
1130 	 * Initialize the interesting rows of the matrix.
1131 	 */
1132 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1133 
1134 	/*
1135 	 * Invert the matrix.
1136 	 */
1137 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1138 	    invrows, used);
1139 
1140 	/*
1141 	 * Reconstruct the missing data using the generated matrix.
1142 	 */
1143 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1144 	    invrows, used);
1145 
1146 	kmem_free(p, psize);
1147 
1148 	return (code);
1149 }
1150 
1151 static int
1152 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1153 {
1154 	int tgts[VDEV_RAIDZ_MAXPARITY];
1155 	int ntgts;
1156 	int i, c;
1157 	int code;
1158 	int nbadparity, nbaddata;
1159 
1160 	/*
1161 	 * The tgts list must already be sorted.
1162 	 */
1163 	for (i = 1; i < nt; i++) {
1164 		ASSERT(t[i] > t[i - 1]);
1165 	}
1166 
1167 	nbadparity = rm->rm_firstdatacol;
1168 	nbaddata = rm->rm_cols - nbadparity;
1169 	ntgts = 0;
1170 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1171 		if (i < nt && c == t[i]) {
1172 			tgts[ntgts++] = c;
1173 			i++;
1174 		} else if (rm->rm_col[c].rc_error != 0) {
1175 			tgts[ntgts++] = c;
1176 		} else if (c >= rm->rm_firstdatacol) {
1177 			nbaddata--;
1178 		} else {
1179 			nbadparity--;
1180 		}
1181 	}
1182 
1183 	ASSERT(ntgts >= nt);
1184 	ASSERT(nbaddata >= 0);
1185 	ASSERT(nbaddata + nbadparity == ntgts);
1186 
1187 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1188 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1189 	ASSERT(code > 0);
1190 	return (code);
1191 }
1192 
1193 static raidz_map_t *
1194 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1195     uint64_t dcols, uint64_t nparity)
1196 {
1197 	raidz_map_t *rm;
1198 	uint64_t b = offset >> unit_shift;
1199 	uint64_t s = size >> unit_shift;
1200 	uint64_t f = b % dcols;
1201 	uint64_t o = (b / dcols) << unit_shift;
1202 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1203 
1204 	q = s / (dcols - nparity);
1205 	r = s - q * (dcols - nparity);
1206 	bc = (r == 0 ? 0 : r + nparity);
1207 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1208 
1209 	if (q == 0) {
1210 		acols = bc;
1211 		scols = MIN(dcols, roundup(bc, nparity + 1));
1212 	} else {
1213 		acols = dcols;
1214 		scols = dcols;
1215 	}
1216 
1217 	ASSERT3U(acols, <=, scols);
1218 
1219 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1220 
1221 	rm->rm_cols = acols;
1222 	rm->rm_scols = scols;
1223 	rm->rm_bigcols = bc;
1224 	rm->rm_skipstart = bc;
1225 	rm->rm_missingdata = 0;
1226 	rm->rm_missingparity = 0;
1227 	rm->rm_firstdatacol = nparity;
1228 	rm->rm_reports = 0;
1229 	rm->rm_freed = 0;
1230 	rm->rm_ecksuminjected = 0;
1231 
1232 	asize = 0;
1233 
1234 	for (c = 0; c < scols; c++) {
1235 		col = f + c;
1236 		coff = o;
1237 		if (col >= dcols) {
1238 			col -= dcols;
1239 			coff += 1ULL << unit_shift;
1240 		}
1241 		rm->rm_col[c].rc_devidx = col;
1242 		rm->rm_col[c].rc_offset = coff;
1243 		rm->rm_col[c].rc_data = NULL;
1244 		rm->rm_col[c].rc_error = 0;
1245 		rm->rm_col[c].rc_tried = 0;
1246 		rm->rm_col[c].rc_skipped = 0;
1247 
1248 		if (c >= acols)
1249 			rm->rm_col[c].rc_size = 0;
1250 		else if (c < bc)
1251 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1252 		else
1253 			rm->rm_col[c].rc_size = q << unit_shift;
1254 
1255 		asize += rm->rm_col[c].rc_size;
1256 	}
1257 
1258 	ASSERT3U(asize, ==, tot << unit_shift);
1259 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1260 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1261 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1262 	ASSERT3U(rm->rm_nskip, <=, nparity);
1263 
1264 	for (c = 0; c < rm->rm_firstdatacol; c++)
1265 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1266 
1267 	rm->rm_col[c].rc_data = data;
1268 
1269 	for (c = c + 1; c < acols; c++)
1270 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1271 		    rm->rm_col[c - 1].rc_size;
1272 
1273 	/*
1274 	 * If all data stored spans all columns, there's a danger that parity
1275 	 * will always be on the same device and, since parity isn't read
1276 	 * during normal operation, that that device's I/O bandwidth won't be
1277 	 * used effectively. We therefore switch the parity every 1MB.
1278 	 *
1279 	 * ... at least that was, ostensibly, the theory. As a practical
1280 	 * matter unless we juggle the parity between all devices evenly, we
1281 	 * won't see any benefit. Further, occasional writes that aren't a
1282 	 * multiple of the LCM of the number of children and the minimum
1283 	 * stripe width are sufficient to avoid pessimal behavior.
1284 	 * Unfortunately, this decision created an implicit on-disk format
1285 	 * requirement that we need to support for all eternity, but only
1286 	 * for single-parity RAID-Z.
1287 	 *
1288 	 * If we intend to skip a sector in the zeroth column for padding
1289 	 * we must make sure to note this swap. We will never intend to
1290 	 * skip the first column since at least one data and one parity
1291 	 * column must appear in each row.
1292 	 */
1293 	ASSERT(rm->rm_cols >= 2);
1294 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1295 
1296 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1297 		devidx = rm->rm_col[0].rc_devidx;
1298 		o = rm->rm_col[0].rc_offset;
1299 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1300 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1301 		rm->rm_col[1].rc_devidx = devidx;
1302 		rm->rm_col[1].rc_offset = o;
1303 
1304 		if (rm->rm_skipstart == 0)
1305 			rm->rm_skipstart = 1;
1306 	}
1307 
1308 	return (rm);
1309 }
1310 
1311 static void
1312 vdev_raidz_map_free(raidz_map_t *rm)
1313 {
1314 	int c;
1315 
1316 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1317 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1318 
1319 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1320 }
1321 
1322 static vdev_t *
1323 vdev_child(vdev_t *pvd, uint64_t devidx)
1324 {
1325 	vdev_t *cvd;
1326 
1327 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1328 		if (cvd->v_id == devidx)
1329 			break;
1330 	}
1331 
1332 	return (cvd);
1333 }
1334 
1335 /*
1336  * We keep track of whether or not there were any injected errors, so that
1337  * any ereports we generate can note it.
1338  */
1339 static int
1340 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1341     uint64_t size)
1342 {
1343 
1344 	return (zio_checksum_verify(spa, bp, data));
1345 }
1346 
1347 /*
1348  * Generate the parity from the data columns. If we tried and were able to
1349  * read the parity without error, verify that the generated parity matches the
1350  * data we read. If it doesn't, we fire off a checksum error. Return the
1351  * number such failures.
1352  */
1353 static int
1354 raidz_parity_verify(raidz_map_t *rm)
1355 {
1356 	void *orig[VDEV_RAIDZ_MAXPARITY];
1357 	int c, ret = 0;
1358 	raidz_col_t *rc;
1359 
1360 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1361 		rc = &rm->rm_col[c];
1362 		if (!rc->rc_tried || rc->rc_error != 0)
1363 			continue;
1364 		orig[c] = zfs_alloc(rc->rc_size);
1365 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1366 	}
1367 
1368 	vdev_raidz_generate_parity(rm);
1369 
1370 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1371 		rc = &rm->rm_col[c];
1372 		if (!rc->rc_tried || rc->rc_error != 0)
1373 			continue;
1374 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1375 			rc->rc_error = ECKSUM;
1376 			ret++;
1377 		}
1378 		zfs_free(orig[c], rc->rc_size);
1379 	}
1380 
1381 	return (ret);
1382 }
1383 
1384 /*
1385  * Iterate over all combinations of bad data and attempt a reconstruction.
1386  * Note that the algorithm below is non-optimal because it doesn't take into
1387  * account how reconstruction is actually performed. For example, with
1388  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1389  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1390  * cases we'd only use parity information in column 0.
1391  */
1392 static int
1393 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1394     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1395 {
1396 	raidz_col_t *rc;
1397 	void *orig[VDEV_RAIDZ_MAXPARITY];
1398 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1399 	int *tgts = &tstore[1];
1400 	int current, next, i, c, n;
1401 	int code, ret = 0;
1402 
1403 	ASSERT(total_errors < rm->rm_firstdatacol);
1404 
1405 	/*
1406 	 * This simplifies one edge condition.
1407 	 */
1408 	tgts[-1] = -1;
1409 
1410 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1411 		/*
1412 		 * Initialize the targets array by finding the first n columns
1413 		 * that contain no error.
1414 		 *
1415 		 * If there were no data errors, we need to ensure that we're
1416 		 * always explicitly attempting to reconstruct at least one
1417 		 * data column. To do this, we simply push the highest target
1418 		 * up into the data columns.
1419 		 */
1420 		for (c = 0, i = 0; i < n; i++) {
1421 			if (i == n - 1 && data_errors == 0 &&
1422 			    c < rm->rm_firstdatacol) {
1423 				c = rm->rm_firstdatacol;
1424 			}
1425 
1426 			while (rm->rm_col[c].rc_error != 0) {
1427 				c++;
1428 				ASSERT3S(c, <, rm->rm_cols);
1429 			}
1430 
1431 			tgts[i] = c++;
1432 		}
1433 
1434 		/*
1435 		 * Setting tgts[n] simplifies the other edge condition.
1436 		 */
1437 		tgts[n] = rm->rm_cols;
1438 
1439 		/*
1440 		 * These buffers were allocated in previous iterations.
1441 		 */
1442 		for (i = 0; i < n - 1; i++) {
1443 			ASSERT(orig[i] != NULL);
1444 		}
1445 
1446 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1447 
1448 		current = 0;
1449 		next = tgts[current];
1450 
1451 		while (current != n) {
1452 			tgts[current] = next;
1453 			current = 0;
1454 
1455 			/*
1456 			 * Save off the original data that we're going to
1457 			 * attempt to reconstruct.
1458 			 */
1459 			for (i = 0; i < n; i++) {
1460 				ASSERT(orig[i] != NULL);
1461 				c = tgts[i];
1462 				ASSERT3S(c, >=, 0);
1463 				ASSERT3S(c, <, rm->rm_cols);
1464 				rc = &rm->rm_col[c];
1465 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1466 			}
1467 
1468 			/*
1469 			 * Attempt a reconstruction and exit the outer loop on
1470 			 * success.
1471 			 */
1472 			code = vdev_raidz_reconstruct(rm, tgts, n);
1473 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1474 				for (i = 0; i < n; i++) {
1475 					c = tgts[i];
1476 					rc = &rm->rm_col[c];
1477 					ASSERT(rc->rc_error == 0);
1478 					rc->rc_error = ECKSUM;
1479 				}
1480 
1481 				ret = code;
1482 				goto done;
1483 			}
1484 
1485 			/*
1486 			 * Restore the original data.
1487 			 */
1488 			for (i = 0; i < n; i++) {
1489 				c = tgts[i];
1490 				rc = &rm->rm_col[c];
1491 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1492 			}
1493 
1494 			do {
1495 				/*
1496 				 * Find the next valid column after the current
1497 				 * position..
1498 				 */
1499 				for (next = tgts[current] + 1;
1500 				    next < rm->rm_cols &&
1501 				    rm->rm_col[next].rc_error != 0; next++)
1502 					continue;
1503 
1504 				ASSERT(next <= tgts[current + 1]);
1505 
1506 				/*
1507 				 * If that spot is available, we're done here.
1508 				 */
1509 				if (next != tgts[current + 1])
1510 					break;
1511 
1512 				/*
1513 				 * Otherwise, find the next valid column after
1514 				 * the previous position.
1515 				 */
1516 				for (c = tgts[current - 1] + 1;
1517 				    rm->rm_col[c].rc_error != 0; c++)
1518 					continue;
1519 
1520 				tgts[current] = c;
1521 				current++;
1522 
1523 			} while (current != n);
1524 		}
1525 	}
1526 	n--;
1527 done:
1528 	for (i = n - 1; i >= 0; i--) {
1529 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1530 	}
1531 
1532 	return (ret);
1533 }
1534 
1535 static int
1536 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1537     off_t offset, size_t bytes)
1538 {
1539 	vdev_t *tvd = vd->v_top;
1540 	vdev_t *cvd;
1541 	raidz_map_t *rm;
1542 	raidz_col_t *rc;
1543 	int c, error;
1544 	int unexpected_errors;
1545 	int parity_errors;
1546 	int parity_untried;
1547 	int data_errors;
1548 	int total_errors;
1549 	int n;
1550 	int tgts[VDEV_RAIDZ_MAXPARITY];
1551 	int code;
1552 
1553 	rc = NULL;	/* gcc */
1554 	error = 0;
1555 
1556 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1557 	    vd->v_nchildren, vd->v_nparity);
1558 
1559 	/*
1560 	 * Iterate over the columns in reverse order so that we hit the parity
1561 	 * last -- any errors along the way will force us to read the parity.
1562 	 */
1563 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1564 		rc = &rm->rm_col[c];
1565 		cvd = vdev_child(vd, rc->rc_devidx);
1566 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1567 			if (c >= rm->rm_firstdatacol)
1568 				rm->rm_missingdata++;
1569 			else
1570 				rm->rm_missingparity++;
1571 			rc->rc_error = ENXIO;
1572 			rc->rc_tried = 1;	/* don't even try */
1573 			rc->rc_skipped = 1;
1574 			continue;
1575 		}
1576 #if 0		/* XXX: Too hard for the boot code. */
1577 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1578 			if (c >= rm->rm_firstdatacol)
1579 				rm->rm_missingdata++;
1580 			else
1581 				rm->rm_missingparity++;
1582 			rc->rc_error = ESTALE;
1583 			rc->rc_skipped = 1;
1584 			continue;
1585 		}
1586 #endif
1587 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1588 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1589 			    rc->rc_offset, rc->rc_size);
1590 			rc->rc_tried = 1;
1591 			rc->rc_skipped = 0;
1592 		}
1593 	}
1594 
1595 reconstruct:
1596 	unexpected_errors = 0;
1597 	parity_errors = 0;
1598 	parity_untried = 0;
1599 	data_errors = 0;
1600 	total_errors = 0;
1601 
1602 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1603 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1604 
1605 	for (c = 0; c < rm->rm_cols; c++) {
1606 		rc = &rm->rm_col[c];
1607 
1608 		if (rc->rc_error) {
1609 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1610 
1611 			if (c < rm->rm_firstdatacol)
1612 				parity_errors++;
1613 			else
1614 				data_errors++;
1615 
1616 			if (!rc->rc_skipped)
1617 				unexpected_errors++;
1618 
1619 			total_errors++;
1620 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1621 			parity_untried++;
1622 		}
1623 	}
1624 
1625 	/*
1626 	 * There are three potential phases for a read:
1627 	 *	1. produce valid data from the columns read
1628 	 *	2. read all disks and try again
1629 	 *	3. perform combinatorial reconstruction
1630 	 *
1631 	 * Each phase is progressively both more expensive and less likely to
1632 	 * occur. If we encounter more errors than we can repair or all phases
1633 	 * fail, we have no choice but to return an error.
1634 	 */
1635 
1636 	/*
1637 	 * If the number of errors we saw was correctable -- less than or equal
1638 	 * to the number of parity disks read -- attempt to produce data that
1639 	 * has a valid checksum. Naturally, this case applies in the absence of
1640 	 * any errors.
1641 	 */
1642 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1643 		if (data_errors == 0) {
1644 			if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1645 				/*
1646 				 * If we read parity information (unnecessarily
1647 				 * as it happens since no reconstruction was
1648 				 * needed) regenerate and verify the parity.
1649 				 * We also regenerate parity when resilvering
1650 				 * so we can write it out to the failed device
1651 				 * later.
1652 				 */
1653 				if (parity_errors + parity_untried <
1654 				    rm->rm_firstdatacol) {
1655 					n = raidz_parity_verify(rm);
1656 					unexpected_errors += n;
1657 					ASSERT(parity_errors + n <=
1658 					    rm->rm_firstdatacol);
1659 				}
1660 				goto done;
1661 			}
1662 		} else {
1663 			/*
1664 			 * We either attempt to read all the parity columns or
1665 			 * none of them. If we didn't try to read parity, we
1666 			 * wouldn't be here in the correctable case. There must
1667 			 * also have been fewer parity errors than parity
1668 			 * columns or, again, we wouldn't be in this code path.
1669 			 */
1670 			ASSERT(parity_untried == 0);
1671 			ASSERT(parity_errors < rm->rm_firstdatacol);
1672 
1673 			/*
1674 			 * Identify the data columns that reported an error.
1675 			 */
1676 			n = 0;
1677 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1678 				rc = &rm->rm_col[c];
1679 				if (rc->rc_error != 0) {
1680 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1681 					tgts[n++] = c;
1682 				}
1683 			}
1684 
1685 			ASSERT(rm->rm_firstdatacol >= n);
1686 
1687 			code = vdev_raidz_reconstruct(rm, tgts, n);
1688 
1689 			if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1690 				/*
1691 				 * If we read more parity disks than were used
1692 				 * for reconstruction, confirm that the other
1693 				 * parity disks produced correct data. This
1694 				 * routine is suboptimal in that it regenerates
1695 				 * the parity that we already used in addition
1696 				 * to the parity that we're attempting to
1697 				 * verify, but this should be a relatively
1698 				 * uncommon case, and can be optimized if it
1699 				 * becomes a problem. Note that we regenerate
1700 				 * parity when resilvering so we can write it
1701 				 * out to failed devices later.
1702 				 */
1703 				if (parity_errors < rm->rm_firstdatacol - n) {
1704 					n = raidz_parity_verify(rm);
1705 					unexpected_errors += n;
1706 					ASSERT(parity_errors + n <=
1707 					    rm->rm_firstdatacol);
1708 				}
1709 
1710 				goto done;
1711 			}
1712 		}
1713 	}
1714 
1715 	/*
1716 	 * This isn't a typical situation -- either we got a read
1717 	 * error or a child silently returned bad data. Read every
1718 	 * block so we can try again with as much data and parity as
1719 	 * we can track down. If we've already been through once
1720 	 * before, all children will be marked as tried so we'll
1721 	 * proceed to combinatorial reconstruction.
1722 	 */
1723 	unexpected_errors = 1;
1724 	rm->rm_missingdata = 0;
1725 	rm->rm_missingparity = 0;
1726 
1727 	n = 0;
1728 	for (c = 0; c < rm->rm_cols; c++) {
1729 		rc = &rm->rm_col[c];
1730 
1731 		if (rc->rc_tried)
1732 			continue;
1733 
1734 		cvd = vdev_child(vd, rc->rc_devidx);
1735 		ASSERT(cvd != NULL);
1736 		rc->rc_error = cvd->v_read(cvd, NULL,
1737 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1738 		if (rc->rc_error == 0)
1739 			n++;
1740 		rc->rc_tried = 1;
1741 		rc->rc_skipped = 0;
1742 	}
1743 	/*
1744 	 * If we managed to read anything more, retry the
1745 	 * reconstruction.
1746 	 */
1747 	if (n > 0)
1748 		goto reconstruct;
1749 
1750 	/*
1751 	 * At this point we've attempted to reconstruct the data given the
1752 	 * errors we detected, and we've attempted to read all columns. There
1753 	 * must, therefore, be one or more additional problems -- silent errors
1754 	 * resulting in invalid data rather than explicit I/O errors resulting
1755 	 * in absent data. We check if there is enough additional data to
1756 	 * possibly reconstruct the data and then perform combinatorial
1757 	 * reconstruction over all possible combinations. If that fails,
1758 	 * we're cooked.
1759 	 */
1760 	if (total_errors > rm->rm_firstdatacol) {
1761 		error = EIO;
1762 	} else if (total_errors < rm->rm_firstdatacol &&
1763 	    (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1764 	     total_errors, data_errors)) != 0) {
1765 		/*
1766 		 * If we didn't use all the available parity for the
1767 		 * combinatorial reconstruction, verify that the remaining
1768 		 * parity is correct.
1769 		 */
1770 		if (code != (1 << rm->rm_firstdatacol) - 1)
1771 			(void) raidz_parity_verify(rm);
1772 	} else {
1773 		/*
1774 		 * We're here because either:
1775 		 *
1776 		 *	total_errors == rm_first_datacol, or
1777 		 *	vdev_raidz_combrec() failed
1778 		 *
1779 		 * In either case, there is enough bad data to prevent
1780 		 * reconstruction.
1781 		 *
1782 		 * Start checksum ereports for all children which haven't
1783 		 * failed, and the IO wasn't speculative.
1784 		 */
1785 		error = ECKSUM;
1786 	}
1787 
1788 done:
1789 	vdev_raidz_map_free(rm);
1790 
1791 	return (error);
1792 }
1793