xref: /illumos-gate/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c (revision 8eef2ab6218419d91e0aefc223c9f2cd1c8e9a2c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 #include <lz4.h>
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #define	ECKSUM	666
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 #define	kmem_alloc(size, flag)	zfs_alloc((size))
40 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
41 
42 static void
43 zfs_init_crc(void)
44 {
45 	int i, j;
46 	uint64_t *ct;
47 
48 	/*
49 	 * Calculate the crc64 table (used for the zap hash
50 	 * function).
51 	 */
52 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
53 		memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table));
54 		for (i = 0; i < 256; i++) {
55 			ct = zfs_crc64_table + i;
56 			for (*ct = i, j = 8; j > 0; j--)
57 				*ct = (*ct >> 1) ^
58 				    (-(*ct & 1) & ZFS_CRC64_POLY);
59 		}
60 	}
61 }
62 
63 static void
64 zio_checksum_off(const void *buf __unused, uint64_t size __unused,
65     const void *ctx_template __unused, zio_cksum_t *zcp)
66 {
67 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
68 }
69 
70 /*
71  * Signature for checksum functions.
72  */
73 typedef void zio_checksum_t(const void *data, uint64_t size,
74     const void *ctx_template, zio_cksum_t *zcp);
75 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
76 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
77 
78 typedef enum zio_checksum_flags {
79 	/* Strong enough for metadata? */
80 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
81 	/* ZIO embedded checksum */
82 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
83 	/* Strong enough for dedup (without verification)? */
84 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
85 	/* Uses salt value */
86 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
87 	/* Strong enough for nopwrite? */
88 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
89 } zio_checksum_flags_t;
90 
91 /*
92  * Information about each checksum function.
93  */
94 typedef struct zio_checksum_info {
95 	/* checksum function for each byteorder */
96 	zio_checksum_t			*ci_func[2];
97 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
98 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
99 	zio_checksum_flags_t		ci_flags;
100 	const char			*ci_name;	/* descriptive name */
101 } zio_checksum_info_t;
102 
103 #include "blkptr.c"
104 
105 #include "fletcher.c"
106 #include "sha256.c"
107 #include "skein_zfs.c"
108 #include "edonr_zfs.c"
109 
110 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
111 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
112 	{{NULL, NULL}, NULL, NULL, 0, "on"},
113 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
114 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
115 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
116 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
117 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
118 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
119 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
120 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
121 	    0, "fletcher2"},
122 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
123 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
124 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
125 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
126 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
127 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
128 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
129 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
130 	    0, "noparity"},
131 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
132 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
133 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
134 	/* no skein and edonr for now */
135 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
136 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
137 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
138 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
139 	{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
140 	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
141 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
142 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
143 };
144 
145 /*
146  * Common signature for all zio compress/decompress functions.
147  */
148 typedef size_t zio_compress_func_t(void *src, void *dst,
149     size_t s_len, size_t d_len, int);
150 typedef int zio_decompress_func_t(void *src, void *dst,
151     size_t s_len, size_t d_len, int);
152 
153 extern int gzip_decompress(void *src, void *dst,
154     size_t s_len, size_t d_len, int);
155 /*
156  * Information about each compression function.
157  */
158 typedef struct zio_compress_info {
159 	zio_compress_func_t	*ci_compress;	/* compression function */
160 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
161 	int			ci_level;	/* level parameter */
162 	const char		*ci_name;	/* algorithm name */
163 } zio_compress_info_t;
164 
165 #include "lzjb.c"
166 #include "zle.c"
167 
168 /*
169  * Compression vectors.
170  */
171 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
172 	{NULL,			NULL,			0,	"inherit"},
173 	{NULL,			NULL,			0,	"on"},
174 	{NULL,			NULL,			0,	"uncompressed"},
175 	{NULL,			lzjb_decompress,	0,	"lzjb"},
176 	{NULL,			NULL,			0,	"empty"},
177 	{NULL,			gzip_decompress,	1,	"gzip-1"},
178 	{NULL,			gzip_decompress,	2,	"gzip-2"},
179 	{NULL,			gzip_decompress,	3,	"gzip-3"},
180 	{NULL,			gzip_decompress,	4,	"gzip-4"},
181 	{NULL,			gzip_decompress,	5,	"gzip-5"},
182 	{NULL,			gzip_decompress,	6,	"gzip-6"},
183 	{NULL,			gzip_decompress,	7,	"gzip-7"},
184 	{NULL,			gzip_decompress,	8,	"gzip-8"},
185 	{NULL,			gzip_decompress,	9,	"gzip-9"},
186 	{NULL,			zle_decompress,		64,	"zle"},
187 	{NULL,			lz4_decompress,		0,	"lz4"},
188 };
189 
190 static void
191 byteswap_uint64_array(void *vbuf, size_t size)
192 {
193 	uint64_t *buf = vbuf;
194 	size_t count = size >> 3;
195 	int i;
196 
197 	ASSERT((size & 7) == 0);
198 
199 	for (i = 0; i < count; i++)
200 		buf[i] = BSWAP_64(buf[i]);
201 }
202 
203 /*
204  * Set the external verifier for a gang block based on <vdev, offset, txg>,
205  * a tuple which is guaranteed to be unique for the life of the pool.
206  */
207 static void
208 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
209 {
210 	const dva_t *dva = BP_IDENTITY(bp);
211 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
212 
213 	ASSERT(BP_IS_GANG(bp));
214 
215 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
216 }
217 
218 /*
219  * Set the external verifier for a label block based on its offset.
220  * The vdev is implicit, and the txg is unknowable at pool open time --
221  * hence the logic in vdev_uberblock_load() to find the most recent copy.
222  */
223 static void
224 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
225 {
226 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
227 }
228 
229 /*
230  * Calls the template init function of a checksum which supports context
231  * templates and installs the template into the spa_t.
232  */
233 static void
234 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
235 {
236 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
237 
238 	if (ci->ci_tmpl_init == NULL)
239 		return;
240 
241 	if (spa->spa_cksum_tmpls[checksum] != NULL)
242 		return;
243 
244 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
245 		spa->spa_cksum_tmpls[checksum] =
246 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
247 	}
248 }
249 
250 /*
251  * Called by a spa_t that's about to be deallocated. This steps through
252  * all of the checksum context templates and deallocates any that were
253  * initialized using the algorithm-specific template init function.
254  */
255 void
256 zio_checksum_templates_free(spa_t *spa)
257 {
258 	for (enum zio_checksum checksum = 0;
259 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
260 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
261 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
262 
263 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
264 			spa->spa_cksum_tmpls[checksum] = NULL;
265 		}
266 	}
267 }
268 
269 static int
270 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
271 {
272 	uint64_t size;
273 	unsigned int checksum;
274 	zio_checksum_info_t *ci;
275 	void *ctx = NULL;
276 	zio_cksum_t actual_cksum, expected_cksum, verifier;
277 	int byteswap;
278 
279 	checksum = BP_GET_CHECKSUM(bp);
280 	size = BP_GET_PSIZE(bp);
281 
282 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
283 		return (EINVAL);
284 	ci = &zio_checksum_table[checksum];
285 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
286 		return (EINVAL);
287 
288 	if (spa != NULL) {
289 		zio_checksum_template_init(checksum, (spa_t *)spa);
290 		ctx = spa->spa_cksum_tmpls[checksum];
291 	}
292 
293 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
294 		zio_eck_t *eck;
295 
296 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
297 		    checksum == ZIO_CHECKSUM_LABEL);
298 
299 		eck = (zio_eck_t *)((char *)data + size) - 1;
300 
301 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
302 			zio_checksum_gang_verifier(&verifier, bp);
303 		else if (checksum == ZIO_CHECKSUM_LABEL)
304 			zio_checksum_label_verifier(&verifier,
305 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
306 		else
307 			verifier = bp->blk_cksum;
308 
309 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
310 
311 		if (byteswap)
312 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
313 
314 		expected_cksum = eck->zec_cksum;
315 		eck->zec_cksum = verifier;
316 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
317 		eck->zec_cksum = expected_cksum;
318 
319 		if (byteswap)
320 			byteswap_uint64_array(&expected_cksum,
321 			    sizeof (zio_cksum_t));
322 	} else {
323 		expected_cksum = bp->blk_cksum;
324 		ci->ci_func[0](data, size, ctx, &actual_cksum);
325 	}
326 
327 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
328 		/* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
329 		return (EIO);
330 	}
331 
332 	return (0);
333 }
334 
335 static int
336 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
337     void *dest, uint64_t destsize)
338 {
339 	zio_compress_info_t *ci;
340 
341 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
342 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
343 		return (EIO);
344 	}
345 
346 	ci = &zio_compress_table[cpfunc];
347 	if (!ci->ci_decompress) {
348 		printf("ZFS: unsupported compression algorithm %s\n",
349 		    ci->ci_name);
350 		return (EIO);
351 	}
352 
353 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
354 }
355 
356 static uint64_t
357 zap_hash(uint64_t salt, const char *name)
358 {
359 	const uint8_t *cp;
360 	uint8_t c;
361 	uint64_t crc = salt;
362 
363 	ASSERT(crc != 0);
364 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
365 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
366 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
367 
368 	/*
369 	 * Only use 28 bits, since we need 4 bits in the cookie for the
370 	 * collision differentiator.  We MUST use the high bits, since
371 	 * those are the onces that we first pay attention to when
372 	 * chosing the bucket.
373 	 */
374 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
375 
376 	return (crc);
377 }
378 
379 static void *zfs_alloc(size_t size);
380 static void zfs_free(void *ptr, size_t size);
381 
382 typedef struct raidz_col {
383 	uint64_t rc_devidx;		/* child device index for I/O */
384 	uint64_t rc_offset;		/* device offset */
385 	uint64_t rc_size;		/* I/O size */
386 	void *rc_data;			/* I/O data */
387 	int rc_error;			/* I/O error for this device */
388 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
389 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
390 } raidz_col_t;
391 
392 typedef struct raidz_map {
393 	uint64_t rm_cols;		/* Regular column count */
394 	uint64_t rm_scols;		/* Count including skipped columns */
395 	uint64_t rm_bigcols;		/* Number of oversized columns */
396 	uint64_t rm_asize;		/* Actual total I/O size */
397 	uint64_t rm_missingdata;	/* Count of missing data devices */
398 	uint64_t rm_missingparity;	/* Count of missing parity devices */
399 	uint64_t rm_firstdatacol;	/* First data column/parity count */
400 	uint64_t rm_nskip;		/* Skipped sectors for padding */
401 	uint64_t rm_skipstart;		/* Column index of padding start */
402 	uintptr_t rm_reports;		/* # of referencing checksum reports */
403 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
404 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
405 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
406 } raidz_map_t;
407 
408 #define	VDEV_RAIDZ_P		0
409 #define	VDEV_RAIDZ_Q		1
410 #define	VDEV_RAIDZ_R		2
411 
412 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
413 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
414 
415 /*
416  * We provide a mechanism to perform the field multiplication operation on a
417  * 64-bit value all at once rather than a byte at a time. This works by
418  * creating a mask from the top bit in each byte and using that to
419  * conditionally apply the XOR of 0x1d.
420  */
421 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
422 { \
423 	(mask) = (x) & 0x8080808080808080ULL; \
424 	(mask) = ((mask) << 1) - ((mask) >> 7); \
425 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
426 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
427 }
428 
429 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
430 { \
431 	VDEV_RAIDZ_64MUL_2((x), mask); \
432 	VDEV_RAIDZ_64MUL_2((x), mask); \
433 }
434 
435 /*
436  * These two tables represent powers and logs of 2 in the Galois field defined
437  * above. These values were computed by repeatedly multiplying by 2 as above.
438  */
439 static const uint8_t vdev_raidz_pow2[256] = {
440 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
441 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
442 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
443 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
444 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
445 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
446 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
447 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
448 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
449 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
450 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
451 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
452 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
453 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
454 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
455 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
456 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
457 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
458 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
459 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
460 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
461 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
462 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
463 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
464 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
465 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
466 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
467 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
468 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
469 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
470 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
471 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
472 };
473 static const uint8_t vdev_raidz_log2[256] = {
474 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
475 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
476 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
477 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
478 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
479 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
480 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
481 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
482 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
483 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
484 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
485 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
486 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
487 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
488 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
489 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
490 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
491 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
492 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
493 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
494 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
495 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
496 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
497 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
498 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
499 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
500 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
501 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
502 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
503 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
504 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
505 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
506 };
507 
508 /*
509  * Multiply a given number by 2 raised to the given power.
510  */
511 static uint8_t
512 vdev_raidz_exp2(uint8_t a, int exp)
513 {
514 	if (a == 0)
515 		return (0);
516 
517 	ASSERT(exp >= 0);
518 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
519 
520 	exp += vdev_raidz_log2[a];
521 	if (exp > 255)
522 		exp -= 255;
523 
524 	return (vdev_raidz_pow2[exp]);
525 }
526 
527 static void
528 vdev_raidz_generate_parity_p(raidz_map_t *rm)
529 {
530 	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
531 	int c;
532 
533 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
534 
535 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
536 		src = rm->rm_col[c].rc_data;
537 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
538 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
539 
540 		if (c == rm->rm_firstdatacol) {
541 			ASSERT(ccount == pcount);
542 			for (i = 0; i < ccount; i++, src++, p++) {
543 				*p = *src;
544 			}
545 		} else {
546 			ASSERT(ccount <= pcount);
547 			for (i = 0; i < ccount; i++, src++, p++) {
548 				*p ^= *src;
549 			}
550 		}
551 	}
552 }
553 
554 static void
555 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
556 {
557 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
558 	int c;
559 
560 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
561 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
562 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
563 
564 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 		src = rm->rm_col[c].rc_data;
566 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
568 
569 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
570 
571 		if (c == rm->rm_firstdatacol) {
572 			ASSERT(ccnt == pcnt || ccnt == 0);
573 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
574 				*p = *src;
575 				*q = *src;
576 			}
577 			for (; i < pcnt; i++, src++, p++, q++) {
578 				*p = 0;
579 				*q = 0;
580 			}
581 		} else {
582 			ASSERT(ccnt <= pcnt);
583 
584 			/*
585 			 * Apply the algorithm described above by multiplying
586 			 * the previous result and adding in the new value.
587 			 */
588 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
589 				*p ^= *src;
590 
591 				VDEV_RAIDZ_64MUL_2(*q, mask);
592 				*q ^= *src;
593 			}
594 
595 			/*
596 			 * Treat short columns as though they are full of 0s.
597 			 * Note that there's therefore nothing needed for P.
598 			 */
599 			for (; i < pcnt; i++, q++) {
600 				VDEV_RAIDZ_64MUL_2(*q, mask);
601 			}
602 		}
603 	}
604 }
605 
606 static void
607 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
608 {
609 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
610 	int c;
611 
612 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
613 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
615 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
617 
618 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
619 		src = rm->rm_col[c].rc_data;
620 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
621 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
622 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
623 
624 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
625 
626 		if (c == rm->rm_firstdatacol) {
627 			ASSERT(ccnt == pcnt || ccnt == 0);
628 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
629 				*p = *src;
630 				*q = *src;
631 				*r = *src;
632 			}
633 			for (; i < pcnt; i++, src++, p++, q++, r++) {
634 				*p = 0;
635 				*q = 0;
636 				*r = 0;
637 			}
638 		} else {
639 			ASSERT(ccnt <= pcnt);
640 
641 			/*
642 			 * Apply the algorithm described above by multiplying
643 			 * the previous result and adding in the new value.
644 			 */
645 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
646 				*p ^= *src;
647 
648 				VDEV_RAIDZ_64MUL_2(*q, mask);
649 				*q ^= *src;
650 
651 				VDEV_RAIDZ_64MUL_4(*r, mask);
652 				*r ^= *src;
653 			}
654 
655 			/*
656 			 * Treat short columns as though they are full of 0s.
657 			 * Note that there's therefore nothing needed for P.
658 			 */
659 			for (; i < pcnt; i++, q++, r++) {
660 				VDEV_RAIDZ_64MUL_2(*q, mask);
661 				VDEV_RAIDZ_64MUL_4(*r, mask);
662 			}
663 		}
664 	}
665 }
666 
667 /*
668  * Generate RAID parity in the first virtual columns according to the number of
669  * parity columns available.
670  */
671 static void
672 vdev_raidz_generate_parity(raidz_map_t *rm)
673 {
674 	switch (rm->rm_firstdatacol) {
675 	case 1:
676 		vdev_raidz_generate_parity_p(rm);
677 		break;
678 	case 2:
679 		vdev_raidz_generate_parity_pq(rm);
680 		break;
681 	case 3:
682 		vdev_raidz_generate_parity_pqr(rm);
683 		break;
684 	default:
685 		panic("invalid RAID-Z configuration");
686 	}
687 }
688 
689 /* BEGIN CSTYLED */
690 /*
691  * In the general case of reconstruction, we must solve the system of linear
692  * equations defined by the coeffecients used to generate parity as well as
693  * the contents of the data and parity disks. This can be expressed with
694  * vectors for the original data (D) and the actual data (d) and parity (p)
695  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
696  *
697  *            __   __                     __     __
698  *            |     |         __     __   |  p_0  |
699  *            |  V  |         |  D_0  |   | p_m-1 |
700  *            |     |    x    |   :   | = |  d_0  |
701  *            |  I  |         | D_n-1 |   |   :   |
702  *            |     |         ~~     ~~   | d_n-1 |
703  *            ~~   ~~                     ~~     ~~
704  *
705  * I is simply a square identity matrix of size n, and V is a vandermonde
706  * matrix defined by the coeffecients we chose for the various parity columns
707  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
708  * computation as well as linear separability.
709  *
710  *      __               __               __     __
711  *      |   1   ..  1 1 1 |               |  p_0  |
712  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
713  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
714  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
715  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
716  *      |   :       : : : |   |   :   |   |  d_2  |
717  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
718  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
719  *      |   0   ..  0 0 1 |               | d_n-1 |
720  *      ~~               ~~               ~~     ~~
721  *
722  * Note that I, V, d, and p are known. To compute D, we must invert the
723  * matrix and use the known data and parity values to reconstruct the unknown
724  * data values. We begin by removing the rows in V|I and d|p that correspond
725  * to failed or missing columns; we then make V|I square (n x n) and d|p
726  * sized n by removing rows corresponding to unused parity from the bottom up
727  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
728  * using Gauss-Jordan elimination. In the example below we use m=3 parity
729  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
730  *           __                               __
731  *           |  1   1   1   1   1   1   1   1  |
732  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
733  *           |  19 205 116  29  64  16  4   1  |      / /
734  *           |  1   0   0   0   0   0   0   0  |     / /
735  *           |  0   1   0   0   0   0   0   0  | <--' /
736  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
737  *           |  0   0   0   1   0   0   0   0  |
738  *           |  0   0   0   0   1   0   0   0  |
739  *           |  0   0   0   0   0   1   0   0  |
740  *           |  0   0   0   0   0   0   1   0  |
741  *           |  0   0   0   0   0   0   0   1  |
742  *           ~~                               ~~
743  *           __                               __
744  *           |  1   1   1   1   1   1   1   1  |
745  *           | 128  64  32  16  8   4   2   1  |
746  *           |  19 205 116  29  64  16  4   1  |
747  *           |  1   0   0   0   0   0   0   0  |
748  *           |  0   1   0   0   0   0   0   0  |
749  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
750  *           |  0   0   0   1   0   0   0   0  |
751  *           |  0   0   0   0   1   0   0   0  |
752  *           |  0   0   0   0   0   1   0   0  |
753  *           |  0   0   0   0   0   0   1   0  |
754  *           |  0   0   0   0   0   0   0   1  |
755  *           ~~                               ~~
756  *
757  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
758  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
759  * matrix is not singular.
760  * __                                                                 __
761  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
762  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
763  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
764  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
765  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
766  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
767  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
768  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
769  * ~~                                                                 ~~
770  * __                                                                 __
771  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
772  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
773  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
774  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
775  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
776  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
777  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
778  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
779  * ~~                                                                 ~~
780  * __                                                                 __
781  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
782  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
783  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
784  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
785  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
786  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
787  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
788  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
789  * ~~                                                                 ~~
790  * __                                                                 __
791  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
792  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
793  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
794  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
795  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
796  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
797  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
798  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
799  * ~~                                                                 ~~
800  * __                                                                 __
801  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
802  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
803  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
804  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
805  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
806  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
807  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
808  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
809  * ~~                                                                 ~~
810  * __                                                                 __
811  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
812  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
813  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
814  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
815  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
816  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
817  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
818  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
819  * ~~                                                                 ~~
820  *                   __                               __
821  *                   |  0   0   1   0   0   0   0   0  |
822  *                   | 167 100  5   41 159 169 217 208 |
823  *                   | 166 100  4   40 158 168 216 209 |
824  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
825  *                   |  0   0   0   0   1   0   0   0  |
826  *                   |  0   0   0   0   0   1   0   0  |
827  *                   |  0   0   0   0   0   0   1   0  |
828  *                   |  0   0   0   0   0   0   0   1  |
829  *                   ~~                               ~~
830  *
831  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
832  * of the missing data.
833  *
834  * As is apparent from the example above, the only non-trivial rows in the
835  * inverse matrix correspond to the data disks that we're trying to
836  * reconstruct. Indeed, those are the only rows we need as the others would
837  * only be useful for reconstructing data known or assumed to be valid. For
838  * that reason, we only build the coefficients in the rows that correspond to
839  * targeted columns.
840  */
841 /* END CSTYLED */
842 
843 static void
844 vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map,
845     uint8_t **rows)
846 {
847 	int i, j;
848 	int pow;
849 
850 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
851 
852 	/*
853 	 * Fill in the missing rows of interest.
854 	 */
855 	for (i = 0; i < nmap; i++) {
856 		ASSERT3S(0, <=, map[i]);
857 		ASSERT3S(map[i], <=, 2);
858 
859 		pow = map[i] * n;
860 		if (pow > 255)
861 			pow -= 255;
862 		ASSERT(pow <= 255);
863 
864 		for (j = 0; j < n; j++) {
865 			pow -= map[i];
866 			if (pow < 0)
867 				pow += 255;
868 			rows[i][j] = vdev_raidz_pow2[pow];
869 		}
870 	}
871 }
872 
873 static void
874 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
875     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
876 {
877 	int i, j, ii, jj;
878 	uint8_t log;
879 
880 	/*
881 	 * Assert that the first nmissing entries from the array of used
882 	 * columns correspond to parity columns and that subsequent entries
883 	 * correspond to data columns.
884 	 */
885 	for (i = 0; i < nmissing; i++) {
886 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
887 	}
888 	for (; i < n; i++) {
889 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
890 	}
891 
892 	/*
893 	 * First initialize the storage where we'll compute the inverse rows.
894 	 */
895 	for (i = 0; i < nmissing; i++) {
896 		for (j = 0; j < n; j++) {
897 			invrows[i][j] = (i == j) ? 1 : 0;
898 		}
899 	}
900 
901 	/*
902 	 * Subtract all trivial rows from the rows of consequence.
903 	 */
904 	for (i = 0; i < nmissing; i++) {
905 		for (j = nmissing; j < n; j++) {
906 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
907 			jj = used[j] - rm->rm_firstdatacol;
908 			ASSERT3S(jj, <, n);
909 			invrows[i][j] = rows[i][jj];
910 			rows[i][jj] = 0;
911 		}
912 	}
913 
914 	/*
915 	 * For each of the rows of interest, we must normalize it and subtract
916 	 * a multiple of it from the other rows.
917 	 */
918 	for (i = 0; i < nmissing; i++) {
919 		for (j = 0; j < missing[i]; j++) {
920 			ASSERT3U(rows[i][j], ==, 0);
921 		}
922 		ASSERT3U(rows[i][missing[i]], !=, 0);
923 
924 		/*
925 		 * Compute the inverse of the first element and multiply each
926 		 * element in the row by that value.
927 		 */
928 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
929 
930 		for (j = 0; j < n; j++) {
931 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
932 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
933 		}
934 
935 		for (ii = 0; ii < nmissing; ii++) {
936 			if (i == ii)
937 				continue;
938 
939 			ASSERT3U(rows[ii][missing[i]], !=, 0);
940 
941 			log = vdev_raidz_log2[rows[ii][missing[i]]];
942 
943 			for (j = 0; j < n; j++) {
944 				rows[ii][j] ^=
945 				    vdev_raidz_exp2(rows[i][j], log);
946 				invrows[ii][j] ^=
947 				    vdev_raidz_exp2(invrows[i][j], log);
948 			}
949 		}
950 	}
951 
952 	/*
953 	 * Verify that the data that is left in the rows are properly part of
954 	 * an identity matrix.
955 	 */
956 	for (i = 0; i < nmissing; i++) {
957 		for (j = 0; j < n; j++) {
958 			if (j == missing[i]) {
959 				ASSERT3U(rows[i][j], ==, 1);
960 			} else {
961 				ASSERT3U(rows[i][j], ==, 0);
962 			}
963 		}
964 	}
965 }
966 
967 static void
968 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
969     int *missing, uint8_t **invrows, const uint8_t *used)
970 {
971 	int i, j, x, cc, c;
972 	uint8_t *src;
973 	uint64_t ccount;
974 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
975 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
976 	uint8_t log, val;
977 	int ll;
978 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
979 	uint8_t *p, *pp;
980 	size_t psize;
981 
982 	log = 0;	/* gcc */
983 	psize = sizeof (invlog[0][0]) * n * nmissing;
984 	p = zfs_alloc(psize);
985 
986 	for (pp = p, i = 0; i < nmissing; i++) {
987 		invlog[i] = pp;
988 		pp += n;
989 	}
990 
991 	for (i = 0; i < nmissing; i++) {
992 		for (j = 0; j < n; j++) {
993 			ASSERT3U(invrows[i][j], !=, 0);
994 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
995 		}
996 	}
997 
998 	for (i = 0; i < n; i++) {
999 		c = used[i];
1000 		ASSERT3U(c, <, rm->rm_cols);
1001 
1002 		src = rm->rm_col[c].rc_data;
1003 		ccount = rm->rm_col[c].rc_size;
1004 		for (j = 0; j < nmissing; j++) {
1005 			cc = missing[j] + rm->rm_firstdatacol;
1006 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1007 			ASSERT3U(cc, <, rm->rm_cols);
1008 			ASSERT3U(cc, !=, c);
1009 
1010 			dst[j] = rm->rm_col[cc].rc_data;
1011 			dcount[j] = rm->rm_col[cc].rc_size;
1012 		}
1013 
1014 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1015 
1016 		for (x = 0; x < ccount; x++, src++) {
1017 			if (*src != 0)
1018 				log = vdev_raidz_log2[*src];
1019 
1020 			for (cc = 0; cc < nmissing; cc++) {
1021 				if (x >= dcount[cc])
1022 					continue;
1023 
1024 				if (*src == 0) {
1025 					val = 0;
1026 				} else {
1027 					if ((ll = log + invlog[cc][i]) >= 255)
1028 						ll -= 255;
1029 					val = vdev_raidz_pow2[ll];
1030 				}
1031 
1032 				if (i == 0)
1033 					dst[cc][x] = val;
1034 				else
1035 					dst[cc][x] ^= val;
1036 			}
1037 		}
1038 	}
1039 
1040 	zfs_free(p, psize);
1041 }
1042 
1043 static int
1044 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1045 {
1046 	int n, i, c, t, tt;
1047 	int nmissing_rows;
1048 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1049 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1050 
1051 	uint8_t *p, *pp;
1052 	size_t psize;
1053 
1054 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1055 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1056 	uint8_t *used;
1057 
1058 	int code = 0;
1059 
1060 
1061 	n = rm->rm_cols - rm->rm_firstdatacol;
1062 
1063 	/*
1064 	 * Figure out which data columns are missing.
1065 	 */
1066 	nmissing_rows = 0;
1067 	for (t = 0; t < ntgts; t++) {
1068 		if (tgts[t] >= rm->rm_firstdatacol) {
1069 			missing_rows[nmissing_rows++] =
1070 			    tgts[t] - rm->rm_firstdatacol;
1071 		}
1072 	}
1073 
1074 	/*
1075 	 * Figure out which parity columns to use to help generate the missing
1076 	 * data columns.
1077 	 */
1078 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1079 		ASSERT(tt < ntgts);
1080 		ASSERT(c < rm->rm_firstdatacol);
1081 
1082 		/*
1083 		 * Skip any targeted parity columns.
1084 		 */
1085 		if (c == tgts[tt]) {
1086 			tt++;
1087 			continue;
1088 		}
1089 
1090 		code |= 1 << c;
1091 
1092 		parity_map[i] = c;
1093 		i++;
1094 	}
1095 
1096 	ASSERT(code != 0);
1097 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1098 
1099 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1100 	    nmissing_rows * n + sizeof (used[0]) * n;
1101 	p = kmem_alloc(psize, KM_SLEEP);
1102 
1103 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1104 		rows[i] = pp;
1105 		pp += n;
1106 		invrows[i] = pp;
1107 		pp += n;
1108 	}
1109 	used = pp;
1110 
1111 	for (i = 0; i < nmissing_rows; i++) {
1112 		used[i] = parity_map[i];
1113 	}
1114 
1115 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1116 		if (tt < nmissing_rows &&
1117 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1118 			tt++;
1119 			continue;
1120 		}
1121 
1122 		ASSERT3S(i, <, n);
1123 		used[i] = c;
1124 		i++;
1125 	}
1126 
1127 	/*
1128 	 * Initialize the interesting rows of the matrix.
1129 	 */
1130 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1131 
1132 	/*
1133 	 * Invert the matrix.
1134 	 */
1135 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1136 	    invrows, used);
1137 
1138 	/*
1139 	 * Reconstruct the missing data using the generated matrix.
1140 	 */
1141 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1142 	    invrows, used);
1143 
1144 	kmem_free(p, psize);
1145 
1146 	return (code);
1147 }
1148 
1149 static int
1150 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1151 {
1152 	int tgts[VDEV_RAIDZ_MAXPARITY];
1153 	int ntgts;
1154 	int i, c;
1155 	int code;
1156 	int nbadparity, nbaddata;
1157 
1158 	/*
1159 	 * The tgts list must already be sorted.
1160 	 */
1161 	for (i = 1; i < nt; i++) {
1162 		ASSERT(t[i] > t[i - 1]);
1163 	}
1164 
1165 	nbadparity = rm->rm_firstdatacol;
1166 	nbaddata = rm->rm_cols - nbadparity;
1167 	ntgts = 0;
1168 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1169 		if (i < nt && c == t[i]) {
1170 			tgts[ntgts++] = c;
1171 			i++;
1172 		} else if (rm->rm_col[c].rc_error != 0) {
1173 			tgts[ntgts++] = c;
1174 		} else if (c >= rm->rm_firstdatacol) {
1175 			nbaddata--;
1176 		} else {
1177 			nbadparity--;
1178 		}
1179 	}
1180 
1181 	ASSERT(ntgts >= nt);
1182 	ASSERT(nbaddata >= 0);
1183 	ASSERT(nbaddata + nbadparity == ntgts);
1184 
1185 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1186 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1187 	ASSERT(code > 0);
1188 	return (code);
1189 }
1190 
1191 static raidz_map_t *
1192 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1193     uint64_t dcols, uint64_t nparity)
1194 {
1195 	raidz_map_t *rm;
1196 	uint64_t b = offset >> unit_shift;
1197 	uint64_t s = size >> unit_shift;
1198 	uint64_t f = b % dcols;
1199 	uint64_t o = (b / dcols) << unit_shift;
1200 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1201 
1202 	q = s / (dcols - nparity);
1203 	r = s - q * (dcols - nparity);
1204 	bc = (r == 0 ? 0 : r + nparity);
1205 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1206 
1207 	if (q == 0) {
1208 		acols = bc;
1209 		scols = MIN(dcols, roundup(bc, nparity + 1));
1210 	} else {
1211 		acols = dcols;
1212 		scols = dcols;
1213 	}
1214 
1215 	ASSERT3U(acols, <=, scols);
1216 
1217 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1218 
1219 	rm->rm_cols = acols;
1220 	rm->rm_scols = scols;
1221 	rm->rm_bigcols = bc;
1222 	rm->rm_skipstart = bc;
1223 	rm->rm_missingdata = 0;
1224 	rm->rm_missingparity = 0;
1225 	rm->rm_firstdatacol = nparity;
1226 	rm->rm_reports = 0;
1227 	rm->rm_freed = 0;
1228 	rm->rm_ecksuminjected = 0;
1229 
1230 	asize = 0;
1231 
1232 	for (c = 0; c < scols; c++) {
1233 		col = f + c;
1234 		coff = o;
1235 		if (col >= dcols) {
1236 			col -= dcols;
1237 			coff += 1ULL << unit_shift;
1238 		}
1239 		rm->rm_col[c].rc_devidx = col;
1240 		rm->rm_col[c].rc_offset = coff;
1241 		rm->rm_col[c].rc_data = NULL;
1242 		rm->rm_col[c].rc_error = 0;
1243 		rm->rm_col[c].rc_tried = 0;
1244 		rm->rm_col[c].rc_skipped = 0;
1245 
1246 		if (c >= acols)
1247 			rm->rm_col[c].rc_size = 0;
1248 		else if (c < bc)
1249 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1250 		else
1251 			rm->rm_col[c].rc_size = q << unit_shift;
1252 
1253 		asize += rm->rm_col[c].rc_size;
1254 	}
1255 
1256 	ASSERT3U(asize, ==, tot << unit_shift);
1257 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1258 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1259 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1260 	ASSERT3U(rm->rm_nskip, <=, nparity);
1261 
1262 	for (c = 0; c < rm->rm_firstdatacol; c++)
1263 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1264 
1265 	rm->rm_col[c].rc_data = data;
1266 
1267 	for (c = c + 1; c < acols; c++)
1268 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1269 		    rm->rm_col[c - 1].rc_size;
1270 
1271 	/*
1272 	 * If all data stored spans all columns, there's a danger that parity
1273 	 * will always be on the same device and, since parity isn't read
1274 	 * during normal operation, that that device's I/O bandwidth won't be
1275 	 * used effectively. We therefore switch the parity every 1MB.
1276 	 *
1277 	 * ... at least that was, ostensibly, the theory. As a practical
1278 	 * matter unless we juggle the parity between all devices evenly, we
1279 	 * won't see any benefit. Further, occasional writes that aren't a
1280 	 * multiple of the LCM of the number of children and the minimum
1281 	 * stripe width are sufficient to avoid pessimal behavior.
1282 	 * Unfortunately, this decision created an implicit on-disk format
1283 	 * requirement that we need to support for all eternity, but only
1284 	 * for single-parity RAID-Z.
1285 	 *
1286 	 * If we intend to skip a sector in the zeroth column for padding
1287 	 * we must make sure to note this swap. We will never intend to
1288 	 * skip the first column since at least one data and one parity
1289 	 * column must appear in each row.
1290 	 */
1291 	ASSERT(rm->rm_cols >= 2);
1292 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1293 
1294 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1295 		devidx = rm->rm_col[0].rc_devidx;
1296 		o = rm->rm_col[0].rc_offset;
1297 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1298 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1299 		rm->rm_col[1].rc_devidx = devidx;
1300 		rm->rm_col[1].rc_offset = o;
1301 
1302 		if (rm->rm_skipstart == 0)
1303 			rm->rm_skipstart = 1;
1304 	}
1305 
1306 	return (rm);
1307 }
1308 
1309 static void
1310 vdev_raidz_map_free(raidz_map_t *rm)
1311 {
1312 	int c;
1313 
1314 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1315 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1316 
1317 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1318 }
1319 
1320 static vdev_t *
1321 vdev_child(vdev_t *pvd, uint64_t devidx)
1322 {
1323 	vdev_t *cvd;
1324 
1325 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1326 		if (cvd->v_id == devidx)
1327 			break;
1328 	}
1329 
1330 	return (cvd);
1331 }
1332 
1333 /*
1334  * We keep track of whether or not there were any injected errors, so that
1335  * any ereports we generate can note it.
1336  */
1337 static int
1338 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1339     uint64_t size __unused)
1340 {
1341 
1342 	return (zio_checksum_verify(spa, bp, data));
1343 }
1344 
1345 /*
1346  * Generate the parity from the data columns. If we tried and were able to
1347  * read the parity without error, verify that the generated parity matches the
1348  * data we read. If it doesn't, we fire off a checksum error. Return the
1349  * number such failures.
1350  */
1351 static int
1352 raidz_parity_verify(raidz_map_t *rm)
1353 {
1354 	void *orig[VDEV_RAIDZ_MAXPARITY];
1355 	int c, ret = 0;
1356 	raidz_col_t *rc;
1357 
1358 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1359 		rc = &rm->rm_col[c];
1360 		if (!rc->rc_tried || rc->rc_error != 0)
1361 			continue;
1362 		orig[c] = zfs_alloc(rc->rc_size);
1363 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1364 	}
1365 
1366 	vdev_raidz_generate_parity(rm);
1367 
1368 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1369 		rc = &rm->rm_col[c];
1370 		if (!rc->rc_tried || rc->rc_error != 0)
1371 			continue;
1372 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1373 			rc->rc_error = ECKSUM;
1374 			ret++;
1375 		}
1376 		zfs_free(orig[c], rc->rc_size);
1377 	}
1378 
1379 	return (ret);
1380 }
1381 
1382 /*
1383  * Iterate over all combinations of bad data and attempt a reconstruction.
1384  * Note that the algorithm below is non-optimal because it doesn't take into
1385  * account how reconstruction is actually performed. For example, with
1386  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1387  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1388  * cases we'd only use parity information in column 0.
1389  */
1390 static int
1391 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1392     void *data, off_t offset __unused, uint64_t bytes, int total_errors,
1393     int data_errors)
1394 {
1395 	raidz_col_t *rc;
1396 	void *orig[VDEV_RAIDZ_MAXPARITY];
1397 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1398 	int *tgts = &tstore[1];
1399 	int current, next, i, c, n;
1400 	int code, ret = 0;
1401 
1402 	ASSERT(total_errors < rm->rm_firstdatacol);
1403 
1404 	/*
1405 	 * This simplifies one edge condition.
1406 	 */
1407 	tgts[-1] = -1;
1408 
1409 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1410 		/*
1411 		 * Initialize the targets array by finding the first n columns
1412 		 * that contain no error.
1413 		 *
1414 		 * If there were no data errors, we need to ensure that we're
1415 		 * always explicitly attempting to reconstruct at least one
1416 		 * data column. To do this, we simply push the highest target
1417 		 * up into the data columns.
1418 		 */
1419 		for (c = 0, i = 0; i < n; i++) {
1420 			if (i == n - 1 && data_errors == 0 &&
1421 			    c < rm->rm_firstdatacol) {
1422 				c = rm->rm_firstdatacol;
1423 			}
1424 
1425 			while (rm->rm_col[c].rc_error != 0) {
1426 				c++;
1427 				ASSERT3S(c, <, rm->rm_cols);
1428 			}
1429 
1430 			tgts[i] = c++;
1431 		}
1432 
1433 		/*
1434 		 * Setting tgts[n] simplifies the other edge condition.
1435 		 */
1436 		tgts[n] = rm->rm_cols;
1437 
1438 		/*
1439 		 * These buffers were allocated in previous iterations.
1440 		 */
1441 		for (i = 0; i < n - 1; i++) {
1442 			ASSERT(orig[i] != NULL);
1443 		}
1444 
1445 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1446 
1447 		current = 0;
1448 		next = tgts[current];
1449 
1450 		while (current != n) {
1451 			tgts[current] = next;
1452 			current = 0;
1453 
1454 			/*
1455 			 * Save off the original data that we're going to
1456 			 * attempt to reconstruct.
1457 			 */
1458 			for (i = 0; i < n; i++) {
1459 				ASSERT(orig[i] != NULL);
1460 				c = tgts[i];
1461 				ASSERT3S(c, >=, 0);
1462 				ASSERT3S(c, <, rm->rm_cols);
1463 				rc = &rm->rm_col[c];
1464 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1465 			}
1466 
1467 			/*
1468 			 * Attempt a reconstruction and exit the outer loop on
1469 			 * success.
1470 			 */
1471 			code = vdev_raidz_reconstruct(rm, tgts, n);
1472 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1473 				for (i = 0; i < n; i++) {
1474 					c = tgts[i];
1475 					rc = &rm->rm_col[c];
1476 					ASSERT(rc->rc_error == 0);
1477 					rc->rc_error = ECKSUM;
1478 				}
1479 
1480 				ret = code;
1481 				goto done;
1482 			}
1483 
1484 			/*
1485 			 * Restore the original data.
1486 			 */
1487 			for (i = 0; i < n; i++) {
1488 				c = tgts[i];
1489 				rc = &rm->rm_col[c];
1490 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1491 			}
1492 
1493 			do {
1494 				/*
1495 				 * Find the next valid column after the current
1496 				 * position..
1497 				 */
1498 				for (next = tgts[current] + 1;
1499 				    next < rm->rm_cols &&
1500 				    rm->rm_col[next].rc_error != 0; next++)
1501 					continue;
1502 
1503 				ASSERT(next <= tgts[current + 1]);
1504 
1505 				/*
1506 				 * If that spot is available, we're done here.
1507 				 */
1508 				if (next != tgts[current + 1])
1509 					break;
1510 
1511 				/*
1512 				 * Otherwise, find the next valid column after
1513 				 * the previous position.
1514 				 */
1515 				for (c = tgts[current - 1] + 1;
1516 				    rm->rm_col[c].rc_error != 0; c++)
1517 					continue;
1518 
1519 				tgts[current] = c;
1520 				current++;
1521 
1522 			} while (current != n);
1523 		}
1524 	}
1525 	n--;
1526 done:
1527 	for (i = n - 1; i >= 0; i--) {
1528 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1529 	}
1530 
1531 	return (ret);
1532 }
1533 
1534 static int
1535 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1536     off_t offset, size_t bytes)
1537 {
1538 	vdev_t *tvd = vd->v_top;
1539 	vdev_t *cvd;
1540 	raidz_map_t *rm;
1541 	raidz_col_t *rc;
1542 	int c, error;
1543 	int unexpected_errors;
1544 	int parity_errors;
1545 	int parity_untried;
1546 	int data_errors;
1547 	int total_errors;
1548 	int n;
1549 	int tgts[VDEV_RAIDZ_MAXPARITY];
1550 	int code;
1551 
1552 	rc = NULL;	/* gcc */
1553 	error = 0;
1554 
1555 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1556 	    vd->v_nchildren, vd->v_nparity);
1557 
1558 	/*
1559 	 * Iterate over the columns in reverse order so that we hit the parity
1560 	 * last -- any errors along the way will force us to read the parity.
1561 	 */
1562 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1563 		rc = &rm->rm_col[c];
1564 		cvd = vdev_child(vd, rc->rc_devidx);
1565 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1566 			if (c >= rm->rm_firstdatacol)
1567 				rm->rm_missingdata++;
1568 			else
1569 				rm->rm_missingparity++;
1570 			rc->rc_error = ENXIO;
1571 			rc->rc_tried = 1;	/* don't even try */
1572 			rc->rc_skipped = 1;
1573 			continue;
1574 		}
1575 #if 0		/* XXX: Too hard for the boot code. */
1576 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1577 			if (c >= rm->rm_firstdatacol)
1578 				rm->rm_missingdata++;
1579 			else
1580 				rm->rm_missingparity++;
1581 			rc->rc_error = ESTALE;
1582 			rc->rc_skipped = 1;
1583 			continue;
1584 		}
1585 #endif
1586 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1587 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1588 			    rc->rc_offset, rc->rc_size);
1589 			rc->rc_tried = 1;
1590 			rc->rc_skipped = 0;
1591 		}
1592 	}
1593 
1594 reconstruct:
1595 	unexpected_errors = 0;
1596 	parity_errors = 0;
1597 	parity_untried = 0;
1598 	data_errors = 0;
1599 	total_errors = 0;
1600 
1601 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1602 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1603 
1604 	for (c = 0; c < rm->rm_cols; c++) {
1605 		rc = &rm->rm_col[c];
1606 
1607 		if (rc->rc_error) {
1608 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1609 
1610 			if (c < rm->rm_firstdatacol)
1611 				parity_errors++;
1612 			else
1613 				data_errors++;
1614 
1615 			if (!rc->rc_skipped)
1616 				unexpected_errors++;
1617 
1618 			total_errors++;
1619 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1620 			parity_untried++;
1621 		}
1622 	}
1623 
1624 	/*
1625 	 * There are three potential phases for a read:
1626 	 *	1. produce valid data from the columns read
1627 	 *	2. read all disks and try again
1628 	 *	3. perform combinatorial reconstruction
1629 	 *
1630 	 * Each phase is progressively both more expensive and less likely to
1631 	 * occur. If we encounter more errors than we can repair or all phases
1632 	 * fail, we have no choice but to return an error.
1633 	 */
1634 
1635 	/*
1636 	 * If the number of errors we saw was correctable -- less than or equal
1637 	 * to the number of parity disks read -- attempt to produce data that
1638 	 * has a valid checksum. Naturally, this case applies in the absence of
1639 	 * any errors.
1640 	 */
1641 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1642 		int rv;
1643 
1644 		if (data_errors == 0) {
1645 			rv = raidz_checksum_verify(vd->spa, bp, data, bytes);
1646 			if (rv == 0) {
1647 				/*
1648 				 * If we read parity information (unnecessarily
1649 				 * as it happens since no reconstruction was
1650 				 * needed) regenerate and verify the parity.
1651 				 * We also regenerate parity when resilvering
1652 				 * so we can write it out to the failed device
1653 				 * later.
1654 				 */
1655 				if (parity_errors + parity_untried <
1656 				    rm->rm_firstdatacol) {
1657 					n = raidz_parity_verify(rm);
1658 					unexpected_errors += n;
1659 					ASSERT(parity_errors + n <=
1660 					    rm->rm_firstdatacol);
1661 				}
1662 				goto done;
1663 			}
1664 		} else {
1665 			/*
1666 			 * We either attempt to read all the parity columns or
1667 			 * none of them. If we didn't try to read parity, we
1668 			 * wouldn't be here in the correctable case. There must
1669 			 * also have been fewer parity errors than parity
1670 			 * columns or, again, we wouldn't be in this code path.
1671 			 */
1672 			ASSERT(parity_untried == 0);
1673 			ASSERT(parity_errors < rm->rm_firstdatacol);
1674 
1675 			/*
1676 			 * Identify the data columns that reported an error.
1677 			 */
1678 			n = 0;
1679 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1680 				rc = &rm->rm_col[c];
1681 				if (rc->rc_error != 0) {
1682 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1683 					tgts[n++] = c;
1684 				}
1685 			}
1686 
1687 			ASSERT(rm->rm_firstdatacol >= n);
1688 
1689 			code = vdev_raidz_reconstruct(rm, tgts, n);
1690 
1691 			rv = raidz_checksum_verify(vd->spa, bp, data, bytes);
1692 			if (rv == 0) {
1693 				/*
1694 				 * If we read more parity disks than were used
1695 				 * for reconstruction, confirm that the other
1696 				 * parity disks produced correct data. This
1697 				 * routine is suboptimal in that it regenerates
1698 				 * the parity that we already used in addition
1699 				 * to the parity that we're attempting to
1700 				 * verify, but this should be a relatively
1701 				 * uncommon case, and can be optimized if it
1702 				 * becomes a problem. Note that we regenerate
1703 				 * parity when resilvering so we can write it
1704 				 * out to failed devices later.
1705 				 */
1706 				if (parity_errors < rm->rm_firstdatacol - n) {
1707 					n = raidz_parity_verify(rm);
1708 					unexpected_errors += n;
1709 					ASSERT(parity_errors + n <=
1710 					    rm->rm_firstdatacol);
1711 				}
1712 
1713 				goto done;
1714 			}
1715 		}
1716 	}
1717 
1718 	/*
1719 	 * This isn't a typical situation -- either we got a read
1720 	 * error or a child silently returned bad data. Read every
1721 	 * block so we can try again with as much data and parity as
1722 	 * we can track down. If we've already been through once
1723 	 * before, all children will be marked as tried so we'll
1724 	 * proceed to combinatorial reconstruction.
1725 	 */
1726 	unexpected_errors = 1;
1727 	rm->rm_missingdata = 0;
1728 	rm->rm_missingparity = 0;
1729 
1730 	n = 0;
1731 	for (c = 0; c < rm->rm_cols; c++) {
1732 		rc = &rm->rm_col[c];
1733 
1734 		if (rc->rc_tried)
1735 			continue;
1736 
1737 		cvd = vdev_child(vd, rc->rc_devidx);
1738 		ASSERT(cvd != NULL);
1739 		rc->rc_error = cvd->v_read(cvd, NULL,
1740 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1741 		if (rc->rc_error == 0)
1742 			n++;
1743 		rc->rc_tried = 1;
1744 		rc->rc_skipped = 0;
1745 	}
1746 	/*
1747 	 * If we managed to read anything more, retry the
1748 	 * reconstruction.
1749 	 */
1750 	if (n > 0)
1751 		goto reconstruct;
1752 
1753 	/*
1754 	 * At this point we've attempted to reconstruct the data given the
1755 	 * errors we detected, and we've attempted to read all columns. There
1756 	 * must, therefore, be one or more additional problems -- silent errors
1757 	 * resulting in invalid data rather than explicit I/O errors resulting
1758 	 * in absent data. We check if there is enough additional data to
1759 	 * possibly reconstruct the data and then perform combinatorial
1760 	 * reconstruction over all possible combinations. If that fails,
1761 	 * we're cooked.
1762 	 */
1763 	if (total_errors > rm->rm_firstdatacol) {
1764 		error = EIO;
1765 	} else if (total_errors < rm->rm_firstdatacol &&
1766 	    (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1767 	    total_errors, data_errors)) != 0) {
1768 		/*
1769 		 * If we didn't use all the available parity for the
1770 		 * combinatorial reconstruction, verify that the remaining
1771 		 * parity is correct.
1772 		 */
1773 		if (code != (1 << rm->rm_firstdatacol) - 1)
1774 			(void) raidz_parity_verify(rm);
1775 	} else {
1776 		/*
1777 		 * We're here because either:
1778 		 *
1779 		 *	total_errors == rm_first_datacol, or
1780 		 *	vdev_raidz_combrec() failed
1781 		 *
1782 		 * In either case, there is enough bad data to prevent
1783 		 * reconstruction.
1784 		 *
1785 		 * Start checksum ereports for all children which haven't
1786 		 * failed, and the IO wasn't speculative.
1787 		 */
1788 		error = ECKSUM;
1789 	}
1790 
1791 done:
1792 	vdev_raidz_map_free(rm);
1793 
1794 	return (error);
1795 }
1796