xref: /illumos-gate/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c (revision 3e8c7f16)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 #include <lz4.h>
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #define	ECKSUM	666
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 static void
zfs_init_crc(void)40 zfs_init_crc(void)
41 {
42 	int i, j;
43 	uint64_t *ct;
44 
45 	/*
46 	 * Calculate the crc64 table (used for the zap hash
47 	 * function).
48 	 */
49 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
50 		memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table));
51 		for (i = 0; i < 256; i++) {
52 			ct = zfs_crc64_table + i;
53 			for (*ct = i, j = 8; j > 0; j--)
54 				*ct = (*ct >> 1) ^
55 				    (-(*ct & 1) & ZFS_CRC64_POLY);
56 		}
57 	}
58 }
59 
60 static void
zio_checksum_off(const void * buf __unused,uint64_t size __unused,const void * ctx_template __unused,zio_cksum_t * zcp)61 zio_checksum_off(const void *buf __unused, uint64_t size __unused,
62     const void *ctx_template __unused, zio_cksum_t *zcp)
63 {
64 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
65 }
66 
67 /*
68  * Signature for checksum functions.
69  */
70 typedef void zio_checksum_t(const void *data, uint64_t size,
71     const void *ctx_template, zio_cksum_t *zcp);
72 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
73 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
74 
75 typedef enum zio_checksum_flags {
76 	/* Strong enough for metadata? */
77 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
78 	/* ZIO embedded checksum */
79 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
80 	/* Strong enough for dedup (without verification)? */
81 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
82 	/* Uses salt value */
83 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
84 	/* Strong enough for nopwrite? */
85 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
86 } zio_checksum_flags_t;
87 
88 /*
89  * Information about each checksum function.
90  */
91 typedef struct zio_checksum_info {
92 	/* checksum function for each byteorder */
93 	zio_checksum_t			*ci_func[2];
94 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
95 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
96 	zio_checksum_flags_t		ci_flags;
97 	const char			*ci_name;	/* descriptive name */
98 } zio_checksum_info_t;
99 
100 #include "blkptr.c"
101 
102 #include "fletcher.c"
103 #include "sha256.c"
104 #include "skein_zfs.c"
105 #include "edonr_zfs.c"
106 
107 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
108 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
109 	{{NULL, NULL}, NULL, NULL, 0, "on"},
110 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
111 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
112 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
113 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
114 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
115 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
116 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
117 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
118 	    0, "fletcher2"},
119 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
120 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
121 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
122 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
123 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
124 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
125 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
126 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
127 	    0, "noparity"},
128 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
129 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
130 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
131 	/* no skein and edonr for now */
132 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
133 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
134 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
135 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
136 	{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
137 	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
138 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
139 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
140 };
141 
142 /*
143  * Common signature for all zio compress/decompress functions.
144  */
145 typedef size_t zio_compress_func_t(void *src, void *dst,
146     size_t s_len, size_t d_len, int);
147 typedef int zio_decompress_func_t(void *src, void *dst,
148     size_t s_len, size_t d_len, int);
149 
150 extern int gzip_decompress(void *src, void *dst,
151     size_t s_len, size_t d_len, int);
152 /*
153  * Information about each compression function.
154  */
155 typedef struct zio_compress_info {
156 	zio_compress_func_t	*ci_compress;	/* compression function */
157 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
158 	int			ci_level;	/* level parameter */
159 	const char		*ci_name;	/* algorithm name */
160 } zio_compress_info_t;
161 
162 #include "lzjb.c"
163 #include "zle.c"
164 
165 /*
166  * Compression vectors.
167  */
168 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
169 	{NULL,			NULL,			0,	"inherit"},
170 	{NULL,			NULL,			0,	"on"},
171 	{NULL,			NULL,			0,	"uncompressed"},
172 	{NULL,			lzjb_decompress,	0,	"lzjb"},
173 	{NULL,			NULL,			0,	"empty"},
174 	{NULL,			gzip_decompress,	1,	"gzip-1"},
175 	{NULL,			gzip_decompress,	2,	"gzip-2"},
176 	{NULL,			gzip_decompress,	3,	"gzip-3"},
177 	{NULL,			gzip_decompress,	4,	"gzip-4"},
178 	{NULL,			gzip_decompress,	5,	"gzip-5"},
179 	{NULL,			gzip_decompress,	6,	"gzip-6"},
180 	{NULL,			gzip_decompress,	7,	"gzip-7"},
181 	{NULL,			gzip_decompress,	8,	"gzip-8"},
182 	{NULL,			gzip_decompress,	9,	"gzip-9"},
183 	{NULL,			zle_decompress,		64,	"zle"},
184 	{NULL,			lz4_decompress,		0,	"lz4"},
185 };
186 
187 static void
byteswap_uint64_array(void * vbuf,size_t size)188 byteswap_uint64_array(void *vbuf, size_t size)
189 {
190 	uint64_t *buf = vbuf;
191 	size_t count = size >> 3;
192 	int i;
193 
194 	ASSERT((size & 7) == 0);
195 
196 	for (i = 0; i < count; i++)
197 		buf[i] = BSWAP_64(buf[i]);
198 }
199 
200 /*
201  * Set the external verifier for a gang block based on <vdev, offset, txg>,
202  * a tuple which is guaranteed to be unique for the life of the pool.
203  */
204 static void
zio_checksum_gang_verifier(zio_cksum_t * zcp,const blkptr_t * bp)205 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
206 {
207 	const dva_t *dva = BP_IDENTITY(bp);
208 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
209 
210 	ASSERT(BP_IS_GANG(bp));
211 
212 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
213 }
214 
215 /*
216  * Set the external verifier for a label block based on its offset.
217  * The vdev is implicit, and the txg is unknowable at pool open time --
218  * hence the logic in vdev_uberblock_load() to find the most recent copy.
219  */
220 static void
zio_checksum_label_verifier(zio_cksum_t * zcp,uint64_t offset)221 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
222 {
223 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
224 }
225 
226 /*
227  * Calls the template init function of a checksum which supports context
228  * templates and installs the template into the spa_t.
229  */
230 static void
zio_checksum_template_init(enum zio_checksum checksum,spa_t * spa)231 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
232 {
233 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
234 
235 	if (ci->ci_tmpl_init == NULL)
236 		return;
237 
238 	if (spa->spa_cksum_tmpls[checksum] != NULL)
239 		return;
240 
241 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
242 		spa->spa_cksum_tmpls[checksum] =
243 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
244 	}
245 }
246 
247 /*
248  * Called by a spa_t that's about to be deallocated. This steps through
249  * all of the checksum context templates and deallocates any that were
250  * initialized using the algorithm-specific template init function.
251  */
252 void
zio_checksum_templates_free(spa_t * spa)253 zio_checksum_templates_free(spa_t *spa)
254 {
255 	for (enum zio_checksum checksum = 0;
256 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
257 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
258 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
259 
260 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
261 			spa->spa_cksum_tmpls[checksum] = NULL;
262 		}
263 	}
264 }
265 
266 static int
zio_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data)267 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
268 {
269 	uint64_t size;
270 	unsigned int checksum;
271 	zio_checksum_info_t *ci;
272 	void *ctx = NULL;
273 	zio_cksum_t actual_cksum, expected_cksum, verifier;
274 	int byteswap;
275 
276 	checksum = BP_GET_CHECKSUM(bp);
277 	size = BP_GET_PSIZE(bp);
278 
279 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
280 		return (EINVAL);
281 	ci = &zio_checksum_table[checksum];
282 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
283 		return (EINVAL);
284 
285 	if (spa != NULL) {
286 		zio_checksum_template_init(checksum, (spa_t *)spa);
287 		ctx = spa->spa_cksum_tmpls[checksum];
288 	}
289 
290 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
291 		zio_eck_t *eck;
292 
293 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
294 		    checksum == ZIO_CHECKSUM_LABEL);
295 
296 		eck = (zio_eck_t *)((char *)data + size) - 1;
297 
298 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
299 			zio_checksum_gang_verifier(&verifier, bp);
300 		else if (checksum == ZIO_CHECKSUM_LABEL)
301 			zio_checksum_label_verifier(&verifier,
302 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
303 		else
304 			verifier = bp->blk_cksum;
305 
306 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
307 
308 		if (byteswap)
309 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
310 
311 		expected_cksum = eck->zec_cksum;
312 		eck->zec_cksum = verifier;
313 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
314 		eck->zec_cksum = expected_cksum;
315 
316 		if (byteswap)
317 			byteswap_uint64_array(&expected_cksum,
318 			    sizeof (zio_cksum_t));
319 	} else {
320 		byteswap = BP_SHOULD_BYTESWAP(bp);
321 		expected_cksum = bp->blk_cksum;
322 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
323 	}
324 
325 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
326 		/* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
327 		return (EIO);
328 	}
329 
330 	return (0);
331 }
332 
333 static int
zio_decompress_data(int cpfunc,void * src,uint64_t srcsize,void * dest,uint64_t destsize)334 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
335     void *dest, uint64_t destsize)
336 {
337 	zio_compress_info_t *ci;
338 
339 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
340 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
341 		return (EIO);
342 	}
343 
344 	ci = &zio_compress_table[cpfunc];
345 	if (!ci->ci_decompress) {
346 		printf("ZFS: unsupported compression algorithm %s\n",
347 		    ci->ci_name);
348 		return (EIO);
349 	}
350 
351 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
352 }
353 
354 static uint64_t
zap_hash(uint64_t salt,const char * name)355 zap_hash(uint64_t salt, const char *name)
356 {
357 	const uint8_t *cp;
358 	uint8_t c;
359 	uint64_t crc = salt;
360 
361 	ASSERT(crc != 0);
362 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
363 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
364 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
365 
366 	/*
367 	 * Only use 28 bits, since we need 4 bits in the cookie for the
368 	 * collision differentiator.  We MUST use the high bits, since
369 	 * those are the onces that we first pay attention to when
370 	 * chosing the bucket.
371 	 */
372 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
373 
374 	return (crc);
375 }
376 
377 typedef struct raidz_col {
378 	uint64_t rc_devidx;		/* child device index for I/O */
379 	uint64_t rc_offset;		/* device offset */
380 	uint64_t rc_size;		/* I/O size */
381 	void *rc_data;			/* I/O data */
382 	int rc_error;			/* I/O error for this device */
383 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
384 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
385 } raidz_col_t;
386 
387 typedef struct raidz_map {
388 	uint64_t rm_cols;		/* Regular column count */
389 	uint64_t rm_scols;		/* Count including skipped columns */
390 	uint64_t rm_bigcols;		/* Number of oversized columns */
391 	uint64_t rm_asize;		/* Actual total I/O size */
392 	uint64_t rm_missingdata;	/* Count of missing data devices */
393 	uint64_t rm_missingparity;	/* Count of missing parity devices */
394 	uint64_t rm_firstdatacol;	/* First data column/parity count */
395 	uint64_t rm_nskip;		/* Skipped sectors for padding */
396 	uint64_t rm_skipstart;		/* Column index of padding start */
397 	uintptr_t rm_reports;		/* # of referencing checksum reports */
398 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
399 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
400 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
401 } raidz_map_t;
402 
403 #define	VDEV_RAIDZ_P		0
404 #define	VDEV_RAIDZ_Q		1
405 #define	VDEV_RAIDZ_R		2
406 
407 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
408 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
409 
410 /*
411  * We provide a mechanism to perform the field multiplication operation on a
412  * 64-bit value all at once rather than a byte at a time. This works by
413  * creating a mask from the top bit in each byte and using that to
414  * conditionally apply the XOR of 0x1d.
415  */
416 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
417 { \
418 	(mask) = (x) & 0x8080808080808080ULL; \
419 	(mask) = ((mask) << 1) - ((mask) >> 7); \
420 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
421 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
422 }
423 
424 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
425 { \
426 	VDEV_RAIDZ_64MUL_2((x), mask); \
427 	VDEV_RAIDZ_64MUL_2((x), mask); \
428 }
429 
430 /*
431  * These two tables represent powers and logs of 2 in the Galois field defined
432  * above. These values were computed by repeatedly multiplying by 2 as above.
433  */
434 static const uint8_t vdev_raidz_pow2[256] = {
435 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
436 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
437 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
438 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
439 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
440 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
441 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
442 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
443 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
444 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
445 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
446 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
447 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
448 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
449 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
450 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
451 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
452 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
453 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
454 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
455 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
456 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
457 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
458 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
459 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
460 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
461 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
462 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
463 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
464 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
465 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
466 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
467 };
468 static const uint8_t vdev_raidz_log2[256] = {
469 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
470 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
471 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
472 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
473 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
474 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
475 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
476 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
477 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
478 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
479 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
480 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
481 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
482 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
483 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
484 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
485 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
486 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
487 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
488 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
489 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
490 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
491 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
492 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
493 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
494 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
495 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
496 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
497 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
498 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
499 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
500 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
501 };
502 
503 /*
504  * Multiply a given number by 2 raised to the given power.
505  */
506 static uint8_t
vdev_raidz_exp2(uint8_t a,int exp)507 vdev_raidz_exp2(uint8_t a, int exp)
508 {
509 	if (a == 0)
510 		return (0);
511 
512 	ASSERT(exp >= 0);
513 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
514 
515 	exp += vdev_raidz_log2[a];
516 	if (exp > 255)
517 		exp -= 255;
518 
519 	return (vdev_raidz_pow2[exp]);
520 }
521 
522 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)523 vdev_raidz_generate_parity_p(raidz_map_t *rm)
524 {
525 	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
526 	int c;
527 
528 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
529 
530 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
531 		src = rm->rm_col[c].rc_data;
532 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
533 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
534 
535 		if (c == rm->rm_firstdatacol) {
536 			ASSERT(ccount == pcount);
537 			for (i = 0; i < ccount; i++, src++, p++) {
538 				*p = *src;
539 			}
540 		} else {
541 			ASSERT(ccount <= pcount);
542 			for (i = 0; i < ccount; i++, src++, p++) {
543 				*p ^= *src;
544 			}
545 		}
546 	}
547 }
548 
549 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)550 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
551 {
552 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
553 	int c;
554 
555 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
556 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
557 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
558 
559 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
560 		src = rm->rm_col[c].rc_data;
561 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
562 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
563 
564 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
565 
566 		if (c == rm->rm_firstdatacol) {
567 			ASSERT(ccnt == pcnt || ccnt == 0);
568 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
569 				*p = *src;
570 				*q = *src;
571 			}
572 			for (; i < pcnt; i++, src++, p++, q++) {
573 				*p = 0;
574 				*q = 0;
575 			}
576 		} else {
577 			ASSERT(ccnt <= pcnt);
578 
579 			/*
580 			 * Apply the algorithm described above by multiplying
581 			 * the previous result and adding in the new value.
582 			 */
583 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
584 				*p ^= *src;
585 
586 				VDEV_RAIDZ_64MUL_2(*q, mask);
587 				*q ^= *src;
588 			}
589 
590 			/*
591 			 * Treat short columns as though they are full of 0s.
592 			 * Note that there's therefore nothing needed for P.
593 			 */
594 			for (; i < pcnt; i++, q++) {
595 				VDEV_RAIDZ_64MUL_2(*q, mask);
596 			}
597 		}
598 	}
599 }
600 
601 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)602 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
603 {
604 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
605 	int c;
606 
607 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
608 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
609 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
610 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
611 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
612 
613 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
614 		src = rm->rm_col[c].rc_data;
615 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
616 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
617 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
618 
619 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
620 
621 		if (c == rm->rm_firstdatacol) {
622 			ASSERT(ccnt == pcnt || ccnt == 0);
623 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
624 				*p = *src;
625 				*q = *src;
626 				*r = *src;
627 			}
628 			for (; i < pcnt; i++, src++, p++, q++, r++) {
629 				*p = 0;
630 				*q = 0;
631 				*r = 0;
632 			}
633 		} else {
634 			ASSERT(ccnt <= pcnt);
635 
636 			/*
637 			 * Apply the algorithm described above by multiplying
638 			 * the previous result and adding in the new value.
639 			 */
640 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
641 				*p ^= *src;
642 
643 				VDEV_RAIDZ_64MUL_2(*q, mask);
644 				*q ^= *src;
645 
646 				VDEV_RAIDZ_64MUL_4(*r, mask);
647 				*r ^= *src;
648 			}
649 
650 			/*
651 			 * Treat short columns as though they are full of 0s.
652 			 * Note that there's therefore nothing needed for P.
653 			 */
654 			for (; i < pcnt; i++, q++, r++) {
655 				VDEV_RAIDZ_64MUL_2(*q, mask);
656 				VDEV_RAIDZ_64MUL_4(*r, mask);
657 			}
658 		}
659 	}
660 }
661 
662 /*
663  * Generate RAID parity in the first virtual columns according to the number of
664  * parity columns available.
665  */
666 static void
vdev_raidz_generate_parity(raidz_map_t * rm)667 vdev_raidz_generate_parity(raidz_map_t *rm)
668 {
669 	switch (rm->rm_firstdatacol) {
670 	case 1:
671 		vdev_raidz_generate_parity_p(rm);
672 		break;
673 	case 2:
674 		vdev_raidz_generate_parity_pq(rm);
675 		break;
676 	case 3:
677 		vdev_raidz_generate_parity_pqr(rm);
678 		break;
679 	default:
680 		panic("invalid RAID-Z configuration");
681 	}
682 }
683 
684 /* BEGIN CSTYLED */
685 /*
686  * In the general case of reconstruction, we must solve the system of linear
687  * equations defined by the coeffecients used to generate parity as well as
688  * the contents of the data and parity disks. This can be expressed with
689  * vectors for the original data (D) and the actual data (d) and parity (p)
690  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
691  *
692  *            __   __                     __     __
693  *            |     |         __     __   |  p_0  |
694  *            |  V  |         |  D_0  |   | p_m-1 |
695  *            |     |    x    |   :   | = |  d_0  |
696  *            |  I  |         | D_n-1 |   |   :   |
697  *            |     |         ~~     ~~   | d_n-1 |
698  *            ~~   ~~                     ~~     ~~
699  *
700  * I is simply a square identity matrix of size n, and V is a vandermonde
701  * matrix defined by the coeffecients we chose for the various parity columns
702  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
703  * computation as well as linear separability.
704  *
705  *      __               __               __     __
706  *      |   1   ..  1 1 1 |               |  p_0  |
707  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
708  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
709  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
710  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
711  *      |   :       : : : |   |   :   |   |  d_2  |
712  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
713  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
714  *      |   0   ..  0 0 1 |               | d_n-1 |
715  *      ~~               ~~               ~~     ~~
716  *
717  * Note that I, V, d, and p are known. To compute D, we must invert the
718  * matrix and use the known data and parity values to reconstruct the unknown
719  * data values. We begin by removing the rows in V|I and d|p that correspond
720  * to failed or missing columns; we then make V|I square (n x n) and d|p
721  * sized n by removing rows corresponding to unused parity from the bottom up
722  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
723  * using Gauss-Jordan elimination. In the example below we use m=3 parity
724  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
725  *           __                               __
726  *           |  1   1   1   1   1   1   1   1  |
727  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
728  *           |  19 205 116  29  64  16  4   1  |      / /
729  *           |  1   0   0   0   0   0   0   0  |     / /
730  *           |  0   1   0   0   0   0   0   0  | <--' /
731  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
732  *           |  0   0   0   1   0   0   0   0  |
733  *           |  0   0   0   0   1   0   0   0  |
734  *           |  0   0   0   0   0   1   0   0  |
735  *           |  0   0   0   0   0   0   1   0  |
736  *           |  0   0   0   0   0   0   0   1  |
737  *           ~~                               ~~
738  *           __                               __
739  *           |  1   1   1   1   1   1   1   1  |
740  *           | 128  64  32  16  8   4   2   1  |
741  *           |  19 205 116  29  64  16  4   1  |
742  *           |  1   0   0   0   0   0   0   0  |
743  *           |  0   1   0   0   0   0   0   0  |
744  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
745  *           |  0   0   0   1   0   0   0   0  |
746  *           |  0   0   0   0   1   0   0   0  |
747  *           |  0   0   0   0   0   1   0   0  |
748  *           |  0   0   0   0   0   0   1   0  |
749  *           |  0   0   0   0   0   0   0   1  |
750  *           ~~                               ~~
751  *
752  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
753  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
754  * matrix is not singular.
755  * __                                                                 __
756  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
757  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
758  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
759  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
760  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
761  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
762  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
763  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
764  * ~~                                                                 ~~
765  * __                                                                 __
766  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
767  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
768  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
769  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
770  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
771  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
772  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
773  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
774  * ~~                                                                 ~~
775  * __                                                                 __
776  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
777  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
778  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
779  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
780  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
781  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
782  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
783  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
784  * ~~                                                                 ~~
785  * __                                                                 __
786  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
787  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
788  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
789  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
790  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
791  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
792  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
793  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
794  * ~~                                                                 ~~
795  * __                                                                 __
796  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
797  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
798  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
799  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
800  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
801  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
802  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
803  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
804  * ~~                                                                 ~~
805  * __                                                                 __
806  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
807  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
808  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
809  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
810  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
811  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
812  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
813  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
814  * ~~                                                                 ~~
815  *                   __                               __
816  *                   |  0   0   1   0   0   0   0   0  |
817  *                   | 167 100  5   41 159 169 217 208 |
818  *                   | 166 100  4   40 158 168 216 209 |
819  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
820  *                   |  0   0   0   0   1   0   0   0  |
821  *                   |  0   0   0   0   0   1   0   0  |
822  *                   |  0   0   0   0   0   0   1   0  |
823  *                   |  0   0   0   0   0   0   0   1  |
824  *                   ~~                               ~~
825  *
826  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
827  * of the missing data.
828  *
829  * As is apparent from the example above, the only non-trivial rows in the
830  * inverse matrix correspond to the data disks that we're trying to
831  * reconstruct. Indeed, those are the only rows we need as the others would
832  * only be useful for reconstructing data known or assumed to be valid. For
833  * that reason, we only build the coefficients in the rows that correspond to
834  * targeted columns.
835  */
836 /* END CSTYLED */
837 
838 static void
vdev_raidz_matrix_init(raidz_map_t * rm __unused,int n,int nmap,int * map,uint8_t ** rows)839 vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map,
840     uint8_t **rows)
841 {
842 	int i, j;
843 	int pow;
844 
845 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
846 
847 	/*
848 	 * Fill in the missing rows of interest.
849 	 */
850 	for (i = 0; i < nmap; i++) {
851 		ASSERT3S(0, <=, map[i]);
852 		ASSERT3S(map[i], <=, 2);
853 
854 		pow = map[i] * n;
855 		if (pow > 255)
856 			pow -= 255;
857 		ASSERT(pow <= 255);
858 
859 		for (j = 0; j < n; j++) {
860 			pow -= map[i];
861 			if (pow < 0)
862 				pow += 255;
863 			rows[i][j] = vdev_raidz_pow2[pow];
864 		}
865 	}
866 }
867 
868 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)869 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
870     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
871 {
872 	int i, j, ii, jj;
873 	uint8_t log;
874 
875 	/*
876 	 * Assert that the first nmissing entries from the array of used
877 	 * columns correspond to parity columns and that subsequent entries
878 	 * correspond to data columns.
879 	 */
880 	for (i = 0; i < nmissing; i++) {
881 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
882 	}
883 	for (; i < n; i++) {
884 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
885 	}
886 
887 	/*
888 	 * First initialize the storage where we'll compute the inverse rows.
889 	 */
890 	for (i = 0; i < nmissing; i++) {
891 		for (j = 0; j < n; j++) {
892 			invrows[i][j] = (i == j) ? 1 : 0;
893 		}
894 	}
895 
896 	/*
897 	 * Subtract all trivial rows from the rows of consequence.
898 	 */
899 	for (i = 0; i < nmissing; i++) {
900 		for (j = nmissing; j < n; j++) {
901 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
902 			jj = used[j] - rm->rm_firstdatacol;
903 			ASSERT3S(jj, <, n);
904 			invrows[i][j] = rows[i][jj];
905 			rows[i][jj] = 0;
906 		}
907 	}
908 
909 	/*
910 	 * For each of the rows of interest, we must normalize it and subtract
911 	 * a multiple of it from the other rows.
912 	 */
913 	for (i = 0; i < nmissing; i++) {
914 		for (j = 0; j < missing[i]; j++) {
915 			ASSERT3U(rows[i][j], ==, 0);
916 		}
917 		ASSERT3U(rows[i][missing[i]], !=, 0);
918 
919 		/*
920 		 * Compute the inverse of the first element and multiply each
921 		 * element in the row by that value.
922 		 */
923 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
924 
925 		for (j = 0; j < n; j++) {
926 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
927 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
928 		}
929 
930 		for (ii = 0; ii < nmissing; ii++) {
931 			if (i == ii)
932 				continue;
933 
934 			ASSERT3U(rows[ii][missing[i]], !=, 0);
935 
936 			log = vdev_raidz_log2[rows[ii][missing[i]]];
937 
938 			for (j = 0; j < n; j++) {
939 				rows[ii][j] ^=
940 				    vdev_raidz_exp2(rows[i][j], log);
941 				invrows[ii][j] ^=
942 				    vdev_raidz_exp2(invrows[i][j], log);
943 			}
944 		}
945 	}
946 
947 	/*
948 	 * Verify that the data that is left in the rows are properly part of
949 	 * an identity matrix.
950 	 */
951 	for (i = 0; i < nmissing; i++) {
952 		for (j = 0; j < n; j++) {
953 			if (j == missing[i]) {
954 				ASSERT3U(rows[i][j], ==, 1);
955 			} else {
956 				ASSERT3U(rows[i][j], ==, 0);
957 			}
958 		}
959 	}
960 }
961 
962 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)963 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
964     int *missing, uint8_t **invrows, const uint8_t *used)
965 {
966 	int i, j, x, cc, c;
967 	uint8_t *src;
968 	uint64_t ccount;
969 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
970 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
971 	uint8_t log, val;
972 	int ll;
973 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
974 	uint8_t *p, *pp;
975 	size_t psize;
976 
977 	log = 0;	/* gcc */
978 	psize = sizeof (invlog[0][0]) * n * nmissing;
979 	p = malloc(psize);
980 	if (p == NULL) {
981 		printf("Out of memory\n");
982 		return;
983 	}
984 
985 	for (pp = p, i = 0; i < nmissing; i++) {
986 		invlog[i] = pp;
987 		pp += n;
988 	}
989 
990 	for (i = 0; i < nmissing; i++) {
991 		for (j = 0; j < n; j++) {
992 			ASSERT3U(invrows[i][j], !=, 0);
993 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
994 		}
995 	}
996 
997 	for (i = 0; i < n; i++) {
998 		c = used[i];
999 		ASSERT3U(c, <, rm->rm_cols);
1000 
1001 		src = rm->rm_col[c].rc_data;
1002 		ccount = rm->rm_col[c].rc_size;
1003 		for (j = 0; j < nmissing; j++) {
1004 			cc = missing[j] + rm->rm_firstdatacol;
1005 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1006 			ASSERT3U(cc, <, rm->rm_cols);
1007 			ASSERT3U(cc, !=, c);
1008 
1009 			dst[j] = rm->rm_col[cc].rc_data;
1010 			dcount[j] = rm->rm_col[cc].rc_size;
1011 		}
1012 
1013 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1014 
1015 		for (x = 0; x < ccount; x++, src++) {
1016 			if (*src != 0)
1017 				log = vdev_raidz_log2[*src];
1018 
1019 			for (cc = 0; cc < nmissing; cc++) {
1020 				if (x >= dcount[cc])
1021 					continue;
1022 
1023 				if (*src == 0) {
1024 					val = 0;
1025 				} else {
1026 					if ((ll = log + invlog[cc][i]) >= 255)
1027 						ll -= 255;
1028 					val = vdev_raidz_pow2[ll];
1029 				}
1030 
1031 				if (i == 0)
1032 					dst[cc][x] = val;
1033 				else
1034 					dst[cc][x] ^= val;
1035 			}
1036 		}
1037 	}
1038 
1039 	free(p);
1040 }
1041 
1042 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1043 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1044 {
1045 	int n, i, c, t, tt;
1046 	int nmissing_rows;
1047 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1048 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1049 
1050 	uint8_t *p, *pp;
1051 	size_t psize;
1052 
1053 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1054 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1055 	uint8_t *used;
1056 
1057 	int code = 0;
1058 
1059 
1060 	n = rm->rm_cols - rm->rm_firstdatacol;
1061 
1062 	/*
1063 	 * Figure out which data columns are missing.
1064 	 */
1065 	nmissing_rows = 0;
1066 	for (t = 0; t < ntgts; t++) {
1067 		if (tgts[t] >= rm->rm_firstdatacol) {
1068 			missing_rows[nmissing_rows++] =
1069 			    tgts[t] - rm->rm_firstdatacol;
1070 		}
1071 	}
1072 
1073 	/*
1074 	 * Figure out which parity columns to use to help generate the missing
1075 	 * data columns.
1076 	 */
1077 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1078 		ASSERT(tt < ntgts);
1079 		ASSERT(c < rm->rm_firstdatacol);
1080 
1081 		/*
1082 		 * Skip any targeted parity columns.
1083 		 */
1084 		if (c == tgts[tt]) {
1085 			tt++;
1086 			continue;
1087 		}
1088 
1089 		code |= 1 << c;
1090 
1091 		parity_map[i] = c;
1092 		i++;
1093 	}
1094 
1095 	ASSERT(code != 0);
1096 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1097 
1098 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1099 	    nmissing_rows * n + sizeof (used[0]) * n;
1100 	p = malloc(psize);
1101 	if (p == NULL) {
1102 		printf("Out of memory\n");
1103 		return (code);
1104 	}
1105 
1106 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1107 		rows[i] = pp;
1108 		pp += n;
1109 		invrows[i] = pp;
1110 		pp += n;
1111 	}
1112 	used = pp;
1113 
1114 	for (i = 0; i < nmissing_rows; i++) {
1115 		used[i] = parity_map[i];
1116 	}
1117 
1118 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1119 		if (tt < nmissing_rows &&
1120 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1121 			tt++;
1122 			continue;
1123 		}
1124 
1125 		ASSERT3S(i, <, n);
1126 		used[i] = c;
1127 		i++;
1128 	}
1129 
1130 	/*
1131 	 * Initialize the interesting rows of the matrix.
1132 	 */
1133 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1134 
1135 	/*
1136 	 * Invert the matrix.
1137 	 */
1138 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1139 	    invrows, used);
1140 
1141 	/*
1142 	 * Reconstruct the missing data using the generated matrix.
1143 	 */
1144 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1145 	    invrows, used);
1146 
1147 	free(p);
1148 
1149 	return (code);
1150 }
1151 
1152 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1153 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1154 {
1155 	int tgts[VDEV_RAIDZ_MAXPARITY];
1156 	int ntgts;
1157 	int i, c;
1158 	int code;
1159 	int nbadparity, nbaddata;
1160 
1161 	/*
1162 	 * The tgts list must already be sorted.
1163 	 */
1164 	for (i = 1; i < nt; i++) {
1165 		ASSERT(t[i] > t[i - 1]);
1166 	}
1167 
1168 	nbadparity = rm->rm_firstdatacol;
1169 	nbaddata = rm->rm_cols - nbadparity;
1170 	ntgts = 0;
1171 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1172 		if (i < nt && c == t[i]) {
1173 			tgts[ntgts++] = c;
1174 			i++;
1175 		} else if (rm->rm_col[c].rc_error != 0) {
1176 			tgts[ntgts++] = c;
1177 		} else if (c >= rm->rm_firstdatacol) {
1178 			nbaddata--;
1179 		} else {
1180 			nbadparity--;
1181 		}
1182 	}
1183 
1184 	ASSERT(ntgts >= nt);
1185 	ASSERT(nbaddata >= 0);
1186 	ASSERT(nbaddata + nbadparity == ntgts);
1187 
1188 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1189 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1190 	ASSERT(code > 0);
1191 	return (code);
1192 }
1193 
1194 static raidz_map_t *
vdev_raidz_map_alloc(void * data,off_t offset,size_t size,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)1195 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1196     uint64_t dcols, uint64_t nparity)
1197 {
1198 	raidz_map_t *rm;
1199 	uint64_t b = offset >> unit_shift;
1200 	uint64_t s = size >> unit_shift;
1201 	uint64_t f = b % dcols;
1202 	uint64_t o = (b / dcols) << unit_shift;
1203 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1204 
1205 	q = s / (dcols - nparity);
1206 	r = s - q * (dcols - nparity);
1207 	bc = (r == 0 ? 0 : r + nparity);
1208 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1209 
1210 	if (q == 0) {
1211 		acols = bc;
1212 		scols = MIN(dcols, roundup(bc, nparity + 1));
1213 	} else {
1214 		acols = dcols;
1215 		scols = dcols;
1216 	}
1217 
1218 	ASSERT3U(acols, <=, scols);
1219 
1220 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1221 	if (rm == NULL)
1222 		return (rm);
1223 
1224 	rm->rm_cols = acols;
1225 	rm->rm_scols = scols;
1226 	rm->rm_bigcols = bc;
1227 	rm->rm_skipstart = bc;
1228 	rm->rm_missingdata = 0;
1229 	rm->rm_missingparity = 0;
1230 	rm->rm_firstdatacol = nparity;
1231 	rm->rm_reports = 0;
1232 	rm->rm_freed = 0;
1233 	rm->rm_ecksuminjected = 0;
1234 
1235 	asize = 0;
1236 
1237 	for (c = 0; c < scols; c++) {
1238 		col = f + c;
1239 		coff = o;
1240 		if (col >= dcols) {
1241 			col -= dcols;
1242 			coff += 1ULL << unit_shift;
1243 		}
1244 		rm->rm_col[c].rc_devidx = col;
1245 		rm->rm_col[c].rc_offset = coff;
1246 		rm->rm_col[c].rc_data = NULL;
1247 		rm->rm_col[c].rc_error = 0;
1248 		rm->rm_col[c].rc_tried = 0;
1249 		rm->rm_col[c].rc_skipped = 0;
1250 
1251 		if (c >= acols)
1252 			rm->rm_col[c].rc_size = 0;
1253 		else if (c < bc)
1254 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1255 		else
1256 			rm->rm_col[c].rc_size = q << unit_shift;
1257 
1258 		asize += rm->rm_col[c].rc_size;
1259 	}
1260 
1261 	ASSERT3U(asize, ==, tot << unit_shift);
1262 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1263 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1264 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1265 	ASSERT3U(rm->rm_nskip, <=, nparity);
1266 
1267 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1268 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1269 		if (rm->rm_col[c].rc_data == NULL) {
1270 			c++;
1271 			while (c != 0)
1272 				free(rm->rm_col[--c].rc_data);
1273 			free(rm);
1274 			return (NULL);
1275 		}
1276 	}
1277 
1278 	rm->rm_col[c].rc_data = data;
1279 
1280 	for (c = c + 1; c < acols; c++)
1281 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1282 		    rm->rm_col[c - 1].rc_size;
1283 
1284 	/*
1285 	 * If all data stored spans all columns, there's a danger that parity
1286 	 * will always be on the same device and, since parity isn't read
1287 	 * during normal operation, that that device's I/O bandwidth won't be
1288 	 * used effectively. We therefore switch the parity every 1MB.
1289 	 *
1290 	 * ... at least that was, ostensibly, the theory. As a practical
1291 	 * matter unless we juggle the parity between all devices evenly, we
1292 	 * won't see any benefit. Further, occasional writes that aren't a
1293 	 * multiple of the LCM of the number of children and the minimum
1294 	 * stripe width are sufficient to avoid pessimal behavior.
1295 	 * Unfortunately, this decision created an implicit on-disk format
1296 	 * requirement that we need to support for all eternity, but only
1297 	 * for single-parity RAID-Z.
1298 	 *
1299 	 * If we intend to skip a sector in the zeroth column for padding
1300 	 * we must make sure to note this swap. We will never intend to
1301 	 * skip the first column since at least one data and one parity
1302 	 * column must appear in each row.
1303 	 */
1304 	ASSERT(rm->rm_cols >= 2);
1305 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1306 
1307 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1308 		devidx = rm->rm_col[0].rc_devidx;
1309 		o = rm->rm_col[0].rc_offset;
1310 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1311 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1312 		rm->rm_col[1].rc_devidx = devidx;
1313 		rm->rm_col[1].rc_offset = o;
1314 
1315 		if (rm->rm_skipstart == 0)
1316 			rm->rm_skipstart = 1;
1317 	}
1318 
1319 	return (rm);
1320 }
1321 
1322 static void
vdev_raidz_map_free(raidz_map_t * rm)1323 vdev_raidz_map_free(raidz_map_t *rm)
1324 {
1325 	int c;
1326 
1327 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1328 		free(rm->rm_col[c].rc_data);
1329 
1330 	free(rm);
1331 }
1332 
1333 static vdev_t *
vdev_child(vdev_t * pvd,uint64_t devidx)1334 vdev_child(vdev_t *pvd, uint64_t devidx)
1335 {
1336 	vdev_t *cvd;
1337 
1338 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1339 		if (cvd->v_id == devidx)
1340 			break;
1341 	}
1342 
1343 	return (cvd);
1344 }
1345 
1346 /*
1347  * We keep track of whether or not there were any injected errors, so that
1348  * any ereports we generate can note it.
1349  */
1350 static int
raidz_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data,uint64_t size __unused)1351 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1352     uint64_t size __unused)
1353 {
1354 
1355 	return (zio_checksum_verify(spa, bp, data));
1356 }
1357 
1358 /*
1359  * Generate the parity from the data columns. If we tried and were able to
1360  * read the parity without error, verify that the generated parity matches the
1361  * data we read. If it doesn't, we fire off a checksum error. Return the
1362  * number such failures.
1363  */
1364 static int
raidz_parity_verify(raidz_map_t * rm)1365 raidz_parity_verify(raidz_map_t *rm)
1366 {
1367 	void *orig[VDEV_RAIDZ_MAXPARITY];
1368 	int c, ret = 0;
1369 	raidz_col_t *rc;
1370 
1371 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1372 		rc = &rm->rm_col[c];
1373 		if (!rc->rc_tried || rc->rc_error != 0)
1374 			continue;
1375 		orig[c] = malloc(rc->rc_size);
1376 		if (orig[c] != NULL) {
1377 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1378 		} else {
1379 			printf("Out of memory\n");
1380 		}
1381 	}
1382 
1383 	vdev_raidz_generate_parity(rm);
1384 
1385 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1386 		rc = &rm->rm_col[c];
1387 		if (!rc->rc_tried || rc->rc_error != 0)
1388 			continue;
1389 		if (orig[c] == NULL ||
1390 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1391 			rc->rc_error = ECKSUM;
1392 			ret++;
1393 		}
1394 		free(orig[c]);
1395 	}
1396 
1397 	return (ret);
1398 }
1399 
1400 /*
1401  * Iterate over all combinations of bad data and attempt a reconstruction.
1402  * Note that the algorithm below is non-optimal because it doesn't take into
1403  * account how reconstruction is actually performed. For example, with
1404  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1405  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1406  * cases we'd only use parity information in column 0.
1407  */
1408 static int
vdev_raidz_combrec(const spa_t * spa,raidz_map_t * rm,const blkptr_t * bp,void * data,off_t offset __unused,uint64_t bytes,int total_errors,int data_errors)1409 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1410     void *data, off_t offset __unused, uint64_t bytes, int total_errors,
1411     int data_errors)
1412 {
1413 	raidz_col_t *rc;
1414 	void *orig[VDEV_RAIDZ_MAXPARITY];
1415 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1416 	int *tgts = &tstore[1];
1417 	int current, next, i, c, n;
1418 	int code, ret = 0;
1419 
1420 	ASSERT(total_errors < rm->rm_firstdatacol);
1421 
1422 	/*
1423 	 * This simplifies one edge condition.
1424 	 */
1425 	tgts[-1] = -1;
1426 
1427 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1428 		/*
1429 		 * Initialize the targets array by finding the first n columns
1430 		 * that contain no error.
1431 		 *
1432 		 * If there were no data errors, we need to ensure that we're
1433 		 * always explicitly attempting to reconstruct at least one
1434 		 * data column. To do this, we simply push the highest target
1435 		 * up into the data columns.
1436 		 */
1437 		for (c = 0, i = 0; i < n; i++) {
1438 			if (i == n - 1 && data_errors == 0 &&
1439 			    c < rm->rm_firstdatacol) {
1440 				c = rm->rm_firstdatacol;
1441 			}
1442 
1443 			while (rm->rm_col[c].rc_error != 0) {
1444 				c++;
1445 				ASSERT3S(c, <, rm->rm_cols);
1446 			}
1447 
1448 			tgts[i] = c++;
1449 		}
1450 
1451 		/*
1452 		 * Setting tgts[n] simplifies the other edge condition.
1453 		 */
1454 		tgts[n] = rm->rm_cols;
1455 
1456 		/*
1457 		 * These buffers were allocated in previous iterations.
1458 		 */
1459 		for (i = 0; i < n - 1; i++) {
1460 			ASSERT(orig[i] != NULL);
1461 		}
1462 
1463 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1464 		if (orig[n - 1] == NULL) {
1465 			ret = ENOMEM;
1466 			goto done;
1467 		}
1468 
1469 		current = 0;
1470 		next = tgts[current];
1471 
1472 		while (current != n) {
1473 			tgts[current] = next;
1474 			current = 0;
1475 
1476 			/*
1477 			 * Save off the original data that we're going to
1478 			 * attempt to reconstruct.
1479 			 */
1480 			for (i = 0; i < n; i++) {
1481 				ASSERT(orig[i] != NULL);
1482 				c = tgts[i];
1483 				ASSERT3S(c, >=, 0);
1484 				ASSERT3S(c, <, rm->rm_cols);
1485 				rc = &rm->rm_col[c];
1486 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1487 			}
1488 
1489 			/*
1490 			 * Attempt a reconstruction and exit the outer loop on
1491 			 * success.
1492 			 */
1493 			code = vdev_raidz_reconstruct(rm, tgts, n);
1494 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1495 				for (i = 0; i < n; i++) {
1496 					c = tgts[i];
1497 					rc = &rm->rm_col[c];
1498 					ASSERT(rc->rc_error == 0);
1499 					rc->rc_error = ECKSUM;
1500 				}
1501 
1502 				ret = code;
1503 				goto done;
1504 			}
1505 
1506 			/*
1507 			 * Restore the original data.
1508 			 */
1509 			for (i = 0; i < n; i++) {
1510 				c = tgts[i];
1511 				rc = &rm->rm_col[c];
1512 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1513 			}
1514 
1515 			do {
1516 				/*
1517 				 * Find the next valid column after the current
1518 				 * position..
1519 				 */
1520 				for (next = tgts[current] + 1;
1521 				    next < rm->rm_cols &&
1522 				    rm->rm_col[next].rc_error != 0; next++)
1523 					continue;
1524 
1525 				ASSERT(next <= tgts[current + 1]);
1526 
1527 				/*
1528 				 * If that spot is available, we're done here.
1529 				 */
1530 				if (next != tgts[current + 1])
1531 					break;
1532 
1533 				/*
1534 				 * Otherwise, find the next valid column after
1535 				 * the previous position.
1536 				 */
1537 				for (c = tgts[current - 1] + 1;
1538 				    rm->rm_col[c].rc_error != 0; c++)
1539 					continue;
1540 
1541 				tgts[current] = c;
1542 				current++;
1543 
1544 			} while (current != n);
1545 		}
1546 	}
1547 	n--;
1548 done:
1549 	for (i = n - 1; i >= 0; i--) {
1550 		free(orig[i]);
1551 	}
1552 
1553 	return (ret);
1554 }
1555 
1556 static int
vdev_raidz_read(vdev_t * vd,const blkptr_t * bp,void * data,off_t offset,size_t bytes)1557 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1558     off_t offset, size_t bytes)
1559 {
1560 	vdev_t *tvd = vd->v_top;
1561 	vdev_t *cvd;
1562 	raidz_map_t *rm;
1563 	raidz_col_t *rc;
1564 	int c, error;
1565 	int unexpected_errors;
1566 	int parity_errors;
1567 	int parity_untried;
1568 	int data_errors;
1569 	int total_errors;
1570 	int n;
1571 	int tgts[VDEV_RAIDZ_MAXPARITY];
1572 	int code;
1573 
1574 	rc = NULL;	/* gcc */
1575 	error = 0;
1576 
1577 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1578 	    vd->v_nchildren, vd->v_nparity);
1579 	if (rm == NULL)
1580 		return (ENOMEM);
1581 
1582 	/*
1583 	 * Iterate over the columns in reverse order so that we hit the parity
1584 	 * last -- any errors along the way will force us to read the parity.
1585 	 */
1586 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1587 		rc = &rm->rm_col[c];
1588 		cvd = vdev_child(vd, rc->rc_devidx);
1589 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1590 			if (c >= rm->rm_firstdatacol)
1591 				rm->rm_missingdata++;
1592 			else
1593 				rm->rm_missingparity++;
1594 			rc->rc_error = ENXIO;
1595 			rc->rc_tried = 1;	/* don't even try */
1596 			rc->rc_skipped = 1;
1597 			continue;
1598 		}
1599 #if 0		/* XXX: Too hard for the boot code. */
1600 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1601 			if (c >= rm->rm_firstdatacol)
1602 				rm->rm_missingdata++;
1603 			else
1604 				rm->rm_missingparity++;
1605 			rc->rc_error = ESTALE;
1606 			rc->rc_skipped = 1;
1607 			continue;
1608 		}
1609 #endif
1610 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1611 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1612 			    rc->rc_offset, rc->rc_size);
1613 			rc->rc_tried = 1;
1614 			rc->rc_skipped = 0;
1615 		}
1616 	}
1617 
1618 reconstruct:
1619 	unexpected_errors = 0;
1620 	parity_errors = 0;
1621 	parity_untried = 0;
1622 	data_errors = 0;
1623 	total_errors = 0;
1624 
1625 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1626 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1627 
1628 	for (c = 0; c < rm->rm_cols; c++) {
1629 		rc = &rm->rm_col[c];
1630 
1631 		if (rc->rc_error) {
1632 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1633 
1634 			if (c < rm->rm_firstdatacol)
1635 				parity_errors++;
1636 			else
1637 				data_errors++;
1638 
1639 			if (!rc->rc_skipped)
1640 				unexpected_errors++;
1641 
1642 			total_errors++;
1643 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1644 			parity_untried++;
1645 		}
1646 	}
1647 
1648 	/*
1649 	 * There are three potential phases for a read:
1650 	 *	1. produce valid data from the columns read
1651 	 *	2. read all disks and try again
1652 	 *	3. perform combinatorial reconstruction
1653 	 *
1654 	 * Each phase is progressively both more expensive and less likely to
1655 	 * occur. If we encounter more errors than we can repair or all phases
1656 	 * fail, we have no choice but to return an error.
1657 	 */
1658 
1659 	/*
1660 	 * If the number of errors we saw was correctable -- less than or equal
1661 	 * to the number of parity disks read -- attempt to produce data that
1662 	 * has a valid checksum. Naturally, this case applies in the absence of
1663 	 * any errors.
1664 	 */
1665 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1666 		int rv;
1667 
1668 		if (data_errors == 0) {
1669 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1670 			if (rv == 0) {
1671 				/*
1672 				 * If we read parity information (unnecessarily
1673 				 * as it happens since no reconstruction was
1674 				 * needed) regenerate and verify the parity.
1675 				 * We also regenerate parity when resilvering
1676 				 * so we can write it out to the failed device
1677 				 * later.
1678 				 */
1679 				if (parity_errors + parity_untried <
1680 				    rm->rm_firstdatacol) {
1681 					n = raidz_parity_verify(rm);
1682 					unexpected_errors += n;
1683 					ASSERT(parity_errors + n <=
1684 					    rm->rm_firstdatacol);
1685 				}
1686 				goto done;
1687 			}
1688 		} else {
1689 			/*
1690 			 * We either attempt to read all the parity columns or
1691 			 * none of them. If we didn't try to read parity, we
1692 			 * wouldn't be here in the correctable case. There must
1693 			 * also have been fewer parity errors than parity
1694 			 * columns or, again, we wouldn't be in this code path.
1695 			 */
1696 			ASSERT(parity_untried == 0);
1697 			ASSERT(parity_errors < rm->rm_firstdatacol);
1698 
1699 			/*
1700 			 * Identify the data columns that reported an error.
1701 			 */
1702 			n = 0;
1703 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1704 				rc = &rm->rm_col[c];
1705 				if (rc->rc_error != 0) {
1706 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1707 					tgts[n++] = c;
1708 				}
1709 			}
1710 
1711 			ASSERT(rm->rm_firstdatacol >= n);
1712 
1713 			code = vdev_raidz_reconstruct(rm, tgts, n);
1714 
1715 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1716 			if (rv == 0) {
1717 				/*
1718 				 * If we read more parity disks than were used
1719 				 * for reconstruction, confirm that the other
1720 				 * parity disks produced correct data. This
1721 				 * routine is suboptimal in that it regenerates
1722 				 * the parity that we already used in addition
1723 				 * to the parity that we're attempting to
1724 				 * verify, but this should be a relatively
1725 				 * uncommon case, and can be optimized if it
1726 				 * becomes a problem. Note that we regenerate
1727 				 * parity when resilvering so we can write it
1728 				 * out to failed devices later.
1729 				 */
1730 				if (parity_errors < rm->rm_firstdatacol - n) {
1731 					n = raidz_parity_verify(rm);
1732 					unexpected_errors += n;
1733 					ASSERT(parity_errors + n <=
1734 					    rm->rm_firstdatacol);
1735 				}
1736 
1737 				goto done;
1738 			}
1739 		}
1740 	}
1741 
1742 	/*
1743 	 * This isn't a typical situation -- either we got a read
1744 	 * error or a child silently returned bad data. Read every
1745 	 * block so we can try again with as much data and parity as
1746 	 * we can track down. If we've already been through once
1747 	 * before, all children will be marked as tried so we'll
1748 	 * proceed to combinatorial reconstruction.
1749 	 */
1750 	unexpected_errors = 1;
1751 	rm->rm_missingdata = 0;
1752 	rm->rm_missingparity = 0;
1753 
1754 	n = 0;
1755 	for (c = 0; c < rm->rm_cols; c++) {
1756 		rc = &rm->rm_col[c];
1757 
1758 		if (rc->rc_tried)
1759 			continue;
1760 
1761 		cvd = vdev_child(vd, rc->rc_devidx);
1762 		ASSERT(cvd != NULL);
1763 		rc->rc_error = cvd->v_read(cvd, NULL,
1764 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1765 		if (rc->rc_error == 0)
1766 			n++;
1767 		rc->rc_tried = 1;
1768 		rc->rc_skipped = 0;
1769 	}
1770 	/*
1771 	 * If we managed to read anything more, retry the
1772 	 * reconstruction.
1773 	 */
1774 	if (n > 0)
1775 		goto reconstruct;
1776 
1777 	/*
1778 	 * At this point we've attempted to reconstruct the data given the
1779 	 * errors we detected, and we've attempted to read all columns. There
1780 	 * must, therefore, be one or more additional problems -- silent errors
1781 	 * resulting in invalid data rather than explicit I/O errors resulting
1782 	 * in absent data. We check if there is enough additional data to
1783 	 * possibly reconstruct the data and then perform combinatorial
1784 	 * reconstruction over all possible combinations. If that fails,
1785 	 * we're cooked.
1786 	 */
1787 	if (total_errors > rm->rm_firstdatacol) {
1788 		error = EIO;
1789 	} else if (total_errors < rm->rm_firstdatacol &&
1790 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1791 	    total_errors, data_errors)) != 0) {
1792 		/*
1793 		 * If we didn't use all the available parity for the
1794 		 * combinatorial reconstruction, verify that the remaining
1795 		 * parity is correct.
1796 		 */
1797 		if (code != (1 << rm->rm_firstdatacol) - 1)
1798 			(void) raidz_parity_verify(rm);
1799 	} else {
1800 		/*
1801 		 * We're here because either:
1802 		 *
1803 		 *	total_errors == rm_first_datacol, or
1804 		 *	vdev_raidz_combrec() failed
1805 		 *
1806 		 * In either case, there is enough bad data to prevent
1807 		 * reconstruction.
1808 		 *
1809 		 * Start checksum ereports for all children which haven't
1810 		 * failed, and the IO wasn't speculative.
1811 		 */
1812 		error = ECKSUM;
1813 	}
1814 
1815 done:
1816 	vdev_raidz_map_free(rm);
1817 
1818 	return (error);
1819 }
1820