xref: /illumos-gate/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c (revision 199767f8)
1*199767f8SToomas Soome /*
2*199767f8SToomas Soome  * CDDL HEADER START
3*199767f8SToomas Soome  *
4*199767f8SToomas Soome  * The contents of this file are subject to the terms of the
5*199767f8SToomas Soome  * Common Development and Distribution License (the "License").
6*199767f8SToomas Soome  * You may not use this file except in compliance with the License.
7*199767f8SToomas Soome  *
8*199767f8SToomas Soome  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*199767f8SToomas Soome  * or http://www.opensolaris.org/os/licensing.
10*199767f8SToomas Soome  * See the License for the specific language governing permissions
11*199767f8SToomas Soome  * and limitations under the License.
12*199767f8SToomas Soome  *
13*199767f8SToomas Soome  * When distributing Covered Code, include this CDDL HEADER in each
14*199767f8SToomas Soome  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*199767f8SToomas Soome  * If applicable, add the following below this CDDL HEADER, with the
16*199767f8SToomas Soome  * fields enclosed by brackets "[]" replaced with your own identifying
17*199767f8SToomas Soome  * information: Portions Copyright [yyyy] [name of copyright owner]
18*199767f8SToomas Soome  *
19*199767f8SToomas Soome  * CDDL HEADER END
20*199767f8SToomas Soome  */
21*199767f8SToomas Soome /*
22*199767f8SToomas Soome  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23*199767f8SToomas Soome  * Use is subject to license terms.
24*199767f8SToomas Soome  */
25*199767f8SToomas Soome 
26*199767f8SToomas Soome #include <sys/cdefs.h>
27*199767f8SToomas Soome 
28*199767f8SToomas Soome static uint64_t zfs_crc64_table[256];
29*199767f8SToomas Soome 
30*199767f8SToomas Soome #define	ECKSUM	666
31*199767f8SToomas Soome 
32*199767f8SToomas Soome #define	ASSERT3S(x, y, z)	((void)0)
33*199767f8SToomas Soome #define	ASSERT3U(x, y, z)	((void)0)
34*199767f8SToomas Soome #define	ASSERT3P(x, y, z)	((void)0)
35*199767f8SToomas Soome #define	ASSERT0(x)		((void)0)
36*199767f8SToomas Soome #define	ASSERT(x)		((void)0)
37*199767f8SToomas Soome 
38*199767f8SToomas Soome #define	panic(...)	do {						\
39*199767f8SToomas Soome 	printf(__VA_ARGS__);						\
40*199767f8SToomas Soome 	for (;;) ;							\
41*199767f8SToomas Soome } while (0)
42*199767f8SToomas Soome 
43*199767f8SToomas Soome #define	kmem_alloc(size, flag)	zfs_alloc((size))
44*199767f8SToomas Soome #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
45*199767f8SToomas Soome 
46*199767f8SToomas Soome static void
47*199767f8SToomas Soome zfs_init_crc(void)
48*199767f8SToomas Soome {
49*199767f8SToomas Soome 	int i, j;
50*199767f8SToomas Soome 	uint64_t *ct;
51*199767f8SToomas Soome 
52*199767f8SToomas Soome 	/*
53*199767f8SToomas Soome 	 * Calculate the crc64 table (used for the zap hash
54*199767f8SToomas Soome 	 * function).
55*199767f8SToomas Soome 	 */
56*199767f8SToomas Soome 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
57*199767f8SToomas Soome 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
58*199767f8SToomas Soome 		for (i = 0; i < 256; i++)
59*199767f8SToomas Soome 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
60*199767f8SToomas Soome 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
61*199767f8SToomas Soome 	}
62*199767f8SToomas Soome }
63*199767f8SToomas Soome 
64*199767f8SToomas Soome static void
65*199767f8SToomas Soome zio_checksum_off(const void *buf, uint64_t size,
66*199767f8SToomas Soome     const void *ctx_template, zio_cksum_t *zcp)
67*199767f8SToomas Soome {
68*199767f8SToomas Soome 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69*199767f8SToomas Soome }
70*199767f8SToomas Soome 
71*199767f8SToomas Soome /*
72*199767f8SToomas Soome  * Signature for checksum functions.
73*199767f8SToomas Soome  */
74*199767f8SToomas Soome typedef void zio_checksum_t(const void *data, uint64_t size,
75*199767f8SToomas Soome     const void *ctx_template, zio_cksum_t *zcp);
76*199767f8SToomas Soome typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
77*199767f8SToomas Soome typedef void zio_checksum_tmpl_free_t(void *ctx_template);
78*199767f8SToomas Soome 
79*199767f8SToomas Soome typedef enum zio_checksum_flags {
80*199767f8SToomas Soome 	/* Strong enough for metadata? */
81*199767f8SToomas Soome 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
82*199767f8SToomas Soome 	/* ZIO embedded checksum */
83*199767f8SToomas Soome 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
84*199767f8SToomas Soome 	/* Strong enough for dedup (without verification)? */
85*199767f8SToomas Soome 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
86*199767f8SToomas Soome 	/* Uses salt value */
87*199767f8SToomas Soome 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
88*199767f8SToomas Soome 	/* Strong enough for nopwrite? */
89*199767f8SToomas Soome 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
90*199767f8SToomas Soome } zio_checksum_flags_t;
91*199767f8SToomas Soome 
92*199767f8SToomas Soome /*
93*199767f8SToomas Soome  * Information about each checksum function.
94*199767f8SToomas Soome  */
95*199767f8SToomas Soome typedef struct zio_checksum_info {
96*199767f8SToomas Soome 	/* checksum function for each byteorder */
97*199767f8SToomas Soome 	zio_checksum_t			*ci_func[2];
98*199767f8SToomas Soome 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
99*199767f8SToomas Soome 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
100*199767f8SToomas Soome 	zio_checksum_flags_t		ci_flags;
101*199767f8SToomas Soome 	const char			*ci_name;	/* descriptive name */
102*199767f8SToomas Soome } zio_checksum_info_t;
103*199767f8SToomas Soome 
104*199767f8SToomas Soome #include "blkptr.c"
105*199767f8SToomas Soome 
106*199767f8SToomas Soome #include "fletcher.c"
107*199767f8SToomas Soome #include "sha256.c"
108*199767f8SToomas Soome 
109*199767f8SToomas Soome static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
110*199767f8SToomas Soome 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
111*199767f8SToomas Soome 	{{NULL, NULL}, NULL, NULL, 0, "on"},
112*199767f8SToomas Soome 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
113*199767f8SToomas Soome 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
114*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
115*199767f8SToomas Soome 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
116*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
117*199767f8SToomas Soome 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
118*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
119*199767f8SToomas Soome 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
120*199767f8SToomas Soome 	    0, "fletcher2"},
121*199767f8SToomas Soome 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
122*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
123*199767f8SToomas Soome 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
124*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
125*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
126*199767f8SToomas Soome 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
127*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
128*199767f8SToomas Soome 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
129*199767f8SToomas Soome 	    0, "noparity"},
130*199767f8SToomas Soome 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
131*199767f8SToomas Soome 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
132*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
133*199767f8SToomas Soome 	/* no skein and edonr for now */
134*199767f8SToomas Soome 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
135*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED |
136*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_NOPWRITE, "skein"},
137*199767f8SToomas Soome 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
138*199767f8SToomas Soome 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
139*199767f8SToomas Soome };
140*199767f8SToomas Soome 
141*199767f8SToomas Soome /*
142*199767f8SToomas Soome  * Common signature for all zio compress/decompress functions.
143*199767f8SToomas Soome  */
144*199767f8SToomas Soome typedef size_t zio_compress_func_t(void *src, void *dst,
145*199767f8SToomas Soome     size_t s_len, size_t d_len, int);
146*199767f8SToomas Soome typedef int zio_decompress_func_t(void *src, void *dst,
147*199767f8SToomas Soome     size_t s_len, size_t d_len, int);
148*199767f8SToomas Soome 
149*199767f8SToomas Soome extern int gzip_decompress(void *src, void *dst,
150*199767f8SToomas Soome     size_t s_len, size_t d_len, int);
151*199767f8SToomas Soome /*
152*199767f8SToomas Soome  * Information about each compression function.
153*199767f8SToomas Soome  */
154*199767f8SToomas Soome typedef struct zio_compress_info {
155*199767f8SToomas Soome 	zio_compress_func_t	*ci_compress;	/* compression function */
156*199767f8SToomas Soome 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
157*199767f8SToomas Soome 	int			ci_level;	/* level parameter */
158*199767f8SToomas Soome 	const char		*ci_name;	/* algorithm name */
159*199767f8SToomas Soome } zio_compress_info_t;
160*199767f8SToomas Soome 
161*199767f8SToomas Soome #include "lzjb.c"
162*199767f8SToomas Soome #include "zle.c"
163*199767f8SToomas Soome #include "lz4.c"
164*199767f8SToomas Soome 
165*199767f8SToomas Soome /*
166*199767f8SToomas Soome  * Compression vectors.
167*199767f8SToomas Soome  */
168*199767f8SToomas Soome static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
169*199767f8SToomas Soome 	{NULL,			NULL,			0,	"inherit"},
170*199767f8SToomas Soome 	{NULL,			NULL,			0,	"on"},
171*199767f8SToomas Soome 	{NULL,			NULL,			0,	"uncompressed"},
172*199767f8SToomas Soome 	{NULL,			lzjb_decompress,	0,	"lzjb"},
173*199767f8SToomas Soome 	{NULL,			NULL,			0,	"empty"},
174*199767f8SToomas Soome 	{NULL,			gzip_decompress,	1,	"gzip-1"},
175*199767f8SToomas Soome 	{NULL,			gzip_decompress,	2,	"gzip-2"},
176*199767f8SToomas Soome 	{NULL,			gzip_decompress,	3,	"gzip-3"},
177*199767f8SToomas Soome 	{NULL,			gzip_decompress,	4,	"gzip-4"},
178*199767f8SToomas Soome 	{NULL,			gzip_decompress,	5,	"gzip-5"},
179*199767f8SToomas Soome 	{NULL,			gzip_decompress,	6,	"gzip-6"},
180*199767f8SToomas Soome 	{NULL,			gzip_decompress,	7,	"gzip-7"},
181*199767f8SToomas Soome 	{NULL,			gzip_decompress,	8,	"gzip-8"},
182*199767f8SToomas Soome 	{NULL,			gzip_decompress,	9,	"gzip-9"},
183*199767f8SToomas Soome 	{NULL,			zle_decompress,		64,	"zle"},
184*199767f8SToomas Soome 	{NULL,			lz4_decompress,		0,	"lz4"},
185*199767f8SToomas Soome };
186*199767f8SToomas Soome 
187*199767f8SToomas Soome static void
188*199767f8SToomas Soome byteswap_uint64_array(void *vbuf, size_t size)
189*199767f8SToomas Soome {
190*199767f8SToomas Soome 	uint64_t *buf = vbuf;
191*199767f8SToomas Soome 	size_t count = size >> 3;
192*199767f8SToomas Soome 	int i;
193*199767f8SToomas Soome 
194*199767f8SToomas Soome 	ASSERT((size & 7) == 0);
195*199767f8SToomas Soome 
196*199767f8SToomas Soome 	for (i = 0; i < count; i++)
197*199767f8SToomas Soome 		buf[i] = BSWAP_64(buf[i]);
198*199767f8SToomas Soome }
199*199767f8SToomas Soome 
200*199767f8SToomas Soome /*
201*199767f8SToomas Soome  * Set the external verifier for a gang block based on <vdev, offset, txg>,
202*199767f8SToomas Soome  * a tuple which is guaranteed to be unique for the life of the pool.
203*199767f8SToomas Soome  */
204*199767f8SToomas Soome static void
205*199767f8SToomas Soome zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
206*199767f8SToomas Soome {
207*199767f8SToomas Soome 	const dva_t *dva = BP_IDENTITY(bp);
208*199767f8SToomas Soome 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
209*199767f8SToomas Soome 
210*199767f8SToomas Soome 	ASSERT(BP_IS_GANG(bp));
211*199767f8SToomas Soome 
212*199767f8SToomas Soome 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
213*199767f8SToomas Soome }
214*199767f8SToomas Soome 
215*199767f8SToomas Soome /*
216*199767f8SToomas Soome  * Set the external verifier for a label block based on its offset.
217*199767f8SToomas Soome  * The vdev is implicit, and the txg is unknowable at pool open time --
218*199767f8SToomas Soome  * hence the logic in vdev_uberblock_load() to find the most recent copy.
219*199767f8SToomas Soome  */
220*199767f8SToomas Soome static void
221*199767f8SToomas Soome zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
222*199767f8SToomas Soome {
223*199767f8SToomas Soome 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
224*199767f8SToomas Soome }
225*199767f8SToomas Soome 
226*199767f8SToomas Soome /*
227*199767f8SToomas Soome  * Calls the template init function of a checksum which supports context
228*199767f8SToomas Soome  * templates and installs the template into the spa_t.
229*199767f8SToomas Soome  */
230*199767f8SToomas Soome static void
231*199767f8SToomas Soome zio_checksum_template_init(enum zio_checksum checksum, const spa_t *spa)
232*199767f8SToomas Soome {
233*199767f8SToomas Soome 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
234*199767f8SToomas Soome 
235*199767f8SToomas Soome 	if (ci->ci_tmpl_init == NULL)
236*199767f8SToomas Soome 		return;
237*199767f8SToomas Soome #if 0	/* for now we dont have anything here */
238*199767f8SToomas Soome 	if (spa->spa_cksum_tmpls[checksum] != NULL)
239*199767f8SToomas Soome 		return;
240*199767f8SToomas Soome 
241*199767f8SToomas Soome 	VERIFY(ci->ci_tmpl_free != NULL);
242*199767f8SToomas Soome 	mutex_enter(&spa->spa_cksum_tmpls_lock);
243*199767f8SToomas Soome 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
244*199767f8SToomas Soome 		spa->spa_cksum_tmpls[checksum] =
245*199767f8SToomas Soome 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
246*199767f8SToomas Soome 		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
247*199767f8SToomas Soome 	}
248*199767f8SToomas Soome 	mutex_exit(&spa->spa_cksum_tmpls_lock);
249*199767f8SToomas Soome #endif
250*199767f8SToomas Soome }
251*199767f8SToomas Soome 
252*199767f8SToomas Soome static int
253*199767f8SToomas Soome zio_checksum_verify(const blkptr_t *bp, void *data)
254*199767f8SToomas Soome {
255*199767f8SToomas Soome 	uint64_t size;
256*199767f8SToomas Soome 	unsigned int checksum;
257*199767f8SToomas Soome 	zio_checksum_info_t *ci;
258*199767f8SToomas Soome 	zio_cksum_t actual_cksum, expected_cksum, verifier;
259*199767f8SToomas Soome 	int byteswap;
260*199767f8SToomas Soome 
261*199767f8SToomas Soome 	checksum = BP_GET_CHECKSUM(bp);
262*199767f8SToomas Soome 	size = BP_GET_PSIZE(bp);
263*199767f8SToomas Soome 
264*199767f8SToomas Soome 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
265*199767f8SToomas Soome 		return (EINVAL);
266*199767f8SToomas Soome 	ci = &zio_checksum_table[checksum];
267*199767f8SToomas Soome 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
268*199767f8SToomas Soome 		return (EINVAL);
269*199767f8SToomas Soome 
270*199767f8SToomas Soome 	zio_checksum_template_init(checksum, NULL);
271*199767f8SToomas Soome 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
272*199767f8SToomas Soome 		zio_eck_t *eck;
273*199767f8SToomas Soome 
274*199767f8SToomas Soome 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
275*199767f8SToomas Soome 		    checksum == ZIO_CHECKSUM_LABEL);
276*199767f8SToomas Soome 
277*199767f8SToomas Soome 		eck = (zio_eck_t *)((char *)data + size) - 1;
278*199767f8SToomas Soome 
279*199767f8SToomas Soome 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
280*199767f8SToomas Soome 			zio_checksum_gang_verifier(&verifier, bp);
281*199767f8SToomas Soome 		else if (checksum == ZIO_CHECKSUM_LABEL)
282*199767f8SToomas Soome 			zio_checksum_label_verifier(&verifier,
283*199767f8SToomas Soome 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
284*199767f8SToomas Soome 		else
285*199767f8SToomas Soome 			verifier = bp->blk_cksum;
286*199767f8SToomas Soome 
287*199767f8SToomas Soome 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
288*199767f8SToomas Soome 
289*199767f8SToomas Soome 		if (byteswap)
290*199767f8SToomas Soome 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
291*199767f8SToomas Soome 
292*199767f8SToomas Soome 		expected_cksum = eck->zec_cksum;
293*199767f8SToomas Soome 		eck->zec_cksum = verifier;
294*199767f8SToomas Soome 		ci->ci_func[byteswap](data, size, NULL, &actual_cksum);
295*199767f8SToomas Soome 		eck->zec_cksum = expected_cksum;
296*199767f8SToomas Soome 
297*199767f8SToomas Soome 		if (byteswap)
298*199767f8SToomas Soome 			byteswap_uint64_array(&expected_cksum,
299*199767f8SToomas Soome 			    sizeof (zio_cksum_t));
300*199767f8SToomas Soome 	} else {
301*199767f8SToomas Soome 		expected_cksum = bp->blk_cksum;
302*199767f8SToomas Soome 		ci->ci_func[0](data, size, NULL, &actual_cksum);
303*199767f8SToomas Soome 	}
304*199767f8SToomas Soome 
305*199767f8SToomas Soome 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
306*199767f8SToomas Soome 		/*printf("ZFS: read checksum failed\n");*/
307*199767f8SToomas Soome 		return (EIO);
308*199767f8SToomas Soome 	}
309*199767f8SToomas Soome 
310*199767f8SToomas Soome 	return (0);
311*199767f8SToomas Soome }
312*199767f8SToomas Soome 
313*199767f8SToomas Soome static int
314*199767f8SToomas Soome zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
315*199767f8SToomas Soome 	void *dest, uint64_t destsize)
316*199767f8SToomas Soome {
317*199767f8SToomas Soome 	zio_compress_info_t *ci;
318*199767f8SToomas Soome 
319*199767f8SToomas Soome 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
320*199767f8SToomas Soome 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
321*199767f8SToomas Soome 		return (EIO);
322*199767f8SToomas Soome 	}
323*199767f8SToomas Soome 
324*199767f8SToomas Soome 	ci = &zio_compress_table[cpfunc];
325*199767f8SToomas Soome 	if (!ci->ci_decompress) {
326*199767f8SToomas Soome 		printf("ZFS: unsupported compression algorithm %s\n",
327*199767f8SToomas Soome 		    ci->ci_name);
328*199767f8SToomas Soome 		return (EIO);
329*199767f8SToomas Soome 	}
330*199767f8SToomas Soome 
331*199767f8SToomas Soome 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
332*199767f8SToomas Soome }
333*199767f8SToomas Soome 
334*199767f8SToomas Soome static uint64_t
335*199767f8SToomas Soome zap_hash(uint64_t salt, const char *name)
336*199767f8SToomas Soome {
337*199767f8SToomas Soome 	const uint8_t *cp;
338*199767f8SToomas Soome 	uint8_t c;
339*199767f8SToomas Soome 	uint64_t crc = salt;
340*199767f8SToomas Soome 
341*199767f8SToomas Soome 	ASSERT(crc != 0);
342*199767f8SToomas Soome 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
343*199767f8SToomas Soome 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
344*199767f8SToomas Soome 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
345*199767f8SToomas Soome 
346*199767f8SToomas Soome 	/*
347*199767f8SToomas Soome 	 * Only use 28 bits, since we need 4 bits in the cookie for the
348*199767f8SToomas Soome 	 * collision differentiator.  We MUST use the high bits, since
349*199767f8SToomas Soome 	 * those are the onces that we first pay attention to when
350*199767f8SToomas Soome 	 * chosing the bucket.
351*199767f8SToomas Soome 	 */
352*199767f8SToomas Soome 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
353*199767f8SToomas Soome 
354*199767f8SToomas Soome 	return (crc);
355*199767f8SToomas Soome }
356*199767f8SToomas Soome 
357*199767f8SToomas Soome static void *zfs_alloc(size_t size);
358*199767f8SToomas Soome static void zfs_free(void *ptr, size_t size);
359*199767f8SToomas Soome 
360*199767f8SToomas Soome typedef struct raidz_col {
361*199767f8SToomas Soome 	uint64_t rc_devidx;		/* child device index for I/O */
362*199767f8SToomas Soome 	uint64_t rc_offset;		/* device offset */
363*199767f8SToomas Soome 	uint64_t rc_size;		/* I/O size */
364*199767f8SToomas Soome 	void *rc_data;			/* I/O data */
365*199767f8SToomas Soome 	int rc_error;			/* I/O error for this device */
366*199767f8SToomas Soome 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
367*199767f8SToomas Soome 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
368*199767f8SToomas Soome } raidz_col_t;
369*199767f8SToomas Soome 
370*199767f8SToomas Soome typedef struct raidz_map {
371*199767f8SToomas Soome 	uint64_t rm_cols;		/* Regular column count */
372*199767f8SToomas Soome 	uint64_t rm_scols;		/* Count including skipped columns */
373*199767f8SToomas Soome 	uint64_t rm_bigcols;		/* Number of oversized columns */
374*199767f8SToomas Soome 	uint64_t rm_asize;		/* Actual total I/O size */
375*199767f8SToomas Soome 	uint64_t rm_missingdata;	/* Count of missing data devices */
376*199767f8SToomas Soome 	uint64_t rm_missingparity;	/* Count of missing parity devices */
377*199767f8SToomas Soome 	uint64_t rm_firstdatacol;	/* First data column/parity count */
378*199767f8SToomas Soome 	uint64_t rm_nskip;		/* Skipped sectors for padding */
379*199767f8SToomas Soome 	uint64_t rm_skipstart;		/* Column index of padding start */
380*199767f8SToomas Soome 	uintptr_t rm_reports;		/* # of referencing checksum reports */
381*199767f8SToomas Soome 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
382*199767f8SToomas Soome 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
383*199767f8SToomas Soome 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
384*199767f8SToomas Soome } raidz_map_t;
385*199767f8SToomas Soome 
386*199767f8SToomas Soome #define	VDEV_RAIDZ_P		0
387*199767f8SToomas Soome #define	VDEV_RAIDZ_Q		1
388*199767f8SToomas Soome #define	VDEV_RAIDZ_R		2
389*199767f8SToomas Soome 
390*199767f8SToomas Soome #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
391*199767f8SToomas Soome #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
392*199767f8SToomas Soome 
393*199767f8SToomas Soome /*
394*199767f8SToomas Soome  * We provide a mechanism to perform the field multiplication operation on a
395*199767f8SToomas Soome  * 64-bit value all at once rather than a byte at a time. This works by
396*199767f8SToomas Soome  * creating a mask from the top bit in each byte and using that to
397*199767f8SToomas Soome  * conditionally apply the XOR of 0x1d.
398*199767f8SToomas Soome  */
399*199767f8SToomas Soome #define	VDEV_RAIDZ_64MUL_2(x, mask) \
400*199767f8SToomas Soome { \
401*199767f8SToomas Soome 	(mask) = (x) & 0x8080808080808080ULL; \
402*199767f8SToomas Soome 	(mask) = ((mask) << 1) - ((mask) >> 7); \
403*199767f8SToomas Soome 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
404*199767f8SToomas Soome 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
405*199767f8SToomas Soome }
406*199767f8SToomas Soome 
407*199767f8SToomas Soome #define	VDEV_RAIDZ_64MUL_4(x, mask) \
408*199767f8SToomas Soome { \
409*199767f8SToomas Soome 	VDEV_RAIDZ_64MUL_2((x), mask); \
410*199767f8SToomas Soome 	VDEV_RAIDZ_64MUL_2((x), mask); \
411*199767f8SToomas Soome }
412*199767f8SToomas Soome 
413*199767f8SToomas Soome /*
414*199767f8SToomas Soome  * These two tables represent powers and logs of 2 in the Galois field defined
415*199767f8SToomas Soome  * above. These values were computed by repeatedly multiplying by 2 as above.
416*199767f8SToomas Soome  */
417*199767f8SToomas Soome static const uint8_t vdev_raidz_pow2[256] = {
418*199767f8SToomas Soome 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
419*199767f8SToomas Soome 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
420*199767f8SToomas Soome 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
421*199767f8SToomas Soome 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
422*199767f8SToomas Soome 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
423*199767f8SToomas Soome 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
424*199767f8SToomas Soome 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
425*199767f8SToomas Soome 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
426*199767f8SToomas Soome 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
427*199767f8SToomas Soome 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
428*199767f8SToomas Soome 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
429*199767f8SToomas Soome 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
430*199767f8SToomas Soome 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
431*199767f8SToomas Soome 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
432*199767f8SToomas Soome 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
433*199767f8SToomas Soome 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
434*199767f8SToomas Soome 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
435*199767f8SToomas Soome 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
436*199767f8SToomas Soome 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
437*199767f8SToomas Soome 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
438*199767f8SToomas Soome 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
439*199767f8SToomas Soome 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
440*199767f8SToomas Soome 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
441*199767f8SToomas Soome 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
442*199767f8SToomas Soome 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
443*199767f8SToomas Soome 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
444*199767f8SToomas Soome 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
445*199767f8SToomas Soome 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
446*199767f8SToomas Soome 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
447*199767f8SToomas Soome 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
448*199767f8SToomas Soome 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
449*199767f8SToomas Soome 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
450*199767f8SToomas Soome };
451*199767f8SToomas Soome static const uint8_t vdev_raidz_log2[256] = {
452*199767f8SToomas Soome 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
453*199767f8SToomas Soome 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
454*199767f8SToomas Soome 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
455*199767f8SToomas Soome 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
456*199767f8SToomas Soome 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
457*199767f8SToomas Soome 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
458*199767f8SToomas Soome 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
459*199767f8SToomas Soome 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
460*199767f8SToomas Soome 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
461*199767f8SToomas Soome 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
462*199767f8SToomas Soome 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
463*199767f8SToomas Soome 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
464*199767f8SToomas Soome 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
465*199767f8SToomas Soome 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
466*199767f8SToomas Soome 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
467*199767f8SToomas Soome 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
468*199767f8SToomas Soome 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
469*199767f8SToomas Soome 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
470*199767f8SToomas Soome 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
471*199767f8SToomas Soome 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
472*199767f8SToomas Soome 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
473*199767f8SToomas Soome 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
474*199767f8SToomas Soome 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
475*199767f8SToomas Soome 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
476*199767f8SToomas Soome 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
477*199767f8SToomas Soome 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
478*199767f8SToomas Soome 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
479*199767f8SToomas Soome 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
480*199767f8SToomas Soome 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
481*199767f8SToomas Soome 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
482*199767f8SToomas Soome 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
483*199767f8SToomas Soome 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
484*199767f8SToomas Soome };
485*199767f8SToomas Soome 
486*199767f8SToomas Soome /*
487*199767f8SToomas Soome  * Multiply a given number by 2 raised to the given power.
488*199767f8SToomas Soome  */
489*199767f8SToomas Soome static uint8_t
490*199767f8SToomas Soome vdev_raidz_exp2(uint8_t a, int exp)
491*199767f8SToomas Soome {
492*199767f8SToomas Soome 	if (a == 0)
493*199767f8SToomas Soome 		return (0);
494*199767f8SToomas Soome 
495*199767f8SToomas Soome 	ASSERT(exp >= 0);
496*199767f8SToomas Soome 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
497*199767f8SToomas Soome 
498*199767f8SToomas Soome 	exp += vdev_raidz_log2[a];
499*199767f8SToomas Soome 	if (exp > 255)
500*199767f8SToomas Soome 		exp -= 255;
501*199767f8SToomas Soome 
502*199767f8SToomas Soome 	return (vdev_raidz_pow2[exp]);
503*199767f8SToomas Soome }
504*199767f8SToomas Soome 
505*199767f8SToomas Soome static void
506*199767f8SToomas Soome vdev_raidz_generate_parity_p(raidz_map_t *rm)
507*199767f8SToomas Soome {
508*199767f8SToomas Soome 	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
509*199767f8SToomas Soome 	int c;
510*199767f8SToomas Soome 
511*199767f8SToomas Soome 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
512*199767f8SToomas Soome 
513*199767f8SToomas Soome 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
514*199767f8SToomas Soome 		src = rm->rm_col[c].rc_data;
515*199767f8SToomas Soome 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
516*199767f8SToomas Soome 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
517*199767f8SToomas Soome 
518*199767f8SToomas Soome 		if (c == rm->rm_firstdatacol) {
519*199767f8SToomas Soome 			ASSERT(ccount == pcount);
520*199767f8SToomas Soome 			for (i = 0; i < ccount; i++, src++, p++) {
521*199767f8SToomas Soome 				*p = *src;
522*199767f8SToomas Soome 			}
523*199767f8SToomas Soome 		} else {
524*199767f8SToomas Soome 			ASSERT(ccount <= pcount);
525*199767f8SToomas Soome 			for (i = 0; i < ccount; i++, src++, p++) {
526*199767f8SToomas Soome 				*p ^= *src;
527*199767f8SToomas Soome 			}
528*199767f8SToomas Soome 		}
529*199767f8SToomas Soome 	}
530*199767f8SToomas Soome }
531*199767f8SToomas Soome 
532*199767f8SToomas Soome static void
533*199767f8SToomas Soome vdev_raidz_generate_parity_pq(raidz_map_t *rm)
534*199767f8SToomas Soome {
535*199767f8SToomas Soome 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
536*199767f8SToomas Soome 	int c;
537*199767f8SToomas Soome 
538*199767f8SToomas Soome 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
539*199767f8SToomas Soome 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
540*199767f8SToomas Soome 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
541*199767f8SToomas Soome 
542*199767f8SToomas Soome 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
543*199767f8SToomas Soome 		src = rm->rm_col[c].rc_data;
544*199767f8SToomas Soome 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
545*199767f8SToomas Soome 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
546*199767f8SToomas Soome 
547*199767f8SToomas Soome 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
548*199767f8SToomas Soome 
549*199767f8SToomas Soome 		if (c == rm->rm_firstdatacol) {
550*199767f8SToomas Soome 			ASSERT(ccnt == pcnt || ccnt == 0);
551*199767f8SToomas Soome 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
552*199767f8SToomas Soome 				*p = *src;
553*199767f8SToomas Soome 				*q = *src;
554*199767f8SToomas Soome 			}
555*199767f8SToomas Soome 			for (; i < pcnt; i++, src++, p++, q++) {
556*199767f8SToomas Soome 				*p = 0;
557*199767f8SToomas Soome 				*q = 0;
558*199767f8SToomas Soome 			}
559*199767f8SToomas Soome 		} else {
560*199767f8SToomas Soome 			ASSERT(ccnt <= pcnt);
561*199767f8SToomas Soome 
562*199767f8SToomas Soome 			/*
563*199767f8SToomas Soome 			 * Apply the algorithm described above by multiplying
564*199767f8SToomas Soome 			 * the previous result and adding in the new value.
565*199767f8SToomas Soome 			 */
566*199767f8SToomas Soome 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
567*199767f8SToomas Soome 				*p ^= *src;
568*199767f8SToomas Soome 
569*199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_2(*q, mask);
570*199767f8SToomas Soome 				*q ^= *src;
571*199767f8SToomas Soome 			}
572*199767f8SToomas Soome 
573*199767f8SToomas Soome 			/*
574*199767f8SToomas Soome 			 * Treat short columns as though they are full of 0s.
575*199767f8SToomas Soome 			 * Note that there's therefore nothing needed for P.
576*199767f8SToomas Soome 			 */
577*199767f8SToomas Soome 			for (; i < pcnt; i++, q++) {
578*199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_2(*q, mask);
579*199767f8SToomas Soome 			}
580*199767f8SToomas Soome 		}
581*199767f8SToomas Soome 	}
582*199767f8SToomas Soome }
583*199767f8SToomas Soome 
584*199767f8SToomas Soome static void
585*199767f8SToomas Soome vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
586*199767f8SToomas Soome {
587*199767f8SToomas Soome 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
588*199767f8SToomas Soome 	int c;
589*199767f8SToomas Soome 
590*199767f8SToomas Soome 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
591*199767f8SToomas Soome 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
592*199767f8SToomas Soome 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
593*199767f8SToomas Soome 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
594*199767f8SToomas Soome 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
595*199767f8SToomas Soome 
596*199767f8SToomas Soome 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
597*199767f8SToomas Soome 		src = rm->rm_col[c].rc_data;
598*199767f8SToomas Soome 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
599*199767f8SToomas Soome 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
600*199767f8SToomas Soome 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
601*199767f8SToomas Soome 
602*199767f8SToomas Soome 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
603*199767f8SToomas Soome 
604*199767f8SToomas Soome 		if (c == rm->rm_firstdatacol) {
605*199767f8SToomas Soome 			ASSERT(ccnt == pcnt || ccnt == 0);
606*199767f8SToomas Soome 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
607*199767f8SToomas Soome 				*p = *src;
608*199767f8SToomas Soome 				*q = *src;
609*199767f8SToomas Soome 				*r = *src;
610*199767f8SToomas Soome 			}
611*199767f8SToomas Soome 			for (; i < pcnt; i++, src++, p++, q++, r++) {
612*199767f8SToomas Soome 				*p = 0;
613*199767f8SToomas Soome 				*q = 0;
614*199767f8SToomas Soome 				*r = 0;
615*199767f8SToomas Soome 			}
616*199767f8SToomas Soome 		} else {
617*199767f8SToomas Soome 			ASSERT(ccnt <= pcnt);
618*199767f8SToomas Soome 
619*199767f8SToomas Soome 			/*
620*199767f8SToomas Soome 			 * Apply the algorithm described above by multiplying
621*199767f8SToomas Soome 			 * the previous result and adding in the new value.
622*199767f8SToomas Soome 			 */
623*199767f8SToomas Soome 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
624*199767f8SToomas Soome 				*p ^= *src;
625*199767f8SToomas Soome 
626*199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_2(*q, mask);
627*199767f8SToomas Soome 				*q ^= *src;
628*199767f8SToomas Soome 
629*199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_4(*r, mask);
630*199767f8SToomas Soome 				*r ^= *src;
631*199767f8SToomas Soome 			}
632*199767f8SToomas Soome 
633*199767f8SToomas Soome 			/*
634*199767f8SToomas Soome 			 * Treat short columns as though they are full of 0s.
635*199767f8SToomas Soome 			 * Note that there's therefore nothing needed for P.
636*199767f8SToomas Soome 			 */
637*199767f8SToomas Soome 			for (; i < pcnt; i++, q++, r++) {
638*199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_2(*q, mask);
639*199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_4(*r, mask);
640*199767f8SToomas Soome 			}
641*199767f8SToomas Soome 		}
642*199767f8SToomas Soome 	}
643*199767f8SToomas Soome }
644*199767f8SToomas Soome 
645*199767f8SToomas Soome /*
646*199767f8SToomas Soome  * Generate RAID parity in the first virtual columns according to the number of
647*199767f8SToomas Soome  * parity columns available.
648*199767f8SToomas Soome  */
649*199767f8SToomas Soome static void
650*199767f8SToomas Soome vdev_raidz_generate_parity(raidz_map_t *rm)
651*199767f8SToomas Soome {
652*199767f8SToomas Soome 	switch (rm->rm_firstdatacol) {
653*199767f8SToomas Soome 	case 1:
654*199767f8SToomas Soome 		vdev_raidz_generate_parity_p(rm);
655*199767f8SToomas Soome 		break;
656*199767f8SToomas Soome 	case 2:
657*199767f8SToomas Soome 		vdev_raidz_generate_parity_pq(rm);
658*199767f8SToomas Soome 		break;
659*199767f8SToomas Soome 	case 3:
660*199767f8SToomas Soome 		vdev_raidz_generate_parity_pqr(rm);
661*199767f8SToomas Soome 		break;
662*199767f8SToomas Soome 	default:
663*199767f8SToomas Soome 		panic("invalid RAID-Z configuration");
664*199767f8SToomas Soome 	}
665*199767f8SToomas Soome }
666*199767f8SToomas Soome 
667*199767f8SToomas Soome /* BEGIN CSTYLED */
668*199767f8SToomas Soome /*
669*199767f8SToomas Soome  * In the general case of reconstruction, we must solve the system of linear
670*199767f8SToomas Soome  * equations defined by the coeffecients used to generate parity as well as
671*199767f8SToomas Soome  * the contents of the data and parity disks. This can be expressed with
672*199767f8SToomas Soome  * vectors for the original data (D) and the actual data (d) and parity (p)
673*199767f8SToomas Soome  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
674*199767f8SToomas Soome  *
675*199767f8SToomas Soome  *            __   __                     __     __
676*199767f8SToomas Soome  *            |     |         __     __   |  p_0  |
677*199767f8SToomas Soome  *            |  V  |         |  D_0  |   | p_m-1 |
678*199767f8SToomas Soome  *            |     |    x    |   :   | = |  d_0  |
679*199767f8SToomas Soome  *            |  I  |         | D_n-1 |   |   :   |
680*199767f8SToomas Soome  *            |     |         ~~     ~~   | d_n-1 |
681*199767f8SToomas Soome  *            ~~   ~~                     ~~     ~~
682*199767f8SToomas Soome  *
683*199767f8SToomas Soome  * I is simply a square identity matrix of size n, and V is a vandermonde
684*199767f8SToomas Soome  * matrix defined by the coeffecients we chose for the various parity columns
685*199767f8SToomas Soome  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
686*199767f8SToomas Soome  * computation as well as linear separability.
687*199767f8SToomas Soome  *
688*199767f8SToomas Soome  *      __               __               __     __
689*199767f8SToomas Soome  *      |   1   ..  1 1 1 |               |  p_0  |
690*199767f8SToomas Soome  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
691*199767f8SToomas Soome  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
692*199767f8SToomas Soome  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
693*199767f8SToomas Soome  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
694*199767f8SToomas Soome  *      |   :       : : : |   |   :   |   |  d_2  |
695*199767f8SToomas Soome  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
696*199767f8SToomas Soome  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
697*199767f8SToomas Soome  *      |   0   ..  0 0 1 |               | d_n-1 |
698*199767f8SToomas Soome  *      ~~               ~~               ~~     ~~
699*199767f8SToomas Soome  *
700*199767f8SToomas Soome  * Note that I, V, d, and p are known. To compute D, we must invert the
701*199767f8SToomas Soome  * matrix and use the known data and parity values to reconstruct the unknown
702*199767f8SToomas Soome  * data values. We begin by removing the rows in V|I and d|p that correspond
703*199767f8SToomas Soome  * to failed or missing columns; we then make V|I square (n x n) and d|p
704*199767f8SToomas Soome  * sized n by removing rows corresponding to unused parity from the bottom up
705*199767f8SToomas Soome  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
706*199767f8SToomas Soome  * using Gauss-Jordan elimination. In the example below we use m=3 parity
707*199767f8SToomas Soome  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
708*199767f8SToomas Soome  *           __                               __
709*199767f8SToomas Soome  *           |  1   1   1   1   1   1   1   1  |
710*199767f8SToomas Soome  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
711*199767f8SToomas Soome  *           |  19 205 116  29  64  16  4   1  |      / /
712*199767f8SToomas Soome  *           |  1   0   0   0   0   0   0   0  |     / /
713*199767f8SToomas Soome  *           |  0   1   0   0   0   0   0   0  | <--' /
714*199767f8SToomas Soome  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
715*199767f8SToomas Soome  *           |  0   0   0   1   0   0   0   0  |
716*199767f8SToomas Soome  *           |  0   0   0   0   1   0   0   0  |
717*199767f8SToomas Soome  *           |  0   0   0   0   0   1   0   0  |
718*199767f8SToomas Soome  *           |  0   0   0   0   0   0   1   0  |
719*199767f8SToomas Soome  *           |  0   0   0   0   0   0   0   1  |
720*199767f8SToomas Soome  *           ~~                               ~~
721*199767f8SToomas Soome  *           __                               __
722*199767f8SToomas Soome  *           |  1   1   1   1   1   1   1   1  |
723*199767f8SToomas Soome  *           | 128  64  32  16  8   4   2   1  |
724*199767f8SToomas Soome  *           |  19 205 116  29  64  16  4   1  |
725*199767f8SToomas Soome  *           |  1   0   0   0   0   0   0   0  |
726*199767f8SToomas Soome  *           |  0   1   0   0   0   0   0   0  |
727*199767f8SToomas Soome  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
728*199767f8SToomas Soome  *           |  0   0   0   1   0   0   0   0  |
729*199767f8SToomas Soome  *           |  0   0   0   0   1   0   0   0  |
730*199767f8SToomas Soome  *           |  0   0   0   0   0   1   0   0  |
731*199767f8SToomas Soome  *           |  0   0   0   0   0   0   1   0  |
732*199767f8SToomas Soome  *           |  0   0   0   0   0   0   0   1  |
733*199767f8SToomas Soome  *           ~~                               ~~
734*199767f8SToomas Soome  *
735*199767f8SToomas Soome  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
736*199767f8SToomas Soome  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
737*199767f8SToomas Soome  * matrix is not singular.
738*199767f8SToomas Soome  * __                                                                 __
739*199767f8SToomas Soome  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
740*199767f8SToomas Soome  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
741*199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
742*199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
743*199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
744*199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
745*199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
746*199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
747*199767f8SToomas Soome  * ~~                                                                 ~~
748*199767f8SToomas Soome  * __                                                                 __
749*199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
750*199767f8SToomas Soome  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
751*199767f8SToomas Soome  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
752*199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
753*199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
754*199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
755*199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
756*199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
757*199767f8SToomas Soome  * ~~                                                                 ~~
758*199767f8SToomas Soome  * __                                                                 __
759*199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
760*199767f8SToomas Soome  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
761*199767f8SToomas Soome  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
762*199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
763*199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
764*199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
765*199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
766*199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
767*199767f8SToomas Soome  * ~~                                                                 ~~
768*199767f8SToomas Soome  * __                                                                 __
769*199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
770*199767f8SToomas Soome  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
771*199767f8SToomas Soome  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
772*199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
773*199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
774*199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
775*199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
776*199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
777*199767f8SToomas Soome  * ~~                                                                 ~~
778*199767f8SToomas Soome  * __                                                                 __
779*199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
780*199767f8SToomas Soome  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
781*199767f8SToomas Soome  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
782*199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
783*199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
784*199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
785*199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
786*199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
787*199767f8SToomas Soome  * ~~                                                                 ~~
788*199767f8SToomas Soome  * __                                                                 __
789*199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
790*199767f8SToomas Soome  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
791*199767f8SToomas Soome  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
792*199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
793*199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
794*199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
795*199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
796*199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
797*199767f8SToomas Soome  * ~~                                                                 ~~
798*199767f8SToomas Soome  *                   __                               __
799*199767f8SToomas Soome  *                   |  0   0   1   0   0   0   0   0  |
800*199767f8SToomas Soome  *                   | 167 100  5   41 159 169 217 208 |
801*199767f8SToomas Soome  *                   | 166 100  4   40 158 168 216 209 |
802*199767f8SToomas Soome  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
803*199767f8SToomas Soome  *                   |  0   0   0   0   1   0   0   0  |
804*199767f8SToomas Soome  *                   |  0   0   0   0   0   1   0   0  |
805*199767f8SToomas Soome  *                   |  0   0   0   0   0   0   1   0  |
806*199767f8SToomas Soome  *                   |  0   0   0   0   0   0   0   1  |
807*199767f8SToomas Soome  *                   ~~                               ~~
808*199767f8SToomas Soome  *
809*199767f8SToomas Soome  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
810*199767f8SToomas Soome  * of the missing data.
811*199767f8SToomas Soome  *
812*199767f8SToomas Soome  * As is apparent from the example above, the only non-trivial rows in the
813*199767f8SToomas Soome  * inverse matrix correspond to the data disks that we're trying to
814*199767f8SToomas Soome  * reconstruct. Indeed, those are the only rows we need as the others would
815*199767f8SToomas Soome  * only be useful for reconstructing data known or assumed to be valid. For
816*199767f8SToomas Soome  * that reason, we only build the coefficients in the rows that correspond to
817*199767f8SToomas Soome  * targeted columns.
818*199767f8SToomas Soome  */
819*199767f8SToomas Soome /* END CSTYLED */
820*199767f8SToomas Soome 
821*199767f8SToomas Soome static void
822*199767f8SToomas Soome vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
823*199767f8SToomas Soome     uint8_t **rows)
824*199767f8SToomas Soome {
825*199767f8SToomas Soome 	int i, j;
826*199767f8SToomas Soome 	int pow;
827*199767f8SToomas Soome 
828*199767f8SToomas Soome 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
829*199767f8SToomas Soome 
830*199767f8SToomas Soome 	/*
831*199767f8SToomas Soome 	 * Fill in the missing rows of interest.
832*199767f8SToomas Soome 	 */
833*199767f8SToomas Soome 	for (i = 0; i < nmap; i++) {
834*199767f8SToomas Soome 		ASSERT3S(0, <=, map[i]);
835*199767f8SToomas Soome 		ASSERT3S(map[i], <=, 2);
836*199767f8SToomas Soome 
837*199767f8SToomas Soome 		pow = map[i] * n;
838*199767f8SToomas Soome 		if (pow > 255)
839*199767f8SToomas Soome 			pow -= 255;
840*199767f8SToomas Soome 		ASSERT(pow <= 255);
841*199767f8SToomas Soome 
842*199767f8SToomas Soome 		for (j = 0; j < n; j++) {
843*199767f8SToomas Soome 			pow -= map[i];
844*199767f8SToomas Soome 			if (pow < 0)
845*199767f8SToomas Soome 				pow += 255;
846*199767f8SToomas Soome 			rows[i][j] = vdev_raidz_pow2[pow];
847*199767f8SToomas Soome 		}
848*199767f8SToomas Soome 	}
849*199767f8SToomas Soome }
850*199767f8SToomas Soome 
851*199767f8SToomas Soome static void
852*199767f8SToomas Soome vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
853*199767f8SToomas Soome     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
854*199767f8SToomas Soome {
855*199767f8SToomas Soome 	int i, j, ii, jj;
856*199767f8SToomas Soome 	uint8_t log;
857*199767f8SToomas Soome 
858*199767f8SToomas Soome 	/*
859*199767f8SToomas Soome 	 * Assert that the first nmissing entries from the array of used
860*199767f8SToomas Soome 	 * columns correspond to parity columns and that subsequent entries
861*199767f8SToomas Soome 	 * correspond to data columns.
862*199767f8SToomas Soome 	 */
863*199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
864*199767f8SToomas Soome 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
865*199767f8SToomas Soome 	}
866*199767f8SToomas Soome 	for (; i < n; i++) {
867*199767f8SToomas Soome 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
868*199767f8SToomas Soome 	}
869*199767f8SToomas Soome 
870*199767f8SToomas Soome 	/*
871*199767f8SToomas Soome 	 * First initialize the storage where we'll compute the inverse rows.
872*199767f8SToomas Soome 	 */
873*199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
874*199767f8SToomas Soome 		for (j = 0; j < n; j++) {
875*199767f8SToomas Soome 			invrows[i][j] = (i == j) ? 1 : 0;
876*199767f8SToomas Soome 		}
877*199767f8SToomas Soome 	}
878*199767f8SToomas Soome 
879*199767f8SToomas Soome 	/*
880*199767f8SToomas Soome 	 * Subtract all trivial rows from the rows of consequence.
881*199767f8SToomas Soome 	 */
882*199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
883*199767f8SToomas Soome 		for (j = nmissing; j < n; j++) {
884*199767f8SToomas Soome 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
885*199767f8SToomas Soome 			jj = used[j] - rm->rm_firstdatacol;
886*199767f8SToomas Soome 			ASSERT3S(jj, <, n);
887*199767f8SToomas Soome 			invrows[i][j] = rows[i][jj];
888*199767f8SToomas Soome 			rows[i][jj] = 0;
889*199767f8SToomas Soome 		}
890*199767f8SToomas Soome 	}
891*199767f8SToomas Soome 
892*199767f8SToomas Soome 	/*
893*199767f8SToomas Soome 	 * For each of the rows of interest, we must normalize it and subtract
894*199767f8SToomas Soome 	 * a multiple of it from the other rows.
895*199767f8SToomas Soome 	 */
896*199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
897*199767f8SToomas Soome 		for (j = 0; j < missing[i]; j++) {
898*199767f8SToomas Soome 			ASSERT3U(rows[i][j], ==, 0);
899*199767f8SToomas Soome 		}
900*199767f8SToomas Soome 		ASSERT3U(rows[i][missing[i]], !=, 0);
901*199767f8SToomas Soome 
902*199767f8SToomas Soome 		/*
903*199767f8SToomas Soome 		 * Compute the inverse of the first element and multiply each
904*199767f8SToomas Soome 		 * element in the row by that value.
905*199767f8SToomas Soome 		 */
906*199767f8SToomas Soome 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
907*199767f8SToomas Soome 
908*199767f8SToomas Soome 		for (j = 0; j < n; j++) {
909*199767f8SToomas Soome 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
910*199767f8SToomas Soome 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
911*199767f8SToomas Soome 		}
912*199767f8SToomas Soome 
913*199767f8SToomas Soome 		for (ii = 0; ii < nmissing; ii++) {
914*199767f8SToomas Soome 			if (i == ii)
915*199767f8SToomas Soome 				continue;
916*199767f8SToomas Soome 
917*199767f8SToomas Soome 			ASSERT3U(rows[ii][missing[i]], !=, 0);
918*199767f8SToomas Soome 
919*199767f8SToomas Soome 			log = vdev_raidz_log2[rows[ii][missing[i]]];
920*199767f8SToomas Soome 
921*199767f8SToomas Soome 			for (j = 0; j < n; j++) {
922*199767f8SToomas Soome 				rows[ii][j] ^=
923*199767f8SToomas Soome 				    vdev_raidz_exp2(rows[i][j], log);
924*199767f8SToomas Soome 				invrows[ii][j] ^=
925*199767f8SToomas Soome 				    vdev_raidz_exp2(invrows[i][j], log);
926*199767f8SToomas Soome 			}
927*199767f8SToomas Soome 		}
928*199767f8SToomas Soome 	}
929*199767f8SToomas Soome 
930*199767f8SToomas Soome 	/*
931*199767f8SToomas Soome 	 * Verify that the data that is left in the rows are properly part of
932*199767f8SToomas Soome 	 * an identity matrix.
933*199767f8SToomas Soome 	 */
934*199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
935*199767f8SToomas Soome 		for (j = 0; j < n; j++) {
936*199767f8SToomas Soome 			if (j == missing[i]) {
937*199767f8SToomas Soome 				ASSERT3U(rows[i][j], ==, 1);
938*199767f8SToomas Soome 			} else {
939*199767f8SToomas Soome 				ASSERT3U(rows[i][j], ==, 0);
940*199767f8SToomas Soome 			}
941*199767f8SToomas Soome 		}
942*199767f8SToomas Soome 	}
943*199767f8SToomas Soome }
944*199767f8SToomas Soome 
945*199767f8SToomas Soome static void
946*199767f8SToomas Soome vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
947*199767f8SToomas Soome     int *missing, uint8_t **invrows, const uint8_t *used)
948*199767f8SToomas Soome {
949*199767f8SToomas Soome 	int i, j, x, cc, c;
950*199767f8SToomas Soome 	uint8_t *src;
951*199767f8SToomas Soome 	uint64_t ccount;
952*199767f8SToomas Soome 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
953*199767f8SToomas Soome 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
954*199767f8SToomas Soome 	uint8_t log, val;
955*199767f8SToomas Soome 	int ll;
956*199767f8SToomas Soome 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
957*199767f8SToomas Soome 	uint8_t *p, *pp;
958*199767f8SToomas Soome 	size_t psize;
959*199767f8SToomas Soome 
960*199767f8SToomas Soome 	log = 0;	/* gcc */
961*199767f8SToomas Soome 	psize = sizeof (invlog[0][0]) * n * nmissing;
962*199767f8SToomas Soome 	p = zfs_alloc(psize);
963*199767f8SToomas Soome 
964*199767f8SToomas Soome 	for (pp = p, i = 0; i < nmissing; i++) {
965*199767f8SToomas Soome 		invlog[i] = pp;
966*199767f8SToomas Soome 		pp += n;
967*199767f8SToomas Soome 	}
968*199767f8SToomas Soome 
969*199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
970*199767f8SToomas Soome 		for (j = 0; j < n; j++) {
971*199767f8SToomas Soome 			ASSERT3U(invrows[i][j], !=, 0);
972*199767f8SToomas Soome 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
973*199767f8SToomas Soome 		}
974*199767f8SToomas Soome 	}
975*199767f8SToomas Soome 
976*199767f8SToomas Soome 	for (i = 0; i < n; i++) {
977*199767f8SToomas Soome 		c = used[i];
978*199767f8SToomas Soome 		ASSERT3U(c, <, rm->rm_cols);
979*199767f8SToomas Soome 
980*199767f8SToomas Soome 		src = rm->rm_col[c].rc_data;
981*199767f8SToomas Soome 		ccount = rm->rm_col[c].rc_size;
982*199767f8SToomas Soome 		for (j = 0; j < nmissing; j++) {
983*199767f8SToomas Soome 			cc = missing[j] + rm->rm_firstdatacol;
984*199767f8SToomas Soome 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
985*199767f8SToomas Soome 			ASSERT3U(cc, <, rm->rm_cols);
986*199767f8SToomas Soome 			ASSERT3U(cc, !=, c);
987*199767f8SToomas Soome 
988*199767f8SToomas Soome 			dst[j] = rm->rm_col[cc].rc_data;
989*199767f8SToomas Soome 			dcount[j] = rm->rm_col[cc].rc_size;
990*199767f8SToomas Soome 		}
991*199767f8SToomas Soome 
992*199767f8SToomas Soome 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
993*199767f8SToomas Soome 
994*199767f8SToomas Soome 		for (x = 0; x < ccount; x++, src++) {
995*199767f8SToomas Soome 			if (*src != 0)
996*199767f8SToomas Soome 				log = vdev_raidz_log2[*src];
997*199767f8SToomas Soome 
998*199767f8SToomas Soome 			for (cc = 0; cc < nmissing; cc++) {
999*199767f8SToomas Soome 				if (x >= dcount[cc])
1000*199767f8SToomas Soome 					continue;
1001*199767f8SToomas Soome 
1002*199767f8SToomas Soome 				if (*src == 0) {
1003*199767f8SToomas Soome 					val = 0;
1004*199767f8SToomas Soome 				} else {
1005*199767f8SToomas Soome 					if ((ll = log + invlog[cc][i]) >= 255)
1006*199767f8SToomas Soome 						ll -= 255;
1007*199767f8SToomas Soome 					val = vdev_raidz_pow2[ll];
1008*199767f8SToomas Soome 				}
1009*199767f8SToomas Soome 
1010*199767f8SToomas Soome 				if (i == 0)
1011*199767f8SToomas Soome 					dst[cc][x] = val;
1012*199767f8SToomas Soome 				else
1013*199767f8SToomas Soome 					dst[cc][x] ^= val;
1014*199767f8SToomas Soome 			}
1015*199767f8SToomas Soome 		}
1016*199767f8SToomas Soome 	}
1017*199767f8SToomas Soome 
1018*199767f8SToomas Soome 	zfs_free(p, psize);
1019*199767f8SToomas Soome }
1020*199767f8SToomas Soome 
1021*199767f8SToomas Soome static int
1022*199767f8SToomas Soome vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1023*199767f8SToomas Soome {
1024*199767f8SToomas Soome 	int n, i, c, t, tt;
1025*199767f8SToomas Soome 	int nmissing_rows;
1026*199767f8SToomas Soome 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1027*199767f8SToomas Soome 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1028*199767f8SToomas Soome 
1029*199767f8SToomas Soome 	uint8_t *p, *pp;
1030*199767f8SToomas Soome 	size_t psize;
1031*199767f8SToomas Soome 
1032*199767f8SToomas Soome 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1033*199767f8SToomas Soome 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1034*199767f8SToomas Soome 	uint8_t *used;
1035*199767f8SToomas Soome 
1036*199767f8SToomas Soome 	int code = 0;
1037*199767f8SToomas Soome 
1038*199767f8SToomas Soome 
1039*199767f8SToomas Soome 	n = rm->rm_cols - rm->rm_firstdatacol;
1040*199767f8SToomas Soome 
1041*199767f8SToomas Soome 	/*
1042*199767f8SToomas Soome 	 * Figure out which data columns are missing.
1043*199767f8SToomas Soome 	 */
1044*199767f8SToomas Soome 	nmissing_rows = 0;
1045*199767f8SToomas Soome 	for (t = 0; t < ntgts; t++) {
1046*199767f8SToomas Soome 		if (tgts[t] >= rm->rm_firstdatacol) {
1047*199767f8SToomas Soome 			missing_rows[nmissing_rows++] =
1048*199767f8SToomas Soome 			    tgts[t] - rm->rm_firstdatacol;
1049*199767f8SToomas Soome 		}
1050*199767f8SToomas Soome 	}
1051*199767f8SToomas Soome 
1052*199767f8SToomas Soome 	/*
1053*199767f8SToomas Soome 	 * Figure out which parity columns to use to help generate the missing
1054*199767f8SToomas Soome 	 * data columns.
1055*199767f8SToomas Soome 	 */
1056*199767f8SToomas Soome 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1057*199767f8SToomas Soome 		ASSERT(tt < ntgts);
1058*199767f8SToomas Soome 		ASSERT(c < rm->rm_firstdatacol);
1059*199767f8SToomas Soome 
1060*199767f8SToomas Soome 		/*
1061*199767f8SToomas Soome 		 * Skip any targeted parity columns.
1062*199767f8SToomas Soome 		 */
1063*199767f8SToomas Soome 		if (c == tgts[tt]) {
1064*199767f8SToomas Soome 			tt++;
1065*199767f8SToomas Soome 			continue;
1066*199767f8SToomas Soome 		}
1067*199767f8SToomas Soome 
1068*199767f8SToomas Soome 		code |= 1 << c;
1069*199767f8SToomas Soome 
1070*199767f8SToomas Soome 		parity_map[i] = c;
1071*199767f8SToomas Soome 		i++;
1072*199767f8SToomas Soome 	}
1073*199767f8SToomas Soome 
1074*199767f8SToomas Soome 	ASSERT(code != 0);
1075*199767f8SToomas Soome 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1076*199767f8SToomas Soome 
1077*199767f8SToomas Soome 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1078*199767f8SToomas Soome 	    nmissing_rows * n + sizeof (used[0]) * n;
1079*199767f8SToomas Soome 	p = kmem_alloc(psize, KM_SLEEP);
1080*199767f8SToomas Soome 
1081*199767f8SToomas Soome 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1082*199767f8SToomas Soome 		rows[i] = pp;
1083*199767f8SToomas Soome 		pp += n;
1084*199767f8SToomas Soome 		invrows[i] = pp;
1085*199767f8SToomas Soome 		pp += n;
1086*199767f8SToomas Soome 	}
1087*199767f8SToomas Soome 	used = pp;
1088*199767f8SToomas Soome 
1089*199767f8SToomas Soome 	for (i = 0; i < nmissing_rows; i++) {
1090*199767f8SToomas Soome 		used[i] = parity_map[i];
1091*199767f8SToomas Soome 	}
1092*199767f8SToomas Soome 
1093*199767f8SToomas Soome 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1094*199767f8SToomas Soome 		if (tt < nmissing_rows &&
1095*199767f8SToomas Soome 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1096*199767f8SToomas Soome 			tt++;
1097*199767f8SToomas Soome 			continue;
1098*199767f8SToomas Soome 		}
1099*199767f8SToomas Soome 
1100*199767f8SToomas Soome 		ASSERT3S(i, <, n);
1101*199767f8SToomas Soome 		used[i] = c;
1102*199767f8SToomas Soome 		i++;
1103*199767f8SToomas Soome 	}
1104*199767f8SToomas Soome 
1105*199767f8SToomas Soome 	/*
1106*199767f8SToomas Soome 	 * Initialize the interesting rows of the matrix.
1107*199767f8SToomas Soome 	 */
1108*199767f8SToomas Soome 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1109*199767f8SToomas Soome 
1110*199767f8SToomas Soome 	/*
1111*199767f8SToomas Soome 	 * Invert the matrix.
1112*199767f8SToomas Soome 	 */
1113*199767f8SToomas Soome 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1114*199767f8SToomas Soome 	    invrows, used);
1115*199767f8SToomas Soome 
1116*199767f8SToomas Soome 	/*
1117*199767f8SToomas Soome 	 * Reconstruct the missing data using the generated matrix.
1118*199767f8SToomas Soome 	 */
1119*199767f8SToomas Soome 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1120*199767f8SToomas Soome 	    invrows, used);
1121*199767f8SToomas Soome 
1122*199767f8SToomas Soome 	kmem_free(p, psize);
1123*199767f8SToomas Soome 
1124*199767f8SToomas Soome 	return (code);
1125*199767f8SToomas Soome }
1126*199767f8SToomas Soome 
1127*199767f8SToomas Soome static int
1128*199767f8SToomas Soome vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1129*199767f8SToomas Soome {
1130*199767f8SToomas Soome 	int tgts[VDEV_RAIDZ_MAXPARITY];
1131*199767f8SToomas Soome 	int ntgts;
1132*199767f8SToomas Soome 	int i, c;
1133*199767f8SToomas Soome 	int code;
1134*199767f8SToomas Soome 	int nbadparity, nbaddata;
1135*199767f8SToomas Soome 
1136*199767f8SToomas Soome 	/*
1137*199767f8SToomas Soome 	 * The tgts list must already be sorted.
1138*199767f8SToomas Soome 	 */
1139*199767f8SToomas Soome 	for (i = 1; i < nt; i++) {
1140*199767f8SToomas Soome 		ASSERT(t[i] > t[i - 1]);
1141*199767f8SToomas Soome 	}
1142*199767f8SToomas Soome 
1143*199767f8SToomas Soome 	nbadparity = rm->rm_firstdatacol;
1144*199767f8SToomas Soome 	nbaddata = rm->rm_cols - nbadparity;
1145*199767f8SToomas Soome 	ntgts = 0;
1146*199767f8SToomas Soome 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1147*199767f8SToomas Soome 		if (i < nt && c == t[i]) {
1148*199767f8SToomas Soome 			tgts[ntgts++] = c;
1149*199767f8SToomas Soome 			i++;
1150*199767f8SToomas Soome 		} else if (rm->rm_col[c].rc_error != 0) {
1151*199767f8SToomas Soome 			tgts[ntgts++] = c;
1152*199767f8SToomas Soome 		} else if (c >= rm->rm_firstdatacol) {
1153*199767f8SToomas Soome 			nbaddata--;
1154*199767f8SToomas Soome 		} else {
1155*199767f8SToomas Soome 			nbadparity--;
1156*199767f8SToomas Soome 		}
1157*199767f8SToomas Soome 	}
1158*199767f8SToomas Soome 
1159*199767f8SToomas Soome 	ASSERT(ntgts >= nt);
1160*199767f8SToomas Soome 	ASSERT(nbaddata >= 0);
1161*199767f8SToomas Soome 	ASSERT(nbaddata + nbadparity == ntgts);
1162*199767f8SToomas Soome 
1163*199767f8SToomas Soome 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1164*199767f8SToomas Soome 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1165*199767f8SToomas Soome 	ASSERT(code > 0);
1166*199767f8SToomas Soome 	return (code);
1167*199767f8SToomas Soome }
1168*199767f8SToomas Soome 
1169*199767f8SToomas Soome static raidz_map_t *
1170*199767f8SToomas Soome vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1171*199767f8SToomas Soome     uint64_t dcols, uint64_t nparity)
1172*199767f8SToomas Soome {
1173*199767f8SToomas Soome 	raidz_map_t *rm;
1174*199767f8SToomas Soome 	uint64_t b = offset >> unit_shift;
1175*199767f8SToomas Soome 	uint64_t s = size >> unit_shift;
1176*199767f8SToomas Soome 	uint64_t f = b % dcols;
1177*199767f8SToomas Soome 	uint64_t o = (b / dcols) << unit_shift;
1178*199767f8SToomas Soome 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1179*199767f8SToomas Soome 
1180*199767f8SToomas Soome 	q = s / (dcols - nparity);
1181*199767f8SToomas Soome 	r = s - q * (dcols - nparity);
1182*199767f8SToomas Soome 	bc = (r == 0 ? 0 : r + nparity);
1183*199767f8SToomas Soome 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1184*199767f8SToomas Soome 
1185*199767f8SToomas Soome 	if (q == 0) {
1186*199767f8SToomas Soome 		acols = bc;
1187*199767f8SToomas Soome 		scols = MIN(dcols, roundup(bc, nparity + 1));
1188*199767f8SToomas Soome 	} else {
1189*199767f8SToomas Soome 		acols = dcols;
1190*199767f8SToomas Soome 		scols = dcols;
1191*199767f8SToomas Soome 	}
1192*199767f8SToomas Soome 
1193*199767f8SToomas Soome 	ASSERT3U(acols, <=, scols);
1194*199767f8SToomas Soome 
1195*199767f8SToomas Soome 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1196*199767f8SToomas Soome 
1197*199767f8SToomas Soome 	rm->rm_cols = acols;
1198*199767f8SToomas Soome 	rm->rm_scols = scols;
1199*199767f8SToomas Soome 	rm->rm_bigcols = bc;
1200*199767f8SToomas Soome 	rm->rm_skipstart = bc;
1201*199767f8SToomas Soome 	rm->rm_missingdata = 0;
1202*199767f8SToomas Soome 	rm->rm_missingparity = 0;
1203*199767f8SToomas Soome 	rm->rm_firstdatacol = nparity;
1204*199767f8SToomas Soome 	rm->rm_reports = 0;
1205*199767f8SToomas Soome 	rm->rm_freed = 0;
1206*199767f8SToomas Soome 	rm->rm_ecksuminjected = 0;
1207*199767f8SToomas Soome 
1208*199767f8SToomas Soome 	asize = 0;
1209*199767f8SToomas Soome 
1210*199767f8SToomas Soome 	for (c = 0; c < scols; c++) {
1211*199767f8SToomas Soome 		col = f + c;
1212*199767f8SToomas Soome 		coff = o;
1213*199767f8SToomas Soome 		if (col >= dcols) {
1214*199767f8SToomas Soome 			col -= dcols;
1215*199767f8SToomas Soome 			coff += 1ULL << unit_shift;
1216*199767f8SToomas Soome 		}
1217*199767f8SToomas Soome 		rm->rm_col[c].rc_devidx = col;
1218*199767f8SToomas Soome 		rm->rm_col[c].rc_offset = coff;
1219*199767f8SToomas Soome 		rm->rm_col[c].rc_data = NULL;
1220*199767f8SToomas Soome 		rm->rm_col[c].rc_error = 0;
1221*199767f8SToomas Soome 		rm->rm_col[c].rc_tried = 0;
1222*199767f8SToomas Soome 		rm->rm_col[c].rc_skipped = 0;
1223*199767f8SToomas Soome 
1224*199767f8SToomas Soome 		if (c >= acols)
1225*199767f8SToomas Soome 			rm->rm_col[c].rc_size = 0;
1226*199767f8SToomas Soome 		else if (c < bc)
1227*199767f8SToomas Soome 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1228*199767f8SToomas Soome 		else
1229*199767f8SToomas Soome 			rm->rm_col[c].rc_size = q << unit_shift;
1230*199767f8SToomas Soome 
1231*199767f8SToomas Soome 		asize += rm->rm_col[c].rc_size;
1232*199767f8SToomas Soome 	}
1233*199767f8SToomas Soome 
1234*199767f8SToomas Soome 	ASSERT3U(asize, ==, tot << unit_shift);
1235*199767f8SToomas Soome 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1236*199767f8SToomas Soome 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1237*199767f8SToomas Soome 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1238*199767f8SToomas Soome 	ASSERT3U(rm->rm_nskip, <=, nparity);
1239*199767f8SToomas Soome 
1240*199767f8SToomas Soome 	for (c = 0; c < rm->rm_firstdatacol; c++)
1241*199767f8SToomas Soome 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1242*199767f8SToomas Soome 
1243*199767f8SToomas Soome 	rm->rm_col[c].rc_data = data;
1244*199767f8SToomas Soome 
1245*199767f8SToomas Soome 	for (c = c + 1; c < acols; c++)
1246*199767f8SToomas Soome 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1247*199767f8SToomas Soome 		    rm->rm_col[c - 1].rc_size;
1248*199767f8SToomas Soome 
1249*199767f8SToomas Soome 	/*
1250*199767f8SToomas Soome 	 * If all data stored spans all columns, there's a danger that parity
1251*199767f8SToomas Soome 	 * will always be on the same device and, since parity isn't read
1252*199767f8SToomas Soome 	 * during normal operation, that that device's I/O bandwidth won't be
1253*199767f8SToomas Soome 	 * used effectively. We therefore switch the parity every 1MB.
1254*199767f8SToomas Soome 	 *
1255*199767f8SToomas Soome 	 * ... at least that was, ostensibly, the theory. As a practical
1256*199767f8SToomas Soome 	 * matter unless we juggle the parity between all devices evenly, we
1257*199767f8SToomas Soome 	 * won't see any benefit. Further, occasional writes that aren't a
1258*199767f8SToomas Soome 	 * multiple of the LCM of the number of children and the minimum
1259*199767f8SToomas Soome 	 * stripe width are sufficient to avoid pessimal behavior.
1260*199767f8SToomas Soome 	 * Unfortunately, this decision created an implicit on-disk format
1261*199767f8SToomas Soome 	 * requirement that we need to support for all eternity, but only
1262*199767f8SToomas Soome 	 * for single-parity RAID-Z.
1263*199767f8SToomas Soome 	 *
1264*199767f8SToomas Soome 	 * If we intend to skip a sector in the zeroth column for padding
1265*199767f8SToomas Soome 	 * we must make sure to note this swap. We will never intend to
1266*199767f8SToomas Soome 	 * skip the first column since at least one data and one parity
1267*199767f8SToomas Soome 	 * column must appear in each row.
1268*199767f8SToomas Soome 	 */
1269*199767f8SToomas Soome 	ASSERT(rm->rm_cols >= 2);
1270*199767f8SToomas Soome 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1271*199767f8SToomas Soome 
1272*199767f8SToomas Soome 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1273*199767f8SToomas Soome 		devidx = rm->rm_col[0].rc_devidx;
1274*199767f8SToomas Soome 		o = rm->rm_col[0].rc_offset;
1275*199767f8SToomas Soome 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1276*199767f8SToomas Soome 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1277*199767f8SToomas Soome 		rm->rm_col[1].rc_devidx = devidx;
1278*199767f8SToomas Soome 		rm->rm_col[1].rc_offset = o;
1279*199767f8SToomas Soome 
1280*199767f8SToomas Soome 		if (rm->rm_skipstart == 0)
1281*199767f8SToomas Soome 			rm->rm_skipstart = 1;
1282*199767f8SToomas Soome 	}
1283*199767f8SToomas Soome 
1284*199767f8SToomas Soome 	return (rm);
1285*199767f8SToomas Soome }
1286*199767f8SToomas Soome 
1287*199767f8SToomas Soome static void
1288*199767f8SToomas Soome vdev_raidz_map_free(raidz_map_t *rm)
1289*199767f8SToomas Soome {
1290*199767f8SToomas Soome 	int c;
1291*199767f8SToomas Soome 
1292*199767f8SToomas Soome 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1293*199767f8SToomas Soome 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1294*199767f8SToomas Soome 
1295*199767f8SToomas Soome 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1296*199767f8SToomas Soome }
1297*199767f8SToomas Soome 
1298*199767f8SToomas Soome static vdev_t *
1299*199767f8SToomas Soome vdev_child(vdev_t *pvd, uint64_t devidx)
1300*199767f8SToomas Soome {
1301*199767f8SToomas Soome 	vdev_t *cvd;
1302*199767f8SToomas Soome 
1303*199767f8SToomas Soome 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1304*199767f8SToomas Soome 		if (cvd->v_id == devidx)
1305*199767f8SToomas Soome 			break;
1306*199767f8SToomas Soome 	}
1307*199767f8SToomas Soome 
1308*199767f8SToomas Soome 	return (cvd);
1309*199767f8SToomas Soome }
1310*199767f8SToomas Soome 
1311*199767f8SToomas Soome /*
1312*199767f8SToomas Soome  * We keep track of whether or not there were any injected errors, so that
1313*199767f8SToomas Soome  * any ereports we generate can note it.
1314*199767f8SToomas Soome  */
1315*199767f8SToomas Soome static int
1316*199767f8SToomas Soome raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1317*199767f8SToomas Soome {
1318*199767f8SToomas Soome 
1319*199767f8SToomas Soome 	return (zio_checksum_verify(bp, data));
1320*199767f8SToomas Soome }
1321*199767f8SToomas Soome 
1322*199767f8SToomas Soome /*
1323*199767f8SToomas Soome  * Generate the parity from the data columns. If we tried and were able to
1324*199767f8SToomas Soome  * read the parity without error, verify that the generated parity matches the
1325*199767f8SToomas Soome  * data we read. If it doesn't, we fire off a checksum error. Return the
1326*199767f8SToomas Soome  * number such failures.
1327*199767f8SToomas Soome  */
1328*199767f8SToomas Soome static int
1329*199767f8SToomas Soome raidz_parity_verify(raidz_map_t *rm)
1330*199767f8SToomas Soome {
1331*199767f8SToomas Soome 	void *orig[VDEV_RAIDZ_MAXPARITY];
1332*199767f8SToomas Soome 	int c, ret = 0;
1333*199767f8SToomas Soome 	raidz_col_t *rc;
1334*199767f8SToomas Soome 
1335*199767f8SToomas Soome 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1336*199767f8SToomas Soome 		rc = &rm->rm_col[c];
1337*199767f8SToomas Soome 		if (!rc->rc_tried || rc->rc_error != 0)
1338*199767f8SToomas Soome 			continue;
1339*199767f8SToomas Soome 		orig[c] = zfs_alloc(rc->rc_size);
1340*199767f8SToomas Soome 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1341*199767f8SToomas Soome 	}
1342*199767f8SToomas Soome 
1343*199767f8SToomas Soome 	vdev_raidz_generate_parity(rm);
1344*199767f8SToomas Soome 
1345*199767f8SToomas Soome 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1346*199767f8SToomas Soome 		rc = &rm->rm_col[c];
1347*199767f8SToomas Soome 		if (!rc->rc_tried || rc->rc_error != 0)
1348*199767f8SToomas Soome 			continue;
1349*199767f8SToomas Soome 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1350*199767f8SToomas Soome 			rc->rc_error = ECKSUM;
1351*199767f8SToomas Soome 			ret++;
1352*199767f8SToomas Soome 		}
1353*199767f8SToomas Soome 		zfs_free(orig[c], rc->rc_size);
1354*199767f8SToomas Soome 	}
1355*199767f8SToomas Soome 
1356*199767f8SToomas Soome 	return (ret);
1357*199767f8SToomas Soome }
1358*199767f8SToomas Soome 
1359*199767f8SToomas Soome /*
1360*199767f8SToomas Soome  * Iterate over all combinations of bad data and attempt a reconstruction.
1361*199767f8SToomas Soome  * Note that the algorithm below is non-optimal because it doesn't take into
1362*199767f8SToomas Soome  * account how reconstruction is actually performed. For example, with
1363*199767f8SToomas Soome  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1364*199767f8SToomas Soome  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1365*199767f8SToomas Soome  * cases we'd only use parity information in column 0.
1366*199767f8SToomas Soome  */
1367*199767f8SToomas Soome static int
1368*199767f8SToomas Soome vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1369*199767f8SToomas Soome     off_t offset, uint64_t bytes, int total_errors, int data_errors)
1370*199767f8SToomas Soome {
1371*199767f8SToomas Soome 	raidz_col_t *rc;
1372*199767f8SToomas Soome 	void *orig[VDEV_RAIDZ_MAXPARITY];
1373*199767f8SToomas Soome 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1374*199767f8SToomas Soome 	int *tgts = &tstore[1];
1375*199767f8SToomas Soome 	int current, next, i, c, n;
1376*199767f8SToomas Soome 	int code, ret = 0;
1377*199767f8SToomas Soome 
1378*199767f8SToomas Soome 	ASSERT(total_errors < rm->rm_firstdatacol);
1379*199767f8SToomas Soome 
1380*199767f8SToomas Soome 	/*
1381*199767f8SToomas Soome 	 * This simplifies one edge condition.
1382*199767f8SToomas Soome 	 */
1383*199767f8SToomas Soome 	tgts[-1] = -1;
1384*199767f8SToomas Soome 
1385*199767f8SToomas Soome 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1386*199767f8SToomas Soome 		/*
1387*199767f8SToomas Soome 		 * Initialize the targets array by finding the first n columns
1388*199767f8SToomas Soome 		 * that contain no error.
1389*199767f8SToomas Soome 		 *
1390*199767f8SToomas Soome 		 * If there were no data errors, we need to ensure that we're
1391*199767f8SToomas Soome 		 * always explicitly attempting to reconstruct at least one
1392*199767f8SToomas Soome 		 * data column. To do this, we simply push the highest target
1393*199767f8SToomas Soome 		 * up into the data columns.
1394*199767f8SToomas Soome 		 */
1395*199767f8SToomas Soome 		for (c = 0, i = 0; i < n; i++) {
1396*199767f8SToomas Soome 			if (i == n - 1 && data_errors == 0 &&
1397*199767f8SToomas Soome 			    c < rm->rm_firstdatacol) {
1398*199767f8SToomas Soome 				c = rm->rm_firstdatacol;
1399*199767f8SToomas Soome 			}
1400*199767f8SToomas Soome 
1401*199767f8SToomas Soome 			while (rm->rm_col[c].rc_error != 0) {
1402*199767f8SToomas Soome 				c++;
1403*199767f8SToomas Soome 				ASSERT3S(c, <, rm->rm_cols);
1404*199767f8SToomas Soome 			}
1405*199767f8SToomas Soome 
1406*199767f8SToomas Soome 			tgts[i] = c++;
1407*199767f8SToomas Soome 		}
1408*199767f8SToomas Soome 
1409*199767f8SToomas Soome 		/*
1410*199767f8SToomas Soome 		 * Setting tgts[n] simplifies the other edge condition.
1411*199767f8SToomas Soome 		 */
1412*199767f8SToomas Soome 		tgts[n] = rm->rm_cols;
1413*199767f8SToomas Soome 
1414*199767f8SToomas Soome 		/*
1415*199767f8SToomas Soome 		 * These buffers were allocated in previous iterations.
1416*199767f8SToomas Soome 		 */
1417*199767f8SToomas Soome 		for (i = 0; i < n - 1; i++) {
1418*199767f8SToomas Soome 			ASSERT(orig[i] != NULL);
1419*199767f8SToomas Soome 		}
1420*199767f8SToomas Soome 
1421*199767f8SToomas Soome 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1422*199767f8SToomas Soome 
1423*199767f8SToomas Soome 		current = 0;
1424*199767f8SToomas Soome 		next = tgts[current];
1425*199767f8SToomas Soome 
1426*199767f8SToomas Soome 		while (current != n) {
1427*199767f8SToomas Soome 			tgts[current] = next;
1428*199767f8SToomas Soome 			current = 0;
1429*199767f8SToomas Soome 
1430*199767f8SToomas Soome 			/*
1431*199767f8SToomas Soome 			 * Save off the original data that we're going to
1432*199767f8SToomas Soome 			 * attempt to reconstruct.
1433*199767f8SToomas Soome 			 */
1434*199767f8SToomas Soome 			for (i = 0; i < n; i++) {
1435*199767f8SToomas Soome 				ASSERT(orig[i] != NULL);
1436*199767f8SToomas Soome 				c = tgts[i];
1437*199767f8SToomas Soome 				ASSERT3S(c, >=, 0);
1438*199767f8SToomas Soome 				ASSERT3S(c, <, rm->rm_cols);
1439*199767f8SToomas Soome 				rc = &rm->rm_col[c];
1440*199767f8SToomas Soome 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1441*199767f8SToomas Soome 			}
1442*199767f8SToomas Soome 
1443*199767f8SToomas Soome 			/*
1444*199767f8SToomas Soome 			 * Attempt a reconstruction and exit the outer loop on
1445*199767f8SToomas Soome 			 * success.
1446*199767f8SToomas Soome 			 */
1447*199767f8SToomas Soome 			code = vdev_raidz_reconstruct(rm, tgts, n);
1448*199767f8SToomas Soome 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1449*199767f8SToomas Soome 				for (i = 0; i < n; i++) {
1450*199767f8SToomas Soome 					c = tgts[i];
1451*199767f8SToomas Soome 					rc = &rm->rm_col[c];
1452*199767f8SToomas Soome 					ASSERT(rc->rc_error == 0);
1453*199767f8SToomas Soome 					rc->rc_error = ECKSUM;
1454*199767f8SToomas Soome 				}
1455*199767f8SToomas Soome 
1456*199767f8SToomas Soome 				ret = code;
1457*199767f8SToomas Soome 				goto done;
1458*199767f8SToomas Soome 			}
1459*199767f8SToomas Soome 
1460*199767f8SToomas Soome 			/*
1461*199767f8SToomas Soome 			 * Restore the original data.
1462*199767f8SToomas Soome 			 */
1463*199767f8SToomas Soome 			for (i = 0; i < n; i++) {
1464*199767f8SToomas Soome 				c = tgts[i];
1465*199767f8SToomas Soome 				rc = &rm->rm_col[c];
1466*199767f8SToomas Soome 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1467*199767f8SToomas Soome 			}
1468*199767f8SToomas Soome 
1469*199767f8SToomas Soome 			do {
1470*199767f8SToomas Soome 				/*
1471*199767f8SToomas Soome 				 * Find the next valid column after the current
1472*199767f8SToomas Soome 				 * position..
1473*199767f8SToomas Soome 				 */
1474*199767f8SToomas Soome 				for (next = tgts[current] + 1;
1475*199767f8SToomas Soome 				    next < rm->rm_cols &&
1476*199767f8SToomas Soome 				    rm->rm_col[next].rc_error != 0; next++)
1477*199767f8SToomas Soome 					continue;
1478*199767f8SToomas Soome 
1479*199767f8SToomas Soome 				ASSERT(next <= tgts[current + 1]);
1480*199767f8SToomas Soome 
1481*199767f8SToomas Soome 				/*
1482*199767f8SToomas Soome 				 * If that spot is available, we're done here.
1483*199767f8SToomas Soome 				 */
1484*199767f8SToomas Soome 				if (next != tgts[current + 1])
1485*199767f8SToomas Soome 					break;
1486*199767f8SToomas Soome 
1487*199767f8SToomas Soome 				/*
1488*199767f8SToomas Soome 				 * Otherwise, find the next valid column after
1489*199767f8SToomas Soome 				 * the previous position.
1490*199767f8SToomas Soome 				 */
1491*199767f8SToomas Soome 				for (c = tgts[current - 1] + 1;
1492*199767f8SToomas Soome 				    rm->rm_col[c].rc_error != 0; c++)
1493*199767f8SToomas Soome 					continue;
1494*199767f8SToomas Soome 
1495*199767f8SToomas Soome 				tgts[current] = c;
1496*199767f8SToomas Soome 				current++;
1497*199767f8SToomas Soome 
1498*199767f8SToomas Soome 			} while (current != n);
1499*199767f8SToomas Soome 		}
1500*199767f8SToomas Soome 	}
1501*199767f8SToomas Soome 	n--;
1502*199767f8SToomas Soome done:
1503*199767f8SToomas Soome 	for (i = n - 1; i >= 0; i--) {
1504*199767f8SToomas Soome 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1505*199767f8SToomas Soome 	}
1506*199767f8SToomas Soome 
1507*199767f8SToomas Soome 	return (ret);
1508*199767f8SToomas Soome }
1509*199767f8SToomas Soome 
1510*199767f8SToomas Soome static int
1511*199767f8SToomas Soome vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1512*199767f8SToomas Soome     off_t offset, size_t bytes)
1513*199767f8SToomas Soome {
1514*199767f8SToomas Soome 	vdev_t *tvd = vd->v_top;
1515*199767f8SToomas Soome 	vdev_t *cvd;
1516*199767f8SToomas Soome 	raidz_map_t *rm;
1517*199767f8SToomas Soome 	raidz_col_t *rc;
1518*199767f8SToomas Soome 	int c, error;
1519*199767f8SToomas Soome 	int unexpected_errors;
1520*199767f8SToomas Soome 	int parity_errors;
1521*199767f8SToomas Soome 	int parity_untried;
1522*199767f8SToomas Soome 	int data_errors;
1523*199767f8SToomas Soome 	int total_errors;
1524*199767f8SToomas Soome 	int n;
1525*199767f8SToomas Soome 	int tgts[VDEV_RAIDZ_MAXPARITY];
1526*199767f8SToomas Soome 	int code;
1527*199767f8SToomas Soome 
1528*199767f8SToomas Soome 	rc = NULL;	/* gcc */
1529*199767f8SToomas Soome 	error = 0;
1530*199767f8SToomas Soome 
1531*199767f8SToomas Soome 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1532*199767f8SToomas Soome 	    vd->v_nchildren, vd->v_nparity);
1533*199767f8SToomas Soome 
1534*199767f8SToomas Soome 	/*
1535*199767f8SToomas Soome 	 * Iterate over the columns in reverse order so that we hit the parity
1536*199767f8SToomas Soome 	 * last -- any errors along the way will force us to read the parity.
1537*199767f8SToomas Soome 	 */
1538*199767f8SToomas Soome 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1539*199767f8SToomas Soome 		rc = &rm->rm_col[c];
1540*199767f8SToomas Soome 		cvd = vdev_child(vd, rc->rc_devidx);
1541*199767f8SToomas Soome 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1542*199767f8SToomas Soome 			if (c >= rm->rm_firstdatacol)
1543*199767f8SToomas Soome 				rm->rm_missingdata++;
1544*199767f8SToomas Soome 			else
1545*199767f8SToomas Soome 				rm->rm_missingparity++;
1546*199767f8SToomas Soome 			rc->rc_error = ENXIO;
1547*199767f8SToomas Soome 			rc->rc_tried = 1;	/* don't even try */
1548*199767f8SToomas Soome 			rc->rc_skipped = 1;
1549*199767f8SToomas Soome 			continue;
1550*199767f8SToomas Soome 		}
1551*199767f8SToomas Soome #if 0		/* XXX: Too hard for the boot code. */
1552*199767f8SToomas Soome 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1553*199767f8SToomas Soome 			if (c >= rm->rm_firstdatacol)
1554*199767f8SToomas Soome 				rm->rm_missingdata++;
1555*199767f8SToomas Soome 			else
1556*199767f8SToomas Soome 				rm->rm_missingparity++;
1557*199767f8SToomas Soome 			rc->rc_error = ESTALE;
1558*199767f8SToomas Soome 			rc->rc_skipped = 1;
1559*199767f8SToomas Soome 			continue;
1560*199767f8SToomas Soome 		}
1561*199767f8SToomas Soome #endif
1562*199767f8SToomas Soome 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1563*199767f8SToomas Soome 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1564*199767f8SToomas Soome 			    rc->rc_offset, rc->rc_size);
1565*199767f8SToomas Soome 			rc->rc_tried = 1;
1566*199767f8SToomas Soome 			rc->rc_skipped = 0;
1567*199767f8SToomas Soome 		}
1568*199767f8SToomas Soome 	}
1569*199767f8SToomas Soome 
1570*199767f8SToomas Soome reconstruct:
1571*199767f8SToomas Soome 	unexpected_errors = 0;
1572*199767f8SToomas Soome 	parity_errors = 0;
1573*199767f8SToomas Soome 	parity_untried = 0;
1574*199767f8SToomas Soome 	data_errors = 0;
1575*199767f8SToomas Soome 	total_errors = 0;
1576*199767f8SToomas Soome 
1577*199767f8SToomas Soome 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1578*199767f8SToomas Soome 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1579*199767f8SToomas Soome 
1580*199767f8SToomas Soome 	for (c = 0; c < rm->rm_cols; c++) {
1581*199767f8SToomas Soome 		rc = &rm->rm_col[c];
1582*199767f8SToomas Soome 
1583*199767f8SToomas Soome 		if (rc->rc_error) {
1584*199767f8SToomas Soome 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1585*199767f8SToomas Soome 
1586*199767f8SToomas Soome 			if (c < rm->rm_firstdatacol)
1587*199767f8SToomas Soome 				parity_errors++;
1588*199767f8SToomas Soome 			else
1589*199767f8SToomas Soome 				data_errors++;
1590*199767f8SToomas Soome 
1591*199767f8SToomas Soome 			if (!rc->rc_skipped)
1592*199767f8SToomas Soome 				unexpected_errors++;
1593*199767f8SToomas Soome 
1594*199767f8SToomas Soome 			total_errors++;
1595*199767f8SToomas Soome 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1596*199767f8SToomas Soome 			parity_untried++;
1597*199767f8SToomas Soome 		}
1598*199767f8SToomas Soome 	}
1599*199767f8SToomas Soome 
1600*199767f8SToomas Soome 	/*
1601*199767f8SToomas Soome 	 * There are three potential phases for a read:
1602*199767f8SToomas Soome 	 *	1. produce valid data from the columns read
1603*199767f8SToomas Soome 	 *	2. read all disks and try again
1604*199767f8SToomas Soome 	 *	3. perform combinatorial reconstruction
1605*199767f8SToomas Soome 	 *
1606*199767f8SToomas Soome 	 * Each phase is progressively both more expensive and less likely to
1607*199767f8SToomas Soome 	 * occur. If we encounter more errors than we can repair or all phases
1608*199767f8SToomas Soome 	 * fail, we have no choice but to return an error.
1609*199767f8SToomas Soome 	 */
1610*199767f8SToomas Soome 
1611*199767f8SToomas Soome 	/*
1612*199767f8SToomas Soome 	 * If the number of errors we saw was correctable -- less than or equal
1613*199767f8SToomas Soome 	 * to the number of parity disks read -- attempt to produce data that
1614*199767f8SToomas Soome 	 * has a valid checksum. Naturally, this case applies in the absence of
1615*199767f8SToomas Soome 	 * any errors.
1616*199767f8SToomas Soome 	 */
1617*199767f8SToomas Soome 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1618*199767f8SToomas Soome 		if (data_errors == 0) {
1619*199767f8SToomas Soome 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1620*199767f8SToomas Soome 				/*
1621*199767f8SToomas Soome 				 * If we read parity information (unnecessarily
1622*199767f8SToomas Soome 				 * as it happens since no reconstruction was
1623*199767f8SToomas Soome 				 * needed) regenerate and verify the parity.
1624*199767f8SToomas Soome 				 * We also regenerate parity when resilvering
1625*199767f8SToomas Soome 				 * so we can write it out to the failed device
1626*199767f8SToomas Soome 				 * later.
1627*199767f8SToomas Soome 				 */
1628*199767f8SToomas Soome 				if (parity_errors + parity_untried <
1629*199767f8SToomas Soome 				    rm->rm_firstdatacol) {
1630*199767f8SToomas Soome 					n = raidz_parity_verify(rm);
1631*199767f8SToomas Soome 					unexpected_errors += n;
1632*199767f8SToomas Soome 					ASSERT(parity_errors + n <=
1633*199767f8SToomas Soome 					    rm->rm_firstdatacol);
1634*199767f8SToomas Soome 				}
1635*199767f8SToomas Soome 				goto done;
1636*199767f8SToomas Soome 			}
1637*199767f8SToomas Soome 		} else {
1638*199767f8SToomas Soome 			/*
1639*199767f8SToomas Soome 			 * We either attempt to read all the parity columns or
1640*199767f8SToomas Soome 			 * none of them. If we didn't try to read parity, we
1641*199767f8SToomas Soome 			 * wouldn't be here in the correctable case. There must
1642*199767f8SToomas Soome 			 * also have been fewer parity errors than parity
1643*199767f8SToomas Soome 			 * columns or, again, we wouldn't be in this code path.
1644*199767f8SToomas Soome 			 */
1645*199767f8SToomas Soome 			ASSERT(parity_untried == 0);
1646*199767f8SToomas Soome 			ASSERT(parity_errors < rm->rm_firstdatacol);
1647*199767f8SToomas Soome 
1648*199767f8SToomas Soome 			/*
1649*199767f8SToomas Soome 			 * Identify the data columns that reported an error.
1650*199767f8SToomas Soome 			 */
1651*199767f8SToomas Soome 			n = 0;
1652*199767f8SToomas Soome 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1653*199767f8SToomas Soome 				rc = &rm->rm_col[c];
1654*199767f8SToomas Soome 				if (rc->rc_error != 0) {
1655*199767f8SToomas Soome 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1656*199767f8SToomas Soome 					tgts[n++] = c;
1657*199767f8SToomas Soome 				}
1658*199767f8SToomas Soome 			}
1659*199767f8SToomas Soome 
1660*199767f8SToomas Soome 			ASSERT(rm->rm_firstdatacol >= n);
1661*199767f8SToomas Soome 
1662*199767f8SToomas Soome 			code = vdev_raidz_reconstruct(rm, tgts, n);
1663*199767f8SToomas Soome 
1664*199767f8SToomas Soome 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1665*199767f8SToomas Soome 				/*
1666*199767f8SToomas Soome 				 * If we read more parity disks than were used
1667*199767f8SToomas Soome 				 * for reconstruction, confirm that the other
1668*199767f8SToomas Soome 				 * parity disks produced correct data. This
1669*199767f8SToomas Soome 				 * routine is suboptimal in that it regenerates
1670*199767f8SToomas Soome 				 * the parity that we already used in addition
1671*199767f8SToomas Soome 				 * to the parity that we're attempting to
1672*199767f8SToomas Soome 				 * verify, but this should be a relatively
1673*199767f8SToomas Soome 				 * uncommon case, and can be optimized if it
1674*199767f8SToomas Soome 				 * becomes a problem. Note that we regenerate
1675*199767f8SToomas Soome 				 * parity when resilvering so we can write it
1676*199767f8SToomas Soome 				 * out to failed devices later.
1677*199767f8SToomas Soome 				 */
1678*199767f8SToomas Soome 				if (parity_errors < rm->rm_firstdatacol - n) {
1679*199767f8SToomas Soome 					n = raidz_parity_verify(rm);
1680*199767f8SToomas Soome 					unexpected_errors += n;
1681*199767f8SToomas Soome 					ASSERT(parity_errors + n <=
1682*199767f8SToomas Soome 					    rm->rm_firstdatacol);
1683*199767f8SToomas Soome 				}
1684*199767f8SToomas Soome 
1685*199767f8SToomas Soome 				goto done;
1686*199767f8SToomas Soome 			}
1687*199767f8SToomas Soome 		}
1688*199767f8SToomas Soome 	}
1689*199767f8SToomas Soome 
1690*199767f8SToomas Soome 	/*
1691*199767f8SToomas Soome 	 * This isn't a typical situation -- either we got a read
1692*199767f8SToomas Soome 	 * error or a child silently returned bad data. Read every
1693*199767f8SToomas Soome 	 * block so we can try again with as much data and parity as
1694*199767f8SToomas Soome 	 * we can track down. If we've already been through once
1695*199767f8SToomas Soome 	 * before, all children will be marked as tried so we'll
1696*199767f8SToomas Soome 	 * proceed to combinatorial reconstruction.
1697*199767f8SToomas Soome 	 */
1698*199767f8SToomas Soome 	unexpected_errors = 1;
1699*199767f8SToomas Soome 	rm->rm_missingdata = 0;
1700*199767f8SToomas Soome 	rm->rm_missingparity = 0;
1701*199767f8SToomas Soome 
1702*199767f8SToomas Soome 	n = 0;
1703*199767f8SToomas Soome 	for (c = 0; c < rm->rm_cols; c++) {
1704*199767f8SToomas Soome 		rc = &rm->rm_col[c];
1705*199767f8SToomas Soome 
1706*199767f8SToomas Soome 		if (rc->rc_tried)
1707*199767f8SToomas Soome 			continue;
1708*199767f8SToomas Soome 
1709*199767f8SToomas Soome 		cvd = vdev_child(vd, rc->rc_devidx);
1710*199767f8SToomas Soome 		ASSERT(cvd != NULL);
1711*199767f8SToomas Soome 		rc->rc_error = cvd->v_read(cvd, NULL,
1712*199767f8SToomas Soome 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1713*199767f8SToomas Soome 		if (rc->rc_error == 0)
1714*199767f8SToomas Soome 			n++;
1715*199767f8SToomas Soome 		rc->rc_tried = 1;
1716*199767f8SToomas Soome 		rc->rc_skipped = 0;
1717*199767f8SToomas Soome 	}
1718*199767f8SToomas Soome 	/*
1719*199767f8SToomas Soome 	 * If we managed to read anything more, retry the
1720*199767f8SToomas Soome 	 * reconstruction.
1721*199767f8SToomas Soome 	 */
1722*199767f8SToomas Soome 	if (n > 0)
1723*199767f8SToomas Soome 		goto reconstruct;
1724*199767f8SToomas Soome 
1725*199767f8SToomas Soome 	/*
1726*199767f8SToomas Soome 	 * At this point we've attempted to reconstruct the data given the
1727*199767f8SToomas Soome 	 * errors we detected, and we've attempted to read all columns. There
1728*199767f8SToomas Soome 	 * must, therefore, be one or more additional problems -- silent errors
1729*199767f8SToomas Soome 	 * resulting in invalid data rather than explicit I/O errors resulting
1730*199767f8SToomas Soome 	 * in absent data. We check if there is enough additional data to
1731*199767f8SToomas Soome 	 * possibly reconstruct the data and then perform combinatorial
1732*199767f8SToomas Soome 	 * reconstruction over all possible combinations. If that fails,
1733*199767f8SToomas Soome 	 * we're cooked.
1734*199767f8SToomas Soome 	 */
1735*199767f8SToomas Soome 	if (total_errors > rm->rm_firstdatacol) {
1736*199767f8SToomas Soome 		error = EIO;
1737*199767f8SToomas Soome 	} else if (total_errors < rm->rm_firstdatacol &&
1738*199767f8SToomas Soome 	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1739*199767f8SToomas Soome 	     total_errors, data_errors)) != 0) {
1740*199767f8SToomas Soome 		/*
1741*199767f8SToomas Soome 		 * If we didn't use all the available parity for the
1742*199767f8SToomas Soome 		 * combinatorial reconstruction, verify that the remaining
1743*199767f8SToomas Soome 		 * parity is correct.
1744*199767f8SToomas Soome 		 */
1745*199767f8SToomas Soome 		if (code != (1 << rm->rm_firstdatacol) - 1)
1746*199767f8SToomas Soome 			(void) raidz_parity_verify(rm);
1747*199767f8SToomas Soome 	} else {
1748*199767f8SToomas Soome 		/*
1749*199767f8SToomas Soome 		 * We're here because either:
1750*199767f8SToomas Soome 		 *
1751*199767f8SToomas Soome 		 *	total_errors == rm_first_datacol, or
1752*199767f8SToomas Soome 		 *	vdev_raidz_combrec() failed
1753*199767f8SToomas Soome 		 *
1754*199767f8SToomas Soome 		 * In either case, there is enough bad data to prevent
1755*199767f8SToomas Soome 		 * reconstruction.
1756*199767f8SToomas Soome 		 *
1757*199767f8SToomas Soome 		 * Start checksum ereports for all children which haven't
1758*199767f8SToomas Soome 		 * failed, and the IO wasn't speculative.
1759*199767f8SToomas Soome 		 */
1760*199767f8SToomas Soome 		error = ECKSUM;
1761*199767f8SToomas Soome 	}
1762*199767f8SToomas Soome 
1763*199767f8SToomas Soome done:
1764*199767f8SToomas Soome 	vdev_raidz_map_free(rm);
1765*199767f8SToomas Soome 
1766*199767f8SToomas Soome 	return (error);
1767*199767f8SToomas Soome }
1768