xref: /illumos-gate/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c (revision 3e8c7f16)
1199767f8SToomas Soome /*
2199767f8SToomas Soome  * CDDL HEADER START
3199767f8SToomas Soome  *
4199767f8SToomas Soome  * The contents of this file are subject to the terms of the
5199767f8SToomas Soome  * Common Development and Distribution License (the "License").
6199767f8SToomas Soome  * You may not use this file except in compliance with the License.
7199767f8SToomas Soome  *
8199767f8SToomas Soome  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9199767f8SToomas Soome  * or http://www.opensolaris.org/os/licensing.
10199767f8SToomas Soome  * See the License for the specific language governing permissions
11199767f8SToomas Soome  * and limitations under the License.
12199767f8SToomas Soome  *
13199767f8SToomas Soome  * When distributing Covered Code, include this CDDL HEADER in each
14199767f8SToomas Soome  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15199767f8SToomas Soome  * If applicable, add the following below this CDDL HEADER, with the
16199767f8SToomas Soome  * fields enclosed by brackets "[]" replaced with your own identifying
17199767f8SToomas Soome  * information: Portions Copyright [yyyy] [name of copyright owner]
18199767f8SToomas Soome  *
19199767f8SToomas Soome  * CDDL HEADER END
20199767f8SToomas Soome  */
21199767f8SToomas Soome /*
22199767f8SToomas Soome  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23199767f8SToomas Soome  * Use is subject to license terms.
24199767f8SToomas Soome  */
25199767f8SToomas Soome 
26199767f8SToomas Soome #include <sys/cdefs.h>
2710ae99eeSToomas Soome #include <lz4.h>
28199767f8SToomas Soome 
29199767f8SToomas Soome static uint64_t zfs_crc64_table[256];
30199767f8SToomas Soome 
31199767f8SToomas Soome #define	ECKSUM	666
32199767f8SToomas Soome 
33199767f8SToomas Soome #define	ASSERT3S(x, y, z)	((void)0)
34199767f8SToomas Soome #define	ASSERT3U(x, y, z)	((void)0)
35199767f8SToomas Soome #define	ASSERT3P(x, y, z)	((void)0)
36199767f8SToomas Soome #define	ASSERT0(x)		((void)0)
37199767f8SToomas Soome #define	ASSERT(x)		((void)0)
38199767f8SToomas Soome 
39199767f8SToomas Soome static void
zfs_init_crc(void)40199767f8SToomas Soome zfs_init_crc(void)
41199767f8SToomas Soome {
42199767f8SToomas Soome 	int i, j;
43199767f8SToomas Soome 	uint64_t *ct;
44199767f8SToomas Soome 
45199767f8SToomas Soome 	/*
46199767f8SToomas Soome 	 * Calculate the crc64 table (used for the zap hash
47199767f8SToomas Soome 	 * function).
48199767f8SToomas Soome 	 */
49199767f8SToomas Soome 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
507bbcfb41SToomas Soome 		memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table));
517bbcfb41SToomas Soome 		for (i = 0; i < 256; i++) {
527bbcfb41SToomas Soome 			ct = zfs_crc64_table + i;
537bbcfb41SToomas Soome 			for (*ct = i, j = 8; j > 0; j--)
547bbcfb41SToomas Soome 				*ct = (*ct >> 1) ^
557bbcfb41SToomas Soome 				    (-(*ct & 1) & ZFS_CRC64_POLY);
567bbcfb41SToomas Soome 		}
57199767f8SToomas Soome 	}
58199767f8SToomas Soome }
59199767f8SToomas Soome 
60199767f8SToomas Soome static void
zio_checksum_off(const void * buf __unused,uint64_t size __unused,const void * ctx_template __unused,zio_cksum_t * zcp)618eef2ab6SToomas Soome zio_checksum_off(const void *buf __unused, uint64_t size __unused,
628eef2ab6SToomas Soome     const void *ctx_template __unused, zio_cksum_t *zcp)
63199767f8SToomas Soome {
64199767f8SToomas Soome 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
65199767f8SToomas Soome }
66199767f8SToomas Soome 
67199767f8SToomas Soome /*
68199767f8SToomas Soome  * Signature for checksum functions.
69199767f8SToomas Soome  */
70199767f8SToomas Soome typedef void zio_checksum_t(const void *data, uint64_t size,
71199767f8SToomas Soome     const void *ctx_template, zio_cksum_t *zcp);
72199767f8SToomas Soome typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
73199767f8SToomas Soome typedef void zio_checksum_tmpl_free_t(void *ctx_template);
74199767f8SToomas Soome 
75199767f8SToomas Soome typedef enum zio_checksum_flags {
76199767f8SToomas Soome 	/* Strong enough for metadata? */
77199767f8SToomas Soome 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
78199767f8SToomas Soome 	/* ZIO embedded checksum */
79199767f8SToomas Soome 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
80199767f8SToomas Soome 	/* Strong enough for dedup (without verification)? */
81199767f8SToomas Soome 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
82199767f8SToomas Soome 	/* Uses salt value */
83199767f8SToomas Soome 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
84199767f8SToomas Soome 	/* Strong enough for nopwrite? */
85199767f8SToomas Soome 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
86199767f8SToomas Soome } zio_checksum_flags_t;
87199767f8SToomas Soome 
88199767f8SToomas Soome /*
89199767f8SToomas Soome  * Information about each checksum function.
90199767f8SToomas Soome  */
91199767f8SToomas Soome typedef struct zio_checksum_info {
92199767f8SToomas Soome 	/* checksum function for each byteorder */
93199767f8SToomas Soome 	zio_checksum_t			*ci_func[2];
94199767f8SToomas Soome 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
95199767f8SToomas Soome 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
96199767f8SToomas Soome 	zio_checksum_flags_t		ci_flags;
97199767f8SToomas Soome 	const char			*ci_name;	/* descriptive name */
98199767f8SToomas Soome } zio_checksum_info_t;
99199767f8SToomas Soome 
100199767f8SToomas Soome #include "blkptr.c"
101199767f8SToomas Soome 
102199767f8SToomas Soome #include "fletcher.c"
103199767f8SToomas Soome #include "sha256.c"
1044a04e8dbSToomas Soome #include "skein_zfs.c"
1054a04e8dbSToomas Soome #include "edonr_zfs.c"
106199767f8SToomas Soome 
107199767f8SToomas Soome static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
108199767f8SToomas Soome 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
109199767f8SToomas Soome 	{{NULL, NULL}, NULL, NULL, 0, "on"},
110199767f8SToomas Soome 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
111199767f8SToomas Soome 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
112199767f8SToomas Soome 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
113199767f8SToomas Soome 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
114199767f8SToomas Soome 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
115199767f8SToomas Soome 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
116199767f8SToomas Soome 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
117199767f8SToomas Soome 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
118199767f8SToomas Soome 	    0, "fletcher2"},
119199767f8SToomas Soome 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
120199767f8SToomas Soome 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
121199767f8SToomas Soome 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
122199767f8SToomas Soome 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
123199767f8SToomas Soome 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
124199767f8SToomas Soome 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
125199767f8SToomas Soome 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
126199767f8SToomas Soome 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
127199767f8SToomas Soome 	    0, "noparity"},
128199767f8SToomas Soome 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
129199767f8SToomas Soome 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
130199767f8SToomas Soome 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
131199767f8SToomas Soome 	/* no skein and edonr for now */
1324a04e8dbSToomas Soome 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
1334a04e8dbSToomas Soome 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
1344a04e8dbSToomas Soome 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
1354a04e8dbSToomas Soome 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
1364a04e8dbSToomas Soome 	{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
1374a04e8dbSToomas Soome 	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
1384a04e8dbSToomas Soome 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
1394a04e8dbSToomas Soome 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
140199767f8SToomas Soome };
141199767f8SToomas Soome 
142199767f8SToomas Soome /*
143199767f8SToomas Soome  * Common signature for all zio compress/decompress functions.
144199767f8SToomas Soome  */
145199767f8SToomas Soome typedef size_t zio_compress_func_t(void *src, void *dst,
146199767f8SToomas Soome     size_t s_len, size_t d_len, int);
147199767f8SToomas Soome typedef int zio_decompress_func_t(void *src, void *dst,
148199767f8SToomas Soome     size_t s_len, size_t d_len, int);
149199767f8SToomas Soome 
150199767f8SToomas Soome extern int gzip_decompress(void *src, void *dst,
151199767f8SToomas Soome     size_t s_len, size_t d_len, int);
152199767f8SToomas Soome /*
153199767f8SToomas Soome  * Information about each compression function.
154199767f8SToomas Soome  */
155199767f8SToomas Soome typedef struct zio_compress_info {
156199767f8SToomas Soome 	zio_compress_func_t	*ci_compress;	/* compression function */
157199767f8SToomas Soome 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
158199767f8SToomas Soome 	int			ci_level;	/* level parameter */
159199767f8SToomas Soome 	const char		*ci_name;	/* algorithm name */
160199767f8SToomas Soome } zio_compress_info_t;
161199767f8SToomas Soome 
162199767f8SToomas Soome #include "lzjb.c"
163199767f8SToomas Soome #include "zle.c"
164199767f8SToomas Soome 
165199767f8SToomas Soome /*
166199767f8SToomas Soome  * Compression vectors.
167199767f8SToomas Soome  */
168199767f8SToomas Soome static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
169199767f8SToomas Soome 	{NULL,			NULL,			0,	"inherit"},
170199767f8SToomas Soome 	{NULL,			NULL,			0,	"on"},
171199767f8SToomas Soome 	{NULL,			NULL,			0,	"uncompressed"},
172199767f8SToomas Soome 	{NULL,			lzjb_decompress,	0,	"lzjb"},
173199767f8SToomas Soome 	{NULL,			NULL,			0,	"empty"},
174199767f8SToomas Soome 	{NULL,			gzip_decompress,	1,	"gzip-1"},
175199767f8SToomas Soome 	{NULL,			gzip_decompress,	2,	"gzip-2"},
176199767f8SToomas Soome 	{NULL,			gzip_decompress,	3,	"gzip-3"},
177199767f8SToomas Soome 	{NULL,			gzip_decompress,	4,	"gzip-4"},
178199767f8SToomas Soome 	{NULL,			gzip_decompress,	5,	"gzip-5"},
179199767f8SToomas Soome 	{NULL,			gzip_decompress,	6,	"gzip-6"},
180199767f8SToomas Soome 	{NULL,			gzip_decompress,	7,	"gzip-7"},
181199767f8SToomas Soome 	{NULL,			gzip_decompress,	8,	"gzip-8"},
182199767f8SToomas Soome 	{NULL,			gzip_decompress,	9,	"gzip-9"},
183199767f8SToomas Soome 	{NULL,			zle_decompress,		64,	"zle"},
184199767f8SToomas Soome 	{NULL,			lz4_decompress,		0,	"lz4"},
185199767f8SToomas Soome };
186199767f8SToomas Soome 
187199767f8SToomas Soome static void
byteswap_uint64_array(void * vbuf,size_t size)188199767f8SToomas Soome byteswap_uint64_array(void *vbuf, size_t size)
189199767f8SToomas Soome {
190199767f8SToomas Soome 	uint64_t *buf = vbuf;
191199767f8SToomas Soome 	size_t count = size >> 3;
192199767f8SToomas Soome 	int i;
193199767f8SToomas Soome 
194199767f8SToomas Soome 	ASSERT((size & 7) == 0);
195199767f8SToomas Soome 
196199767f8SToomas Soome 	for (i = 0; i < count; i++)
197199767f8SToomas Soome 		buf[i] = BSWAP_64(buf[i]);
198199767f8SToomas Soome }
199199767f8SToomas Soome 
200199767f8SToomas Soome /*
201199767f8SToomas Soome  * Set the external verifier for a gang block based on <vdev, offset, txg>,
202199767f8SToomas Soome  * a tuple which is guaranteed to be unique for the life of the pool.
203199767f8SToomas Soome  */
204199767f8SToomas Soome static void
zio_checksum_gang_verifier(zio_cksum_t * zcp,const blkptr_t * bp)205199767f8SToomas Soome zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
206199767f8SToomas Soome {
207199767f8SToomas Soome 	const dva_t *dva = BP_IDENTITY(bp);
208199767f8SToomas Soome 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
209199767f8SToomas Soome 
210199767f8SToomas Soome 	ASSERT(BP_IS_GANG(bp));
211199767f8SToomas Soome 
212199767f8SToomas Soome 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
213199767f8SToomas Soome }
214199767f8SToomas Soome 
215199767f8SToomas Soome /*
216199767f8SToomas Soome  * Set the external verifier for a label block based on its offset.
217199767f8SToomas Soome  * The vdev is implicit, and the txg is unknowable at pool open time --
218199767f8SToomas Soome  * hence the logic in vdev_uberblock_load() to find the most recent copy.
219199767f8SToomas Soome  */
220199767f8SToomas Soome static void
zio_checksum_label_verifier(zio_cksum_t * zcp,uint64_t offset)221199767f8SToomas Soome zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
222199767f8SToomas Soome {
223199767f8SToomas Soome 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
224199767f8SToomas Soome }
225199767f8SToomas Soome 
226199767f8SToomas Soome /*
227199767f8SToomas Soome  * Calls the template init function of a checksum which supports context
228199767f8SToomas Soome  * templates and installs the template into the spa_t.
229199767f8SToomas Soome  */
230199767f8SToomas Soome static void
zio_checksum_template_init(enum zio_checksum checksum,spa_t * spa)2314a04e8dbSToomas Soome zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
232199767f8SToomas Soome {
233199767f8SToomas Soome 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
234199767f8SToomas Soome 
235199767f8SToomas Soome 	if (ci->ci_tmpl_init == NULL)
236199767f8SToomas Soome 		return;
2374a04e8dbSToomas Soome 
238199767f8SToomas Soome 	if (spa->spa_cksum_tmpls[checksum] != NULL)
239199767f8SToomas Soome 		return;
240199767f8SToomas Soome 
241199767f8SToomas Soome 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
242199767f8SToomas Soome 		spa->spa_cksum_tmpls[checksum] =
243199767f8SToomas Soome 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
244199767f8SToomas Soome 	}
2454a04e8dbSToomas Soome }
2464a04e8dbSToomas Soome 
2474a04e8dbSToomas Soome /*
2484a04e8dbSToomas Soome  * Called by a spa_t that's about to be deallocated. This steps through
2494a04e8dbSToomas Soome  * all of the checksum context templates and deallocates any that were
2504a04e8dbSToomas Soome  * initialized using the algorithm-specific template init function.
2514a04e8dbSToomas Soome  */
2524a04e8dbSToomas Soome void
zio_checksum_templates_free(spa_t * spa)2534a04e8dbSToomas Soome zio_checksum_templates_free(spa_t *spa)
2544a04e8dbSToomas Soome {
2554a04e8dbSToomas Soome 	for (enum zio_checksum checksum = 0;
2564a04e8dbSToomas Soome 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
2574a04e8dbSToomas Soome 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
2584a04e8dbSToomas Soome 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
2594a04e8dbSToomas Soome 
2604a04e8dbSToomas Soome 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
2614a04e8dbSToomas Soome 			spa->spa_cksum_tmpls[checksum] = NULL;
2624a04e8dbSToomas Soome 		}
2634a04e8dbSToomas Soome 	}
264199767f8SToomas Soome }
265199767f8SToomas Soome 
266199767f8SToomas Soome static int
zio_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data)2674a04e8dbSToomas Soome zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
268199767f8SToomas Soome {
269199767f8SToomas Soome 	uint64_t size;
270199767f8SToomas Soome 	unsigned int checksum;
271199767f8SToomas Soome 	zio_checksum_info_t *ci;
2724a04e8dbSToomas Soome 	void *ctx = NULL;
273199767f8SToomas Soome 	zio_cksum_t actual_cksum, expected_cksum, verifier;
274199767f8SToomas Soome 	int byteswap;
275199767f8SToomas Soome 
276199767f8SToomas Soome 	checksum = BP_GET_CHECKSUM(bp);
277199767f8SToomas Soome 	size = BP_GET_PSIZE(bp);
278199767f8SToomas Soome 
279199767f8SToomas Soome 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
280199767f8SToomas Soome 		return (EINVAL);
281199767f8SToomas Soome 	ci = &zio_checksum_table[checksum];
282199767f8SToomas Soome 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
283199767f8SToomas Soome 		return (EINVAL);
284199767f8SToomas Soome 
2854a04e8dbSToomas Soome 	if (spa != NULL) {
2867bbcfb41SToomas Soome 		zio_checksum_template_init(checksum, (spa_t *)spa);
2874a04e8dbSToomas Soome 		ctx = spa->spa_cksum_tmpls[checksum];
2884a04e8dbSToomas Soome 	}
2894a04e8dbSToomas Soome 
290199767f8SToomas Soome 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
291199767f8SToomas Soome 		zio_eck_t *eck;
292199767f8SToomas Soome 
293199767f8SToomas Soome 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
294199767f8SToomas Soome 		    checksum == ZIO_CHECKSUM_LABEL);
295199767f8SToomas Soome 
296199767f8SToomas Soome 		eck = (zio_eck_t *)((char *)data + size) - 1;
297199767f8SToomas Soome 
298199767f8SToomas Soome 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
299199767f8SToomas Soome 			zio_checksum_gang_verifier(&verifier, bp);
300199767f8SToomas Soome 		else if (checksum == ZIO_CHECKSUM_LABEL)
301199767f8SToomas Soome 			zio_checksum_label_verifier(&verifier,
302199767f8SToomas Soome 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
303199767f8SToomas Soome 		else
304199767f8SToomas Soome 			verifier = bp->blk_cksum;
305199767f8SToomas Soome 
306199767f8SToomas Soome 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
307199767f8SToomas Soome 
308199767f8SToomas Soome 		if (byteswap)
309199767f8SToomas Soome 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
310199767f8SToomas Soome 
311199767f8SToomas Soome 		expected_cksum = eck->zec_cksum;
312199767f8SToomas Soome 		eck->zec_cksum = verifier;
3134a04e8dbSToomas Soome 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
314199767f8SToomas Soome 		eck->zec_cksum = expected_cksum;
315199767f8SToomas Soome 
316199767f8SToomas Soome 		if (byteswap)
317199767f8SToomas Soome 			byteswap_uint64_array(&expected_cksum,
318199767f8SToomas Soome 			    sizeof (zio_cksum_t));
319199767f8SToomas Soome 	} else {
320ece0bc84SToomas Soome 		byteswap = BP_SHOULD_BYTESWAP(bp);
321199767f8SToomas Soome 		expected_cksum = bp->blk_cksum;
322ece0bc84SToomas Soome 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
323199767f8SToomas Soome 	}
324199767f8SToomas Soome 
325199767f8SToomas Soome 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
3264a04e8dbSToomas Soome 		/* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
327199767f8SToomas Soome 		return (EIO);
328199767f8SToomas Soome 	}
329199767f8SToomas Soome 
330199767f8SToomas Soome 	return (0);
331199767f8SToomas Soome }
332199767f8SToomas Soome 
333199767f8SToomas Soome static int
zio_decompress_data(int cpfunc,void * src,uint64_t srcsize,void * dest,uint64_t destsize)334199767f8SToomas Soome zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
3357bbcfb41SToomas Soome     void *dest, uint64_t destsize)
336199767f8SToomas Soome {
337199767f8SToomas Soome 	zio_compress_info_t *ci;
338199767f8SToomas Soome 
339199767f8SToomas Soome 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
340199767f8SToomas Soome 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
341199767f8SToomas Soome 		return (EIO);
342199767f8SToomas Soome 	}
343199767f8SToomas Soome 
344199767f8SToomas Soome 	ci = &zio_compress_table[cpfunc];
345199767f8SToomas Soome 	if (!ci->ci_decompress) {
346199767f8SToomas Soome 		printf("ZFS: unsupported compression algorithm %s\n",
347199767f8SToomas Soome 		    ci->ci_name);
348199767f8SToomas Soome 		return (EIO);
349199767f8SToomas Soome 	}
350199767f8SToomas Soome 
351199767f8SToomas Soome 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
352199767f8SToomas Soome }
353199767f8SToomas Soome 
354199767f8SToomas Soome static uint64_t
zap_hash(uint64_t salt,const char * name)355199767f8SToomas Soome zap_hash(uint64_t salt, const char *name)
356199767f8SToomas Soome {
357199767f8SToomas Soome 	const uint8_t *cp;
358199767f8SToomas Soome 	uint8_t c;
359199767f8SToomas Soome 	uint64_t crc = salt;
360199767f8SToomas Soome 
361199767f8SToomas Soome 	ASSERT(crc != 0);
362199767f8SToomas Soome 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
363199767f8SToomas Soome 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
364199767f8SToomas Soome 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
365199767f8SToomas Soome 
366199767f8SToomas Soome 	/*
367199767f8SToomas Soome 	 * Only use 28 bits, since we need 4 bits in the cookie for the
368199767f8SToomas Soome 	 * collision differentiator.  We MUST use the high bits, since
369199767f8SToomas Soome 	 * those are the onces that we first pay attention to when
370199767f8SToomas Soome 	 * chosing the bucket.
371199767f8SToomas Soome 	 */
372199767f8SToomas Soome 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
373199767f8SToomas Soome 
374199767f8SToomas Soome 	return (crc);
375199767f8SToomas Soome }
376199767f8SToomas Soome 
377199767f8SToomas Soome typedef struct raidz_col {
378199767f8SToomas Soome 	uint64_t rc_devidx;		/* child device index for I/O */
379199767f8SToomas Soome 	uint64_t rc_offset;		/* device offset */
380199767f8SToomas Soome 	uint64_t rc_size;		/* I/O size */
381199767f8SToomas Soome 	void *rc_data;			/* I/O data */
382199767f8SToomas Soome 	int rc_error;			/* I/O error for this device */
383199767f8SToomas Soome 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
384199767f8SToomas Soome 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
385199767f8SToomas Soome } raidz_col_t;
386199767f8SToomas Soome 
387199767f8SToomas Soome typedef struct raidz_map {
388199767f8SToomas Soome 	uint64_t rm_cols;		/* Regular column count */
389199767f8SToomas Soome 	uint64_t rm_scols;		/* Count including skipped columns */
390199767f8SToomas Soome 	uint64_t rm_bigcols;		/* Number of oversized columns */
391199767f8SToomas Soome 	uint64_t rm_asize;		/* Actual total I/O size */
392199767f8SToomas Soome 	uint64_t rm_missingdata;	/* Count of missing data devices */
393199767f8SToomas Soome 	uint64_t rm_missingparity;	/* Count of missing parity devices */
394199767f8SToomas Soome 	uint64_t rm_firstdatacol;	/* First data column/parity count */
395199767f8SToomas Soome 	uint64_t rm_nskip;		/* Skipped sectors for padding */
396199767f8SToomas Soome 	uint64_t rm_skipstart;		/* Column index of padding start */
397199767f8SToomas Soome 	uintptr_t rm_reports;		/* # of referencing checksum reports */
398199767f8SToomas Soome 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
399199767f8SToomas Soome 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
400199767f8SToomas Soome 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
401199767f8SToomas Soome } raidz_map_t;
402199767f8SToomas Soome 
403199767f8SToomas Soome #define	VDEV_RAIDZ_P		0
404199767f8SToomas Soome #define	VDEV_RAIDZ_Q		1
405199767f8SToomas Soome #define	VDEV_RAIDZ_R		2
406199767f8SToomas Soome 
407199767f8SToomas Soome #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
408199767f8SToomas Soome #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
409199767f8SToomas Soome 
410199767f8SToomas Soome /*
411199767f8SToomas Soome  * We provide a mechanism to perform the field multiplication operation on a
412199767f8SToomas Soome  * 64-bit value all at once rather than a byte at a time. This works by
413199767f8SToomas Soome  * creating a mask from the top bit in each byte and using that to
414199767f8SToomas Soome  * conditionally apply the XOR of 0x1d.
415199767f8SToomas Soome  */
416199767f8SToomas Soome #define	VDEV_RAIDZ_64MUL_2(x, mask) \
417199767f8SToomas Soome { \
418199767f8SToomas Soome 	(mask) = (x) & 0x8080808080808080ULL; \
419199767f8SToomas Soome 	(mask) = ((mask) << 1) - ((mask) >> 7); \
420199767f8SToomas Soome 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
421199767f8SToomas Soome 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
422199767f8SToomas Soome }
423199767f8SToomas Soome 
424199767f8SToomas Soome #define	VDEV_RAIDZ_64MUL_4(x, mask) \
425199767f8SToomas Soome { \
426199767f8SToomas Soome 	VDEV_RAIDZ_64MUL_2((x), mask); \
427199767f8SToomas Soome 	VDEV_RAIDZ_64MUL_2((x), mask); \
428199767f8SToomas Soome }
429199767f8SToomas Soome 
430199767f8SToomas Soome /*
431199767f8SToomas Soome  * These two tables represent powers and logs of 2 in the Galois field defined
432199767f8SToomas Soome  * above. These values were computed by repeatedly multiplying by 2 as above.
433199767f8SToomas Soome  */
434199767f8SToomas Soome static const uint8_t vdev_raidz_pow2[256] = {
435199767f8SToomas Soome 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
436199767f8SToomas Soome 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
437199767f8SToomas Soome 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
438199767f8SToomas Soome 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
439199767f8SToomas Soome 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
440199767f8SToomas Soome 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
441199767f8SToomas Soome 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
442199767f8SToomas Soome 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
443199767f8SToomas Soome 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
444199767f8SToomas Soome 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
445199767f8SToomas Soome 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
446199767f8SToomas Soome 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
447199767f8SToomas Soome 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
448199767f8SToomas Soome 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
449199767f8SToomas Soome 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
450199767f8SToomas Soome 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
451199767f8SToomas Soome 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
452199767f8SToomas Soome 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
453199767f8SToomas Soome 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
454199767f8SToomas Soome 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
455199767f8SToomas Soome 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
456199767f8SToomas Soome 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
457199767f8SToomas Soome 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
458199767f8SToomas Soome 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
459199767f8SToomas Soome 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
460199767f8SToomas Soome 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
461199767f8SToomas Soome 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
462199767f8SToomas Soome 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
463199767f8SToomas Soome 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
464199767f8SToomas Soome 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
465199767f8SToomas Soome 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
466199767f8SToomas Soome 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
467199767f8SToomas Soome };
468199767f8SToomas Soome static const uint8_t vdev_raidz_log2[256] = {
469199767f8SToomas Soome 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
470199767f8SToomas Soome 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
471199767f8SToomas Soome 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
472199767f8SToomas Soome 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
473199767f8SToomas Soome 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
474199767f8SToomas Soome 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
475199767f8SToomas Soome 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
476199767f8SToomas Soome 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
477199767f8SToomas Soome 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
478199767f8SToomas Soome 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
479199767f8SToomas Soome 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
480199767f8SToomas Soome 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
481199767f8SToomas Soome 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
482199767f8SToomas Soome 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
483199767f8SToomas Soome 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
484199767f8SToomas Soome 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
485199767f8SToomas Soome 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
486199767f8SToomas Soome 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
487199767f8SToomas Soome 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
488199767f8SToomas Soome 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
489199767f8SToomas Soome 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
490199767f8SToomas Soome 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
491199767f8SToomas Soome 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
492199767f8SToomas Soome 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
493199767f8SToomas Soome 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
494199767f8SToomas Soome 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
495199767f8SToomas Soome 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
496199767f8SToomas Soome 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
497199767f8SToomas Soome 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
498199767f8SToomas Soome 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
499199767f8SToomas Soome 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
500199767f8SToomas Soome 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
501199767f8SToomas Soome };
502199767f8SToomas Soome 
503199767f8SToomas Soome /*
504199767f8SToomas Soome  * Multiply a given number by 2 raised to the given power.
505199767f8SToomas Soome  */
506199767f8SToomas Soome static uint8_t
vdev_raidz_exp2(uint8_t a,int exp)507199767f8SToomas Soome vdev_raidz_exp2(uint8_t a, int exp)
508199767f8SToomas Soome {
509199767f8SToomas Soome 	if (a == 0)
510199767f8SToomas Soome 		return (0);
511199767f8SToomas Soome 
512199767f8SToomas Soome 	ASSERT(exp >= 0);
513199767f8SToomas Soome 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
514199767f8SToomas Soome 
515199767f8SToomas Soome 	exp += vdev_raidz_log2[a];
516199767f8SToomas Soome 	if (exp > 255)
517199767f8SToomas Soome 		exp -= 255;
518199767f8SToomas Soome 
519199767f8SToomas Soome 	return (vdev_raidz_pow2[exp]);
520199767f8SToomas Soome }
521199767f8SToomas Soome 
522199767f8SToomas Soome static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)523199767f8SToomas Soome vdev_raidz_generate_parity_p(raidz_map_t *rm)
524199767f8SToomas Soome {
525199767f8SToomas Soome 	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
526199767f8SToomas Soome 	int c;
527199767f8SToomas Soome 
528199767f8SToomas Soome 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
529199767f8SToomas Soome 
530199767f8SToomas Soome 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
531199767f8SToomas Soome 		src = rm->rm_col[c].rc_data;
532199767f8SToomas Soome 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
533199767f8SToomas Soome 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
534199767f8SToomas Soome 
535199767f8SToomas Soome 		if (c == rm->rm_firstdatacol) {
536199767f8SToomas Soome 			ASSERT(ccount == pcount);
537199767f8SToomas Soome 			for (i = 0; i < ccount; i++, src++, p++) {
538199767f8SToomas Soome 				*p = *src;
539199767f8SToomas Soome 			}
540199767f8SToomas Soome 		} else {
541199767f8SToomas Soome 			ASSERT(ccount <= pcount);
542199767f8SToomas Soome 			for (i = 0; i < ccount; i++, src++, p++) {
543199767f8SToomas Soome 				*p ^= *src;
544199767f8SToomas Soome 			}
545199767f8SToomas Soome 		}
546199767f8SToomas Soome 	}
547199767f8SToomas Soome }
548199767f8SToomas Soome 
549199767f8SToomas Soome static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)550199767f8SToomas Soome vdev_raidz_generate_parity_pq(raidz_map_t *rm)
551199767f8SToomas Soome {
552199767f8SToomas Soome 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
553199767f8SToomas Soome 	int c;
554199767f8SToomas Soome 
555199767f8SToomas Soome 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
556199767f8SToomas Soome 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
557199767f8SToomas Soome 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
558199767f8SToomas Soome 
559199767f8SToomas Soome 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
560199767f8SToomas Soome 		src = rm->rm_col[c].rc_data;
561199767f8SToomas Soome 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
562199767f8SToomas Soome 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
563199767f8SToomas Soome 
564199767f8SToomas Soome 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
565199767f8SToomas Soome 
566199767f8SToomas Soome 		if (c == rm->rm_firstdatacol) {
567199767f8SToomas Soome 			ASSERT(ccnt == pcnt || ccnt == 0);
568199767f8SToomas Soome 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
569199767f8SToomas Soome 				*p = *src;
570199767f8SToomas Soome 				*q = *src;
571199767f8SToomas Soome 			}
572199767f8SToomas Soome 			for (; i < pcnt; i++, src++, p++, q++) {
573199767f8SToomas Soome 				*p = 0;
574199767f8SToomas Soome 				*q = 0;
575199767f8SToomas Soome 			}
576199767f8SToomas Soome 		} else {
577199767f8SToomas Soome 			ASSERT(ccnt <= pcnt);
578199767f8SToomas Soome 
579199767f8SToomas Soome 			/*
580199767f8SToomas Soome 			 * Apply the algorithm described above by multiplying
581199767f8SToomas Soome 			 * the previous result and adding in the new value.
582199767f8SToomas Soome 			 */
583199767f8SToomas Soome 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
584199767f8SToomas Soome 				*p ^= *src;
585199767f8SToomas Soome 
586199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_2(*q, mask);
587199767f8SToomas Soome 				*q ^= *src;
588199767f8SToomas Soome 			}
589199767f8SToomas Soome 
590199767f8SToomas Soome 			/*
591199767f8SToomas Soome 			 * Treat short columns as though they are full of 0s.
592199767f8SToomas Soome 			 * Note that there's therefore nothing needed for P.
593199767f8SToomas Soome 			 */
594199767f8SToomas Soome 			for (; i < pcnt; i++, q++) {
595199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_2(*q, mask);
596199767f8SToomas Soome 			}
597199767f8SToomas Soome 		}
598199767f8SToomas Soome 	}
599199767f8SToomas Soome }
600199767f8SToomas Soome 
601199767f8SToomas Soome static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)602199767f8SToomas Soome vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
603199767f8SToomas Soome {
604199767f8SToomas Soome 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
605199767f8SToomas Soome 	int c;
606199767f8SToomas Soome 
607199767f8SToomas Soome 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
608199767f8SToomas Soome 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
609199767f8SToomas Soome 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
610199767f8SToomas Soome 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
611199767f8SToomas Soome 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
612199767f8SToomas Soome 
613199767f8SToomas Soome 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
614199767f8SToomas Soome 		src = rm->rm_col[c].rc_data;
615199767f8SToomas Soome 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
616199767f8SToomas Soome 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
617199767f8SToomas Soome 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
618199767f8SToomas Soome 
619199767f8SToomas Soome 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
620199767f8SToomas Soome 
621199767f8SToomas Soome 		if (c == rm->rm_firstdatacol) {
622199767f8SToomas Soome 			ASSERT(ccnt == pcnt || ccnt == 0);
623199767f8SToomas Soome 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
624199767f8SToomas Soome 				*p = *src;
625199767f8SToomas Soome 				*q = *src;
626199767f8SToomas Soome 				*r = *src;
627199767f8SToomas Soome 			}
628199767f8SToomas Soome 			for (; i < pcnt; i++, src++, p++, q++, r++) {
629199767f8SToomas Soome 				*p = 0;
630199767f8SToomas Soome 				*q = 0;
631199767f8SToomas Soome 				*r = 0;
632199767f8SToomas Soome 			}
633199767f8SToomas Soome 		} else {
634199767f8SToomas Soome 			ASSERT(ccnt <= pcnt);
635199767f8SToomas Soome 
636199767f8SToomas Soome 			/*
637199767f8SToomas Soome 			 * Apply the algorithm described above by multiplying
638199767f8SToomas Soome 			 * the previous result and adding in the new value.
639199767f8SToomas Soome 			 */
640199767f8SToomas Soome 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
641199767f8SToomas Soome 				*p ^= *src;
642199767f8SToomas Soome 
643199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_2(*q, mask);
644199767f8SToomas Soome 				*q ^= *src;
645199767f8SToomas Soome 
646199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_4(*r, mask);
647199767f8SToomas Soome 				*r ^= *src;
648199767f8SToomas Soome 			}
649199767f8SToomas Soome 
650199767f8SToomas Soome 			/*
651199767f8SToomas Soome 			 * Treat short columns as though they are full of 0s.
652199767f8SToomas Soome 			 * Note that there's therefore nothing needed for P.
653199767f8SToomas Soome 			 */
654199767f8SToomas Soome 			for (; i < pcnt; i++, q++, r++) {
655199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_2(*q, mask);
656199767f8SToomas Soome 				VDEV_RAIDZ_64MUL_4(*r, mask);
657199767f8SToomas Soome 			}
658199767f8SToomas Soome 		}
659199767f8SToomas Soome 	}
660199767f8SToomas Soome }
661199767f8SToomas Soome 
662199767f8SToomas Soome /*
663199767f8SToomas Soome  * Generate RAID parity in the first virtual columns according to the number of
664199767f8SToomas Soome  * parity columns available.
665199767f8SToomas Soome  */
666199767f8SToomas Soome static void
vdev_raidz_generate_parity(raidz_map_t * rm)667199767f8SToomas Soome vdev_raidz_generate_parity(raidz_map_t *rm)
668199767f8SToomas Soome {
669199767f8SToomas Soome 	switch (rm->rm_firstdatacol) {
670199767f8SToomas Soome 	case 1:
671199767f8SToomas Soome 		vdev_raidz_generate_parity_p(rm);
672199767f8SToomas Soome 		break;
673199767f8SToomas Soome 	case 2:
674199767f8SToomas Soome 		vdev_raidz_generate_parity_pq(rm);
675199767f8SToomas Soome 		break;
676199767f8SToomas Soome 	case 3:
677199767f8SToomas Soome 		vdev_raidz_generate_parity_pqr(rm);
678199767f8SToomas Soome 		break;
679199767f8SToomas Soome 	default:
680199767f8SToomas Soome 		panic("invalid RAID-Z configuration");
681199767f8SToomas Soome 	}
682199767f8SToomas Soome }
683199767f8SToomas Soome 
684199767f8SToomas Soome /* BEGIN CSTYLED */
685199767f8SToomas Soome /*
686199767f8SToomas Soome  * In the general case of reconstruction, we must solve the system of linear
687199767f8SToomas Soome  * equations defined by the coeffecients used to generate parity as well as
688199767f8SToomas Soome  * the contents of the data and parity disks. This can be expressed with
689199767f8SToomas Soome  * vectors for the original data (D) and the actual data (d) and parity (p)
690199767f8SToomas Soome  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
691199767f8SToomas Soome  *
692199767f8SToomas Soome  *            __   __                     __     __
693199767f8SToomas Soome  *            |     |         __     __   |  p_0  |
694199767f8SToomas Soome  *            |  V  |         |  D_0  |   | p_m-1 |
695199767f8SToomas Soome  *            |     |    x    |   :   | = |  d_0  |
696199767f8SToomas Soome  *            |  I  |         | D_n-1 |   |   :   |
697199767f8SToomas Soome  *            |     |         ~~     ~~   | d_n-1 |
698199767f8SToomas Soome  *            ~~   ~~                     ~~     ~~
699199767f8SToomas Soome  *
700199767f8SToomas Soome  * I is simply a square identity matrix of size n, and V is a vandermonde
701199767f8SToomas Soome  * matrix defined by the coeffecients we chose for the various parity columns
702199767f8SToomas Soome  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
703199767f8SToomas Soome  * computation as well as linear separability.
704199767f8SToomas Soome  *
705199767f8SToomas Soome  *      __               __               __     __
706199767f8SToomas Soome  *      |   1   ..  1 1 1 |               |  p_0  |
707199767f8SToomas Soome  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
708199767f8SToomas Soome  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
709199767f8SToomas Soome  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
710199767f8SToomas Soome  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
711199767f8SToomas Soome  *      |   :       : : : |   |   :   |   |  d_2  |
712199767f8SToomas Soome  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
713199767f8SToomas Soome  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
714199767f8SToomas Soome  *      |   0   ..  0 0 1 |               | d_n-1 |
715199767f8SToomas Soome  *      ~~               ~~               ~~     ~~
716199767f8SToomas Soome  *
717199767f8SToomas Soome  * Note that I, V, d, and p are known. To compute D, we must invert the
718199767f8SToomas Soome  * matrix and use the known data and parity values to reconstruct the unknown
719199767f8SToomas Soome  * data values. We begin by removing the rows in V|I and d|p that correspond
720199767f8SToomas Soome  * to failed or missing columns; we then make V|I square (n x n) and d|p
721199767f8SToomas Soome  * sized n by removing rows corresponding to unused parity from the bottom up
722199767f8SToomas Soome  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
723199767f8SToomas Soome  * using Gauss-Jordan elimination. In the example below we use m=3 parity
724199767f8SToomas Soome  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
725199767f8SToomas Soome  *           __                               __
726199767f8SToomas Soome  *           |  1   1   1   1   1   1   1   1  |
727199767f8SToomas Soome  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
728199767f8SToomas Soome  *           |  19 205 116  29  64  16  4   1  |      / /
729199767f8SToomas Soome  *           |  1   0   0   0   0   0   0   0  |     / /
730199767f8SToomas Soome  *           |  0   1   0   0   0   0   0   0  | <--' /
731199767f8SToomas Soome  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
732199767f8SToomas Soome  *           |  0   0   0   1   0   0   0   0  |
733199767f8SToomas Soome  *           |  0   0   0   0   1   0   0   0  |
734199767f8SToomas Soome  *           |  0   0   0   0   0   1   0   0  |
735199767f8SToomas Soome  *           |  0   0   0   0   0   0   1   0  |
736199767f8SToomas Soome  *           |  0   0   0   0   0   0   0   1  |
737199767f8SToomas Soome  *           ~~                               ~~
738199767f8SToomas Soome  *           __                               __
739199767f8SToomas Soome  *           |  1   1   1   1   1   1   1   1  |
740199767f8SToomas Soome  *           | 128  64  32  16  8   4   2   1  |
741199767f8SToomas Soome  *           |  19 205 116  29  64  16  4   1  |
742199767f8SToomas Soome  *           |  1   0   0   0   0   0   0   0  |
743199767f8SToomas Soome  *           |  0   1   0   0   0   0   0   0  |
744199767f8SToomas Soome  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
745199767f8SToomas Soome  *           |  0   0   0   1   0   0   0   0  |
746199767f8SToomas Soome  *           |  0   0   0   0   1   0   0   0  |
747199767f8SToomas Soome  *           |  0   0   0   0   0   1   0   0  |
748199767f8SToomas Soome  *           |  0   0   0   0   0   0   1   0  |
749199767f8SToomas Soome  *           |  0   0   0   0   0   0   0   1  |
750199767f8SToomas Soome  *           ~~                               ~~
751199767f8SToomas Soome  *
752199767f8SToomas Soome  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
753199767f8SToomas Soome  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
754199767f8SToomas Soome  * matrix is not singular.
755199767f8SToomas Soome  * __                                                                 __
756199767f8SToomas Soome  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
757199767f8SToomas Soome  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
758199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
759199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
760199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
761199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
762199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
763199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
764199767f8SToomas Soome  * ~~                                                                 ~~
765199767f8SToomas Soome  * __                                                                 __
766199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
767199767f8SToomas Soome  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
768199767f8SToomas Soome  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
769199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
770199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
771199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
772199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
773199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
774199767f8SToomas Soome  * ~~                                                                 ~~
775199767f8SToomas Soome  * __                                                                 __
776199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
777199767f8SToomas Soome  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
778199767f8SToomas Soome  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
779199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
780199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
781199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
782199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
783199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
784199767f8SToomas Soome  * ~~                                                                 ~~
785199767f8SToomas Soome  * __                                                                 __
786199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
787199767f8SToomas Soome  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
788199767f8SToomas Soome  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
789199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
790199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
791199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
792199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
793199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
794199767f8SToomas Soome  * ~~                                                                 ~~
795199767f8SToomas Soome  * __                                                                 __
796199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
797199767f8SToomas Soome  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
798199767f8SToomas Soome  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
799199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
800199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
801199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
802199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
803199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
804199767f8SToomas Soome  * ~~                                                                 ~~
805199767f8SToomas Soome  * __                                                                 __
806199767f8SToomas Soome  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
807199767f8SToomas Soome  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
808199767f8SToomas Soome  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
809199767f8SToomas Soome  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
810199767f8SToomas Soome  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
811199767f8SToomas Soome  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
812199767f8SToomas Soome  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
813199767f8SToomas Soome  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
814199767f8SToomas Soome  * ~~                                                                 ~~
815199767f8SToomas Soome  *                   __                               __
816199767f8SToomas Soome  *                   |  0   0   1   0   0   0   0   0  |
817199767f8SToomas Soome  *                   | 167 100  5   41 159 169 217 208 |
818199767f8SToomas Soome  *                   | 166 100  4   40 158 168 216 209 |
819199767f8SToomas Soome  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
820199767f8SToomas Soome  *                   |  0   0   0   0   1   0   0   0  |
821199767f8SToomas Soome  *                   |  0   0   0   0   0   1   0   0  |
822199767f8SToomas Soome  *                   |  0   0   0   0   0   0   1   0  |
823199767f8SToomas Soome  *                   |  0   0   0   0   0   0   0   1  |
824199767f8SToomas Soome  *                   ~~                               ~~
825199767f8SToomas Soome  *
826199767f8SToomas Soome  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
827199767f8SToomas Soome  * of the missing data.
828199767f8SToomas Soome  *
829199767f8SToomas Soome  * As is apparent from the example above, the only non-trivial rows in the
830199767f8SToomas Soome  * inverse matrix correspond to the data disks that we're trying to
831199767f8SToomas Soome  * reconstruct. Indeed, those are the only rows we need as the others would
832199767f8SToomas Soome  * only be useful for reconstructing data known or assumed to be valid. For
833199767f8SToomas Soome  * that reason, we only build the coefficients in the rows that correspond to
834199767f8SToomas Soome  * targeted columns.
835199767f8SToomas Soome  */
836199767f8SToomas Soome /* END CSTYLED */
837199767f8SToomas Soome 
838199767f8SToomas Soome static void
vdev_raidz_matrix_init(raidz_map_t * rm __unused,int n,int nmap,int * map,uint8_t ** rows)8398eef2ab6SToomas Soome vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map,
840199767f8SToomas Soome     uint8_t **rows)
841199767f8SToomas Soome {
842199767f8SToomas Soome 	int i, j;
843199767f8SToomas Soome 	int pow;
844199767f8SToomas Soome 
845199767f8SToomas Soome 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
846199767f8SToomas Soome 
847199767f8SToomas Soome 	/*
848199767f8SToomas Soome 	 * Fill in the missing rows of interest.
849199767f8SToomas Soome 	 */
850199767f8SToomas Soome 	for (i = 0; i < nmap; i++) {
851199767f8SToomas Soome 		ASSERT3S(0, <=, map[i]);
852199767f8SToomas Soome 		ASSERT3S(map[i], <=, 2);
853199767f8SToomas Soome 
854199767f8SToomas Soome 		pow = map[i] * n;
855199767f8SToomas Soome 		if (pow > 255)
856199767f8SToomas Soome 			pow -= 255;
857199767f8SToomas Soome 		ASSERT(pow <= 255);
858199767f8SToomas Soome 
859199767f8SToomas Soome 		for (j = 0; j < n; j++) {
860199767f8SToomas Soome 			pow -= map[i];
861199767f8SToomas Soome 			if (pow < 0)
862199767f8SToomas Soome 				pow += 255;
863199767f8SToomas Soome 			rows[i][j] = vdev_raidz_pow2[pow];
864199767f8SToomas Soome 		}
865199767f8SToomas Soome 	}
866199767f8SToomas Soome }
867199767f8SToomas Soome 
868199767f8SToomas Soome static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)869199767f8SToomas Soome vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
870199767f8SToomas Soome     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
871199767f8SToomas Soome {
872199767f8SToomas Soome 	int i, j, ii, jj;
873199767f8SToomas Soome 	uint8_t log;
874199767f8SToomas Soome 
875199767f8SToomas Soome 	/*
876199767f8SToomas Soome 	 * Assert that the first nmissing entries from the array of used
877199767f8SToomas Soome 	 * columns correspond to parity columns and that subsequent entries
878199767f8SToomas Soome 	 * correspond to data columns.
879199767f8SToomas Soome 	 */
880199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
881199767f8SToomas Soome 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
882199767f8SToomas Soome 	}
883199767f8SToomas Soome 	for (; i < n; i++) {
884199767f8SToomas Soome 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
885199767f8SToomas Soome 	}
886199767f8SToomas Soome 
887199767f8SToomas Soome 	/*
888199767f8SToomas Soome 	 * First initialize the storage where we'll compute the inverse rows.
889199767f8SToomas Soome 	 */
890199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
891199767f8SToomas Soome 		for (j = 0; j < n; j++) {
892199767f8SToomas Soome 			invrows[i][j] = (i == j) ? 1 : 0;
893199767f8SToomas Soome 		}
894199767f8SToomas Soome 	}
895199767f8SToomas Soome 
896199767f8SToomas Soome 	/*
897199767f8SToomas Soome 	 * Subtract all trivial rows from the rows of consequence.
898199767f8SToomas Soome 	 */
899199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
900199767f8SToomas Soome 		for (j = nmissing; j < n; j++) {
901199767f8SToomas Soome 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
902199767f8SToomas Soome 			jj = used[j] - rm->rm_firstdatacol;
903199767f8SToomas Soome 			ASSERT3S(jj, <, n);
904199767f8SToomas Soome 			invrows[i][j] = rows[i][jj];
905199767f8SToomas Soome 			rows[i][jj] = 0;
906199767f8SToomas Soome 		}
907199767f8SToomas Soome 	}
908199767f8SToomas Soome 
909199767f8SToomas Soome 	/*
910199767f8SToomas Soome 	 * For each of the rows of interest, we must normalize it and subtract
911199767f8SToomas Soome 	 * a multiple of it from the other rows.
912199767f8SToomas Soome 	 */
913199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
914199767f8SToomas Soome 		for (j = 0; j < missing[i]; j++) {
915199767f8SToomas Soome 			ASSERT3U(rows[i][j], ==, 0);
916199767f8SToomas Soome 		}
917199767f8SToomas Soome 		ASSERT3U(rows[i][missing[i]], !=, 0);
918199767f8SToomas Soome 
919199767f8SToomas Soome 		/*
920199767f8SToomas Soome 		 * Compute the inverse of the first element and multiply each
921199767f8SToomas Soome 		 * element in the row by that value.
922199767f8SToomas Soome 		 */
923199767f8SToomas Soome 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
924199767f8SToomas Soome 
925199767f8SToomas Soome 		for (j = 0; j < n; j++) {
926199767f8SToomas Soome 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
927199767f8SToomas Soome 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
928199767f8SToomas Soome 		}
929199767f8SToomas Soome 
930199767f8SToomas Soome 		for (ii = 0; ii < nmissing; ii++) {
931199767f8SToomas Soome 			if (i == ii)
932199767f8SToomas Soome 				continue;
933199767f8SToomas Soome 
934199767f8SToomas Soome 			ASSERT3U(rows[ii][missing[i]], !=, 0);
935199767f8SToomas Soome 
936199767f8SToomas Soome 			log = vdev_raidz_log2[rows[ii][missing[i]]];
937199767f8SToomas Soome 
938199767f8SToomas Soome 			for (j = 0; j < n; j++) {
939199767f8SToomas Soome 				rows[ii][j] ^=
940199767f8SToomas Soome 				    vdev_raidz_exp2(rows[i][j], log);
941199767f8SToomas Soome 				invrows[ii][j] ^=
942199767f8SToomas Soome 				    vdev_raidz_exp2(invrows[i][j], log);
943199767f8SToomas Soome 			}
944199767f8SToomas Soome 		}
945199767f8SToomas Soome 	}
946199767f8SToomas Soome 
947199767f8SToomas Soome 	/*
948199767f8SToomas Soome 	 * Verify that the data that is left in the rows are properly part of
949199767f8SToomas Soome 	 * an identity matrix.
950199767f8SToomas Soome 	 */
951199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
952199767f8SToomas Soome 		for (j = 0; j < n; j++) {
953199767f8SToomas Soome 			if (j == missing[i]) {
954199767f8SToomas Soome 				ASSERT3U(rows[i][j], ==, 1);
955199767f8SToomas Soome 			} else {
956199767f8SToomas Soome 				ASSERT3U(rows[i][j], ==, 0);
957199767f8SToomas Soome 			}
958199767f8SToomas Soome 		}
959199767f8SToomas Soome 	}
960199767f8SToomas Soome }
961199767f8SToomas Soome 
962199767f8SToomas Soome static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)963199767f8SToomas Soome vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
964199767f8SToomas Soome     int *missing, uint8_t **invrows, const uint8_t *used)
965199767f8SToomas Soome {
966199767f8SToomas Soome 	int i, j, x, cc, c;
967199767f8SToomas Soome 	uint8_t *src;
968199767f8SToomas Soome 	uint64_t ccount;
969199767f8SToomas Soome 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
970199767f8SToomas Soome 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
971199767f8SToomas Soome 	uint8_t log, val;
972199767f8SToomas Soome 	int ll;
973199767f8SToomas Soome 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
974199767f8SToomas Soome 	uint8_t *p, *pp;
975199767f8SToomas Soome 	size_t psize;
976199767f8SToomas Soome 
977199767f8SToomas Soome 	log = 0;	/* gcc */
978199767f8SToomas Soome 	psize = sizeof (invlog[0][0]) * n * nmissing;
979*3e8c7f16SToomas Soome 	p = malloc(psize);
980*3e8c7f16SToomas Soome 	if (p == NULL) {
981*3e8c7f16SToomas Soome 		printf("Out of memory\n");
982*3e8c7f16SToomas Soome 		return;
983*3e8c7f16SToomas Soome 	}
984199767f8SToomas Soome 
985199767f8SToomas Soome 	for (pp = p, i = 0; i < nmissing; i++) {
986199767f8SToomas Soome 		invlog[i] = pp;
987199767f8SToomas Soome 		pp += n;
988199767f8SToomas Soome 	}
989199767f8SToomas Soome 
990199767f8SToomas Soome 	for (i = 0; i < nmissing; i++) {
991199767f8SToomas Soome 		for (j = 0; j < n; j++) {
992199767f8SToomas Soome 			ASSERT3U(invrows[i][j], !=, 0);
993199767f8SToomas Soome 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
994199767f8SToomas Soome 		}
995199767f8SToomas Soome 	}
996199767f8SToomas Soome 
997199767f8SToomas Soome 	for (i = 0; i < n; i++) {
998199767f8SToomas Soome 		c = used[i];
999199767f8SToomas Soome 		ASSERT3U(c, <, rm->rm_cols);
1000199767f8SToomas Soome 
1001199767f8SToomas Soome 		src = rm->rm_col[c].rc_data;
1002199767f8SToomas Soome 		ccount = rm->rm_col[c].rc_size;
1003199767f8SToomas Soome 		for (j = 0; j < nmissing; j++) {
1004199767f8SToomas Soome 			cc = missing[j] + rm->rm_firstdatacol;
1005199767f8SToomas Soome 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1006199767f8SToomas Soome 			ASSERT3U(cc, <, rm->rm_cols);
1007199767f8SToomas Soome 			ASSERT3U(cc, !=, c);
1008199767f8SToomas Soome 
1009199767f8SToomas Soome 			dst[j] = rm->rm_col[cc].rc_data;
1010199767f8SToomas Soome 			dcount[j] = rm->rm_col[cc].rc_size;
1011199767f8SToomas Soome 		}
1012199767f8SToomas Soome 
1013199767f8SToomas Soome 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1014199767f8SToomas Soome 
1015199767f8SToomas Soome 		for (x = 0; x < ccount; x++, src++) {
1016199767f8SToomas Soome 			if (*src != 0)
1017199767f8SToomas Soome 				log = vdev_raidz_log2[*src];
1018199767f8SToomas Soome 
1019199767f8SToomas Soome 			for (cc = 0; cc < nmissing; cc++) {
1020199767f8SToomas Soome 				if (x >= dcount[cc])
1021199767f8SToomas Soome 					continue;
1022199767f8SToomas Soome 
1023199767f8SToomas Soome 				if (*src == 0) {
1024199767f8SToomas Soome 					val = 0;
1025199767f8SToomas Soome 				} else {
1026199767f8SToomas Soome 					if ((ll = log + invlog[cc][i]) >= 255)
1027199767f8SToomas Soome 						ll -= 255;
1028199767f8SToomas Soome 					val = vdev_raidz_pow2[ll];
1029199767f8SToomas Soome 				}
1030199767f8SToomas Soome 
1031199767f8SToomas Soome 				if (i == 0)
1032199767f8SToomas Soome 					dst[cc][x] = val;
1033199767f8SToomas Soome 				else
1034199767f8SToomas Soome 					dst[cc][x] ^= val;
1035199767f8SToomas Soome 			}
1036199767f8SToomas Soome 		}
1037199767f8SToomas Soome 	}
1038199767f8SToomas Soome 
1039*3e8c7f16SToomas Soome 	free(p);
1040199767f8SToomas Soome }
1041199767f8SToomas Soome 
1042199767f8SToomas Soome static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1043199767f8SToomas Soome vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1044199767f8SToomas Soome {
1045199767f8SToomas Soome 	int n, i, c, t, tt;
1046199767f8SToomas Soome 	int nmissing_rows;
1047199767f8SToomas Soome 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1048199767f8SToomas Soome 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1049199767f8SToomas Soome 
1050199767f8SToomas Soome 	uint8_t *p, *pp;
1051199767f8SToomas Soome 	size_t psize;
1052199767f8SToomas Soome 
1053199767f8SToomas Soome 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1054199767f8SToomas Soome 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1055199767f8SToomas Soome 	uint8_t *used;
1056199767f8SToomas Soome 
1057199767f8SToomas Soome 	int code = 0;
1058199767f8SToomas Soome 
1059199767f8SToomas Soome 
1060199767f8SToomas Soome 	n = rm->rm_cols - rm->rm_firstdatacol;
1061199767f8SToomas Soome 
1062199767f8SToomas Soome 	/*
1063199767f8SToomas Soome 	 * Figure out which data columns are missing.
1064199767f8SToomas Soome 	 */
1065199767f8SToomas Soome 	nmissing_rows = 0;
1066199767f8SToomas Soome 	for (t = 0; t < ntgts; t++) {
1067199767f8SToomas Soome 		if (tgts[t] >= rm->rm_firstdatacol) {
1068199767f8SToomas Soome 			missing_rows[nmissing_rows++] =
1069199767f8SToomas Soome 			    tgts[t] - rm->rm_firstdatacol;
1070199767f8SToomas Soome 		}
1071199767f8SToomas Soome 	}
1072199767f8SToomas Soome 
1073199767f8SToomas Soome 	/*
1074199767f8SToomas Soome 	 * Figure out which parity columns to use to help generate the missing
1075199767f8SToomas Soome 	 * data columns.
1076199767f8SToomas Soome 	 */
1077199767f8SToomas Soome 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1078199767f8SToomas Soome 		ASSERT(tt < ntgts);
1079199767f8SToomas Soome 		ASSERT(c < rm->rm_firstdatacol);
1080199767f8SToomas Soome 
1081199767f8SToomas Soome 		/*
1082199767f8SToomas Soome 		 * Skip any targeted parity columns.
1083199767f8SToomas Soome 		 */
1084199767f8SToomas Soome 		if (c == tgts[tt]) {
1085199767f8SToomas Soome 			tt++;
1086199767f8SToomas Soome 			continue;
1087199767f8SToomas Soome 		}
1088199767f8SToomas Soome 
1089199767f8SToomas Soome 		code |= 1 << c;
1090199767f8SToomas Soome 
1091199767f8SToomas Soome 		parity_map[i] = c;
1092199767f8SToomas Soome 		i++;
1093199767f8SToomas Soome 	}
1094199767f8SToomas Soome 
1095199767f8SToomas Soome 	ASSERT(code != 0);
1096199767f8SToomas Soome 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1097199767f8SToomas Soome 
1098199767f8SToomas Soome 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1099199767f8SToomas Soome 	    nmissing_rows * n + sizeof (used[0]) * n;
1100*3e8c7f16SToomas Soome 	p = malloc(psize);
1101*3e8c7f16SToomas Soome 	if (p == NULL) {
1102*3e8c7f16SToomas Soome 		printf("Out of memory\n");
1103*3e8c7f16SToomas Soome 		return (code);
1104*3e8c7f16SToomas Soome 	}
1105199767f8SToomas Soome 
1106199767f8SToomas Soome 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1107199767f8SToomas Soome 		rows[i] = pp;
1108199767f8SToomas Soome 		pp += n;
1109199767f8SToomas Soome 		invrows[i] = pp;
1110199767f8SToomas Soome 		pp += n;
1111199767f8SToomas Soome 	}
1112199767f8SToomas Soome 	used = pp;
1113199767f8SToomas Soome 
1114199767f8SToomas Soome 	for (i = 0; i < nmissing_rows; i++) {
1115199767f8SToomas Soome 		used[i] = parity_map[i];
1116199767f8SToomas Soome 	}
1117199767f8SToomas Soome 
1118199767f8SToomas Soome 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1119199767f8SToomas Soome 		if (tt < nmissing_rows &&
1120199767f8SToomas Soome 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1121199767f8SToomas Soome 			tt++;
1122199767f8SToomas Soome 			continue;
1123199767f8SToomas Soome 		}
1124199767f8SToomas Soome 
1125199767f8SToomas Soome 		ASSERT3S(i, <, n);
1126199767f8SToomas Soome 		used[i] = c;
1127199767f8SToomas Soome 		i++;
1128199767f8SToomas Soome 	}
1129199767f8SToomas Soome 
1130199767f8SToomas Soome 	/*
1131199767f8SToomas Soome 	 * Initialize the interesting rows of the matrix.
1132199767f8SToomas Soome 	 */
1133199767f8SToomas Soome 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1134199767f8SToomas Soome 
1135199767f8SToomas Soome 	/*
1136199767f8SToomas Soome 	 * Invert the matrix.
1137199767f8SToomas Soome 	 */
1138199767f8SToomas Soome 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1139199767f8SToomas Soome 	    invrows, used);
1140199767f8SToomas Soome 
1141199767f8SToomas Soome 	/*
1142199767f8SToomas Soome 	 * Reconstruct the missing data using the generated matrix.
1143199767f8SToomas Soome 	 */
1144199767f8SToomas Soome 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1145199767f8SToomas Soome 	    invrows, used);
1146199767f8SToomas Soome 
1147*3e8c7f16SToomas Soome 	free(p);
1148199767f8SToomas Soome 
1149199767f8SToomas Soome 	return (code);
1150199767f8SToomas Soome }
1151199767f8SToomas Soome 
1152199767f8SToomas Soome static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1153199767f8SToomas Soome vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1154199767f8SToomas Soome {
1155199767f8SToomas Soome 	int tgts[VDEV_RAIDZ_MAXPARITY];
1156199767f8SToomas Soome 	int ntgts;
1157199767f8SToomas Soome 	int i, c;
1158199767f8SToomas Soome 	int code;
1159199767f8SToomas Soome 	int nbadparity, nbaddata;
1160199767f8SToomas Soome 
1161199767f8SToomas Soome 	/*
1162199767f8SToomas Soome 	 * The tgts list must already be sorted.
1163199767f8SToomas Soome 	 */
1164199767f8SToomas Soome 	for (i = 1; i < nt; i++) {
1165199767f8SToomas Soome 		ASSERT(t[i] > t[i - 1]);
1166199767f8SToomas Soome 	}
1167199767f8SToomas Soome 
1168199767f8SToomas Soome 	nbadparity = rm->rm_firstdatacol;
1169199767f8SToomas Soome 	nbaddata = rm->rm_cols - nbadparity;
1170199767f8SToomas Soome 	ntgts = 0;
1171199767f8SToomas Soome 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1172199767f8SToomas Soome 		if (i < nt && c == t[i]) {
1173199767f8SToomas Soome 			tgts[ntgts++] = c;
1174199767f8SToomas Soome 			i++;
1175199767f8SToomas Soome 		} else if (rm->rm_col[c].rc_error != 0) {
1176199767f8SToomas Soome 			tgts[ntgts++] = c;
1177199767f8SToomas Soome 		} else if (c >= rm->rm_firstdatacol) {
1178199767f8SToomas Soome 			nbaddata--;
1179199767f8SToomas Soome 		} else {
1180199767f8SToomas Soome 			nbadparity--;
1181199767f8SToomas Soome 		}
1182199767f8SToomas Soome 	}
1183199767f8SToomas Soome 
1184199767f8SToomas Soome 	ASSERT(ntgts >= nt);
1185199767f8SToomas Soome 	ASSERT(nbaddata >= 0);
1186199767f8SToomas Soome 	ASSERT(nbaddata + nbadparity == ntgts);
1187199767f8SToomas Soome 
1188199767f8SToomas Soome 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1189199767f8SToomas Soome 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1190199767f8SToomas Soome 	ASSERT(code > 0);
1191199767f8SToomas Soome 	return (code);
1192199767f8SToomas Soome }
1193199767f8SToomas Soome 
1194199767f8SToomas Soome static raidz_map_t *
vdev_raidz_map_alloc(void * data,off_t offset,size_t size,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)1195199767f8SToomas Soome vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1196199767f8SToomas Soome     uint64_t dcols, uint64_t nparity)
1197199767f8SToomas Soome {
1198199767f8SToomas Soome 	raidz_map_t *rm;
1199199767f8SToomas Soome 	uint64_t b = offset >> unit_shift;
1200199767f8SToomas Soome 	uint64_t s = size >> unit_shift;
1201199767f8SToomas Soome 	uint64_t f = b % dcols;
1202199767f8SToomas Soome 	uint64_t o = (b / dcols) << unit_shift;
1203199767f8SToomas Soome 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1204199767f8SToomas Soome 
1205199767f8SToomas Soome 	q = s / (dcols - nparity);
1206199767f8SToomas Soome 	r = s - q * (dcols - nparity);
1207199767f8SToomas Soome 	bc = (r == 0 ? 0 : r + nparity);
1208199767f8SToomas Soome 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1209199767f8SToomas Soome 
1210199767f8SToomas Soome 	if (q == 0) {
1211199767f8SToomas Soome 		acols = bc;
1212199767f8SToomas Soome 		scols = MIN(dcols, roundup(bc, nparity + 1));
1213199767f8SToomas Soome 	} else {
1214199767f8SToomas Soome 		acols = dcols;
1215199767f8SToomas Soome 		scols = dcols;
1216199767f8SToomas Soome 	}
1217199767f8SToomas Soome 
1218199767f8SToomas Soome 	ASSERT3U(acols, <=, scols);
1219199767f8SToomas Soome 
1220*3e8c7f16SToomas Soome 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1221*3e8c7f16SToomas Soome 	if (rm == NULL)
1222*3e8c7f16SToomas Soome 		return (rm);
1223199767f8SToomas Soome 
1224199767f8SToomas Soome 	rm->rm_cols = acols;
1225199767f8SToomas Soome 	rm->rm_scols = scols;
1226199767f8SToomas Soome 	rm->rm_bigcols = bc;
1227199767f8SToomas Soome 	rm->rm_skipstart = bc;
1228199767f8SToomas Soome 	rm->rm_missingdata = 0;
1229199767f8SToomas Soome 	rm->rm_missingparity = 0;
1230199767f8SToomas Soome 	rm->rm_firstdatacol = nparity;
1231199767f8SToomas Soome 	rm->rm_reports = 0;
1232199767f8SToomas Soome 	rm->rm_freed = 0;
1233199767f8SToomas Soome 	rm->rm_ecksuminjected = 0;
1234199767f8SToomas Soome 
1235199767f8SToomas Soome 	asize = 0;
1236199767f8SToomas Soome 
1237199767f8SToomas Soome 	for (c = 0; c < scols; c++) {
1238199767f8SToomas Soome 		col = f + c;
1239199767f8SToomas Soome 		coff = o;
1240199767f8SToomas Soome 		if (col >= dcols) {
1241199767f8SToomas Soome 			col -= dcols;
1242199767f8SToomas Soome 			coff += 1ULL << unit_shift;
1243199767f8SToomas Soome 		}
1244199767f8SToomas Soome 		rm->rm_col[c].rc_devidx = col;
1245199767f8SToomas Soome 		rm->rm_col[c].rc_offset = coff;
1246199767f8SToomas Soome 		rm->rm_col[c].rc_data = NULL;
1247199767f8SToomas Soome 		rm->rm_col[c].rc_error = 0;
1248199767f8SToomas Soome 		rm->rm_col[c].rc_tried = 0;
1249199767f8SToomas Soome 		rm->rm_col[c].rc_skipped = 0;
1250199767f8SToomas Soome 
1251199767f8SToomas Soome 		if (c >= acols)
1252199767f8SToomas Soome 			rm->rm_col[c].rc_size = 0;
1253199767f8SToomas Soome 		else if (c < bc)
1254199767f8SToomas Soome 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1255199767f8SToomas Soome 		else
1256199767f8SToomas Soome 			rm->rm_col[c].rc_size = q << unit_shift;
1257199767f8SToomas Soome 
1258199767f8SToomas Soome 		asize += rm->rm_col[c].rc_size;
1259199767f8SToomas Soome 	}
1260199767f8SToomas Soome 
1261199767f8SToomas Soome 	ASSERT3U(asize, ==, tot << unit_shift);
1262199767f8SToomas Soome 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1263199767f8SToomas Soome 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1264199767f8SToomas Soome 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1265199767f8SToomas Soome 	ASSERT3U(rm->rm_nskip, <=, nparity);
1266199767f8SToomas Soome 
1267*3e8c7f16SToomas Soome 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1268*3e8c7f16SToomas Soome 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1269*3e8c7f16SToomas Soome 		if (rm->rm_col[c].rc_data == NULL) {
1270*3e8c7f16SToomas Soome 			c++;
1271*3e8c7f16SToomas Soome 			while (c != 0)
1272*3e8c7f16SToomas Soome 				free(rm->rm_col[--c].rc_data);
1273*3e8c7f16SToomas Soome 			free(rm);
1274*3e8c7f16SToomas Soome 			return (NULL);
1275*3e8c7f16SToomas Soome 		}
1276*3e8c7f16SToomas Soome 	}
1277199767f8SToomas Soome 
1278199767f8SToomas Soome 	rm->rm_col[c].rc_data = data;
1279199767f8SToomas Soome 
1280199767f8SToomas Soome 	for (c = c + 1; c < acols; c++)
1281199767f8SToomas Soome 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1282199767f8SToomas Soome 		    rm->rm_col[c - 1].rc_size;
1283199767f8SToomas Soome 
1284199767f8SToomas Soome 	/*
1285199767f8SToomas Soome 	 * If all data stored spans all columns, there's a danger that parity
1286199767f8SToomas Soome 	 * will always be on the same device and, since parity isn't read
1287199767f8SToomas Soome 	 * during normal operation, that that device's I/O bandwidth won't be
1288199767f8SToomas Soome 	 * used effectively. We therefore switch the parity every 1MB.
1289199767f8SToomas Soome 	 *
1290199767f8SToomas Soome 	 * ... at least that was, ostensibly, the theory. As a practical
1291199767f8SToomas Soome 	 * matter unless we juggle the parity between all devices evenly, we
1292199767f8SToomas Soome 	 * won't see any benefit. Further, occasional writes that aren't a
1293199767f8SToomas Soome 	 * multiple of the LCM of the number of children and the minimum
1294199767f8SToomas Soome 	 * stripe width are sufficient to avoid pessimal behavior.
1295199767f8SToomas Soome 	 * Unfortunately, this decision created an implicit on-disk format
1296199767f8SToomas Soome 	 * requirement that we need to support for all eternity, but only
1297199767f8SToomas Soome 	 * for single-parity RAID-Z.
1298199767f8SToomas Soome 	 *
1299199767f8SToomas Soome 	 * If we intend to skip a sector in the zeroth column for padding
1300199767f8SToomas Soome 	 * we must make sure to note this swap. We will never intend to
1301199767f8SToomas Soome 	 * skip the first column since at least one data and one parity
1302199767f8SToomas Soome 	 * column must appear in each row.
1303199767f8SToomas Soome 	 */
1304199767f8SToomas Soome 	ASSERT(rm->rm_cols >= 2);
1305199767f8SToomas Soome 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1306199767f8SToomas Soome 
1307199767f8SToomas Soome 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1308199767f8SToomas Soome 		devidx = rm->rm_col[0].rc_devidx;
1309199767f8SToomas Soome 		o = rm->rm_col[0].rc_offset;
1310199767f8SToomas Soome 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1311199767f8SToomas Soome 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1312199767f8SToomas Soome 		rm->rm_col[1].rc_devidx = devidx;
1313199767f8SToomas Soome 		rm->rm_col[1].rc_offset = o;
1314199767f8SToomas Soome 
1315199767f8SToomas Soome 		if (rm->rm_skipstart == 0)
1316199767f8SToomas Soome 			rm->rm_skipstart = 1;
1317199767f8SToomas Soome 	}
1318199767f8SToomas Soome 
1319199767f8SToomas Soome 	return (rm);
1320199767f8SToomas Soome }
1321199767f8SToomas Soome 
1322199767f8SToomas Soome static void
vdev_raidz_map_free(raidz_map_t * rm)1323199767f8SToomas Soome vdev_raidz_map_free(raidz_map_t *rm)
1324199767f8SToomas Soome {
1325199767f8SToomas Soome 	int c;
1326199767f8SToomas Soome 
1327199767f8SToomas Soome 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1328*3e8c7f16SToomas Soome 		free(rm->rm_col[c].rc_data);
1329199767f8SToomas Soome 
1330*3e8c7f16SToomas Soome 	free(rm);
1331199767f8SToomas Soome }
1332199767f8SToomas Soome 
1333199767f8SToomas Soome static vdev_t *
vdev_child(vdev_t * pvd,uint64_t devidx)1334199767f8SToomas Soome vdev_child(vdev_t *pvd, uint64_t devidx)
1335199767f8SToomas Soome {
1336199767f8SToomas Soome 	vdev_t *cvd;
1337199767f8SToomas Soome 
1338199767f8SToomas Soome 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1339199767f8SToomas Soome 		if (cvd->v_id == devidx)
1340199767f8SToomas Soome 			break;
1341199767f8SToomas Soome 	}
1342199767f8SToomas Soome 
1343199767f8SToomas Soome 	return (cvd);
1344199767f8SToomas Soome }
1345199767f8SToomas Soome 
1346199767f8SToomas Soome /*
1347199767f8SToomas Soome  * We keep track of whether or not there were any injected errors, so that
1348199767f8SToomas Soome  * any ereports we generate can note it.
1349199767f8SToomas Soome  */
1350199767f8SToomas Soome static int
raidz_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data,uint64_t size __unused)13514a04e8dbSToomas Soome raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
13528eef2ab6SToomas Soome     uint64_t size __unused)
1353199767f8SToomas Soome {
1354199767f8SToomas Soome 
13554a04e8dbSToomas Soome 	return (zio_checksum_verify(spa, bp, data));
1356199767f8SToomas Soome }
1357199767f8SToomas Soome 
1358199767f8SToomas Soome /*
1359199767f8SToomas Soome  * Generate the parity from the data columns. If we tried and were able to
1360199767f8SToomas Soome  * read the parity without error, verify that the generated parity matches the
1361199767f8SToomas Soome  * data we read. If it doesn't, we fire off a checksum error. Return the
1362199767f8SToomas Soome  * number such failures.
1363199767f8SToomas Soome  */
1364199767f8SToomas Soome static int
raidz_parity_verify(raidz_map_t * rm)1365199767f8SToomas Soome raidz_parity_verify(raidz_map_t *rm)
1366199767f8SToomas Soome {
1367199767f8SToomas Soome 	void *orig[VDEV_RAIDZ_MAXPARITY];
1368199767f8SToomas Soome 	int c, ret = 0;
1369199767f8SToomas Soome 	raidz_col_t *rc;
1370199767f8SToomas Soome 
1371199767f8SToomas Soome 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1372199767f8SToomas Soome 		rc = &rm->rm_col[c];
1373199767f8SToomas Soome 		if (!rc->rc_tried || rc->rc_error != 0)
1374199767f8SToomas Soome 			continue;
1375*3e8c7f16SToomas Soome 		orig[c] = malloc(rc->rc_size);
1376*3e8c7f16SToomas Soome 		if (orig[c] != NULL) {
1377*3e8c7f16SToomas Soome 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1378*3e8c7f16SToomas Soome 		} else {
1379*3e8c7f16SToomas Soome 			printf("Out of memory\n");
1380*3e8c7f16SToomas Soome 		}
1381199767f8SToomas Soome 	}
1382199767f8SToomas Soome 
1383199767f8SToomas Soome 	vdev_raidz_generate_parity(rm);
1384199767f8SToomas Soome 
1385199767f8SToomas Soome 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1386199767f8SToomas Soome 		rc = &rm->rm_col[c];
1387199767f8SToomas Soome 		if (!rc->rc_tried || rc->rc_error != 0)
1388199767f8SToomas Soome 			continue;
1389*3e8c7f16SToomas Soome 		if (orig[c] == NULL ||
1390*3e8c7f16SToomas Soome 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1391199767f8SToomas Soome 			rc->rc_error = ECKSUM;
1392199767f8SToomas Soome 			ret++;
1393199767f8SToomas Soome 		}
1394*3e8c7f16SToomas Soome 		free(orig[c]);
1395199767f8SToomas Soome 	}
1396199767f8SToomas Soome 
1397199767f8SToomas Soome 	return (ret);
1398199767f8SToomas Soome }
1399199767f8SToomas Soome 
1400199767f8SToomas Soome /*
1401199767f8SToomas Soome  * Iterate over all combinations of bad data and attempt a reconstruction.
1402199767f8SToomas Soome  * Note that the algorithm below is non-optimal because it doesn't take into
1403199767f8SToomas Soome  * account how reconstruction is actually performed. For example, with
1404199767f8SToomas Soome  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1405199767f8SToomas Soome  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1406199767f8SToomas Soome  * cases we'd only use parity information in column 0.
1407199767f8SToomas Soome  */
1408199767f8SToomas Soome static int
vdev_raidz_combrec(const spa_t * spa,raidz_map_t * rm,const blkptr_t * bp,void * data,off_t offset __unused,uint64_t bytes,int total_errors,int data_errors)14094a04e8dbSToomas Soome vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
14108eef2ab6SToomas Soome     void *data, off_t offset __unused, uint64_t bytes, int total_errors,
14118eef2ab6SToomas Soome     int data_errors)
1412199767f8SToomas Soome {
1413199767f8SToomas Soome 	raidz_col_t *rc;
1414199767f8SToomas Soome 	void *orig[VDEV_RAIDZ_MAXPARITY];
1415199767f8SToomas Soome 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1416199767f8SToomas Soome 	int *tgts = &tstore[1];
1417199767f8SToomas Soome 	int current, next, i, c, n;
1418199767f8SToomas Soome 	int code, ret = 0;
1419199767f8SToomas Soome 
1420199767f8SToomas Soome 	ASSERT(total_errors < rm->rm_firstdatacol);
1421199767f8SToomas Soome 
1422199767f8SToomas Soome 	/*
1423199767f8SToomas Soome 	 * This simplifies one edge condition.
1424199767f8SToomas Soome 	 */
1425199767f8SToomas Soome 	tgts[-1] = -1;
1426199767f8SToomas Soome 
1427199767f8SToomas Soome 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1428199767f8SToomas Soome 		/*
1429199767f8SToomas Soome 		 * Initialize the targets array by finding the first n columns
1430199767f8SToomas Soome 		 * that contain no error.
1431199767f8SToomas Soome 		 *
1432199767f8SToomas Soome 		 * If there were no data errors, we need to ensure that we're
1433199767f8SToomas Soome 		 * always explicitly attempting to reconstruct at least one
1434199767f8SToomas Soome 		 * data column. To do this, we simply push the highest target
1435199767f8SToomas Soome 		 * up into the data columns.
1436199767f8SToomas Soome 		 */
1437199767f8SToomas Soome 		for (c = 0, i = 0; i < n; i++) {
1438199767f8SToomas Soome 			if (i == n - 1 && data_errors == 0 &&
1439199767f8SToomas Soome 			    c < rm->rm_firstdatacol) {
1440199767f8SToomas Soome 				c = rm->rm_firstdatacol;
1441199767f8SToomas Soome 			}
1442199767f8SToomas Soome 
1443199767f8SToomas Soome 			while (rm->rm_col[c].rc_error != 0) {
1444199767f8SToomas Soome 				c++;
1445199767f8SToomas Soome 				ASSERT3S(c, <, rm->rm_cols);
1446199767f8SToomas Soome 			}
1447199767f8SToomas Soome 
1448199767f8SToomas Soome 			tgts[i] = c++;
1449199767f8SToomas Soome 		}
1450199767f8SToomas Soome 
1451199767f8SToomas Soome 		/*
1452199767f8SToomas Soome 		 * Setting tgts[n] simplifies the other edge condition.
1453199767f8SToomas Soome 		 */
1454199767f8SToomas Soome 		tgts[n] = rm->rm_cols;
1455199767f8SToomas Soome 
1456199767f8SToomas Soome 		/*
1457199767f8SToomas Soome 		 * These buffers were allocated in previous iterations.
1458199767f8SToomas Soome 		 */
1459199767f8SToomas Soome 		for (i = 0; i < n - 1; i++) {
1460199767f8SToomas Soome 			ASSERT(orig[i] != NULL);
1461199767f8SToomas Soome 		}
1462199767f8SToomas Soome 
1463*3e8c7f16SToomas Soome 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1464*3e8c7f16SToomas Soome 		if (orig[n - 1] == NULL) {
1465*3e8c7f16SToomas Soome 			ret = ENOMEM;
1466*3e8c7f16SToomas Soome 			goto done;
1467*3e8c7f16SToomas Soome 		}
1468199767f8SToomas Soome 
1469199767f8SToomas Soome 		current = 0;
1470199767f8SToomas Soome 		next = tgts[current];
1471199767f8SToomas Soome 
1472199767f8SToomas Soome 		while (current != n) {
1473199767f8SToomas Soome 			tgts[current] = next;
1474199767f8SToomas Soome 			current = 0;
1475199767f8SToomas Soome 
1476199767f8SToomas Soome 			/*
1477199767f8SToomas Soome 			 * Save off the original data that we're going to
1478199767f8SToomas Soome 			 * attempt to reconstruct.
1479199767f8SToomas Soome 			 */
1480199767f8SToomas Soome 			for (i = 0; i < n; i++) {
1481199767f8SToomas Soome 				ASSERT(orig[i] != NULL);
1482199767f8SToomas Soome 				c = tgts[i];
1483199767f8SToomas Soome 				ASSERT3S(c, >=, 0);
1484199767f8SToomas Soome 				ASSERT3S(c, <, rm->rm_cols);
1485199767f8SToomas Soome 				rc = &rm->rm_col[c];
1486199767f8SToomas Soome 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1487199767f8SToomas Soome 			}
1488199767f8SToomas Soome 
1489199767f8SToomas Soome 			/*
1490199767f8SToomas Soome 			 * Attempt a reconstruction and exit the outer loop on
1491199767f8SToomas Soome 			 * success.
1492199767f8SToomas Soome 			 */
1493199767f8SToomas Soome 			code = vdev_raidz_reconstruct(rm, tgts, n);
14944a04e8dbSToomas Soome 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1495199767f8SToomas Soome 				for (i = 0; i < n; i++) {
1496199767f8SToomas Soome 					c = tgts[i];
1497199767f8SToomas Soome 					rc = &rm->rm_col[c];
1498199767f8SToomas Soome 					ASSERT(rc->rc_error == 0);
1499199767f8SToomas Soome 					rc->rc_error = ECKSUM;
1500199767f8SToomas Soome 				}
1501199767f8SToomas Soome 
1502199767f8SToomas Soome 				ret = code;
1503199767f8SToomas Soome 				goto done;
1504199767f8SToomas Soome 			}
1505199767f8SToomas Soome 
1506199767f8SToomas Soome 			/*
1507199767f8SToomas Soome 			 * Restore the original data.
1508199767f8SToomas Soome 			 */
1509199767f8SToomas Soome 			for (i = 0; i < n; i++) {
1510199767f8SToomas Soome 				c = tgts[i];
1511199767f8SToomas Soome 				rc = &rm->rm_col[c];
1512199767f8SToomas Soome 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1513199767f8SToomas Soome 			}
1514199767f8SToomas Soome 
1515199767f8SToomas Soome 			do {
1516199767f8SToomas Soome 				/*
1517199767f8SToomas Soome 				 * Find the next valid column after the current
1518199767f8SToomas Soome 				 * position..
1519199767f8SToomas Soome 				 */
1520199767f8SToomas Soome 				for (next = tgts[current] + 1;
1521199767f8SToomas Soome 				    next < rm->rm_cols &&
1522199767f8SToomas Soome 				    rm->rm_col[next].rc_error != 0; next++)
1523199767f8SToomas Soome 					continue;
1524199767f8SToomas Soome 
1525199767f8SToomas Soome 				ASSERT(next <= tgts[current + 1]);
1526199767f8SToomas Soome 
1527199767f8SToomas Soome 				/*
1528199767f8SToomas Soome 				 * If that spot is available, we're done here.
1529199767f8SToomas Soome 				 */
1530199767f8SToomas Soome 				if (next != tgts[current + 1])
1531199767f8SToomas Soome 					break;
1532199767f8SToomas Soome 
1533199767f8SToomas Soome 				/*
1534199767f8SToomas Soome 				 * Otherwise, find the next valid column after
1535199767f8SToomas Soome 				 * the previous position.
1536199767f8SToomas Soome 				 */
1537199767f8SToomas Soome 				for (c = tgts[current - 1] + 1;
1538199767f8SToomas Soome 				    rm->rm_col[c].rc_error != 0; c++)
1539199767f8SToomas Soome 					continue;
1540199767f8SToomas Soome 
1541199767f8SToomas Soome 				tgts[current] = c;
1542199767f8SToomas Soome 				current++;
1543199767f8SToomas Soome 
1544199767f8SToomas Soome 			} while (current != n);
1545199767f8SToomas Soome 		}
1546199767f8SToomas Soome 	}
1547199767f8SToomas Soome 	n--;
1548199767f8SToomas Soome done:
1549199767f8SToomas Soome 	for (i = n - 1; i >= 0; i--) {
1550*3e8c7f16SToomas Soome 		free(orig[i]);
1551199767f8SToomas Soome 	}
1552199767f8SToomas Soome 
1553199767f8SToomas Soome 	return (ret);
1554199767f8SToomas Soome }
1555199767f8SToomas Soome 
1556199767f8SToomas Soome static int
vdev_raidz_read(vdev_t * vd,const blkptr_t * bp,void * data,off_t offset,size_t bytes)1557199767f8SToomas Soome vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1558199767f8SToomas Soome     off_t offset, size_t bytes)
1559199767f8SToomas Soome {
1560199767f8SToomas Soome 	vdev_t *tvd = vd->v_top;
1561199767f8SToomas Soome 	vdev_t *cvd;
1562199767f8SToomas Soome 	raidz_map_t *rm;
1563199767f8SToomas Soome 	raidz_col_t *rc;
1564199767f8SToomas Soome 	int c, error;
1565199767f8SToomas Soome 	int unexpected_errors;
1566199767f8SToomas Soome 	int parity_errors;
1567199767f8SToomas Soome 	int parity_untried;
1568199767f8SToomas Soome 	int data_errors;
1569199767f8SToomas Soome 	int total_errors;
1570199767f8SToomas Soome 	int n;
1571199767f8SToomas Soome 	int tgts[VDEV_RAIDZ_MAXPARITY];
1572199767f8SToomas Soome 	int code;
1573199767f8SToomas Soome 
1574199767f8SToomas Soome 	rc = NULL;	/* gcc */
1575199767f8SToomas Soome 	error = 0;
1576199767f8SToomas Soome 
1577199767f8SToomas Soome 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1578199767f8SToomas Soome 	    vd->v_nchildren, vd->v_nparity);
1579*3e8c7f16SToomas Soome 	if (rm == NULL)
1580*3e8c7f16SToomas Soome 		return (ENOMEM);
1581199767f8SToomas Soome 
1582199767f8SToomas Soome 	/*
1583199767f8SToomas Soome 	 * Iterate over the columns in reverse order so that we hit the parity
1584199767f8SToomas Soome 	 * last -- any errors along the way will force us to read the parity.
1585199767f8SToomas Soome 	 */
1586199767f8SToomas Soome 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1587199767f8SToomas Soome 		rc = &rm->rm_col[c];
1588199767f8SToomas Soome 		cvd = vdev_child(vd, rc->rc_devidx);
1589199767f8SToomas Soome 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1590199767f8SToomas Soome 			if (c >= rm->rm_firstdatacol)
1591199767f8SToomas Soome 				rm->rm_missingdata++;
1592199767f8SToomas Soome 			else
1593199767f8SToomas Soome 				rm->rm_missingparity++;
1594199767f8SToomas Soome 			rc->rc_error = ENXIO;
1595199767f8SToomas Soome 			rc->rc_tried = 1;	/* don't even try */
1596199767f8SToomas Soome 			rc->rc_skipped = 1;
1597199767f8SToomas Soome 			continue;
1598199767f8SToomas Soome 		}
1599199767f8SToomas Soome #if 0		/* XXX: Too hard for the boot code. */
1600199767f8SToomas Soome 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1601199767f8SToomas Soome 			if (c >= rm->rm_firstdatacol)
1602199767f8SToomas Soome 				rm->rm_missingdata++;
1603199767f8SToomas Soome 			else
1604199767f8SToomas Soome 				rm->rm_missingparity++;
1605199767f8SToomas Soome 			rc->rc_error = ESTALE;
1606199767f8SToomas Soome 			rc->rc_skipped = 1;
1607199767f8SToomas Soome 			continue;
1608199767f8SToomas Soome 		}
1609199767f8SToomas Soome #endif
1610199767f8SToomas Soome 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1611199767f8SToomas Soome 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1612199767f8SToomas Soome 			    rc->rc_offset, rc->rc_size);
1613199767f8SToomas Soome 			rc->rc_tried = 1;
1614199767f8SToomas Soome 			rc->rc_skipped = 0;
1615199767f8SToomas Soome 		}
1616199767f8SToomas Soome 	}
1617199767f8SToomas Soome 
1618199767f8SToomas Soome reconstruct:
1619199767f8SToomas Soome 	unexpected_errors = 0;
1620199767f8SToomas Soome 	parity_errors = 0;
1621199767f8SToomas Soome 	parity_untried = 0;
1622199767f8SToomas Soome 	data_errors = 0;
1623199767f8SToomas Soome 	total_errors = 0;
1624199767f8SToomas Soome 
1625199767f8SToomas Soome 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1626199767f8SToomas Soome 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1627199767f8SToomas Soome 
1628199767f8SToomas Soome 	for (c = 0; c < rm->rm_cols; c++) {
1629199767f8SToomas Soome 		rc = &rm->rm_col[c];
1630199767f8SToomas Soome 
1631199767f8SToomas Soome 		if (rc->rc_error) {
1632199767f8SToomas Soome 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1633199767f8SToomas Soome 
1634199767f8SToomas Soome 			if (c < rm->rm_firstdatacol)
1635199767f8SToomas Soome 				parity_errors++;
1636199767f8SToomas Soome 			else
1637199767f8SToomas Soome 				data_errors++;
1638199767f8SToomas Soome 
1639199767f8SToomas Soome 			if (!rc->rc_skipped)
1640199767f8SToomas Soome 				unexpected_errors++;
1641199767f8SToomas Soome 
1642199767f8SToomas Soome 			total_errors++;
1643199767f8SToomas Soome 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1644199767f8SToomas Soome 			parity_untried++;
1645199767f8SToomas Soome 		}
1646199767f8SToomas Soome 	}
1647199767f8SToomas Soome 
1648199767f8SToomas Soome 	/*
1649199767f8SToomas Soome 	 * There are three potential phases for a read:
1650199767f8SToomas Soome 	 *	1. produce valid data from the columns read
1651199767f8SToomas Soome 	 *	2. read all disks and try again
1652199767f8SToomas Soome 	 *	3. perform combinatorial reconstruction
1653199767f8SToomas Soome 	 *
1654199767f8SToomas Soome 	 * Each phase is progressively both more expensive and less likely to
1655199767f8SToomas Soome 	 * occur. If we encounter more errors than we can repair or all phases
1656199767f8SToomas Soome 	 * fail, we have no choice but to return an error.
1657199767f8SToomas Soome 	 */
1658199767f8SToomas Soome 
1659199767f8SToomas Soome 	/*
1660199767f8SToomas Soome 	 * If the number of errors we saw was correctable -- less than or equal
1661199767f8SToomas Soome 	 * to the number of parity disks read -- attempt to produce data that
1662199767f8SToomas Soome 	 * has a valid checksum. Naturally, this case applies in the absence of
1663199767f8SToomas Soome 	 * any errors.
1664199767f8SToomas Soome 	 */
1665199767f8SToomas Soome 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
16667bbcfb41SToomas Soome 		int rv;
16677bbcfb41SToomas Soome 
1668199767f8SToomas Soome 		if (data_errors == 0) {
1669da9bf005SToomas Soome 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
16707bbcfb41SToomas Soome 			if (rv == 0) {
1671199767f8SToomas Soome 				/*
1672199767f8SToomas Soome 				 * If we read parity information (unnecessarily
1673199767f8SToomas Soome 				 * as it happens since no reconstruction was
1674199767f8SToomas Soome 				 * needed) regenerate and verify the parity.
1675199767f8SToomas Soome 				 * We also regenerate parity when resilvering
1676199767f8SToomas Soome 				 * so we can write it out to the failed device
1677199767f8SToomas Soome 				 * later.
1678199767f8SToomas Soome 				 */
1679199767f8SToomas Soome 				if (parity_errors + parity_untried <
1680199767f8SToomas Soome 				    rm->rm_firstdatacol) {
1681199767f8SToomas Soome 					n = raidz_parity_verify(rm);
1682199767f8SToomas Soome 					unexpected_errors += n;
1683199767f8SToomas Soome 					ASSERT(parity_errors + n <=
1684199767f8SToomas Soome 					    rm->rm_firstdatacol);
1685199767f8SToomas Soome 				}
1686199767f8SToomas Soome 				goto done;
1687199767f8SToomas Soome 			}
1688199767f8SToomas Soome 		} else {
1689199767f8SToomas Soome 			/*
1690199767f8SToomas Soome 			 * We either attempt to read all the parity columns or
1691199767f8SToomas Soome 			 * none of them. If we didn't try to read parity, we
1692199767f8SToomas Soome 			 * wouldn't be here in the correctable case. There must
1693199767f8SToomas Soome 			 * also have been fewer parity errors than parity
1694199767f8SToomas Soome 			 * columns or, again, we wouldn't be in this code path.
1695199767f8SToomas Soome 			 */
1696199767f8SToomas Soome 			ASSERT(parity_untried == 0);
1697199767f8SToomas Soome 			ASSERT(parity_errors < rm->rm_firstdatacol);
1698199767f8SToomas Soome 
1699199767f8SToomas Soome 			/*
1700199767f8SToomas Soome 			 * Identify the data columns that reported an error.
1701199767f8SToomas Soome 			 */
1702199767f8SToomas Soome 			n = 0;
1703199767f8SToomas Soome 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1704199767f8SToomas Soome 				rc = &rm->rm_col[c];
1705199767f8SToomas Soome 				if (rc->rc_error != 0) {
1706199767f8SToomas Soome 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1707199767f8SToomas Soome 					tgts[n++] = c;
1708199767f8SToomas Soome 				}
1709199767f8SToomas Soome 			}
1710199767f8SToomas Soome 
1711199767f8SToomas Soome 			ASSERT(rm->rm_firstdatacol >= n);
1712199767f8SToomas Soome 
1713199767f8SToomas Soome 			code = vdev_raidz_reconstruct(rm, tgts, n);
1714199767f8SToomas Soome 
1715da9bf005SToomas Soome 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
17167bbcfb41SToomas Soome 			if (rv == 0) {
1717199767f8SToomas Soome 				/*
1718199767f8SToomas Soome 				 * If we read more parity disks than were used
1719199767f8SToomas Soome 				 * for reconstruction, confirm that the other
1720199767f8SToomas Soome 				 * parity disks produced correct data. This
1721199767f8SToomas Soome 				 * routine is suboptimal in that it regenerates
1722199767f8SToomas Soome 				 * the parity that we already used in addition
1723199767f8SToomas Soome 				 * to the parity that we're attempting to
1724199767f8SToomas Soome 				 * verify, but this should be a relatively
1725199767f8SToomas Soome 				 * uncommon case, and can be optimized if it
1726199767f8SToomas Soome 				 * becomes a problem. Note that we regenerate
1727199767f8SToomas Soome 				 * parity when resilvering so we can write it
1728199767f8SToomas Soome 				 * out to failed devices later.
1729199767f8SToomas Soome 				 */
1730199767f8SToomas Soome 				if (parity_errors < rm->rm_firstdatacol - n) {
1731199767f8SToomas Soome 					n = raidz_parity_verify(rm);
1732199767f8SToomas Soome 					unexpected_errors += n;
1733199767f8SToomas Soome 					ASSERT(parity_errors + n <=
1734199767f8SToomas Soome 					    rm->rm_firstdatacol);
1735199767f8SToomas Soome 				}
1736199767f8SToomas Soome 
1737199767f8SToomas Soome 				goto done;
1738199767f8SToomas Soome 			}
1739199767f8SToomas Soome 		}
1740199767f8SToomas Soome 	}
1741199767f8SToomas Soome 
1742199767f8SToomas Soome 	/*
1743199767f8SToomas Soome 	 * This isn't a typical situation -- either we got a read
1744199767f8SToomas Soome 	 * error or a child silently returned bad data. Read every
1745199767f8SToomas Soome 	 * block so we can try again with as much data and parity as
1746199767f8SToomas Soome 	 * we can track down. If we've already been through once
1747199767f8SToomas Soome 	 * before, all children will be marked as tried so we'll
1748199767f8SToomas Soome 	 * proceed to combinatorial reconstruction.
1749199767f8SToomas Soome 	 */
1750199767f8SToomas Soome 	unexpected_errors = 1;
1751199767f8SToomas Soome 	rm->rm_missingdata = 0;
1752199767f8SToomas Soome 	rm->rm_missingparity = 0;
1753199767f8SToomas Soome 
1754199767f8SToomas Soome 	n = 0;
1755199767f8SToomas Soome 	for (c = 0; c < rm->rm_cols; c++) {
1756199767f8SToomas Soome 		rc = &rm->rm_col[c];
1757199767f8SToomas Soome 
1758199767f8SToomas Soome 		if (rc->rc_tried)
1759199767f8SToomas Soome 			continue;
1760199767f8SToomas Soome 
1761199767f8SToomas Soome 		cvd = vdev_child(vd, rc->rc_devidx);
1762199767f8SToomas Soome 		ASSERT(cvd != NULL);
1763199767f8SToomas Soome 		rc->rc_error = cvd->v_read(cvd, NULL,
1764199767f8SToomas Soome 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1765199767f8SToomas Soome 		if (rc->rc_error == 0)
1766199767f8SToomas Soome 			n++;
1767199767f8SToomas Soome 		rc->rc_tried = 1;
1768199767f8SToomas Soome 		rc->rc_skipped = 0;
1769199767f8SToomas Soome 	}
1770199767f8SToomas Soome 	/*
1771199767f8SToomas Soome 	 * If we managed to read anything more, retry the
1772199767f8SToomas Soome 	 * reconstruction.
1773199767f8SToomas Soome 	 */
1774199767f8SToomas Soome 	if (n > 0)
1775199767f8SToomas Soome 		goto reconstruct;
1776199767f8SToomas Soome 
1777199767f8SToomas Soome 	/*
1778199767f8SToomas Soome 	 * At this point we've attempted to reconstruct the data given the
1779199767f8SToomas Soome 	 * errors we detected, and we've attempted to read all columns. There
1780199767f8SToomas Soome 	 * must, therefore, be one or more additional problems -- silent errors
1781199767f8SToomas Soome 	 * resulting in invalid data rather than explicit I/O errors resulting
1782199767f8SToomas Soome 	 * in absent data. We check if there is enough additional data to
1783199767f8SToomas Soome 	 * possibly reconstruct the data and then perform combinatorial
1784199767f8SToomas Soome 	 * reconstruction over all possible combinations. If that fails,
1785199767f8SToomas Soome 	 * we're cooked.
1786199767f8SToomas Soome 	 */
1787199767f8SToomas Soome 	if (total_errors > rm->rm_firstdatacol) {
1788199767f8SToomas Soome 		error = EIO;
1789199767f8SToomas Soome 	} else if (total_errors < rm->rm_firstdatacol &&
1790da9bf005SToomas Soome 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
17917bbcfb41SToomas Soome 	    total_errors, data_errors)) != 0) {
1792199767f8SToomas Soome 		/*
1793199767f8SToomas Soome 		 * If we didn't use all the available parity for the
1794199767f8SToomas Soome 		 * combinatorial reconstruction, verify that the remaining
1795199767f8SToomas Soome 		 * parity is correct.
1796199767f8SToomas Soome 		 */
1797199767f8SToomas Soome 		if (code != (1 << rm->rm_firstdatacol) - 1)
1798199767f8SToomas Soome 			(void) raidz_parity_verify(rm);
1799199767f8SToomas Soome 	} else {
1800199767f8SToomas Soome 		/*
1801199767f8SToomas Soome 		 * We're here because either:
1802199767f8SToomas Soome 		 *
1803199767f8SToomas Soome 		 *	total_errors == rm_first_datacol, or
1804199767f8SToomas Soome 		 *	vdev_raidz_combrec() failed
1805199767f8SToomas Soome 		 *
1806199767f8SToomas Soome 		 * In either case, there is enough bad data to prevent
1807199767f8SToomas Soome 		 * reconstruction.
1808199767f8SToomas Soome 		 *
1809199767f8SToomas Soome 		 * Start checksum ereports for all children which haven't
1810199767f8SToomas Soome 		 * failed, and the IO wasn't speculative.
1811199767f8SToomas Soome 		 */
1812199767f8SToomas Soome 		error = ECKSUM;
1813199767f8SToomas Soome 	}
1814199767f8SToomas Soome 
1815199767f8SToomas Soome done:
1816199767f8SToomas Soome 	vdev_raidz_map_free(rm);
1817199767f8SToomas Soome 
1818199767f8SToomas Soome 	return (error);
1819199767f8SToomas Soome }
1820