1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27#include <lz4.h>
28
29static uint64_t zfs_crc64_table[256];
30
31#define	ECKSUM	666
32
33#define	ASSERT3S(x, y, z)	((void)0)
34#define	ASSERT3U(x, y, z)	((void)0)
35#define	ASSERT3P(x, y, z)	((void)0)
36#define	ASSERT0(x)		((void)0)
37#define	ASSERT(x)		((void)0)
38
39#define	kmem_alloc(size, flag)	zfs_alloc((size))
40#define	kmem_free(ptr, size)	zfs_free((ptr), (size))
41
42static void
43zfs_init_crc(void)
44{
45	int i, j;
46	uint64_t *ct;
47
48	/*
49	 * Calculate the crc64 table (used for the zap hash
50	 * function).
51	 */
52	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
53		memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table));
54		for (i = 0; i < 256; i++) {
55			ct = zfs_crc64_table + i;
56			for (*ct = i, j = 8; j > 0; j--)
57				*ct = (*ct >> 1) ^
58				    (-(*ct & 1) & ZFS_CRC64_POLY);
59		}
60	}
61}
62
63static void
64zio_checksum_off(const void *buf __unused, uint64_t size __unused,
65    const void *ctx_template __unused, zio_cksum_t *zcp)
66{
67	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
68}
69
70/*
71 * Signature for checksum functions.
72 */
73typedef void zio_checksum_t(const void *data, uint64_t size,
74    const void *ctx_template, zio_cksum_t *zcp);
75typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
76typedef void zio_checksum_tmpl_free_t(void *ctx_template);
77
78typedef enum zio_checksum_flags {
79	/* Strong enough for metadata? */
80	ZCHECKSUM_FLAG_METADATA = (1 << 1),
81	/* ZIO embedded checksum */
82	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
83	/* Strong enough for dedup (without verification)? */
84	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
85	/* Uses salt value */
86	ZCHECKSUM_FLAG_SALTED = (1 << 4),
87	/* Strong enough for nopwrite? */
88	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
89} zio_checksum_flags_t;
90
91/*
92 * Information about each checksum function.
93 */
94typedef struct zio_checksum_info {
95	/* checksum function for each byteorder */
96	zio_checksum_t			*ci_func[2];
97	zio_checksum_tmpl_init_t	*ci_tmpl_init;
98	zio_checksum_tmpl_free_t	*ci_tmpl_free;
99	zio_checksum_flags_t		ci_flags;
100	const char			*ci_name;	/* descriptive name */
101} zio_checksum_info_t;
102
103#include "blkptr.c"
104
105#include "fletcher.c"
106#include "sha256.c"
107#include "skein_zfs.c"
108#include "edonr_zfs.c"
109
110static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
111	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
112	{{NULL, NULL}, NULL, NULL, 0, "on"},
113	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
114	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
115	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
116	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
117	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
118	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
119	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
120	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
121	    0, "fletcher2"},
122	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
123	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
124	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
125	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
126	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
127	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
128	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
129	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
130	    0, "noparity"},
131	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
132	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
133	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
134	/* no skein and edonr for now */
135	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
136	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
137	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
138	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
139	{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
140	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
141	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
142	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
143};
144
145/*
146 * Common signature for all zio compress/decompress functions.
147 */
148typedef size_t zio_compress_func_t(void *src, void *dst,
149    size_t s_len, size_t d_len, int);
150typedef int zio_decompress_func_t(void *src, void *dst,
151    size_t s_len, size_t d_len, int);
152
153extern int gzip_decompress(void *src, void *dst,
154    size_t s_len, size_t d_len, int);
155/*
156 * Information about each compression function.
157 */
158typedef struct zio_compress_info {
159	zio_compress_func_t	*ci_compress;	/* compression function */
160	zio_decompress_func_t	*ci_decompress;	/* decompression function */
161	int			ci_level;	/* level parameter */
162	const char		*ci_name;	/* algorithm name */
163} zio_compress_info_t;
164
165#include "lzjb.c"
166#include "zle.c"
167
168/*
169 * Compression vectors.
170 */
171static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
172	{NULL,			NULL,			0,	"inherit"},
173	{NULL,			NULL,			0,	"on"},
174	{NULL,			NULL,			0,	"uncompressed"},
175	{NULL,			lzjb_decompress,	0,	"lzjb"},
176	{NULL,			NULL,			0,	"empty"},
177	{NULL,			gzip_decompress,	1,	"gzip-1"},
178	{NULL,			gzip_decompress,	2,	"gzip-2"},
179	{NULL,			gzip_decompress,	3,	"gzip-3"},
180	{NULL,			gzip_decompress,	4,	"gzip-4"},
181	{NULL,			gzip_decompress,	5,	"gzip-5"},
182	{NULL,			gzip_decompress,	6,	"gzip-6"},
183	{NULL,			gzip_decompress,	7,	"gzip-7"},
184	{NULL,			gzip_decompress,	8,	"gzip-8"},
185	{NULL,			gzip_decompress,	9,	"gzip-9"},
186	{NULL,			zle_decompress,		64,	"zle"},
187	{NULL,			lz4_decompress,		0,	"lz4"},
188};
189
190static void
191byteswap_uint64_array(void *vbuf, size_t size)
192{
193	uint64_t *buf = vbuf;
194	size_t count = size >> 3;
195	int i;
196
197	ASSERT((size & 7) == 0);
198
199	for (i = 0; i < count; i++)
200		buf[i] = BSWAP_64(buf[i]);
201}
202
203/*
204 * Set the external verifier for a gang block based on <vdev, offset, txg>,
205 * a tuple which is guaranteed to be unique for the life of the pool.
206 */
207static void
208zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
209{
210	const dva_t *dva = BP_IDENTITY(bp);
211	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
212
213	ASSERT(BP_IS_GANG(bp));
214
215	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
216}
217
218/*
219 * Set the external verifier for a label block based on its offset.
220 * The vdev is implicit, and the txg is unknowable at pool open time --
221 * hence the logic in vdev_uberblock_load() to find the most recent copy.
222 */
223static void
224zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
225{
226	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
227}
228
229/*
230 * Calls the template init function of a checksum which supports context
231 * templates and installs the template into the spa_t.
232 */
233static void
234zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
235{
236	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
237
238	if (ci->ci_tmpl_init == NULL)
239		return;
240
241	if (spa->spa_cksum_tmpls[checksum] != NULL)
242		return;
243
244	if (spa->spa_cksum_tmpls[checksum] == NULL) {
245		spa->spa_cksum_tmpls[checksum] =
246		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
247	}
248}
249
250/*
251 * Called by a spa_t that's about to be deallocated. This steps through
252 * all of the checksum context templates and deallocates any that were
253 * initialized using the algorithm-specific template init function.
254 */
255void
256zio_checksum_templates_free(spa_t *spa)
257{
258	for (enum zio_checksum checksum = 0;
259	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
260		if (spa->spa_cksum_tmpls[checksum] != NULL) {
261			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
262
263			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
264			spa->spa_cksum_tmpls[checksum] = NULL;
265		}
266	}
267}
268
269static int
270zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
271{
272	uint64_t size;
273	unsigned int checksum;
274	zio_checksum_info_t *ci;
275	void *ctx = NULL;
276	zio_cksum_t actual_cksum, expected_cksum, verifier;
277	int byteswap;
278
279	checksum = BP_GET_CHECKSUM(bp);
280	size = BP_GET_PSIZE(bp);
281
282	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
283		return (EINVAL);
284	ci = &zio_checksum_table[checksum];
285	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
286		return (EINVAL);
287
288	if (spa != NULL) {
289		zio_checksum_template_init(checksum, (spa_t *)spa);
290		ctx = spa->spa_cksum_tmpls[checksum];
291	}
292
293	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
294		zio_eck_t *eck;
295
296		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
297		    checksum == ZIO_CHECKSUM_LABEL);
298
299		eck = (zio_eck_t *)((char *)data + size) - 1;
300
301		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
302			zio_checksum_gang_verifier(&verifier, bp);
303		else if (checksum == ZIO_CHECKSUM_LABEL)
304			zio_checksum_label_verifier(&verifier,
305			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
306		else
307			verifier = bp->blk_cksum;
308
309		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
310
311		if (byteswap)
312			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
313
314		expected_cksum = eck->zec_cksum;
315		eck->zec_cksum = verifier;
316		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
317		eck->zec_cksum = expected_cksum;
318
319		if (byteswap)
320			byteswap_uint64_array(&expected_cksum,
321			    sizeof (zio_cksum_t));
322	} else {
323		expected_cksum = bp->blk_cksum;
324		ci->ci_func[0](data, size, ctx, &actual_cksum);
325	}
326
327	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
328		/* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
329		return (EIO);
330	}
331
332	return (0);
333}
334
335static int
336zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
337    void *dest, uint64_t destsize)
338{
339	zio_compress_info_t *ci;
340
341	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
342		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
343		return (EIO);
344	}
345
346	ci = &zio_compress_table[cpfunc];
347	if (!ci->ci_decompress) {
348		printf("ZFS: unsupported compression algorithm %s\n",
349		    ci->ci_name);
350		return (EIO);
351	}
352
353	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
354}
355
356static uint64_t
357zap_hash(uint64_t salt, const char *name)
358{
359	const uint8_t *cp;
360	uint8_t c;
361	uint64_t crc = salt;
362
363	ASSERT(crc != 0);
364	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
365	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
366		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
367
368	/*
369	 * Only use 28 bits, since we need 4 bits in the cookie for the
370	 * collision differentiator.  We MUST use the high bits, since
371	 * those are the onces that we first pay attention to when
372	 * chosing the bucket.
373	 */
374	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
375
376	return (crc);
377}
378
379static void *zfs_alloc(size_t size);
380static void zfs_free(void *ptr, size_t size);
381
382typedef struct raidz_col {
383	uint64_t rc_devidx;		/* child device index for I/O */
384	uint64_t rc_offset;		/* device offset */
385	uint64_t rc_size;		/* I/O size */
386	void *rc_data;			/* I/O data */
387	int rc_error;			/* I/O error for this device */
388	uint8_t rc_tried;		/* Did we attempt this I/O column? */
389	uint8_t rc_skipped;		/* Did we skip this I/O column? */
390} raidz_col_t;
391
392typedef struct raidz_map {
393	uint64_t rm_cols;		/* Regular column count */
394	uint64_t rm_scols;		/* Count including skipped columns */
395	uint64_t rm_bigcols;		/* Number of oversized columns */
396	uint64_t rm_asize;		/* Actual total I/O size */
397	uint64_t rm_missingdata;	/* Count of missing data devices */
398	uint64_t rm_missingparity;	/* Count of missing parity devices */
399	uint64_t rm_firstdatacol;	/* First data column/parity count */
400	uint64_t rm_nskip;		/* Skipped sectors for padding */
401	uint64_t rm_skipstart;		/* Column index of padding start */
402	uintptr_t rm_reports;		/* # of referencing checksum reports */
403	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
404	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
405	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
406} raidz_map_t;
407
408#define	VDEV_RAIDZ_P		0
409#define	VDEV_RAIDZ_Q		1
410#define	VDEV_RAIDZ_R		2
411
412#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
413#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
414
415/*
416 * We provide a mechanism to perform the field multiplication operation on a
417 * 64-bit value all at once rather than a byte at a time. This works by
418 * creating a mask from the top bit in each byte and using that to
419 * conditionally apply the XOR of 0x1d.
420 */
421#define	VDEV_RAIDZ_64MUL_2(x, mask) \
422{ \
423	(mask) = (x) & 0x8080808080808080ULL; \
424	(mask) = ((mask) << 1) - ((mask) >> 7); \
425	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
426	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
427}
428
429#define	VDEV_RAIDZ_64MUL_4(x, mask) \
430{ \
431	VDEV_RAIDZ_64MUL_2((x), mask); \
432	VDEV_RAIDZ_64MUL_2((x), mask); \
433}
434
435/*
436 * These two tables represent powers and logs of 2 in the Galois field defined
437 * above. These values were computed by repeatedly multiplying by 2 as above.
438 */
439static const uint8_t vdev_raidz_pow2[256] = {
440	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
441	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
442	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
443	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
444	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
445	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
446	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
447	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
448	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
449	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
450	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
451	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
452	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
453	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
454	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
455	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
456	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
457	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
458	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
459	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
460	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
461	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
462	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
463	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
464	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
465	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
466	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
467	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
468	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
469	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
470	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
471	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
472};
473static const uint8_t vdev_raidz_log2[256] = {
474	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
475	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
476	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
477	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
478	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
479	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
480	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
481	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
482	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
483	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
484	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
485	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
486	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
487	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
488	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
489	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
490	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
491	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
492	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
493	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
494	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
495	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
496	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
497	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
498	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
499	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
500	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
501	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
502	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
503	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
504	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
505	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
506};
507
508/*
509 * Multiply a given number by 2 raised to the given power.
510 */
511static uint8_t
512vdev_raidz_exp2(uint8_t a, int exp)
513{
514	if (a == 0)
515		return (0);
516
517	ASSERT(exp >= 0);
518	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
519
520	exp += vdev_raidz_log2[a];
521	if (exp > 255)
522		exp -= 255;
523
524	return (vdev_raidz_pow2[exp]);
525}
526
527static void
528vdev_raidz_generate_parity_p(raidz_map_t *rm)
529{
530	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
531	int c;
532
533	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
534
535	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
536		src = rm->rm_col[c].rc_data;
537		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
538		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
539
540		if (c == rm->rm_firstdatacol) {
541			ASSERT(ccount == pcount);
542			for (i = 0; i < ccount; i++, src++, p++) {
543				*p = *src;
544			}
545		} else {
546			ASSERT(ccount <= pcount);
547			for (i = 0; i < ccount; i++, src++, p++) {
548				*p ^= *src;
549			}
550		}
551	}
552}
553
554static void
555vdev_raidz_generate_parity_pq(raidz_map_t *rm)
556{
557	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
558	int c;
559
560	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
561	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
562	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
563
564	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565		src = rm->rm_col[c].rc_data;
566		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
568
569		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
570
571		if (c == rm->rm_firstdatacol) {
572			ASSERT(ccnt == pcnt || ccnt == 0);
573			for (i = 0; i < ccnt; i++, src++, p++, q++) {
574				*p = *src;
575				*q = *src;
576			}
577			for (; i < pcnt; i++, src++, p++, q++) {
578				*p = 0;
579				*q = 0;
580			}
581		} else {
582			ASSERT(ccnt <= pcnt);
583
584			/*
585			 * Apply the algorithm described above by multiplying
586			 * the previous result and adding in the new value.
587			 */
588			for (i = 0; i < ccnt; i++, src++, p++, q++) {
589				*p ^= *src;
590
591				VDEV_RAIDZ_64MUL_2(*q, mask);
592				*q ^= *src;
593			}
594
595			/*
596			 * Treat short columns as though they are full of 0s.
597			 * Note that there's therefore nothing needed for P.
598			 */
599			for (; i < pcnt; i++, q++) {
600				VDEV_RAIDZ_64MUL_2(*q, mask);
601			}
602		}
603	}
604}
605
606static void
607vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
608{
609	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
610	int c;
611
612	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
613	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
615	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
617
618	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
619		src = rm->rm_col[c].rc_data;
620		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
621		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
622		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
623
624		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
625
626		if (c == rm->rm_firstdatacol) {
627			ASSERT(ccnt == pcnt || ccnt == 0);
628			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
629				*p = *src;
630				*q = *src;
631				*r = *src;
632			}
633			for (; i < pcnt; i++, src++, p++, q++, r++) {
634				*p = 0;
635				*q = 0;
636				*r = 0;
637			}
638		} else {
639			ASSERT(ccnt <= pcnt);
640
641			/*
642			 * Apply the algorithm described above by multiplying
643			 * the previous result and adding in the new value.
644			 */
645			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
646				*p ^= *src;
647
648				VDEV_RAIDZ_64MUL_2(*q, mask);
649				*q ^= *src;
650
651				VDEV_RAIDZ_64MUL_4(*r, mask);
652				*r ^= *src;
653			}
654
655			/*
656			 * Treat short columns as though they are full of 0s.
657			 * Note that there's therefore nothing needed for P.
658			 */
659			for (; i < pcnt; i++, q++, r++) {
660				VDEV_RAIDZ_64MUL_2(*q, mask);
661				VDEV_RAIDZ_64MUL_4(*r, mask);
662			}
663		}
664	}
665}
666
667/*
668 * Generate RAID parity in the first virtual columns according to the number of
669 * parity columns available.
670 */
671static void
672vdev_raidz_generate_parity(raidz_map_t *rm)
673{
674	switch (rm->rm_firstdatacol) {
675	case 1:
676		vdev_raidz_generate_parity_p(rm);
677		break;
678	case 2:
679		vdev_raidz_generate_parity_pq(rm);
680		break;
681	case 3:
682		vdev_raidz_generate_parity_pqr(rm);
683		break;
684	default:
685		panic("invalid RAID-Z configuration");
686	}
687}
688
689/* BEGIN CSTYLED */
690/*
691 * In the general case of reconstruction, we must solve the system of linear
692 * equations defined by the coeffecients used to generate parity as well as
693 * the contents of the data and parity disks. This can be expressed with
694 * vectors for the original data (D) and the actual data (d) and parity (p)
695 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
696 *
697 *            __   __                     __     __
698 *            |     |         __     __   |  p_0  |
699 *            |  V  |         |  D_0  |   | p_m-1 |
700 *            |     |    x    |   :   | = |  d_0  |
701 *            |  I  |         | D_n-1 |   |   :   |
702 *            |     |         ~~     ~~   | d_n-1 |
703 *            ~~   ~~                     ~~     ~~
704 *
705 * I is simply a square identity matrix of size n, and V is a vandermonde
706 * matrix defined by the coeffecients we chose for the various parity columns
707 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
708 * computation as well as linear separability.
709 *
710 *      __               __               __     __
711 *      |   1   ..  1 1 1 |               |  p_0  |
712 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
713 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
714 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
715 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
716 *      |   :       : : : |   |   :   |   |  d_2  |
717 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
718 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
719 *      |   0   ..  0 0 1 |               | d_n-1 |
720 *      ~~               ~~               ~~     ~~
721 *
722 * Note that I, V, d, and p are known. To compute D, we must invert the
723 * matrix and use the known data and parity values to reconstruct the unknown
724 * data values. We begin by removing the rows in V|I and d|p that correspond
725 * to failed or missing columns; we then make V|I square (n x n) and d|p
726 * sized n by removing rows corresponding to unused parity from the bottom up
727 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
728 * using Gauss-Jordan elimination. In the example below we use m=3 parity
729 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
730 *           __                               __
731 *           |  1   1   1   1   1   1   1   1  |
732 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
733 *           |  19 205 116  29  64  16  4   1  |      / /
734 *           |  1   0   0   0   0   0   0   0  |     / /
735 *           |  0   1   0   0   0   0   0   0  | <--' /
736 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
737 *           |  0   0   0   1   0   0   0   0  |
738 *           |  0   0   0   0   1   0   0   0  |
739 *           |  0   0   0   0   0   1   0   0  |
740 *           |  0   0   0   0   0   0   1   0  |
741 *           |  0   0   0   0   0   0   0   1  |
742 *           ~~                               ~~
743 *           __                               __
744 *           |  1   1   1   1   1   1   1   1  |
745 *           | 128  64  32  16  8   4   2   1  |
746 *           |  19 205 116  29  64  16  4   1  |
747 *           |  1   0   0   0   0   0   0   0  |
748 *           |  0   1   0   0   0   0   0   0  |
749 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
750 *           |  0   0   0   1   0   0   0   0  |
751 *           |  0   0   0   0   1   0   0   0  |
752 *           |  0   0   0   0   0   1   0   0  |
753 *           |  0   0   0   0   0   0   1   0  |
754 *           |  0   0   0   0   0   0   0   1  |
755 *           ~~                               ~~
756 *
757 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
758 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
759 * matrix is not singular.
760 * __                                                                 __
761 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
762 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
763 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
764 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
765 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
766 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
767 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
768 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
769 * ~~                                                                 ~~
770 * __                                                                 __
771 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
772 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
773 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
774 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
775 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
776 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
777 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
778 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
779 * ~~                                                                 ~~
780 * __                                                                 __
781 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
782 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
783 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
784 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
785 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
786 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
787 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
788 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
789 * ~~                                                                 ~~
790 * __                                                                 __
791 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
792 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
793 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
794 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
795 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
796 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
797 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
798 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
799 * ~~                                                                 ~~
800 * __                                                                 __
801 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
802 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
803 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
804 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
805 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
806 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
807 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
808 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
809 * ~~                                                                 ~~
810 * __                                                                 __
811 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
812 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
813 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
814 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
815 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
816 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
817 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
818 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
819 * ~~                                                                 ~~
820 *                   __                               __
821 *                   |  0   0   1   0   0   0   0   0  |
822 *                   | 167 100  5   41 159 169 217 208 |
823 *                   | 166 100  4   40 158 168 216 209 |
824 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
825 *                   |  0   0   0   0   1   0   0   0  |
826 *                   |  0   0   0   0   0   1   0   0  |
827 *                   |  0   0   0   0   0   0   1   0  |
828 *                   |  0   0   0   0   0   0   0   1  |
829 *                   ~~                               ~~
830 *
831 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
832 * of the missing data.
833 *
834 * As is apparent from the example above, the only non-trivial rows in the
835 * inverse matrix correspond to the data disks that we're trying to
836 * reconstruct. Indeed, those are the only rows we need as the others would
837 * only be useful for reconstructing data known or assumed to be valid. For
838 * that reason, we only build the coefficients in the rows that correspond to
839 * targeted columns.
840 */
841/* END CSTYLED */
842
843static void
844vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map,
845    uint8_t **rows)
846{
847	int i, j;
848	int pow;
849
850	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
851
852	/*
853	 * Fill in the missing rows of interest.
854	 */
855	for (i = 0; i < nmap; i++) {
856		ASSERT3S(0, <=, map[i]);
857		ASSERT3S(map[i], <=, 2);
858
859		pow = map[i] * n;
860		if (pow > 255)
861			pow -= 255;
862		ASSERT(pow <= 255);
863
864		for (j = 0; j < n; j++) {
865			pow -= map[i];
866			if (pow < 0)
867				pow += 255;
868			rows[i][j] = vdev_raidz_pow2[pow];
869		}
870	}
871}
872
873static void
874vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
875    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
876{
877	int i, j, ii, jj;
878	uint8_t log;
879
880	/*
881	 * Assert that the first nmissing entries from the array of used
882	 * columns correspond to parity columns and that subsequent entries
883	 * correspond to data columns.
884	 */
885	for (i = 0; i < nmissing; i++) {
886		ASSERT3S(used[i], <, rm->rm_firstdatacol);
887	}
888	for (; i < n; i++) {
889		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
890	}
891
892	/*
893	 * First initialize the storage where we'll compute the inverse rows.
894	 */
895	for (i = 0; i < nmissing; i++) {
896		for (j = 0; j < n; j++) {
897			invrows[i][j] = (i == j) ? 1 : 0;
898		}
899	}
900
901	/*
902	 * Subtract all trivial rows from the rows of consequence.
903	 */
904	for (i = 0; i < nmissing; i++) {
905		for (j = nmissing; j < n; j++) {
906			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
907			jj = used[j] - rm->rm_firstdatacol;
908			ASSERT3S(jj, <, n);
909			invrows[i][j] = rows[i][jj];
910			rows[i][jj] = 0;
911		}
912	}
913
914	/*
915	 * For each of the rows of interest, we must normalize it and subtract
916	 * a multiple of it from the other rows.
917	 */
918	for (i = 0; i < nmissing; i++) {
919		for (j = 0; j < missing[i]; j++) {
920			ASSERT3U(rows[i][j], ==, 0);
921		}
922		ASSERT3U(rows[i][missing[i]], !=, 0);
923
924		/*
925		 * Compute the inverse of the first element and multiply each
926		 * element in the row by that value.
927		 */
928		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
929
930		for (j = 0; j < n; j++) {
931			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
932			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
933		}
934
935		for (ii = 0; ii < nmissing; ii++) {
936			if (i == ii)
937				continue;
938
939			ASSERT3U(rows[ii][missing[i]], !=, 0);
940
941			log = vdev_raidz_log2[rows[ii][missing[i]]];
942
943			for (j = 0; j < n; j++) {
944				rows[ii][j] ^=
945				    vdev_raidz_exp2(rows[i][j], log);
946				invrows[ii][j] ^=
947				    vdev_raidz_exp2(invrows[i][j], log);
948			}
949		}
950	}
951
952	/*
953	 * Verify that the data that is left in the rows are properly part of
954	 * an identity matrix.
955	 */
956	for (i = 0; i < nmissing; i++) {
957		for (j = 0; j < n; j++) {
958			if (j == missing[i]) {
959				ASSERT3U(rows[i][j], ==, 1);
960			} else {
961				ASSERT3U(rows[i][j], ==, 0);
962			}
963		}
964	}
965}
966
967static void
968vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
969    int *missing, uint8_t **invrows, const uint8_t *used)
970{
971	int i, j, x, cc, c;
972	uint8_t *src;
973	uint64_t ccount;
974	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
975	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
976	uint8_t log, val;
977	int ll;
978	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
979	uint8_t *p, *pp;
980	size_t psize;
981
982	log = 0;	/* gcc */
983	psize = sizeof (invlog[0][0]) * n * nmissing;
984	p = zfs_alloc(psize);
985
986	for (pp = p, i = 0; i < nmissing; i++) {
987		invlog[i] = pp;
988		pp += n;
989	}
990
991	for (i = 0; i < nmissing; i++) {
992		for (j = 0; j < n; j++) {
993			ASSERT3U(invrows[i][j], !=, 0);
994			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
995		}
996	}
997
998	for (i = 0; i < n; i++) {
999		c = used[i];
1000		ASSERT3U(c, <, rm->rm_cols);
1001
1002		src = rm->rm_col[c].rc_data;
1003		ccount = rm->rm_col[c].rc_size;
1004		for (j = 0; j < nmissing; j++) {
1005			cc = missing[j] + rm->rm_firstdatacol;
1006			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1007			ASSERT3U(cc, <, rm->rm_cols);
1008			ASSERT3U(cc, !=, c);
1009
1010			dst[j] = rm->rm_col[cc].rc_data;
1011			dcount[j] = rm->rm_col[cc].rc_size;
1012		}
1013
1014		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1015
1016		for (x = 0; x < ccount; x++, src++) {
1017			if (*src != 0)
1018				log = vdev_raidz_log2[*src];
1019
1020			for (cc = 0; cc < nmissing; cc++) {
1021				if (x >= dcount[cc])
1022					continue;
1023
1024				if (*src == 0) {
1025					val = 0;
1026				} else {
1027					if ((ll = log + invlog[cc][i]) >= 255)
1028						ll -= 255;
1029					val = vdev_raidz_pow2[ll];
1030				}
1031
1032				if (i == 0)
1033					dst[cc][x] = val;
1034				else
1035					dst[cc][x] ^= val;
1036			}
1037		}
1038	}
1039
1040	zfs_free(p, psize);
1041}
1042
1043static int
1044vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1045{
1046	int n, i, c, t, tt;
1047	int nmissing_rows;
1048	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1049	int parity_map[VDEV_RAIDZ_MAXPARITY];
1050
1051	uint8_t *p, *pp;
1052	size_t psize;
1053
1054	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1055	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1056	uint8_t *used;
1057
1058	int code = 0;
1059
1060
1061	n = rm->rm_cols - rm->rm_firstdatacol;
1062
1063	/*
1064	 * Figure out which data columns are missing.
1065	 */
1066	nmissing_rows = 0;
1067	for (t = 0; t < ntgts; t++) {
1068		if (tgts[t] >= rm->rm_firstdatacol) {
1069			missing_rows[nmissing_rows++] =
1070			    tgts[t] - rm->rm_firstdatacol;
1071		}
1072	}
1073
1074	/*
1075	 * Figure out which parity columns to use to help generate the missing
1076	 * data columns.
1077	 */
1078	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1079		ASSERT(tt < ntgts);
1080		ASSERT(c < rm->rm_firstdatacol);
1081
1082		/*
1083		 * Skip any targeted parity columns.
1084		 */
1085		if (c == tgts[tt]) {
1086			tt++;
1087			continue;
1088		}
1089
1090		code |= 1 << c;
1091
1092		parity_map[i] = c;
1093		i++;
1094	}
1095
1096	ASSERT(code != 0);
1097	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1098
1099	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1100	    nmissing_rows * n + sizeof (used[0]) * n;
1101	p = kmem_alloc(psize, KM_SLEEP);
1102
1103	for (pp = p, i = 0; i < nmissing_rows; i++) {
1104		rows[i] = pp;
1105		pp += n;
1106		invrows[i] = pp;
1107		pp += n;
1108	}
1109	used = pp;
1110
1111	for (i = 0; i < nmissing_rows; i++) {
1112		used[i] = parity_map[i];
1113	}
1114
1115	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1116		if (tt < nmissing_rows &&
1117		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1118			tt++;
1119			continue;
1120		}
1121
1122		ASSERT3S(i, <, n);
1123		used[i] = c;
1124		i++;
1125	}
1126
1127	/*
1128	 * Initialize the interesting rows of the matrix.
1129	 */
1130	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1131
1132	/*
1133	 * Invert the matrix.
1134	 */
1135	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1136	    invrows, used);
1137
1138	/*
1139	 * Reconstruct the missing data using the generated matrix.
1140	 */
1141	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1142	    invrows, used);
1143
1144	kmem_free(p, psize);
1145
1146	return (code);
1147}
1148
1149static int
1150vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1151{
1152	int tgts[VDEV_RAIDZ_MAXPARITY];
1153	int ntgts;
1154	int i, c;
1155	int code;
1156	int nbadparity, nbaddata;
1157
1158	/*
1159	 * The tgts list must already be sorted.
1160	 */
1161	for (i = 1; i < nt; i++) {
1162		ASSERT(t[i] > t[i - 1]);
1163	}
1164
1165	nbadparity = rm->rm_firstdatacol;
1166	nbaddata = rm->rm_cols - nbadparity;
1167	ntgts = 0;
1168	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1169		if (i < nt && c == t[i]) {
1170			tgts[ntgts++] = c;
1171			i++;
1172		} else if (rm->rm_col[c].rc_error != 0) {
1173			tgts[ntgts++] = c;
1174		} else if (c >= rm->rm_firstdatacol) {
1175			nbaddata--;
1176		} else {
1177			nbadparity--;
1178		}
1179	}
1180
1181	ASSERT(ntgts >= nt);
1182	ASSERT(nbaddata >= 0);
1183	ASSERT(nbaddata + nbadparity == ntgts);
1184
1185	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1186	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1187	ASSERT(code > 0);
1188	return (code);
1189}
1190
1191static raidz_map_t *
1192vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1193    uint64_t dcols, uint64_t nparity)
1194{
1195	raidz_map_t *rm;
1196	uint64_t b = offset >> unit_shift;
1197	uint64_t s = size >> unit_shift;
1198	uint64_t f = b % dcols;
1199	uint64_t o = (b / dcols) << unit_shift;
1200	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1201
1202	q = s / (dcols - nparity);
1203	r = s - q * (dcols - nparity);
1204	bc = (r == 0 ? 0 : r + nparity);
1205	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1206
1207	if (q == 0) {
1208		acols = bc;
1209		scols = MIN(dcols, roundup(bc, nparity + 1));
1210	} else {
1211		acols = dcols;
1212		scols = dcols;
1213	}
1214
1215	ASSERT3U(acols, <=, scols);
1216
1217	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1218
1219	rm->rm_cols = acols;
1220	rm->rm_scols = scols;
1221	rm->rm_bigcols = bc;
1222	rm->rm_skipstart = bc;
1223	rm->rm_missingdata = 0;
1224	rm->rm_missingparity = 0;
1225	rm->rm_firstdatacol = nparity;
1226	rm->rm_reports = 0;
1227	rm->rm_freed = 0;
1228	rm->rm_ecksuminjected = 0;
1229
1230	asize = 0;
1231
1232	for (c = 0; c < scols; c++) {
1233		col = f + c;
1234		coff = o;
1235		if (col >= dcols) {
1236			col -= dcols;
1237			coff += 1ULL << unit_shift;
1238		}
1239		rm->rm_col[c].rc_devidx = col;
1240		rm->rm_col[c].rc_offset = coff;
1241		rm->rm_col[c].rc_data = NULL;
1242		rm->rm_col[c].rc_error = 0;
1243		rm->rm_col[c].rc_tried = 0;
1244		rm->rm_col[c].rc_skipped = 0;
1245
1246		if (c >= acols)
1247			rm->rm_col[c].rc_size = 0;
1248		else if (c < bc)
1249			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1250		else
1251			rm->rm_col[c].rc_size = q << unit_shift;
1252
1253		asize += rm->rm_col[c].rc_size;
1254	}
1255
1256	ASSERT3U(asize, ==, tot << unit_shift);
1257	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1258	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1259	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1260	ASSERT3U(rm->rm_nskip, <=, nparity);
1261
1262	for (c = 0; c < rm->rm_firstdatacol; c++)
1263		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1264
1265	rm->rm_col[c].rc_data = data;
1266
1267	for (c = c + 1; c < acols; c++)
1268		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1269		    rm->rm_col[c - 1].rc_size;
1270
1271	/*
1272	 * If all data stored spans all columns, there's a danger that parity
1273	 * will always be on the same device and, since parity isn't read
1274	 * during normal operation, that that device's I/O bandwidth won't be
1275	 * used effectively. We therefore switch the parity every 1MB.
1276	 *
1277	 * ... at least that was, ostensibly, the theory. As a practical
1278	 * matter unless we juggle the parity between all devices evenly, we
1279	 * won't see any benefit. Further, occasional writes that aren't a
1280	 * multiple of the LCM of the number of children and the minimum
1281	 * stripe width are sufficient to avoid pessimal behavior.
1282	 * Unfortunately, this decision created an implicit on-disk format
1283	 * requirement that we need to support for all eternity, but only
1284	 * for single-parity RAID-Z.
1285	 *
1286	 * If we intend to skip a sector in the zeroth column for padding
1287	 * we must make sure to note this swap. We will never intend to
1288	 * skip the first column since at least one data and one parity
1289	 * column must appear in each row.
1290	 */
1291	ASSERT(rm->rm_cols >= 2);
1292	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1293
1294	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1295		devidx = rm->rm_col[0].rc_devidx;
1296		o = rm->rm_col[0].rc_offset;
1297		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1298		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1299		rm->rm_col[1].rc_devidx = devidx;
1300		rm->rm_col[1].rc_offset = o;
1301
1302		if (rm->rm_skipstart == 0)
1303			rm->rm_skipstart = 1;
1304	}
1305
1306	return (rm);
1307}
1308
1309static void
1310vdev_raidz_map_free(raidz_map_t *rm)
1311{
1312	int c;
1313
1314	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1315		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1316
1317	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1318}
1319
1320static vdev_t *
1321vdev_child(vdev_t *pvd, uint64_t devidx)
1322{
1323	vdev_t *cvd;
1324
1325	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1326		if (cvd->v_id == devidx)
1327			break;
1328	}
1329
1330	return (cvd);
1331}
1332
1333/*
1334 * We keep track of whether or not there were any injected errors, so that
1335 * any ereports we generate can note it.
1336 */
1337static int
1338raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1339    uint64_t size __unused)
1340{
1341
1342	return (zio_checksum_verify(spa, bp, data));
1343}
1344
1345/*
1346 * Generate the parity from the data columns. If we tried and were able to
1347 * read the parity without error, verify that the generated parity matches the
1348 * data we read. If it doesn't, we fire off a checksum error. Return the
1349 * number such failures.
1350 */
1351static int
1352raidz_parity_verify(raidz_map_t *rm)
1353{
1354	void *orig[VDEV_RAIDZ_MAXPARITY];
1355	int c, ret = 0;
1356	raidz_col_t *rc;
1357
1358	for (c = 0; c < rm->rm_firstdatacol; c++) {
1359		rc = &rm->rm_col[c];
1360		if (!rc->rc_tried || rc->rc_error != 0)
1361			continue;
1362		orig[c] = zfs_alloc(rc->rc_size);
1363		bcopy(rc->rc_data, orig[c], rc->rc_size);
1364	}
1365
1366	vdev_raidz_generate_parity(rm);
1367
1368	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1369		rc = &rm->rm_col[c];
1370		if (!rc->rc_tried || rc->rc_error != 0)
1371			continue;
1372		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1373			rc->rc_error = ECKSUM;
1374			ret++;
1375		}
1376		zfs_free(orig[c], rc->rc_size);
1377	}
1378
1379	return (ret);
1380}
1381
1382/*
1383 * Iterate over all combinations of bad data and attempt a reconstruction.
1384 * Note that the algorithm below is non-optimal because it doesn't take into
1385 * account how reconstruction is actually performed. For example, with
1386 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1387 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1388 * cases we'd only use parity information in column 0.
1389 */
1390static int
1391vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1392    void *data, off_t offset __unused, uint64_t bytes, int total_errors,
1393    int data_errors)
1394{
1395	raidz_col_t *rc;
1396	void *orig[VDEV_RAIDZ_MAXPARITY];
1397	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1398	int *tgts = &tstore[1];
1399	int current, next, i, c, n;
1400	int code, ret = 0;
1401
1402	ASSERT(total_errors < rm->rm_firstdatacol);
1403
1404	/*
1405	 * This simplifies one edge condition.
1406	 */
1407	tgts[-1] = -1;
1408
1409	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1410		/*
1411		 * Initialize the targets array by finding the first n columns
1412		 * that contain no error.
1413		 *
1414		 * If there were no data errors, we need to ensure that we're
1415		 * always explicitly attempting to reconstruct at least one
1416		 * data column. To do this, we simply push the highest target
1417		 * up into the data columns.
1418		 */
1419		for (c = 0, i = 0; i < n; i++) {
1420			if (i == n - 1 && data_errors == 0 &&
1421			    c < rm->rm_firstdatacol) {
1422				c = rm->rm_firstdatacol;
1423			}
1424
1425			while (rm->rm_col[c].rc_error != 0) {
1426				c++;
1427				ASSERT3S(c, <, rm->rm_cols);
1428			}
1429
1430			tgts[i] = c++;
1431		}
1432
1433		/*
1434		 * Setting tgts[n] simplifies the other edge condition.
1435		 */
1436		tgts[n] = rm->rm_cols;
1437
1438		/*
1439		 * These buffers were allocated in previous iterations.
1440		 */
1441		for (i = 0; i < n - 1; i++) {
1442			ASSERT(orig[i] != NULL);
1443		}
1444
1445		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1446
1447		current = 0;
1448		next = tgts[current];
1449
1450		while (current != n) {
1451			tgts[current] = next;
1452			current = 0;
1453
1454			/*
1455			 * Save off the original data that we're going to
1456			 * attempt to reconstruct.
1457			 */
1458			for (i = 0; i < n; i++) {
1459				ASSERT(orig[i] != NULL);
1460				c = tgts[i];
1461				ASSERT3S(c, >=, 0);
1462				ASSERT3S(c, <, rm->rm_cols);
1463				rc = &rm->rm_col[c];
1464				bcopy(rc->rc_data, orig[i], rc->rc_size);
1465			}
1466
1467			/*
1468			 * Attempt a reconstruction and exit the outer loop on
1469			 * success.
1470			 */
1471			code = vdev_raidz_reconstruct(rm, tgts, n);
1472			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1473				for (i = 0; i < n; i++) {
1474					c = tgts[i];
1475					rc = &rm->rm_col[c];
1476					ASSERT(rc->rc_error == 0);
1477					rc->rc_error = ECKSUM;
1478				}
1479
1480				ret = code;
1481				goto done;
1482			}
1483
1484			/*
1485			 * Restore the original data.
1486			 */
1487			for (i = 0; i < n; i++) {
1488				c = tgts[i];
1489				rc = &rm->rm_col[c];
1490				bcopy(orig[i], rc->rc_data, rc->rc_size);
1491			}
1492
1493			do {
1494				/*
1495				 * Find the next valid column after the current
1496				 * position..
1497				 */
1498				for (next = tgts[current] + 1;
1499				    next < rm->rm_cols &&
1500				    rm->rm_col[next].rc_error != 0; next++)
1501					continue;
1502
1503				ASSERT(next <= tgts[current + 1]);
1504
1505				/*
1506				 * If that spot is available, we're done here.
1507				 */
1508				if (next != tgts[current + 1])
1509					break;
1510
1511				/*
1512				 * Otherwise, find the next valid column after
1513				 * the previous position.
1514				 */
1515				for (c = tgts[current - 1] + 1;
1516				    rm->rm_col[c].rc_error != 0; c++)
1517					continue;
1518
1519				tgts[current] = c;
1520				current++;
1521
1522			} while (current != n);
1523		}
1524	}
1525	n--;
1526done:
1527	for (i = n - 1; i >= 0; i--) {
1528		zfs_free(orig[i], rm->rm_col[0].rc_size);
1529	}
1530
1531	return (ret);
1532}
1533
1534static int
1535vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1536    off_t offset, size_t bytes)
1537{
1538	vdev_t *tvd = vd->v_top;
1539	vdev_t *cvd;
1540	raidz_map_t *rm;
1541	raidz_col_t *rc;
1542	int c, error;
1543	int unexpected_errors;
1544	int parity_errors;
1545	int parity_untried;
1546	int data_errors;
1547	int total_errors;
1548	int n;
1549	int tgts[VDEV_RAIDZ_MAXPARITY];
1550	int code;
1551
1552	rc = NULL;	/* gcc */
1553	error = 0;
1554
1555	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1556	    vd->v_nchildren, vd->v_nparity);
1557
1558	/*
1559	 * Iterate over the columns in reverse order so that we hit the parity
1560	 * last -- any errors along the way will force us to read the parity.
1561	 */
1562	for (c = rm->rm_cols - 1; c >= 0; c--) {
1563		rc = &rm->rm_col[c];
1564		cvd = vdev_child(vd, rc->rc_devidx);
1565		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1566			if (c >= rm->rm_firstdatacol)
1567				rm->rm_missingdata++;
1568			else
1569				rm->rm_missingparity++;
1570			rc->rc_error = ENXIO;
1571			rc->rc_tried = 1;	/* don't even try */
1572			rc->rc_skipped = 1;
1573			continue;
1574		}
1575#if 0		/* XXX: Too hard for the boot code. */
1576		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1577			if (c >= rm->rm_firstdatacol)
1578				rm->rm_missingdata++;
1579			else
1580				rm->rm_missingparity++;
1581			rc->rc_error = ESTALE;
1582			rc->rc_skipped = 1;
1583			continue;
1584		}
1585#endif
1586		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1587			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1588			    rc->rc_offset, rc->rc_size);
1589			rc->rc_tried = 1;
1590			rc->rc_skipped = 0;
1591		}
1592	}
1593
1594reconstruct:
1595	unexpected_errors = 0;
1596	parity_errors = 0;
1597	parity_untried = 0;
1598	data_errors = 0;
1599	total_errors = 0;
1600
1601	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1602	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1603
1604	for (c = 0; c < rm->rm_cols; c++) {
1605		rc = &rm->rm_col[c];
1606
1607		if (rc->rc_error) {
1608			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1609
1610			if (c < rm->rm_firstdatacol)
1611				parity_errors++;
1612			else
1613				data_errors++;
1614
1615			if (!rc->rc_skipped)
1616				unexpected_errors++;
1617
1618			total_errors++;
1619		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1620			parity_untried++;
1621		}
1622	}
1623
1624	/*
1625	 * There are three potential phases for a read:
1626	 *	1. produce valid data from the columns read
1627	 *	2. read all disks and try again
1628	 *	3. perform combinatorial reconstruction
1629	 *
1630	 * Each phase is progressively both more expensive and less likely to
1631	 * occur. If we encounter more errors than we can repair or all phases
1632	 * fail, we have no choice but to return an error.
1633	 */
1634
1635	/*
1636	 * If the number of errors we saw was correctable -- less than or equal
1637	 * to the number of parity disks read -- attempt to produce data that
1638	 * has a valid checksum. Naturally, this case applies in the absence of
1639	 * any errors.
1640	 */
1641	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1642		int rv;
1643
1644		if (data_errors == 0) {
1645			rv = raidz_checksum_verify(vd->spa, bp, data, bytes);
1646			if (rv == 0) {
1647				/*
1648				 * If we read parity information (unnecessarily
1649				 * as it happens since no reconstruction was
1650				 * needed) regenerate and verify the parity.
1651				 * We also regenerate parity when resilvering
1652				 * so we can write it out to the failed device
1653				 * later.
1654				 */
1655				if (parity_errors + parity_untried <
1656				    rm->rm_firstdatacol) {
1657					n = raidz_parity_verify(rm);
1658					unexpected_errors += n;
1659					ASSERT(parity_errors + n <=
1660					    rm->rm_firstdatacol);
1661				}
1662				goto done;
1663			}
1664		} else {
1665			/*
1666			 * We either attempt to read all the parity columns or
1667			 * none of them. If we didn't try to read parity, we
1668			 * wouldn't be here in the correctable case. There must
1669			 * also have been fewer parity errors than parity
1670			 * columns or, again, we wouldn't be in this code path.
1671			 */
1672			ASSERT(parity_untried == 0);
1673			ASSERT(parity_errors < rm->rm_firstdatacol);
1674
1675			/*
1676			 * Identify the data columns that reported an error.
1677			 */
1678			n = 0;
1679			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1680				rc = &rm->rm_col[c];
1681				if (rc->rc_error != 0) {
1682					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1683					tgts[n++] = c;
1684				}
1685			}
1686
1687			ASSERT(rm->rm_firstdatacol >= n);
1688
1689			code = vdev_raidz_reconstruct(rm, tgts, n);
1690
1691			rv = raidz_checksum_verify(vd->spa, bp, data, bytes);
1692			if (rv == 0) {
1693				/*
1694				 * If we read more parity disks than were used
1695				 * for reconstruction, confirm that the other
1696				 * parity disks produced correct data. This
1697				 * routine is suboptimal in that it regenerates
1698				 * the parity that we already used in addition
1699				 * to the parity that we're attempting to
1700				 * verify, but this should be a relatively
1701				 * uncommon case, and can be optimized if it
1702				 * becomes a problem. Note that we regenerate
1703				 * parity when resilvering so we can write it
1704				 * out to failed devices later.
1705				 */
1706				if (parity_errors < rm->rm_firstdatacol - n) {
1707					n = raidz_parity_verify(rm);
1708					unexpected_errors += n;
1709					ASSERT(parity_errors + n <=
1710					    rm->rm_firstdatacol);
1711				}
1712
1713				goto done;
1714			}
1715		}
1716	}
1717
1718	/*
1719	 * This isn't a typical situation -- either we got a read
1720	 * error or a child silently returned bad data. Read every
1721	 * block so we can try again with as much data and parity as
1722	 * we can track down. If we've already been through once
1723	 * before, all children will be marked as tried so we'll
1724	 * proceed to combinatorial reconstruction.
1725	 */
1726	unexpected_errors = 1;
1727	rm->rm_missingdata = 0;
1728	rm->rm_missingparity = 0;
1729
1730	n = 0;
1731	for (c = 0; c < rm->rm_cols; c++) {
1732		rc = &rm->rm_col[c];
1733
1734		if (rc->rc_tried)
1735			continue;
1736
1737		cvd = vdev_child(vd, rc->rc_devidx);
1738		ASSERT(cvd != NULL);
1739		rc->rc_error = cvd->v_read(cvd, NULL,
1740		    rc->rc_data, rc->rc_offset, rc->rc_size);
1741		if (rc->rc_error == 0)
1742			n++;
1743		rc->rc_tried = 1;
1744		rc->rc_skipped = 0;
1745	}
1746	/*
1747	 * If we managed to read anything more, retry the
1748	 * reconstruction.
1749	 */
1750	if (n > 0)
1751		goto reconstruct;
1752
1753	/*
1754	 * At this point we've attempted to reconstruct the data given the
1755	 * errors we detected, and we've attempted to read all columns. There
1756	 * must, therefore, be one or more additional problems -- silent errors
1757	 * resulting in invalid data rather than explicit I/O errors resulting
1758	 * in absent data. We check if there is enough additional data to
1759	 * possibly reconstruct the data and then perform combinatorial
1760	 * reconstruction over all possible combinations. If that fails,
1761	 * we're cooked.
1762	 */
1763	if (total_errors > rm->rm_firstdatacol) {
1764		error = EIO;
1765	} else if (total_errors < rm->rm_firstdatacol &&
1766	    (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1767	    total_errors, data_errors)) != 0) {
1768		/*
1769		 * If we didn't use all the available parity for the
1770		 * combinatorial reconstruction, verify that the remaining
1771		 * parity is correct.
1772		 */
1773		if (code != (1 << rm->rm_firstdatacol) - 1)
1774			(void) raidz_parity_verify(rm);
1775	} else {
1776		/*
1777		 * We're here because either:
1778		 *
1779		 *	total_errors == rm_first_datacol, or
1780		 *	vdev_raidz_combrec() failed
1781		 *
1782		 * In either case, there is enough bad data to prevent
1783		 * reconstruction.
1784		 *
1785		 * Start checksum ereports for all children which haven't
1786		 * failed, and the IO wasn't speculative.
1787		 */
1788		error = ECKSUM;
1789	}
1790
1791done:
1792	vdev_raidz_map_free(rm);
1793
1794	return (error);
1795}
1796