1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27#include <lz4.h>
28
29static uint64_t zfs_crc64_table[256];
30
31#define	ECKSUM	666
32
33#define	ASSERT3S(x, y, z)	((void)0)
34#define	ASSERT3U(x, y, z)	((void)0)
35#define	ASSERT3P(x, y, z)	((void)0)
36#define	ASSERT0(x)		((void)0)
37#define	ASSERT(x)		((void)0)
38
39#define	kmem_alloc(size, flag)	zfs_alloc((size))
40#define	kmem_free(ptr, size)	zfs_free((ptr), (size))
41
42static void
43zfs_init_crc(void)
44{
45	int i, j;
46	uint64_t *ct;
47
48	/*
49	 * Calculate the crc64 table (used for the zap hash
50	 * function).
51	 */
52	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
53		memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table));
54		for (i = 0; i < 256; i++) {
55			ct = zfs_crc64_table + i;
56			for (*ct = i, j = 8; j > 0; j--)
57				*ct = (*ct >> 1) ^
58				    (-(*ct & 1) & ZFS_CRC64_POLY);
59		}
60	}
61}
62
63static void
64zio_checksum_off(const void *buf __unused, uint64_t size __unused,
65    const void *ctx_template __unused, zio_cksum_t *zcp)
66{
67	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
68}
69
70/*
71 * Signature for checksum functions.
72 */
73typedef void zio_checksum_t(const void *data, uint64_t size,
74    const void *ctx_template, zio_cksum_t *zcp);
75typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
76typedef void zio_checksum_tmpl_free_t(void *ctx_template);
77
78typedef enum zio_checksum_flags {
79	/* Strong enough for metadata? */
80	ZCHECKSUM_FLAG_METADATA = (1 << 1),
81	/* ZIO embedded checksum */
82	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
83	/* Strong enough for dedup (without verification)? */
84	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
85	/* Uses salt value */
86	ZCHECKSUM_FLAG_SALTED = (1 << 4),
87	/* Strong enough for nopwrite? */
88	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
89} zio_checksum_flags_t;
90
91/*
92 * Information about each checksum function.
93 */
94typedef struct zio_checksum_info {
95	/* checksum function for each byteorder */
96	zio_checksum_t			*ci_func[2];
97	zio_checksum_tmpl_init_t	*ci_tmpl_init;
98	zio_checksum_tmpl_free_t	*ci_tmpl_free;
99	zio_checksum_flags_t		ci_flags;
100	const char			*ci_name;	/* descriptive name */
101} zio_checksum_info_t;
102
103#include "blkptr.c"
104
105#include "fletcher.c"
106#include "sha256.c"
107#include "skein_zfs.c"
108#include "edonr_zfs.c"
109
110static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
111	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
112	{{NULL, NULL}, NULL, NULL, 0, "on"},
113	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
114	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
115	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
116	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
117	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
118	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
119	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
120	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
121	    0, "fletcher2"},
122	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
123	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
124	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
125	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
126	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
127	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
128	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
129	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
130	    0, "noparity"},
131	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
132	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
133	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
134	/* no skein and edonr for now */
135	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
136	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
137	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
138	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
139	{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
140	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
141	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
142	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
143};
144
145/*
146 * Common signature for all zio compress/decompress functions.
147 */
148typedef size_t zio_compress_func_t(void *src, void *dst,
149    size_t s_len, size_t d_len, int);
150typedef int zio_decompress_func_t(void *src, void *dst,
151    size_t s_len, size_t d_len, int);
152
153extern int gzip_decompress(void *src, void *dst,
154    size_t s_len, size_t d_len, int);
155/*
156 * Information about each compression function.
157 */
158typedef struct zio_compress_info {
159	zio_compress_func_t	*ci_compress;	/* compression function */
160	zio_decompress_func_t	*ci_decompress;	/* decompression function */
161	int			ci_level;	/* level parameter */
162	const char		*ci_name;	/* algorithm name */
163} zio_compress_info_t;
164
165#include "lzjb.c"
166#include "zle.c"
167
168/*
169 * Compression vectors.
170 */
171static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
172	{NULL,			NULL,			0,	"inherit"},
173	{NULL,			NULL,			0,	"on"},
174	{NULL,			NULL,			0,	"uncompressed"},
175	{NULL,			lzjb_decompress,	0,	"lzjb"},
176	{NULL,			NULL,			0,	"empty"},
177	{NULL,			gzip_decompress,	1,	"gzip-1"},
178	{NULL,			gzip_decompress,	2,	"gzip-2"},
179	{NULL,			gzip_decompress,	3,	"gzip-3"},
180	{NULL,			gzip_decompress,	4,	"gzip-4"},
181	{NULL,			gzip_decompress,	5,	"gzip-5"},
182	{NULL,			gzip_decompress,	6,	"gzip-6"},
183	{NULL,			gzip_decompress,	7,	"gzip-7"},
184	{NULL,			gzip_decompress,	8,	"gzip-8"},
185	{NULL,			gzip_decompress,	9,	"gzip-9"},
186	{NULL,			zle_decompress,		64,	"zle"},
187	{NULL,			lz4_decompress,		0,	"lz4"},
188};
189
190static void
191byteswap_uint64_array(void *vbuf, size_t size)
192{
193	uint64_t *buf = vbuf;
194	size_t count = size >> 3;
195	int i;
196
197	ASSERT((size & 7) == 0);
198
199	for (i = 0; i < count; i++)
200		buf[i] = BSWAP_64(buf[i]);
201}
202
203/*
204 * Set the external verifier for a gang block based on <vdev, offset, txg>,
205 * a tuple which is guaranteed to be unique for the life of the pool.
206 */
207static void
208zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
209{
210	const dva_t *dva = BP_IDENTITY(bp);
211	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
212
213	ASSERT(BP_IS_GANG(bp));
214
215	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
216}
217
218/*
219 * Set the external verifier for a label block based on its offset.
220 * The vdev is implicit, and the txg is unknowable at pool open time --
221 * hence the logic in vdev_uberblock_load() to find the most recent copy.
222 */
223static void
224zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
225{
226	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
227}
228
229/*
230 * Calls the template init function of a checksum which supports context
231 * templates and installs the template into the spa_t.
232 */
233static void
234zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
235{
236	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
237
238	if (ci->ci_tmpl_init == NULL)
239		return;
240
241	if (spa->spa_cksum_tmpls[checksum] != NULL)
242		return;
243
244	if (spa->spa_cksum_tmpls[checksum] == NULL) {
245		spa->spa_cksum_tmpls[checksum] =
246		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
247	}
248}
249
250/*
251 * Called by a spa_t that's about to be deallocated. This steps through
252 * all of the checksum context templates and deallocates any that were
253 * initialized using the algorithm-specific template init function.
254 */
255void
256zio_checksum_templates_free(spa_t *spa)
257{
258	for (enum zio_checksum checksum = 0;
259	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
260		if (spa->spa_cksum_tmpls[checksum] != NULL) {
261			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
262
263			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
264			spa->spa_cksum_tmpls[checksum] = NULL;
265		}
266	}
267}
268
269static int
270zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
271{
272	uint64_t size;
273	unsigned int checksum;
274	zio_checksum_info_t *ci;
275	void *ctx = NULL;
276	zio_cksum_t actual_cksum, expected_cksum, verifier;
277	int byteswap;
278
279	checksum = BP_GET_CHECKSUM(bp);
280	size = BP_GET_PSIZE(bp);
281
282	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
283		return (EINVAL);
284	ci = &zio_checksum_table[checksum];
285	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
286		return (EINVAL);
287
288	if (spa != NULL) {
289		zio_checksum_template_init(checksum, (spa_t *)spa);
290		ctx = spa->spa_cksum_tmpls[checksum];
291	}
292
293	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
294		zio_eck_t *eck;
295
296		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
297		    checksum == ZIO_CHECKSUM_LABEL);
298
299		eck = (zio_eck_t *)((char *)data + size) - 1;
300
301		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
302			zio_checksum_gang_verifier(&verifier, bp);
303		else if (checksum == ZIO_CHECKSUM_LABEL)
304			zio_checksum_label_verifier(&verifier,
305			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
306		else
307			verifier = bp->blk_cksum;
308
309		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
310
311		if (byteswap)
312			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
313
314		expected_cksum = eck->zec_cksum;
315		eck->zec_cksum = verifier;
316		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
317		eck->zec_cksum = expected_cksum;
318
319		if (byteswap)
320			byteswap_uint64_array(&expected_cksum,
321			    sizeof (zio_cksum_t));
322	} else {
323		byteswap = BP_SHOULD_BYTESWAP(bp);
324		expected_cksum = bp->blk_cksum;
325		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
326	}
327
328	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
329		/* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
330		return (EIO);
331	}
332
333	return (0);
334}
335
336static int
337zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
338    void *dest, uint64_t destsize)
339{
340	zio_compress_info_t *ci;
341
342	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
343		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
344		return (EIO);
345	}
346
347	ci = &zio_compress_table[cpfunc];
348	if (!ci->ci_decompress) {
349		printf("ZFS: unsupported compression algorithm %s\n",
350		    ci->ci_name);
351		return (EIO);
352	}
353
354	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
355}
356
357static uint64_t
358zap_hash(uint64_t salt, const char *name)
359{
360	const uint8_t *cp;
361	uint8_t c;
362	uint64_t crc = salt;
363
364	ASSERT(crc != 0);
365	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
366	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
367		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
368
369	/*
370	 * Only use 28 bits, since we need 4 bits in the cookie for the
371	 * collision differentiator.  We MUST use the high bits, since
372	 * those are the onces that we first pay attention to when
373	 * chosing the bucket.
374	 */
375	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
376
377	return (crc);
378}
379
380static void *zfs_alloc(size_t size);
381static void zfs_free(void *ptr, size_t size);
382
383typedef struct raidz_col {
384	uint64_t rc_devidx;		/* child device index for I/O */
385	uint64_t rc_offset;		/* device offset */
386	uint64_t rc_size;		/* I/O size */
387	void *rc_data;			/* I/O data */
388	int rc_error;			/* I/O error for this device */
389	uint8_t rc_tried;		/* Did we attempt this I/O column? */
390	uint8_t rc_skipped;		/* Did we skip this I/O column? */
391} raidz_col_t;
392
393typedef struct raidz_map {
394	uint64_t rm_cols;		/* Regular column count */
395	uint64_t rm_scols;		/* Count including skipped columns */
396	uint64_t rm_bigcols;		/* Number of oversized columns */
397	uint64_t rm_asize;		/* Actual total I/O size */
398	uint64_t rm_missingdata;	/* Count of missing data devices */
399	uint64_t rm_missingparity;	/* Count of missing parity devices */
400	uint64_t rm_firstdatacol;	/* First data column/parity count */
401	uint64_t rm_nskip;		/* Skipped sectors for padding */
402	uint64_t rm_skipstart;		/* Column index of padding start */
403	uintptr_t rm_reports;		/* # of referencing checksum reports */
404	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
405	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
406	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
407} raidz_map_t;
408
409#define	VDEV_RAIDZ_P		0
410#define	VDEV_RAIDZ_Q		1
411#define	VDEV_RAIDZ_R		2
412
413#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
414#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
415
416/*
417 * We provide a mechanism to perform the field multiplication operation on a
418 * 64-bit value all at once rather than a byte at a time. This works by
419 * creating a mask from the top bit in each byte and using that to
420 * conditionally apply the XOR of 0x1d.
421 */
422#define	VDEV_RAIDZ_64MUL_2(x, mask) \
423{ \
424	(mask) = (x) & 0x8080808080808080ULL; \
425	(mask) = ((mask) << 1) - ((mask) >> 7); \
426	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
427	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
428}
429
430#define	VDEV_RAIDZ_64MUL_4(x, mask) \
431{ \
432	VDEV_RAIDZ_64MUL_2((x), mask); \
433	VDEV_RAIDZ_64MUL_2((x), mask); \
434}
435
436/*
437 * These two tables represent powers and logs of 2 in the Galois field defined
438 * above. These values were computed by repeatedly multiplying by 2 as above.
439 */
440static const uint8_t vdev_raidz_pow2[256] = {
441	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
442	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
443	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
444	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
445	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
446	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
447	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
448	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
449	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
450	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
451	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
452	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
453	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
454	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
455	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
456	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
457	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
458	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
459	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
460	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
461	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
462	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
463	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
464	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
465	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
466	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
467	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
468	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
469	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
470	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
471	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
472	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
473};
474static const uint8_t vdev_raidz_log2[256] = {
475	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
476	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
477	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
478	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
479	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
480	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
481	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
482	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
483	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
484	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
485	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
486	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
487	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
488	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
489	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
490	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
491	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
492	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
493	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
494	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
495	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
496	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
497	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
498	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
499	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
500	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
501	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
502	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
503	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
504	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
505	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
506	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
507};
508
509/*
510 * Multiply a given number by 2 raised to the given power.
511 */
512static uint8_t
513vdev_raidz_exp2(uint8_t a, int exp)
514{
515	if (a == 0)
516		return (0);
517
518	ASSERT(exp >= 0);
519	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
520
521	exp += vdev_raidz_log2[a];
522	if (exp > 255)
523		exp -= 255;
524
525	return (vdev_raidz_pow2[exp]);
526}
527
528static void
529vdev_raidz_generate_parity_p(raidz_map_t *rm)
530{
531	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
532	int c;
533
534	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
535
536	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
537		src = rm->rm_col[c].rc_data;
538		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
539		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
540
541		if (c == rm->rm_firstdatacol) {
542			ASSERT(ccount == pcount);
543			for (i = 0; i < ccount; i++, src++, p++) {
544				*p = *src;
545			}
546		} else {
547			ASSERT(ccount <= pcount);
548			for (i = 0; i < ccount; i++, src++, p++) {
549				*p ^= *src;
550			}
551		}
552	}
553}
554
555static void
556vdev_raidz_generate_parity_pq(raidz_map_t *rm)
557{
558	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
559	int c;
560
561	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
562	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
563	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
564
565	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
566		src = rm->rm_col[c].rc_data;
567		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
568		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
569
570		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
571
572		if (c == rm->rm_firstdatacol) {
573			ASSERT(ccnt == pcnt || ccnt == 0);
574			for (i = 0; i < ccnt; i++, src++, p++, q++) {
575				*p = *src;
576				*q = *src;
577			}
578			for (; i < pcnt; i++, src++, p++, q++) {
579				*p = 0;
580				*q = 0;
581			}
582		} else {
583			ASSERT(ccnt <= pcnt);
584
585			/*
586			 * Apply the algorithm described above by multiplying
587			 * the previous result and adding in the new value.
588			 */
589			for (i = 0; i < ccnt; i++, src++, p++, q++) {
590				*p ^= *src;
591
592				VDEV_RAIDZ_64MUL_2(*q, mask);
593				*q ^= *src;
594			}
595
596			/*
597			 * Treat short columns as though they are full of 0s.
598			 * Note that there's therefore nothing needed for P.
599			 */
600			for (; i < pcnt; i++, q++) {
601				VDEV_RAIDZ_64MUL_2(*q, mask);
602			}
603		}
604	}
605}
606
607static void
608vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
609{
610	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
611	int c;
612
613	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
614	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
615	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
616	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
617	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
618
619	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
620		src = rm->rm_col[c].rc_data;
621		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
622		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
623		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
624
625		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
626
627		if (c == rm->rm_firstdatacol) {
628			ASSERT(ccnt == pcnt || ccnt == 0);
629			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
630				*p = *src;
631				*q = *src;
632				*r = *src;
633			}
634			for (; i < pcnt; i++, src++, p++, q++, r++) {
635				*p = 0;
636				*q = 0;
637				*r = 0;
638			}
639		} else {
640			ASSERT(ccnt <= pcnt);
641
642			/*
643			 * Apply the algorithm described above by multiplying
644			 * the previous result and adding in the new value.
645			 */
646			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
647				*p ^= *src;
648
649				VDEV_RAIDZ_64MUL_2(*q, mask);
650				*q ^= *src;
651
652				VDEV_RAIDZ_64MUL_4(*r, mask);
653				*r ^= *src;
654			}
655
656			/*
657			 * Treat short columns as though they are full of 0s.
658			 * Note that there's therefore nothing needed for P.
659			 */
660			for (; i < pcnt; i++, q++, r++) {
661				VDEV_RAIDZ_64MUL_2(*q, mask);
662				VDEV_RAIDZ_64MUL_4(*r, mask);
663			}
664		}
665	}
666}
667
668/*
669 * Generate RAID parity in the first virtual columns according to the number of
670 * parity columns available.
671 */
672static void
673vdev_raidz_generate_parity(raidz_map_t *rm)
674{
675	switch (rm->rm_firstdatacol) {
676	case 1:
677		vdev_raidz_generate_parity_p(rm);
678		break;
679	case 2:
680		vdev_raidz_generate_parity_pq(rm);
681		break;
682	case 3:
683		vdev_raidz_generate_parity_pqr(rm);
684		break;
685	default:
686		panic("invalid RAID-Z configuration");
687	}
688}
689
690/* BEGIN CSTYLED */
691/*
692 * In the general case of reconstruction, we must solve the system of linear
693 * equations defined by the coeffecients used to generate parity as well as
694 * the contents of the data and parity disks. This can be expressed with
695 * vectors for the original data (D) and the actual data (d) and parity (p)
696 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
697 *
698 *            __   __                     __     __
699 *            |     |         __     __   |  p_0  |
700 *            |  V  |         |  D_0  |   | p_m-1 |
701 *            |     |    x    |   :   | = |  d_0  |
702 *            |  I  |         | D_n-1 |   |   :   |
703 *            |     |         ~~     ~~   | d_n-1 |
704 *            ~~   ~~                     ~~     ~~
705 *
706 * I is simply a square identity matrix of size n, and V is a vandermonde
707 * matrix defined by the coeffecients we chose for the various parity columns
708 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
709 * computation as well as linear separability.
710 *
711 *      __               __               __     __
712 *      |   1   ..  1 1 1 |               |  p_0  |
713 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
714 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
715 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
716 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
717 *      |   :       : : : |   |   :   |   |  d_2  |
718 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
719 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
720 *      |   0   ..  0 0 1 |               | d_n-1 |
721 *      ~~               ~~               ~~     ~~
722 *
723 * Note that I, V, d, and p are known. To compute D, we must invert the
724 * matrix and use the known data and parity values to reconstruct the unknown
725 * data values. We begin by removing the rows in V|I and d|p that correspond
726 * to failed or missing columns; we then make V|I square (n x n) and d|p
727 * sized n by removing rows corresponding to unused parity from the bottom up
728 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
729 * using Gauss-Jordan elimination. In the example below we use m=3 parity
730 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
731 *           __                               __
732 *           |  1   1   1   1   1   1   1   1  |
733 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
734 *           |  19 205 116  29  64  16  4   1  |      / /
735 *           |  1   0   0   0   0   0   0   0  |     / /
736 *           |  0   1   0   0   0   0   0   0  | <--' /
737 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
738 *           |  0   0   0   1   0   0   0   0  |
739 *           |  0   0   0   0   1   0   0   0  |
740 *           |  0   0   0   0   0   1   0   0  |
741 *           |  0   0   0   0   0   0   1   0  |
742 *           |  0   0   0   0   0   0   0   1  |
743 *           ~~                               ~~
744 *           __                               __
745 *           |  1   1   1   1   1   1   1   1  |
746 *           | 128  64  32  16  8   4   2   1  |
747 *           |  19 205 116  29  64  16  4   1  |
748 *           |  1   0   0   0   0   0   0   0  |
749 *           |  0   1   0   0   0   0   0   0  |
750 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
751 *           |  0   0   0   1   0   0   0   0  |
752 *           |  0   0   0   0   1   0   0   0  |
753 *           |  0   0   0   0   0   1   0   0  |
754 *           |  0   0   0   0   0   0   1   0  |
755 *           |  0   0   0   0   0   0   0   1  |
756 *           ~~                               ~~
757 *
758 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
759 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
760 * matrix is not singular.
761 * __                                                                 __
762 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
763 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
764 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
765 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
766 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
767 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
768 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
769 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
770 * ~~                                                                 ~~
771 * __                                                                 __
772 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
773 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
774 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
775 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
776 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
777 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
778 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
779 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
780 * ~~                                                                 ~~
781 * __                                                                 __
782 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
783 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
784 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
785 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
786 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
787 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
788 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
789 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
790 * ~~                                                                 ~~
791 * __                                                                 __
792 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
793 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
794 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
795 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
796 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
797 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
798 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
799 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
800 * ~~                                                                 ~~
801 * __                                                                 __
802 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
803 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
804 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
805 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
806 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
807 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
808 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
809 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
810 * ~~                                                                 ~~
811 * __                                                                 __
812 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
813 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
814 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
815 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
816 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
817 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
818 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
819 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
820 * ~~                                                                 ~~
821 *                   __                               __
822 *                   |  0   0   1   0   0   0   0   0  |
823 *                   | 167 100  5   41 159 169 217 208 |
824 *                   | 166 100  4   40 158 168 216 209 |
825 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
826 *                   |  0   0   0   0   1   0   0   0  |
827 *                   |  0   0   0   0   0   1   0   0  |
828 *                   |  0   0   0   0   0   0   1   0  |
829 *                   |  0   0   0   0   0   0   0   1  |
830 *                   ~~                               ~~
831 *
832 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
833 * of the missing data.
834 *
835 * As is apparent from the example above, the only non-trivial rows in the
836 * inverse matrix correspond to the data disks that we're trying to
837 * reconstruct. Indeed, those are the only rows we need as the others would
838 * only be useful for reconstructing data known or assumed to be valid. For
839 * that reason, we only build the coefficients in the rows that correspond to
840 * targeted columns.
841 */
842/* END CSTYLED */
843
844static void
845vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map,
846    uint8_t **rows)
847{
848	int i, j;
849	int pow;
850
851	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
852
853	/*
854	 * Fill in the missing rows of interest.
855	 */
856	for (i = 0; i < nmap; i++) {
857		ASSERT3S(0, <=, map[i]);
858		ASSERT3S(map[i], <=, 2);
859
860		pow = map[i] * n;
861		if (pow > 255)
862			pow -= 255;
863		ASSERT(pow <= 255);
864
865		for (j = 0; j < n; j++) {
866			pow -= map[i];
867			if (pow < 0)
868				pow += 255;
869			rows[i][j] = vdev_raidz_pow2[pow];
870		}
871	}
872}
873
874static void
875vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
876    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
877{
878	int i, j, ii, jj;
879	uint8_t log;
880
881	/*
882	 * Assert that the first nmissing entries from the array of used
883	 * columns correspond to parity columns and that subsequent entries
884	 * correspond to data columns.
885	 */
886	for (i = 0; i < nmissing; i++) {
887		ASSERT3S(used[i], <, rm->rm_firstdatacol);
888	}
889	for (; i < n; i++) {
890		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
891	}
892
893	/*
894	 * First initialize the storage where we'll compute the inverse rows.
895	 */
896	for (i = 0; i < nmissing; i++) {
897		for (j = 0; j < n; j++) {
898			invrows[i][j] = (i == j) ? 1 : 0;
899		}
900	}
901
902	/*
903	 * Subtract all trivial rows from the rows of consequence.
904	 */
905	for (i = 0; i < nmissing; i++) {
906		for (j = nmissing; j < n; j++) {
907			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
908			jj = used[j] - rm->rm_firstdatacol;
909			ASSERT3S(jj, <, n);
910			invrows[i][j] = rows[i][jj];
911			rows[i][jj] = 0;
912		}
913	}
914
915	/*
916	 * For each of the rows of interest, we must normalize it and subtract
917	 * a multiple of it from the other rows.
918	 */
919	for (i = 0; i < nmissing; i++) {
920		for (j = 0; j < missing[i]; j++) {
921			ASSERT3U(rows[i][j], ==, 0);
922		}
923		ASSERT3U(rows[i][missing[i]], !=, 0);
924
925		/*
926		 * Compute the inverse of the first element and multiply each
927		 * element in the row by that value.
928		 */
929		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
930
931		for (j = 0; j < n; j++) {
932			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
933			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
934		}
935
936		for (ii = 0; ii < nmissing; ii++) {
937			if (i == ii)
938				continue;
939
940			ASSERT3U(rows[ii][missing[i]], !=, 0);
941
942			log = vdev_raidz_log2[rows[ii][missing[i]]];
943
944			for (j = 0; j < n; j++) {
945				rows[ii][j] ^=
946				    vdev_raidz_exp2(rows[i][j], log);
947				invrows[ii][j] ^=
948				    vdev_raidz_exp2(invrows[i][j], log);
949			}
950		}
951	}
952
953	/*
954	 * Verify that the data that is left in the rows are properly part of
955	 * an identity matrix.
956	 */
957	for (i = 0; i < nmissing; i++) {
958		for (j = 0; j < n; j++) {
959			if (j == missing[i]) {
960				ASSERT3U(rows[i][j], ==, 1);
961			} else {
962				ASSERT3U(rows[i][j], ==, 0);
963			}
964		}
965	}
966}
967
968static void
969vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
970    int *missing, uint8_t **invrows, const uint8_t *used)
971{
972	int i, j, x, cc, c;
973	uint8_t *src;
974	uint64_t ccount;
975	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
976	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
977	uint8_t log, val;
978	int ll;
979	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
980	uint8_t *p, *pp;
981	size_t psize;
982
983	log = 0;	/* gcc */
984	psize = sizeof (invlog[0][0]) * n * nmissing;
985	p = zfs_alloc(psize);
986
987	for (pp = p, i = 0; i < nmissing; i++) {
988		invlog[i] = pp;
989		pp += n;
990	}
991
992	for (i = 0; i < nmissing; i++) {
993		for (j = 0; j < n; j++) {
994			ASSERT3U(invrows[i][j], !=, 0);
995			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
996		}
997	}
998
999	for (i = 0; i < n; i++) {
1000		c = used[i];
1001		ASSERT3U(c, <, rm->rm_cols);
1002
1003		src = rm->rm_col[c].rc_data;
1004		ccount = rm->rm_col[c].rc_size;
1005		for (j = 0; j < nmissing; j++) {
1006			cc = missing[j] + rm->rm_firstdatacol;
1007			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1008			ASSERT3U(cc, <, rm->rm_cols);
1009			ASSERT3U(cc, !=, c);
1010
1011			dst[j] = rm->rm_col[cc].rc_data;
1012			dcount[j] = rm->rm_col[cc].rc_size;
1013		}
1014
1015		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1016
1017		for (x = 0; x < ccount; x++, src++) {
1018			if (*src != 0)
1019				log = vdev_raidz_log2[*src];
1020
1021			for (cc = 0; cc < nmissing; cc++) {
1022				if (x >= dcount[cc])
1023					continue;
1024
1025				if (*src == 0) {
1026					val = 0;
1027				} else {
1028					if ((ll = log + invlog[cc][i]) >= 255)
1029						ll -= 255;
1030					val = vdev_raidz_pow2[ll];
1031				}
1032
1033				if (i == 0)
1034					dst[cc][x] = val;
1035				else
1036					dst[cc][x] ^= val;
1037			}
1038		}
1039	}
1040
1041	zfs_free(p, psize);
1042}
1043
1044static int
1045vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1046{
1047	int n, i, c, t, tt;
1048	int nmissing_rows;
1049	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1050	int parity_map[VDEV_RAIDZ_MAXPARITY];
1051
1052	uint8_t *p, *pp;
1053	size_t psize;
1054
1055	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1056	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1057	uint8_t *used;
1058
1059	int code = 0;
1060
1061
1062	n = rm->rm_cols - rm->rm_firstdatacol;
1063
1064	/*
1065	 * Figure out which data columns are missing.
1066	 */
1067	nmissing_rows = 0;
1068	for (t = 0; t < ntgts; t++) {
1069		if (tgts[t] >= rm->rm_firstdatacol) {
1070			missing_rows[nmissing_rows++] =
1071			    tgts[t] - rm->rm_firstdatacol;
1072		}
1073	}
1074
1075	/*
1076	 * Figure out which parity columns to use to help generate the missing
1077	 * data columns.
1078	 */
1079	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1080		ASSERT(tt < ntgts);
1081		ASSERT(c < rm->rm_firstdatacol);
1082
1083		/*
1084		 * Skip any targeted parity columns.
1085		 */
1086		if (c == tgts[tt]) {
1087			tt++;
1088			continue;
1089		}
1090
1091		code |= 1 << c;
1092
1093		parity_map[i] = c;
1094		i++;
1095	}
1096
1097	ASSERT(code != 0);
1098	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1099
1100	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1101	    nmissing_rows * n + sizeof (used[0]) * n;
1102	p = kmem_alloc(psize, KM_SLEEP);
1103
1104	for (pp = p, i = 0; i < nmissing_rows; i++) {
1105		rows[i] = pp;
1106		pp += n;
1107		invrows[i] = pp;
1108		pp += n;
1109	}
1110	used = pp;
1111
1112	for (i = 0; i < nmissing_rows; i++) {
1113		used[i] = parity_map[i];
1114	}
1115
1116	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1117		if (tt < nmissing_rows &&
1118		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1119			tt++;
1120			continue;
1121		}
1122
1123		ASSERT3S(i, <, n);
1124		used[i] = c;
1125		i++;
1126	}
1127
1128	/*
1129	 * Initialize the interesting rows of the matrix.
1130	 */
1131	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1132
1133	/*
1134	 * Invert the matrix.
1135	 */
1136	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1137	    invrows, used);
1138
1139	/*
1140	 * Reconstruct the missing data using the generated matrix.
1141	 */
1142	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1143	    invrows, used);
1144
1145	kmem_free(p, psize);
1146
1147	return (code);
1148}
1149
1150static int
1151vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1152{
1153	int tgts[VDEV_RAIDZ_MAXPARITY];
1154	int ntgts;
1155	int i, c;
1156	int code;
1157	int nbadparity, nbaddata;
1158
1159	/*
1160	 * The tgts list must already be sorted.
1161	 */
1162	for (i = 1; i < nt; i++) {
1163		ASSERT(t[i] > t[i - 1]);
1164	}
1165
1166	nbadparity = rm->rm_firstdatacol;
1167	nbaddata = rm->rm_cols - nbadparity;
1168	ntgts = 0;
1169	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1170		if (i < nt && c == t[i]) {
1171			tgts[ntgts++] = c;
1172			i++;
1173		} else if (rm->rm_col[c].rc_error != 0) {
1174			tgts[ntgts++] = c;
1175		} else if (c >= rm->rm_firstdatacol) {
1176			nbaddata--;
1177		} else {
1178			nbadparity--;
1179		}
1180	}
1181
1182	ASSERT(ntgts >= nt);
1183	ASSERT(nbaddata >= 0);
1184	ASSERT(nbaddata + nbadparity == ntgts);
1185
1186	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1187	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1188	ASSERT(code > 0);
1189	return (code);
1190}
1191
1192static raidz_map_t *
1193vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1194    uint64_t dcols, uint64_t nparity)
1195{
1196	raidz_map_t *rm;
1197	uint64_t b = offset >> unit_shift;
1198	uint64_t s = size >> unit_shift;
1199	uint64_t f = b % dcols;
1200	uint64_t o = (b / dcols) << unit_shift;
1201	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1202
1203	q = s / (dcols - nparity);
1204	r = s - q * (dcols - nparity);
1205	bc = (r == 0 ? 0 : r + nparity);
1206	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1207
1208	if (q == 0) {
1209		acols = bc;
1210		scols = MIN(dcols, roundup(bc, nparity + 1));
1211	} else {
1212		acols = dcols;
1213		scols = dcols;
1214	}
1215
1216	ASSERT3U(acols, <=, scols);
1217
1218	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1219
1220	rm->rm_cols = acols;
1221	rm->rm_scols = scols;
1222	rm->rm_bigcols = bc;
1223	rm->rm_skipstart = bc;
1224	rm->rm_missingdata = 0;
1225	rm->rm_missingparity = 0;
1226	rm->rm_firstdatacol = nparity;
1227	rm->rm_reports = 0;
1228	rm->rm_freed = 0;
1229	rm->rm_ecksuminjected = 0;
1230
1231	asize = 0;
1232
1233	for (c = 0; c < scols; c++) {
1234		col = f + c;
1235		coff = o;
1236		if (col >= dcols) {
1237			col -= dcols;
1238			coff += 1ULL << unit_shift;
1239		}
1240		rm->rm_col[c].rc_devidx = col;
1241		rm->rm_col[c].rc_offset = coff;
1242		rm->rm_col[c].rc_data = NULL;
1243		rm->rm_col[c].rc_error = 0;
1244		rm->rm_col[c].rc_tried = 0;
1245		rm->rm_col[c].rc_skipped = 0;
1246
1247		if (c >= acols)
1248			rm->rm_col[c].rc_size = 0;
1249		else if (c < bc)
1250			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1251		else
1252			rm->rm_col[c].rc_size = q << unit_shift;
1253
1254		asize += rm->rm_col[c].rc_size;
1255	}
1256
1257	ASSERT3U(asize, ==, tot << unit_shift);
1258	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1259	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1260	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1261	ASSERT3U(rm->rm_nskip, <=, nparity);
1262
1263	for (c = 0; c < rm->rm_firstdatacol; c++)
1264		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1265
1266	rm->rm_col[c].rc_data = data;
1267
1268	for (c = c + 1; c < acols; c++)
1269		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1270		    rm->rm_col[c - 1].rc_size;
1271
1272	/*
1273	 * If all data stored spans all columns, there's a danger that parity
1274	 * will always be on the same device and, since parity isn't read
1275	 * during normal operation, that that device's I/O bandwidth won't be
1276	 * used effectively. We therefore switch the parity every 1MB.
1277	 *
1278	 * ... at least that was, ostensibly, the theory. As a practical
1279	 * matter unless we juggle the parity between all devices evenly, we
1280	 * won't see any benefit. Further, occasional writes that aren't a
1281	 * multiple of the LCM of the number of children and the minimum
1282	 * stripe width are sufficient to avoid pessimal behavior.
1283	 * Unfortunately, this decision created an implicit on-disk format
1284	 * requirement that we need to support for all eternity, but only
1285	 * for single-parity RAID-Z.
1286	 *
1287	 * If we intend to skip a sector in the zeroth column for padding
1288	 * we must make sure to note this swap. We will never intend to
1289	 * skip the first column since at least one data and one parity
1290	 * column must appear in each row.
1291	 */
1292	ASSERT(rm->rm_cols >= 2);
1293	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1294
1295	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1296		devidx = rm->rm_col[0].rc_devidx;
1297		o = rm->rm_col[0].rc_offset;
1298		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1299		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1300		rm->rm_col[1].rc_devidx = devidx;
1301		rm->rm_col[1].rc_offset = o;
1302
1303		if (rm->rm_skipstart == 0)
1304			rm->rm_skipstart = 1;
1305	}
1306
1307	return (rm);
1308}
1309
1310static void
1311vdev_raidz_map_free(raidz_map_t *rm)
1312{
1313	int c;
1314
1315	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1316		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1317
1318	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1319}
1320
1321static vdev_t *
1322vdev_child(vdev_t *pvd, uint64_t devidx)
1323{
1324	vdev_t *cvd;
1325
1326	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1327		if (cvd->v_id == devidx)
1328			break;
1329	}
1330
1331	return (cvd);
1332}
1333
1334/*
1335 * We keep track of whether or not there were any injected errors, so that
1336 * any ereports we generate can note it.
1337 */
1338static int
1339raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1340    uint64_t size __unused)
1341{
1342
1343	return (zio_checksum_verify(spa, bp, data));
1344}
1345
1346/*
1347 * Generate the parity from the data columns. If we tried and were able to
1348 * read the parity without error, verify that the generated parity matches the
1349 * data we read. If it doesn't, we fire off a checksum error. Return the
1350 * number such failures.
1351 */
1352static int
1353raidz_parity_verify(raidz_map_t *rm)
1354{
1355	void *orig[VDEV_RAIDZ_MAXPARITY];
1356	int c, ret = 0;
1357	raidz_col_t *rc;
1358
1359	for (c = 0; c < rm->rm_firstdatacol; c++) {
1360		rc = &rm->rm_col[c];
1361		if (!rc->rc_tried || rc->rc_error != 0)
1362			continue;
1363		orig[c] = zfs_alloc(rc->rc_size);
1364		bcopy(rc->rc_data, orig[c], rc->rc_size);
1365	}
1366
1367	vdev_raidz_generate_parity(rm);
1368
1369	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1370		rc = &rm->rm_col[c];
1371		if (!rc->rc_tried || rc->rc_error != 0)
1372			continue;
1373		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1374			rc->rc_error = ECKSUM;
1375			ret++;
1376		}
1377		zfs_free(orig[c], rc->rc_size);
1378	}
1379
1380	return (ret);
1381}
1382
1383/*
1384 * Iterate over all combinations of bad data and attempt a reconstruction.
1385 * Note that the algorithm below is non-optimal because it doesn't take into
1386 * account how reconstruction is actually performed. For example, with
1387 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1388 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1389 * cases we'd only use parity information in column 0.
1390 */
1391static int
1392vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1393    void *data, off_t offset __unused, uint64_t bytes, int total_errors,
1394    int data_errors)
1395{
1396	raidz_col_t *rc;
1397	void *orig[VDEV_RAIDZ_MAXPARITY];
1398	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1399	int *tgts = &tstore[1];
1400	int current, next, i, c, n;
1401	int code, ret = 0;
1402
1403	ASSERT(total_errors < rm->rm_firstdatacol);
1404
1405	/*
1406	 * This simplifies one edge condition.
1407	 */
1408	tgts[-1] = -1;
1409
1410	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1411		/*
1412		 * Initialize the targets array by finding the first n columns
1413		 * that contain no error.
1414		 *
1415		 * If there were no data errors, we need to ensure that we're
1416		 * always explicitly attempting to reconstruct at least one
1417		 * data column. To do this, we simply push the highest target
1418		 * up into the data columns.
1419		 */
1420		for (c = 0, i = 0; i < n; i++) {
1421			if (i == n - 1 && data_errors == 0 &&
1422			    c < rm->rm_firstdatacol) {
1423				c = rm->rm_firstdatacol;
1424			}
1425
1426			while (rm->rm_col[c].rc_error != 0) {
1427				c++;
1428				ASSERT3S(c, <, rm->rm_cols);
1429			}
1430
1431			tgts[i] = c++;
1432		}
1433
1434		/*
1435		 * Setting tgts[n] simplifies the other edge condition.
1436		 */
1437		tgts[n] = rm->rm_cols;
1438
1439		/*
1440		 * These buffers were allocated in previous iterations.
1441		 */
1442		for (i = 0; i < n - 1; i++) {
1443			ASSERT(orig[i] != NULL);
1444		}
1445
1446		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1447
1448		current = 0;
1449		next = tgts[current];
1450
1451		while (current != n) {
1452			tgts[current] = next;
1453			current = 0;
1454
1455			/*
1456			 * Save off the original data that we're going to
1457			 * attempt to reconstruct.
1458			 */
1459			for (i = 0; i < n; i++) {
1460				ASSERT(orig[i] != NULL);
1461				c = tgts[i];
1462				ASSERT3S(c, >=, 0);
1463				ASSERT3S(c, <, rm->rm_cols);
1464				rc = &rm->rm_col[c];
1465				bcopy(rc->rc_data, orig[i], rc->rc_size);
1466			}
1467
1468			/*
1469			 * Attempt a reconstruction and exit the outer loop on
1470			 * success.
1471			 */
1472			code = vdev_raidz_reconstruct(rm, tgts, n);
1473			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1474				for (i = 0; i < n; i++) {
1475					c = tgts[i];
1476					rc = &rm->rm_col[c];
1477					ASSERT(rc->rc_error == 0);
1478					rc->rc_error = ECKSUM;
1479				}
1480
1481				ret = code;
1482				goto done;
1483			}
1484
1485			/*
1486			 * Restore the original data.
1487			 */
1488			for (i = 0; i < n; i++) {
1489				c = tgts[i];
1490				rc = &rm->rm_col[c];
1491				bcopy(orig[i], rc->rc_data, rc->rc_size);
1492			}
1493
1494			do {
1495				/*
1496				 * Find the next valid column after the current
1497				 * position..
1498				 */
1499				for (next = tgts[current] + 1;
1500				    next < rm->rm_cols &&
1501				    rm->rm_col[next].rc_error != 0; next++)
1502					continue;
1503
1504				ASSERT(next <= tgts[current + 1]);
1505
1506				/*
1507				 * If that spot is available, we're done here.
1508				 */
1509				if (next != tgts[current + 1])
1510					break;
1511
1512				/*
1513				 * Otherwise, find the next valid column after
1514				 * the previous position.
1515				 */
1516				for (c = tgts[current - 1] + 1;
1517				    rm->rm_col[c].rc_error != 0; c++)
1518					continue;
1519
1520				tgts[current] = c;
1521				current++;
1522
1523			} while (current != n);
1524		}
1525	}
1526	n--;
1527done:
1528	for (i = n - 1; i >= 0; i--) {
1529		zfs_free(orig[i], rm->rm_col[0].rc_size);
1530	}
1531
1532	return (ret);
1533}
1534
1535static int
1536vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1537    off_t offset, size_t bytes)
1538{
1539	vdev_t *tvd = vd->v_top;
1540	vdev_t *cvd;
1541	raidz_map_t *rm;
1542	raidz_col_t *rc;
1543	int c, error;
1544	int unexpected_errors;
1545	int parity_errors;
1546	int parity_untried;
1547	int data_errors;
1548	int total_errors;
1549	int n;
1550	int tgts[VDEV_RAIDZ_MAXPARITY];
1551	int code;
1552
1553	rc = NULL;	/* gcc */
1554	error = 0;
1555
1556	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1557	    vd->v_nchildren, vd->v_nparity);
1558
1559	/*
1560	 * Iterate over the columns in reverse order so that we hit the parity
1561	 * last -- any errors along the way will force us to read the parity.
1562	 */
1563	for (c = rm->rm_cols - 1; c >= 0; c--) {
1564		rc = &rm->rm_col[c];
1565		cvd = vdev_child(vd, rc->rc_devidx);
1566		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1567			if (c >= rm->rm_firstdatacol)
1568				rm->rm_missingdata++;
1569			else
1570				rm->rm_missingparity++;
1571			rc->rc_error = ENXIO;
1572			rc->rc_tried = 1;	/* don't even try */
1573			rc->rc_skipped = 1;
1574			continue;
1575		}
1576#if 0		/* XXX: Too hard for the boot code. */
1577		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1578			if (c >= rm->rm_firstdatacol)
1579				rm->rm_missingdata++;
1580			else
1581				rm->rm_missingparity++;
1582			rc->rc_error = ESTALE;
1583			rc->rc_skipped = 1;
1584			continue;
1585		}
1586#endif
1587		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1588			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1589			    rc->rc_offset, rc->rc_size);
1590			rc->rc_tried = 1;
1591			rc->rc_skipped = 0;
1592		}
1593	}
1594
1595reconstruct:
1596	unexpected_errors = 0;
1597	parity_errors = 0;
1598	parity_untried = 0;
1599	data_errors = 0;
1600	total_errors = 0;
1601
1602	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1603	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1604
1605	for (c = 0; c < rm->rm_cols; c++) {
1606		rc = &rm->rm_col[c];
1607
1608		if (rc->rc_error) {
1609			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1610
1611			if (c < rm->rm_firstdatacol)
1612				parity_errors++;
1613			else
1614				data_errors++;
1615
1616			if (!rc->rc_skipped)
1617				unexpected_errors++;
1618
1619			total_errors++;
1620		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1621			parity_untried++;
1622		}
1623	}
1624
1625	/*
1626	 * There are three potential phases for a read:
1627	 *	1. produce valid data from the columns read
1628	 *	2. read all disks and try again
1629	 *	3. perform combinatorial reconstruction
1630	 *
1631	 * Each phase is progressively both more expensive and less likely to
1632	 * occur. If we encounter more errors than we can repair or all phases
1633	 * fail, we have no choice but to return an error.
1634	 */
1635
1636	/*
1637	 * If the number of errors we saw was correctable -- less than or equal
1638	 * to the number of parity disks read -- attempt to produce data that
1639	 * has a valid checksum. Naturally, this case applies in the absence of
1640	 * any errors.
1641	 */
1642	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1643		int rv;
1644
1645		if (data_errors == 0) {
1646			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1647			if (rv == 0) {
1648				/*
1649				 * If we read parity information (unnecessarily
1650				 * as it happens since no reconstruction was
1651				 * needed) regenerate and verify the parity.
1652				 * We also regenerate parity when resilvering
1653				 * so we can write it out to the failed device
1654				 * later.
1655				 */
1656				if (parity_errors + parity_untried <
1657				    rm->rm_firstdatacol) {
1658					n = raidz_parity_verify(rm);
1659					unexpected_errors += n;
1660					ASSERT(parity_errors + n <=
1661					    rm->rm_firstdatacol);
1662				}
1663				goto done;
1664			}
1665		} else {
1666			/*
1667			 * We either attempt to read all the parity columns or
1668			 * none of them. If we didn't try to read parity, we
1669			 * wouldn't be here in the correctable case. There must
1670			 * also have been fewer parity errors than parity
1671			 * columns or, again, we wouldn't be in this code path.
1672			 */
1673			ASSERT(parity_untried == 0);
1674			ASSERT(parity_errors < rm->rm_firstdatacol);
1675
1676			/*
1677			 * Identify the data columns that reported an error.
1678			 */
1679			n = 0;
1680			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1681				rc = &rm->rm_col[c];
1682				if (rc->rc_error != 0) {
1683					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1684					tgts[n++] = c;
1685				}
1686			}
1687
1688			ASSERT(rm->rm_firstdatacol >= n);
1689
1690			code = vdev_raidz_reconstruct(rm, tgts, n);
1691
1692			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1693			if (rv == 0) {
1694				/*
1695				 * If we read more parity disks than were used
1696				 * for reconstruction, confirm that the other
1697				 * parity disks produced correct data. This
1698				 * routine is suboptimal in that it regenerates
1699				 * the parity that we already used in addition
1700				 * to the parity that we're attempting to
1701				 * verify, but this should be a relatively
1702				 * uncommon case, and can be optimized if it
1703				 * becomes a problem. Note that we regenerate
1704				 * parity when resilvering so we can write it
1705				 * out to failed devices later.
1706				 */
1707				if (parity_errors < rm->rm_firstdatacol - n) {
1708					n = raidz_parity_verify(rm);
1709					unexpected_errors += n;
1710					ASSERT(parity_errors + n <=
1711					    rm->rm_firstdatacol);
1712				}
1713
1714				goto done;
1715			}
1716		}
1717	}
1718
1719	/*
1720	 * This isn't a typical situation -- either we got a read
1721	 * error or a child silently returned bad data. Read every
1722	 * block so we can try again with as much data and parity as
1723	 * we can track down. If we've already been through once
1724	 * before, all children will be marked as tried so we'll
1725	 * proceed to combinatorial reconstruction.
1726	 */
1727	unexpected_errors = 1;
1728	rm->rm_missingdata = 0;
1729	rm->rm_missingparity = 0;
1730
1731	n = 0;
1732	for (c = 0; c < rm->rm_cols; c++) {
1733		rc = &rm->rm_col[c];
1734
1735		if (rc->rc_tried)
1736			continue;
1737
1738		cvd = vdev_child(vd, rc->rc_devidx);
1739		ASSERT(cvd != NULL);
1740		rc->rc_error = cvd->v_read(cvd, NULL,
1741		    rc->rc_data, rc->rc_offset, rc->rc_size);
1742		if (rc->rc_error == 0)
1743			n++;
1744		rc->rc_tried = 1;
1745		rc->rc_skipped = 0;
1746	}
1747	/*
1748	 * If we managed to read anything more, retry the
1749	 * reconstruction.
1750	 */
1751	if (n > 0)
1752		goto reconstruct;
1753
1754	/*
1755	 * At this point we've attempted to reconstruct the data given the
1756	 * errors we detected, and we've attempted to read all columns. There
1757	 * must, therefore, be one or more additional problems -- silent errors
1758	 * resulting in invalid data rather than explicit I/O errors resulting
1759	 * in absent data. We check if there is enough additional data to
1760	 * possibly reconstruct the data and then perform combinatorial
1761	 * reconstruction over all possible combinations. If that fails,
1762	 * we're cooked.
1763	 */
1764	if (total_errors > rm->rm_firstdatacol) {
1765		error = EIO;
1766	} else if (total_errors < rm->rm_firstdatacol &&
1767	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1768	    total_errors, data_errors)) != 0) {
1769		/*
1770		 * If we didn't use all the available parity for the
1771		 * combinatorial reconstruction, verify that the remaining
1772		 * parity is correct.
1773		 */
1774		if (code != (1 << rm->rm_firstdatacol) - 1)
1775			(void) raidz_parity_verify(rm);
1776	} else {
1777		/*
1778		 * We're here because either:
1779		 *
1780		 *	total_errors == rm_first_datacol, or
1781		 *	vdev_raidz_combrec() failed
1782		 *
1783		 * In either case, there is enough bad data to prevent
1784		 * reconstruction.
1785		 *
1786		 * Start checksum ereports for all children which haven't
1787		 * failed, and the IO wasn't speculative.
1788		 */
1789		error = ECKSUM;
1790	}
1791
1792done:
1793	vdev_raidz_map_free(rm);
1794
1795	return (error);
1796}
1797