1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/cdefs.h>
27#include <lz4.h>
28
29static uint64_t zfs_crc64_table[256];
30
31#define	ECKSUM	666
32
33#define	ASSERT3S(x, y, z)	((void)0)
34#define	ASSERT3U(x, y, z)	((void)0)
35#define	ASSERT3P(x, y, z)	((void)0)
36#define	ASSERT0(x)		((void)0)
37#define	ASSERT(x)		((void)0)
38
39static void
40zfs_init_crc(void)
41{
42	int i, j;
43	uint64_t *ct;
44
45	/*
46	 * Calculate the crc64 table (used for the zap hash
47	 * function).
48	 */
49	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
50		memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table));
51		for (i = 0; i < 256; i++) {
52			ct = zfs_crc64_table + i;
53			for (*ct = i, j = 8; j > 0; j--)
54				*ct = (*ct >> 1) ^
55				    (-(*ct & 1) & ZFS_CRC64_POLY);
56		}
57	}
58}
59
60static void
61zio_checksum_off(const void *buf __unused, uint64_t size __unused,
62    const void *ctx_template __unused, zio_cksum_t *zcp)
63{
64	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
65}
66
67/*
68 * Signature for checksum functions.
69 */
70typedef void zio_checksum_t(const void *data, uint64_t size,
71    const void *ctx_template, zio_cksum_t *zcp);
72typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
73typedef void zio_checksum_tmpl_free_t(void *ctx_template);
74
75typedef enum zio_checksum_flags {
76	/* Strong enough for metadata? */
77	ZCHECKSUM_FLAG_METADATA = (1 << 1),
78	/* ZIO embedded checksum */
79	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
80	/* Strong enough for dedup (without verification)? */
81	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
82	/* Uses salt value */
83	ZCHECKSUM_FLAG_SALTED = (1 << 4),
84	/* Strong enough for nopwrite? */
85	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
86} zio_checksum_flags_t;
87
88/*
89 * Information about each checksum function.
90 */
91typedef struct zio_checksum_info {
92	/* checksum function for each byteorder */
93	zio_checksum_t			*ci_func[2];
94	zio_checksum_tmpl_init_t	*ci_tmpl_init;
95	zio_checksum_tmpl_free_t	*ci_tmpl_free;
96	zio_checksum_flags_t		ci_flags;
97	const char			*ci_name;	/* descriptive name */
98} zio_checksum_info_t;
99
100#include "blkptr.c"
101
102#include "fletcher.c"
103#include "sha256.c"
104#include "skein_zfs.c"
105#include "edonr_zfs.c"
106
107static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
108	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
109	{{NULL, NULL}, NULL, NULL, 0, "on"},
110	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
111	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
112	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
113	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
114	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
115	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
116	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
117	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
118	    0, "fletcher2"},
119	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
120	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
121	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
122	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
123	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
124	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
125	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
126	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
127	    0, "noparity"},
128	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
129	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
130	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
131	/* no skein and edonr for now */
132	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
133	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
134	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
135	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
136	{{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
137	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
138	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
139	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
140};
141
142/*
143 * Common signature for all zio compress/decompress functions.
144 */
145typedef size_t zio_compress_func_t(void *src, void *dst,
146    size_t s_len, size_t d_len, int);
147typedef int zio_decompress_func_t(void *src, void *dst,
148    size_t s_len, size_t d_len, int);
149
150extern int gzip_decompress(void *src, void *dst,
151    size_t s_len, size_t d_len, int);
152/*
153 * Information about each compression function.
154 */
155typedef struct zio_compress_info {
156	zio_compress_func_t	*ci_compress;	/* compression function */
157	zio_decompress_func_t	*ci_decompress;	/* decompression function */
158	int			ci_level;	/* level parameter */
159	const char		*ci_name;	/* algorithm name */
160} zio_compress_info_t;
161
162#include "lzjb.c"
163#include "zle.c"
164
165/*
166 * Compression vectors.
167 */
168static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
169	{NULL,			NULL,			0,	"inherit"},
170	{NULL,			NULL,			0,	"on"},
171	{NULL,			NULL,			0,	"uncompressed"},
172	{NULL,			lzjb_decompress,	0,	"lzjb"},
173	{NULL,			NULL,			0,	"empty"},
174	{NULL,			gzip_decompress,	1,	"gzip-1"},
175	{NULL,			gzip_decompress,	2,	"gzip-2"},
176	{NULL,			gzip_decompress,	3,	"gzip-3"},
177	{NULL,			gzip_decompress,	4,	"gzip-4"},
178	{NULL,			gzip_decompress,	5,	"gzip-5"},
179	{NULL,			gzip_decompress,	6,	"gzip-6"},
180	{NULL,			gzip_decompress,	7,	"gzip-7"},
181	{NULL,			gzip_decompress,	8,	"gzip-8"},
182	{NULL,			gzip_decompress,	9,	"gzip-9"},
183	{NULL,			zle_decompress,		64,	"zle"},
184	{NULL,			lz4_decompress,		0,	"lz4"},
185};
186
187static void
188byteswap_uint64_array(void *vbuf, size_t size)
189{
190	uint64_t *buf = vbuf;
191	size_t count = size >> 3;
192	int i;
193
194	ASSERT((size & 7) == 0);
195
196	for (i = 0; i < count; i++)
197		buf[i] = BSWAP_64(buf[i]);
198}
199
200/*
201 * Set the external verifier for a gang block based on <vdev, offset, txg>,
202 * a tuple which is guaranteed to be unique for the life of the pool.
203 */
204static void
205zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
206{
207	const dva_t *dva = BP_IDENTITY(bp);
208	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
209
210	ASSERT(BP_IS_GANG(bp));
211
212	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
213}
214
215/*
216 * Set the external verifier for a label block based on its offset.
217 * The vdev is implicit, and the txg is unknowable at pool open time --
218 * hence the logic in vdev_uberblock_load() to find the most recent copy.
219 */
220static void
221zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
222{
223	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
224}
225
226/*
227 * Calls the template init function of a checksum which supports context
228 * templates and installs the template into the spa_t.
229 */
230static void
231zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
232{
233	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
234
235	if (ci->ci_tmpl_init == NULL)
236		return;
237
238	if (spa->spa_cksum_tmpls[checksum] != NULL)
239		return;
240
241	if (spa->spa_cksum_tmpls[checksum] == NULL) {
242		spa->spa_cksum_tmpls[checksum] =
243		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
244	}
245}
246
247/*
248 * Called by a spa_t that's about to be deallocated. This steps through
249 * all of the checksum context templates and deallocates any that were
250 * initialized using the algorithm-specific template init function.
251 */
252void
253zio_checksum_templates_free(spa_t *spa)
254{
255	for (enum zio_checksum checksum = 0;
256	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
257		if (spa->spa_cksum_tmpls[checksum] != NULL) {
258			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
259
260			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
261			spa->spa_cksum_tmpls[checksum] = NULL;
262		}
263	}
264}
265
266static int
267zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
268{
269	uint64_t size;
270	unsigned int checksum;
271	zio_checksum_info_t *ci;
272	void *ctx = NULL;
273	zio_cksum_t actual_cksum, expected_cksum, verifier;
274	int byteswap;
275
276	checksum = BP_GET_CHECKSUM(bp);
277	size = BP_GET_PSIZE(bp);
278
279	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
280		return (EINVAL);
281	ci = &zio_checksum_table[checksum];
282	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
283		return (EINVAL);
284
285	if (spa != NULL) {
286		zio_checksum_template_init(checksum, (spa_t *)spa);
287		ctx = spa->spa_cksum_tmpls[checksum];
288	}
289
290	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
291		zio_eck_t *eck;
292
293		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
294		    checksum == ZIO_CHECKSUM_LABEL);
295
296		eck = (zio_eck_t *)((char *)data + size) - 1;
297
298		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
299			zio_checksum_gang_verifier(&verifier, bp);
300		else if (checksum == ZIO_CHECKSUM_LABEL)
301			zio_checksum_label_verifier(&verifier,
302			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
303		else
304			verifier = bp->blk_cksum;
305
306		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
307
308		if (byteswap)
309			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
310
311		expected_cksum = eck->zec_cksum;
312		eck->zec_cksum = verifier;
313		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
314		eck->zec_cksum = expected_cksum;
315
316		if (byteswap)
317			byteswap_uint64_array(&expected_cksum,
318			    sizeof (zio_cksum_t));
319	} else {
320		byteswap = BP_SHOULD_BYTESWAP(bp);
321		expected_cksum = bp->blk_cksum;
322		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
323	}
324
325	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
326		/* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
327		return (EIO);
328	}
329
330	return (0);
331}
332
333static int
334zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
335    void *dest, uint64_t destsize)
336{
337	zio_compress_info_t *ci;
338
339	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
340		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
341		return (EIO);
342	}
343
344	ci = &zio_compress_table[cpfunc];
345	if (!ci->ci_decompress) {
346		printf("ZFS: unsupported compression algorithm %s\n",
347		    ci->ci_name);
348		return (EIO);
349	}
350
351	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
352}
353
354static uint64_t
355zap_hash(uint64_t salt, const char *name)
356{
357	const uint8_t *cp;
358	uint8_t c;
359	uint64_t crc = salt;
360
361	ASSERT(crc != 0);
362	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
363	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
364		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
365
366	/*
367	 * Only use 28 bits, since we need 4 bits in the cookie for the
368	 * collision differentiator.  We MUST use the high bits, since
369	 * those are the onces that we first pay attention to when
370	 * chosing the bucket.
371	 */
372	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
373
374	return (crc);
375}
376
377typedef struct raidz_col {
378	uint64_t rc_devidx;		/* child device index for I/O */
379	uint64_t rc_offset;		/* device offset */
380	uint64_t rc_size;		/* I/O size */
381	void *rc_data;			/* I/O data */
382	int rc_error;			/* I/O error for this device */
383	uint8_t rc_tried;		/* Did we attempt this I/O column? */
384	uint8_t rc_skipped;		/* Did we skip this I/O column? */
385} raidz_col_t;
386
387typedef struct raidz_map {
388	uint64_t rm_cols;		/* Regular column count */
389	uint64_t rm_scols;		/* Count including skipped columns */
390	uint64_t rm_bigcols;		/* Number of oversized columns */
391	uint64_t rm_asize;		/* Actual total I/O size */
392	uint64_t rm_missingdata;	/* Count of missing data devices */
393	uint64_t rm_missingparity;	/* Count of missing parity devices */
394	uint64_t rm_firstdatacol;	/* First data column/parity count */
395	uint64_t rm_nskip;		/* Skipped sectors for padding */
396	uint64_t rm_skipstart;		/* Column index of padding start */
397	uintptr_t rm_reports;		/* # of referencing checksum reports */
398	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
399	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
400	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
401} raidz_map_t;
402
403#define	VDEV_RAIDZ_P		0
404#define	VDEV_RAIDZ_Q		1
405#define	VDEV_RAIDZ_R		2
406
407#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
408#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
409
410/*
411 * We provide a mechanism to perform the field multiplication operation on a
412 * 64-bit value all at once rather than a byte at a time. This works by
413 * creating a mask from the top bit in each byte and using that to
414 * conditionally apply the XOR of 0x1d.
415 */
416#define	VDEV_RAIDZ_64MUL_2(x, mask) \
417{ \
418	(mask) = (x) & 0x8080808080808080ULL; \
419	(mask) = ((mask) << 1) - ((mask) >> 7); \
420	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
421	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
422}
423
424#define	VDEV_RAIDZ_64MUL_4(x, mask) \
425{ \
426	VDEV_RAIDZ_64MUL_2((x), mask); \
427	VDEV_RAIDZ_64MUL_2((x), mask); \
428}
429
430/*
431 * These two tables represent powers and logs of 2 in the Galois field defined
432 * above. These values were computed by repeatedly multiplying by 2 as above.
433 */
434static const uint8_t vdev_raidz_pow2[256] = {
435	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
436	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
437	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
438	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
439	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
440	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
441	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
442	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
443	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
444	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
445	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
446	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
447	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
448	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
449	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
450	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
451	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
452	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
453	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
454	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
455	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
456	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
457	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
458	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
459	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
460	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
461	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
462	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
463	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
464	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
465	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
466	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
467};
468static const uint8_t vdev_raidz_log2[256] = {
469	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
470	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
471	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
472	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
473	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
474	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
475	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
476	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
477	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
478	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
479	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
480	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
481	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
482	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
483	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
484	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
485	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
486	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
487	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
488	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
489	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
490	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
491	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
492	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
493	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
494	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
495	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
496	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
497	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
498	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
499	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
500	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
501};
502
503/*
504 * Multiply a given number by 2 raised to the given power.
505 */
506static uint8_t
507vdev_raidz_exp2(uint8_t a, int exp)
508{
509	if (a == 0)
510		return (0);
511
512	ASSERT(exp >= 0);
513	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
514
515	exp += vdev_raidz_log2[a];
516	if (exp > 255)
517		exp -= 255;
518
519	return (vdev_raidz_pow2[exp]);
520}
521
522static void
523vdev_raidz_generate_parity_p(raidz_map_t *rm)
524{
525	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
526	int c;
527
528	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
529
530	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
531		src = rm->rm_col[c].rc_data;
532		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
533		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
534
535		if (c == rm->rm_firstdatacol) {
536			ASSERT(ccount == pcount);
537			for (i = 0; i < ccount; i++, src++, p++) {
538				*p = *src;
539			}
540		} else {
541			ASSERT(ccount <= pcount);
542			for (i = 0; i < ccount; i++, src++, p++) {
543				*p ^= *src;
544			}
545		}
546	}
547}
548
549static void
550vdev_raidz_generate_parity_pq(raidz_map_t *rm)
551{
552	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
553	int c;
554
555	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
556	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
557	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
558
559	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
560		src = rm->rm_col[c].rc_data;
561		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
562		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
563
564		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
565
566		if (c == rm->rm_firstdatacol) {
567			ASSERT(ccnt == pcnt || ccnt == 0);
568			for (i = 0; i < ccnt; i++, src++, p++, q++) {
569				*p = *src;
570				*q = *src;
571			}
572			for (; i < pcnt; i++, src++, p++, q++) {
573				*p = 0;
574				*q = 0;
575			}
576		} else {
577			ASSERT(ccnt <= pcnt);
578
579			/*
580			 * Apply the algorithm described above by multiplying
581			 * the previous result and adding in the new value.
582			 */
583			for (i = 0; i < ccnt; i++, src++, p++, q++) {
584				*p ^= *src;
585
586				VDEV_RAIDZ_64MUL_2(*q, mask);
587				*q ^= *src;
588			}
589
590			/*
591			 * Treat short columns as though they are full of 0s.
592			 * Note that there's therefore nothing needed for P.
593			 */
594			for (; i < pcnt; i++, q++) {
595				VDEV_RAIDZ_64MUL_2(*q, mask);
596			}
597		}
598	}
599}
600
601static void
602vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
603{
604	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
605	int c;
606
607	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
608	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
609	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
610	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
611	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
612
613	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
614		src = rm->rm_col[c].rc_data;
615		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
616		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
617		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
618
619		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
620
621		if (c == rm->rm_firstdatacol) {
622			ASSERT(ccnt == pcnt || ccnt == 0);
623			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
624				*p = *src;
625				*q = *src;
626				*r = *src;
627			}
628			for (; i < pcnt; i++, src++, p++, q++, r++) {
629				*p = 0;
630				*q = 0;
631				*r = 0;
632			}
633		} else {
634			ASSERT(ccnt <= pcnt);
635
636			/*
637			 * Apply the algorithm described above by multiplying
638			 * the previous result and adding in the new value.
639			 */
640			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
641				*p ^= *src;
642
643				VDEV_RAIDZ_64MUL_2(*q, mask);
644				*q ^= *src;
645
646				VDEV_RAIDZ_64MUL_4(*r, mask);
647				*r ^= *src;
648			}
649
650			/*
651			 * Treat short columns as though they are full of 0s.
652			 * Note that there's therefore nothing needed for P.
653			 */
654			for (; i < pcnt; i++, q++, r++) {
655				VDEV_RAIDZ_64MUL_2(*q, mask);
656				VDEV_RAIDZ_64MUL_4(*r, mask);
657			}
658		}
659	}
660}
661
662/*
663 * Generate RAID parity in the first virtual columns according to the number of
664 * parity columns available.
665 */
666static void
667vdev_raidz_generate_parity(raidz_map_t *rm)
668{
669	switch (rm->rm_firstdatacol) {
670	case 1:
671		vdev_raidz_generate_parity_p(rm);
672		break;
673	case 2:
674		vdev_raidz_generate_parity_pq(rm);
675		break;
676	case 3:
677		vdev_raidz_generate_parity_pqr(rm);
678		break;
679	default:
680		panic("invalid RAID-Z configuration");
681	}
682}
683
684/* BEGIN CSTYLED */
685/*
686 * In the general case of reconstruction, we must solve the system of linear
687 * equations defined by the coeffecients used to generate parity as well as
688 * the contents of the data and parity disks. This can be expressed with
689 * vectors for the original data (D) and the actual data (d) and parity (p)
690 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
691 *
692 *            __   __                     __     __
693 *            |     |         __     __   |  p_0  |
694 *            |  V  |         |  D_0  |   | p_m-1 |
695 *            |     |    x    |   :   | = |  d_0  |
696 *            |  I  |         | D_n-1 |   |   :   |
697 *            |     |         ~~     ~~   | d_n-1 |
698 *            ~~   ~~                     ~~     ~~
699 *
700 * I is simply a square identity matrix of size n, and V is a vandermonde
701 * matrix defined by the coeffecients we chose for the various parity columns
702 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
703 * computation as well as linear separability.
704 *
705 *      __               __               __     __
706 *      |   1   ..  1 1 1 |               |  p_0  |
707 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
708 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
709 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
710 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
711 *      |   :       : : : |   |   :   |   |  d_2  |
712 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
713 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
714 *      |   0   ..  0 0 1 |               | d_n-1 |
715 *      ~~               ~~               ~~     ~~
716 *
717 * Note that I, V, d, and p are known. To compute D, we must invert the
718 * matrix and use the known data and parity values to reconstruct the unknown
719 * data values. We begin by removing the rows in V|I and d|p that correspond
720 * to failed or missing columns; we then make V|I square (n x n) and d|p
721 * sized n by removing rows corresponding to unused parity from the bottom up
722 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
723 * using Gauss-Jordan elimination. In the example below we use m=3 parity
724 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
725 *           __                               __
726 *           |  1   1   1   1   1   1   1   1  |
727 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
728 *           |  19 205 116  29  64  16  4   1  |      / /
729 *           |  1   0   0   0   0   0   0   0  |     / /
730 *           |  0   1   0   0   0   0   0   0  | <--' /
731 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
732 *           |  0   0   0   1   0   0   0   0  |
733 *           |  0   0   0   0   1   0   0   0  |
734 *           |  0   0   0   0   0   1   0   0  |
735 *           |  0   0   0   0   0   0   1   0  |
736 *           |  0   0   0   0   0   0   0   1  |
737 *           ~~                               ~~
738 *           __                               __
739 *           |  1   1   1   1   1   1   1   1  |
740 *           | 128  64  32  16  8   4   2   1  |
741 *           |  19 205 116  29  64  16  4   1  |
742 *           |  1   0   0   0   0   0   0   0  |
743 *           |  0   1   0   0   0   0   0   0  |
744 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
745 *           |  0   0   0   1   0   0   0   0  |
746 *           |  0   0   0   0   1   0   0   0  |
747 *           |  0   0   0   0   0   1   0   0  |
748 *           |  0   0   0   0   0   0   1   0  |
749 *           |  0   0   0   0   0   0   0   1  |
750 *           ~~                               ~~
751 *
752 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
753 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
754 * matrix is not singular.
755 * __                                                                 __
756 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
757 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
758 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
759 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
760 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
761 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
762 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
763 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
764 * ~~                                                                 ~~
765 * __                                                                 __
766 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
767 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
768 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
769 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
770 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
771 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
772 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
773 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
774 * ~~                                                                 ~~
775 * __                                                                 __
776 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
777 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
778 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
779 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
780 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
781 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
782 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
783 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
784 * ~~                                                                 ~~
785 * __                                                                 __
786 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
787 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
788 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
789 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
790 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
791 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
792 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
793 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
794 * ~~                                                                 ~~
795 * __                                                                 __
796 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
797 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
798 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
799 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
800 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
801 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
802 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
803 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
804 * ~~                                                                 ~~
805 * __                                                                 __
806 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
807 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
808 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
809 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
810 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
811 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
812 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
813 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
814 * ~~                                                                 ~~
815 *                   __                               __
816 *                   |  0   0   1   0   0   0   0   0  |
817 *                   | 167 100  5   41 159 169 217 208 |
818 *                   | 166 100  4   40 158 168 216 209 |
819 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
820 *                   |  0   0   0   0   1   0   0   0  |
821 *                   |  0   0   0   0   0   1   0   0  |
822 *                   |  0   0   0   0   0   0   1   0  |
823 *                   |  0   0   0   0   0   0   0   1  |
824 *                   ~~                               ~~
825 *
826 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
827 * of the missing data.
828 *
829 * As is apparent from the example above, the only non-trivial rows in the
830 * inverse matrix correspond to the data disks that we're trying to
831 * reconstruct. Indeed, those are the only rows we need as the others would
832 * only be useful for reconstructing data known or assumed to be valid. For
833 * that reason, we only build the coefficients in the rows that correspond to
834 * targeted columns.
835 */
836/* END CSTYLED */
837
838static void
839vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map,
840    uint8_t **rows)
841{
842	int i, j;
843	int pow;
844
845	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
846
847	/*
848	 * Fill in the missing rows of interest.
849	 */
850	for (i = 0; i < nmap; i++) {
851		ASSERT3S(0, <=, map[i]);
852		ASSERT3S(map[i], <=, 2);
853
854		pow = map[i] * n;
855		if (pow > 255)
856			pow -= 255;
857		ASSERT(pow <= 255);
858
859		for (j = 0; j < n; j++) {
860			pow -= map[i];
861			if (pow < 0)
862				pow += 255;
863			rows[i][j] = vdev_raidz_pow2[pow];
864		}
865	}
866}
867
868static void
869vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
870    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
871{
872	int i, j, ii, jj;
873	uint8_t log;
874
875	/*
876	 * Assert that the first nmissing entries from the array of used
877	 * columns correspond to parity columns and that subsequent entries
878	 * correspond to data columns.
879	 */
880	for (i = 0; i < nmissing; i++) {
881		ASSERT3S(used[i], <, rm->rm_firstdatacol);
882	}
883	for (; i < n; i++) {
884		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
885	}
886
887	/*
888	 * First initialize the storage where we'll compute the inverse rows.
889	 */
890	for (i = 0; i < nmissing; i++) {
891		for (j = 0; j < n; j++) {
892			invrows[i][j] = (i == j) ? 1 : 0;
893		}
894	}
895
896	/*
897	 * Subtract all trivial rows from the rows of consequence.
898	 */
899	for (i = 0; i < nmissing; i++) {
900		for (j = nmissing; j < n; j++) {
901			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
902			jj = used[j] - rm->rm_firstdatacol;
903			ASSERT3S(jj, <, n);
904			invrows[i][j] = rows[i][jj];
905			rows[i][jj] = 0;
906		}
907	}
908
909	/*
910	 * For each of the rows of interest, we must normalize it and subtract
911	 * a multiple of it from the other rows.
912	 */
913	for (i = 0; i < nmissing; i++) {
914		for (j = 0; j < missing[i]; j++) {
915			ASSERT3U(rows[i][j], ==, 0);
916		}
917		ASSERT3U(rows[i][missing[i]], !=, 0);
918
919		/*
920		 * Compute the inverse of the first element and multiply each
921		 * element in the row by that value.
922		 */
923		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
924
925		for (j = 0; j < n; j++) {
926			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
927			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
928		}
929
930		for (ii = 0; ii < nmissing; ii++) {
931			if (i == ii)
932				continue;
933
934			ASSERT3U(rows[ii][missing[i]], !=, 0);
935
936			log = vdev_raidz_log2[rows[ii][missing[i]]];
937
938			for (j = 0; j < n; j++) {
939				rows[ii][j] ^=
940				    vdev_raidz_exp2(rows[i][j], log);
941				invrows[ii][j] ^=
942				    vdev_raidz_exp2(invrows[i][j], log);
943			}
944		}
945	}
946
947	/*
948	 * Verify that the data that is left in the rows are properly part of
949	 * an identity matrix.
950	 */
951	for (i = 0; i < nmissing; i++) {
952		for (j = 0; j < n; j++) {
953			if (j == missing[i]) {
954				ASSERT3U(rows[i][j], ==, 1);
955			} else {
956				ASSERT3U(rows[i][j], ==, 0);
957			}
958		}
959	}
960}
961
962static void
963vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
964    int *missing, uint8_t **invrows, const uint8_t *used)
965{
966	int i, j, x, cc, c;
967	uint8_t *src;
968	uint64_t ccount;
969	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
970	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
971	uint8_t log, val;
972	int ll;
973	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
974	uint8_t *p, *pp;
975	size_t psize;
976
977	log = 0;	/* gcc */
978	psize = sizeof (invlog[0][0]) * n * nmissing;
979	p = malloc(psize);
980	if (p == NULL) {
981		printf("Out of memory\n");
982		return;
983	}
984
985	for (pp = p, i = 0; i < nmissing; i++) {
986		invlog[i] = pp;
987		pp += n;
988	}
989
990	for (i = 0; i < nmissing; i++) {
991		for (j = 0; j < n; j++) {
992			ASSERT3U(invrows[i][j], !=, 0);
993			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
994		}
995	}
996
997	for (i = 0; i < n; i++) {
998		c = used[i];
999		ASSERT3U(c, <, rm->rm_cols);
1000
1001		src = rm->rm_col[c].rc_data;
1002		ccount = rm->rm_col[c].rc_size;
1003		for (j = 0; j < nmissing; j++) {
1004			cc = missing[j] + rm->rm_firstdatacol;
1005			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1006			ASSERT3U(cc, <, rm->rm_cols);
1007			ASSERT3U(cc, !=, c);
1008
1009			dst[j] = rm->rm_col[cc].rc_data;
1010			dcount[j] = rm->rm_col[cc].rc_size;
1011		}
1012
1013		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1014
1015		for (x = 0; x < ccount; x++, src++) {
1016			if (*src != 0)
1017				log = vdev_raidz_log2[*src];
1018
1019			for (cc = 0; cc < nmissing; cc++) {
1020				if (x >= dcount[cc])
1021					continue;
1022
1023				if (*src == 0) {
1024					val = 0;
1025				} else {
1026					if ((ll = log + invlog[cc][i]) >= 255)
1027						ll -= 255;
1028					val = vdev_raidz_pow2[ll];
1029				}
1030
1031				if (i == 0)
1032					dst[cc][x] = val;
1033				else
1034					dst[cc][x] ^= val;
1035			}
1036		}
1037	}
1038
1039	free(p);
1040}
1041
1042static int
1043vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1044{
1045	int n, i, c, t, tt;
1046	int nmissing_rows;
1047	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1048	int parity_map[VDEV_RAIDZ_MAXPARITY];
1049
1050	uint8_t *p, *pp;
1051	size_t psize;
1052
1053	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1054	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1055	uint8_t *used;
1056
1057	int code = 0;
1058
1059
1060	n = rm->rm_cols - rm->rm_firstdatacol;
1061
1062	/*
1063	 * Figure out which data columns are missing.
1064	 */
1065	nmissing_rows = 0;
1066	for (t = 0; t < ntgts; t++) {
1067		if (tgts[t] >= rm->rm_firstdatacol) {
1068			missing_rows[nmissing_rows++] =
1069			    tgts[t] - rm->rm_firstdatacol;
1070		}
1071	}
1072
1073	/*
1074	 * Figure out which parity columns to use to help generate the missing
1075	 * data columns.
1076	 */
1077	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1078		ASSERT(tt < ntgts);
1079		ASSERT(c < rm->rm_firstdatacol);
1080
1081		/*
1082		 * Skip any targeted parity columns.
1083		 */
1084		if (c == tgts[tt]) {
1085			tt++;
1086			continue;
1087		}
1088
1089		code |= 1 << c;
1090
1091		parity_map[i] = c;
1092		i++;
1093	}
1094
1095	ASSERT(code != 0);
1096	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1097
1098	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1099	    nmissing_rows * n + sizeof (used[0]) * n;
1100	p = malloc(psize);
1101	if (p == NULL) {
1102		printf("Out of memory\n");
1103		return (code);
1104	}
1105
1106	for (pp = p, i = 0; i < nmissing_rows; i++) {
1107		rows[i] = pp;
1108		pp += n;
1109		invrows[i] = pp;
1110		pp += n;
1111	}
1112	used = pp;
1113
1114	for (i = 0; i < nmissing_rows; i++) {
1115		used[i] = parity_map[i];
1116	}
1117
1118	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1119		if (tt < nmissing_rows &&
1120		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1121			tt++;
1122			continue;
1123		}
1124
1125		ASSERT3S(i, <, n);
1126		used[i] = c;
1127		i++;
1128	}
1129
1130	/*
1131	 * Initialize the interesting rows of the matrix.
1132	 */
1133	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1134
1135	/*
1136	 * Invert the matrix.
1137	 */
1138	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1139	    invrows, used);
1140
1141	/*
1142	 * Reconstruct the missing data using the generated matrix.
1143	 */
1144	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1145	    invrows, used);
1146
1147	free(p);
1148
1149	return (code);
1150}
1151
1152static int
1153vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1154{
1155	int tgts[VDEV_RAIDZ_MAXPARITY];
1156	int ntgts;
1157	int i, c;
1158	int code;
1159	int nbadparity, nbaddata;
1160
1161	/*
1162	 * The tgts list must already be sorted.
1163	 */
1164	for (i = 1; i < nt; i++) {
1165		ASSERT(t[i] > t[i - 1]);
1166	}
1167
1168	nbadparity = rm->rm_firstdatacol;
1169	nbaddata = rm->rm_cols - nbadparity;
1170	ntgts = 0;
1171	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1172		if (i < nt && c == t[i]) {
1173			tgts[ntgts++] = c;
1174			i++;
1175		} else if (rm->rm_col[c].rc_error != 0) {
1176			tgts[ntgts++] = c;
1177		} else if (c >= rm->rm_firstdatacol) {
1178			nbaddata--;
1179		} else {
1180			nbadparity--;
1181		}
1182	}
1183
1184	ASSERT(ntgts >= nt);
1185	ASSERT(nbaddata >= 0);
1186	ASSERT(nbaddata + nbadparity == ntgts);
1187
1188	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1189	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1190	ASSERT(code > 0);
1191	return (code);
1192}
1193
1194static raidz_map_t *
1195vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1196    uint64_t dcols, uint64_t nparity)
1197{
1198	raidz_map_t *rm;
1199	uint64_t b = offset >> unit_shift;
1200	uint64_t s = size >> unit_shift;
1201	uint64_t f = b % dcols;
1202	uint64_t o = (b / dcols) << unit_shift;
1203	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1204
1205	q = s / (dcols - nparity);
1206	r = s - q * (dcols - nparity);
1207	bc = (r == 0 ? 0 : r + nparity);
1208	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1209
1210	if (q == 0) {
1211		acols = bc;
1212		scols = MIN(dcols, roundup(bc, nparity + 1));
1213	} else {
1214		acols = dcols;
1215		scols = dcols;
1216	}
1217
1218	ASSERT3U(acols, <=, scols);
1219
1220	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1221	if (rm == NULL)
1222		return (rm);
1223
1224	rm->rm_cols = acols;
1225	rm->rm_scols = scols;
1226	rm->rm_bigcols = bc;
1227	rm->rm_skipstart = bc;
1228	rm->rm_missingdata = 0;
1229	rm->rm_missingparity = 0;
1230	rm->rm_firstdatacol = nparity;
1231	rm->rm_reports = 0;
1232	rm->rm_freed = 0;
1233	rm->rm_ecksuminjected = 0;
1234
1235	asize = 0;
1236
1237	for (c = 0; c < scols; c++) {
1238		col = f + c;
1239		coff = o;
1240		if (col >= dcols) {
1241			col -= dcols;
1242			coff += 1ULL << unit_shift;
1243		}
1244		rm->rm_col[c].rc_devidx = col;
1245		rm->rm_col[c].rc_offset = coff;
1246		rm->rm_col[c].rc_data = NULL;
1247		rm->rm_col[c].rc_error = 0;
1248		rm->rm_col[c].rc_tried = 0;
1249		rm->rm_col[c].rc_skipped = 0;
1250
1251		if (c >= acols)
1252			rm->rm_col[c].rc_size = 0;
1253		else if (c < bc)
1254			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1255		else
1256			rm->rm_col[c].rc_size = q << unit_shift;
1257
1258		asize += rm->rm_col[c].rc_size;
1259	}
1260
1261	ASSERT3U(asize, ==, tot << unit_shift);
1262	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1263	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1264	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1265	ASSERT3U(rm->rm_nskip, <=, nparity);
1266
1267	for (c = 0; c < rm->rm_firstdatacol; c++) {
1268		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1269		if (rm->rm_col[c].rc_data == NULL) {
1270			c++;
1271			while (c != 0)
1272				free(rm->rm_col[--c].rc_data);
1273			free(rm);
1274			return (NULL);
1275		}
1276	}
1277
1278	rm->rm_col[c].rc_data = data;
1279
1280	for (c = c + 1; c < acols; c++)
1281		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1282		    rm->rm_col[c - 1].rc_size;
1283
1284	/*
1285	 * If all data stored spans all columns, there's a danger that parity
1286	 * will always be on the same device and, since parity isn't read
1287	 * during normal operation, that that device's I/O bandwidth won't be
1288	 * used effectively. We therefore switch the parity every 1MB.
1289	 *
1290	 * ... at least that was, ostensibly, the theory. As a practical
1291	 * matter unless we juggle the parity between all devices evenly, we
1292	 * won't see any benefit. Further, occasional writes that aren't a
1293	 * multiple of the LCM of the number of children and the minimum
1294	 * stripe width are sufficient to avoid pessimal behavior.
1295	 * Unfortunately, this decision created an implicit on-disk format
1296	 * requirement that we need to support for all eternity, but only
1297	 * for single-parity RAID-Z.
1298	 *
1299	 * If we intend to skip a sector in the zeroth column for padding
1300	 * we must make sure to note this swap. We will never intend to
1301	 * skip the first column since at least one data and one parity
1302	 * column must appear in each row.
1303	 */
1304	ASSERT(rm->rm_cols >= 2);
1305	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1306
1307	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1308		devidx = rm->rm_col[0].rc_devidx;
1309		o = rm->rm_col[0].rc_offset;
1310		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1311		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1312		rm->rm_col[1].rc_devidx = devidx;
1313		rm->rm_col[1].rc_offset = o;
1314
1315		if (rm->rm_skipstart == 0)
1316			rm->rm_skipstart = 1;
1317	}
1318
1319	return (rm);
1320}
1321
1322static void
1323vdev_raidz_map_free(raidz_map_t *rm)
1324{
1325	int c;
1326
1327	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1328		free(rm->rm_col[c].rc_data);
1329
1330	free(rm);
1331}
1332
1333static vdev_t *
1334vdev_child(vdev_t *pvd, uint64_t devidx)
1335{
1336	vdev_t *cvd;
1337
1338	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1339		if (cvd->v_id == devidx)
1340			break;
1341	}
1342
1343	return (cvd);
1344}
1345
1346/*
1347 * We keep track of whether or not there were any injected errors, so that
1348 * any ereports we generate can note it.
1349 */
1350static int
1351raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1352    uint64_t size __unused)
1353{
1354
1355	return (zio_checksum_verify(spa, bp, data));
1356}
1357
1358/*
1359 * Generate the parity from the data columns. If we tried and were able to
1360 * read the parity without error, verify that the generated parity matches the
1361 * data we read. If it doesn't, we fire off a checksum error. Return the
1362 * number such failures.
1363 */
1364static int
1365raidz_parity_verify(raidz_map_t *rm)
1366{
1367	void *orig[VDEV_RAIDZ_MAXPARITY];
1368	int c, ret = 0;
1369	raidz_col_t *rc;
1370
1371	for (c = 0; c < rm->rm_firstdatacol; c++) {
1372		rc = &rm->rm_col[c];
1373		if (!rc->rc_tried || rc->rc_error != 0)
1374			continue;
1375		orig[c] = malloc(rc->rc_size);
1376		if (orig[c] != NULL) {
1377			bcopy(rc->rc_data, orig[c], rc->rc_size);
1378		} else {
1379			printf("Out of memory\n");
1380		}
1381	}
1382
1383	vdev_raidz_generate_parity(rm);
1384
1385	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1386		rc = &rm->rm_col[c];
1387		if (!rc->rc_tried || rc->rc_error != 0)
1388			continue;
1389		if (orig[c] == NULL ||
1390		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1391			rc->rc_error = ECKSUM;
1392			ret++;
1393		}
1394		free(orig[c]);
1395	}
1396
1397	return (ret);
1398}
1399
1400/*
1401 * Iterate over all combinations of bad data and attempt a reconstruction.
1402 * Note that the algorithm below is non-optimal because it doesn't take into
1403 * account how reconstruction is actually performed. For example, with
1404 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1405 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1406 * cases we'd only use parity information in column 0.
1407 */
1408static int
1409vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1410    void *data, off_t offset __unused, uint64_t bytes, int total_errors,
1411    int data_errors)
1412{
1413	raidz_col_t *rc;
1414	void *orig[VDEV_RAIDZ_MAXPARITY];
1415	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1416	int *tgts = &tstore[1];
1417	int current, next, i, c, n;
1418	int code, ret = 0;
1419
1420	ASSERT(total_errors < rm->rm_firstdatacol);
1421
1422	/*
1423	 * This simplifies one edge condition.
1424	 */
1425	tgts[-1] = -1;
1426
1427	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1428		/*
1429		 * Initialize the targets array by finding the first n columns
1430		 * that contain no error.
1431		 *
1432		 * If there were no data errors, we need to ensure that we're
1433		 * always explicitly attempting to reconstruct at least one
1434		 * data column. To do this, we simply push the highest target
1435		 * up into the data columns.
1436		 */
1437		for (c = 0, i = 0; i < n; i++) {
1438			if (i == n - 1 && data_errors == 0 &&
1439			    c < rm->rm_firstdatacol) {
1440				c = rm->rm_firstdatacol;
1441			}
1442
1443			while (rm->rm_col[c].rc_error != 0) {
1444				c++;
1445				ASSERT3S(c, <, rm->rm_cols);
1446			}
1447
1448			tgts[i] = c++;
1449		}
1450
1451		/*
1452		 * Setting tgts[n] simplifies the other edge condition.
1453		 */
1454		tgts[n] = rm->rm_cols;
1455
1456		/*
1457		 * These buffers were allocated in previous iterations.
1458		 */
1459		for (i = 0; i < n - 1; i++) {
1460			ASSERT(orig[i] != NULL);
1461		}
1462
1463		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1464		if (orig[n - 1] == NULL) {
1465			ret = ENOMEM;
1466			goto done;
1467		}
1468
1469		current = 0;
1470		next = tgts[current];
1471
1472		while (current != n) {
1473			tgts[current] = next;
1474			current = 0;
1475
1476			/*
1477			 * Save off the original data that we're going to
1478			 * attempt to reconstruct.
1479			 */
1480			for (i = 0; i < n; i++) {
1481				ASSERT(orig[i] != NULL);
1482				c = tgts[i];
1483				ASSERT3S(c, >=, 0);
1484				ASSERT3S(c, <, rm->rm_cols);
1485				rc = &rm->rm_col[c];
1486				bcopy(rc->rc_data, orig[i], rc->rc_size);
1487			}
1488
1489			/*
1490			 * Attempt a reconstruction and exit the outer loop on
1491			 * success.
1492			 */
1493			code = vdev_raidz_reconstruct(rm, tgts, n);
1494			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1495				for (i = 0; i < n; i++) {
1496					c = tgts[i];
1497					rc = &rm->rm_col[c];
1498					ASSERT(rc->rc_error == 0);
1499					rc->rc_error = ECKSUM;
1500				}
1501
1502				ret = code;
1503				goto done;
1504			}
1505
1506			/*
1507			 * Restore the original data.
1508			 */
1509			for (i = 0; i < n; i++) {
1510				c = tgts[i];
1511				rc = &rm->rm_col[c];
1512				bcopy(orig[i], rc->rc_data, rc->rc_size);
1513			}
1514
1515			do {
1516				/*
1517				 * Find the next valid column after the current
1518				 * position..
1519				 */
1520				for (next = tgts[current] + 1;
1521				    next < rm->rm_cols &&
1522				    rm->rm_col[next].rc_error != 0; next++)
1523					continue;
1524
1525				ASSERT(next <= tgts[current + 1]);
1526
1527				/*
1528				 * If that spot is available, we're done here.
1529				 */
1530				if (next != tgts[current + 1])
1531					break;
1532
1533				/*
1534				 * Otherwise, find the next valid column after
1535				 * the previous position.
1536				 */
1537				for (c = tgts[current - 1] + 1;
1538				    rm->rm_col[c].rc_error != 0; c++)
1539					continue;
1540
1541				tgts[current] = c;
1542				current++;
1543
1544			} while (current != n);
1545		}
1546	}
1547	n--;
1548done:
1549	for (i = n - 1; i >= 0; i--) {
1550		free(orig[i]);
1551	}
1552
1553	return (ret);
1554}
1555
1556static int
1557vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1558    off_t offset, size_t bytes)
1559{
1560	vdev_t *tvd = vd->v_top;
1561	vdev_t *cvd;
1562	raidz_map_t *rm;
1563	raidz_col_t *rc;
1564	int c, error;
1565	int unexpected_errors;
1566	int parity_errors;
1567	int parity_untried;
1568	int data_errors;
1569	int total_errors;
1570	int n;
1571	int tgts[VDEV_RAIDZ_MAXPARITY];
1572	int code;
1573
1574	rc = NULL;	/* gcc */
1575	error = 0;
1576
1577	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1578	    vd->v_nchildren, vd->v_nparity);
1579	if (rm == NULL)
1580		return (ENOMEM);
1581
1582	/*
1583	 * Iterate over the columns in reverse order so that we hit the parity
1584	 * last -- any errors along the way will force us to read the parity.
1585	 */
1586	for (c = rm->rm_cols - 1; c >= 0; c--) {
1587		rc = &rm->rm_col[c];
1588		cvd = vdev_child(vd, rc->rc_devidx);
1589		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1590			if (c >= rm->rm_firstdatacol)
1591				rm->rm_missingdata++;
1592			else
1593				rm->rm_missingparity++;
1594			rc->rc_error = ENXIO;
1595			rc->rc_tried = 1;	/* don't even try */
1596			rc->rc_skipped = 1;
1597			continue;
1598		}
1599#if 0		/* XXX: Too hard for the boot code. */
1600		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1601			if (c >= rm->rm_firstdatacol)
1602				rm->rm_missingdata++;
1603			else
1604				rm->rm_missingparity++;
1605			rc->rc_error = ESTALE;
1606			rc->rc_skipped = 1;
1607			continue;
1608		}
1609#endif
1610		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1611			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1612			    rc->rc_offset, rc->rc_size);
1613			rc->rc_tried = 1;
1614			rc->rc_skipped = 0;
1615		}
1616	}
1617
1618reconstruct:
1619	unexpected_errors = 0;
1620	parity_errors = 0;
1621	parity_untried = 0;
1622	data_errors = 0;
1623	total_errors = 0;
1624
1625	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1626	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1627
1628	for (c = 0; c < rm->rm_cols; c++) {
1629		rc = &rm->rm_col[c];
1630
1631		if (rc->rc_error) {
1632			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1633
1634			if (c < rm->rm_firstdatacol)
1635				parity_errors++;
1636			else
1637				data_errors++;
1638
1639			if (!rc->rc_skipped)
1640				unexpected_errors++;
1641
1642			total_errors++;
1643		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1644			parity_untried++;
1645		}
1646	}
1647
1648	/*
1649	 * There are three potential phases for a read:
1650	 *	1. produce valid data from the columns read
1651	 *	2. read all disks and try again
1652	 *	3. perform combinatorial reconstruction
1653	 *
1654	 * Each phase is progressively both more expensive and less likely to
1655	 * occur. If we encounter more errors than we can repair or all phases
1656	 * fail, we have no choice but to return an error.
1657	 */
1658
1659	/*
1660	 * If the number of errors we saw was correctable -- less than or equal
1661	 * to the number of parity disks read -- attempt to produce data that
1662	 * has a valid checksum. Naturally, this case applies in the absence of
1663	 * any errors.
1664	 */
1665	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1666		int rv;
1667
1668		if (data_errors == 0) {
1669			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1670			if (rv == 0) {
1671				/*
1672				 * If we read parity information (unnecessarily
1673				 * as it happens since no reconstruction was
1674				 * needed) regenerate and verify the parity.
1675				 * We also regenerate parity when resilvering
1676				 * so we can write it out to the failed device
1677				 * later.
1678				 */
1679				if (parity_errors + parity_untried <
1680				    rm->rm_firstdatacol) {
1681					n = raidz_parity_verify(rm);
1682					unexpected_errors += n;
1683					ASSERT(parity_errors + n <=
1684					    rm->rm_firstdatacol);
1685				}
1686				goto done;
1687			}
1688		} else {
1689			/*
1690			 * We either attempt to read all the parity columns or
1691			 * none of them. If we didn't try to read parity, we
1692			 * wouldn't be here in the correctable case. There must
1693			 * also have been fewer parity errors than parity
1694			 * columns or, again, we wouldn't be in this code path.
1695			 */
1696			ASSERT(parity_untried == 0);
1697			ASSERT(parity_errors < rm->rm_firstdatacol);
1698
1699			/*
1700			 * Identify the data columns that reported an error.
1701			 */
1702			n = 0;
1703			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1704				rc = &rm->rm_col[c];
1705				if (rc->rc_error != 0) {
1706					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1707					tgts[n++] = c;
1708				}
1709			}
1710
1711			ASSERT(rm->rm_firstdatacol >= n);
1712
1713			code = vdev_raidz_reconstruct(rm, tgts, n);
1714
1715			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1716			if (rv == 0) {
1717				/*
1718				 * If we read more parity disks than were used
1719				 * for reconstruction, confirm that the other
1720				 * parity disks produced correct data. This
1721				 * routine is suboptimal in that it regenerates
1722				 * the parity that we already used in addition
1723				 * to the parity that we're attempting to
1724				 * verify, but this should be a relatively
1725				 * uncommon case, and can be optimized if it
1726				 * becomes a problem. Note that we regenerate
1727				 * parity when resilvering so we can write it
1728				 * out to failed devices later.
1729				 */
1730				if (parity_errors < rm->rm_firstdatacol - n) {
1731					n = raidz_parity_verify(rm);
1732					unexpected_errors += n;
1733					ASSERT(parity_errors + n <=
1734					    rm->rm_firstdatacol);
1735				}
1736
1737				goto done;
1738			}
1739		}
1740	}
1741
1742	/*
1743	 * This isn't a typical situation -- either we got a read
1744	 * error or a child silently returned bad data. Read every
1745	 * block so we can try again with as much data and parity as
1746	 * we can track down. If we've already been through once
1747	 * before, all children will be marked as tried so we'll
1748	 * proceed to combinatorial reconstruction.
1749	 */
1750	unexpected_errors = 1;
1751	rm->rm_missingdata = 0;
1752	rm->rm_missingparity = 0;
1753
1754	n = 0;
1755	for (c = 0; c < rm->rm_cols; c++) {
1756		rc = &rm->rm_col[c];
1757
1758		if (rc->rc_tried)
1759			continue;
1760
1761		cvd = vdev_child(vd, rc->rc_devidx);
1762		ASSERT(cvd != NULL);
1763		rc->rc_error = cvd->v_read(cvd, NULL,
1764		    rc->rc_data, rc->rc_offset, rc->rc_size);
1765		if (rc->rc_error == 0)
1766			n++;
1767		rc->rc_tried = 1;
1768		rc->rc_skipped = 0;
1769	}
1770	/*
1771	 * If we managed to read anything more, retry the
1772	 * reconstruction.
1773	 */
1774	if (n > 0)
1775		goto reconstruct;
1776
1777	/*
1778	 * At this point we've attempted to reconstruct the data given the
1779	 * errors we detected, and we've attempted to read all columns. There
1780	 * must, therefore, be one or more additional problems -- silent errors
1781	 * resulting in invalid data rather than explicit I/O errors resulting
1782	 * in absent data. We check if there is enough additional data to
1783	 * possibly reconstruct the data and then perform combinatorial
1784	 * reconstruction over all possible combinations. If that fails,
1785	 * we're cooked.
1786	 */
1787	if (total_errors > rm->rm_firstdatacol) {
1788		error = EIO;
1789	} else if (total_errors < rm->rm_firstdatacol &&
1790	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1791	    total_errors, data_errors)) != 0) {
1792		/*
1793		 * If we didn't use all the available parity for the
1794		 * combinatorial reconstruction, verify that the remaining
1795		 * parity is correct.
1796		 */
1797		if (code != (1 << rm->rm_firstdatacol) - 1)
1798			(void) raidz_parity_verify(rm);
1799	} else {
1800		/*
1801		 * We're here because either:
1802		 *
1803		 *	total_errors == rm_first_datacol, or
1804		 *	vdev_raidz_combrec() failed
1805		 *
1806		 * In either case, there is enough bad data to prevent
1807		 * reconstruction.
1808		 *
1809		 * Start checksum ereports for all children which haven't
1810		 * failed, and the IO wasn't speculative.
1811		 */
1812		error = ECKSUM;
1813	}
1814
1815done:
1816	vdev_raidz_map_free(rm);
1817
1818	return (error);
1819}
1820