1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/cdefs.h>
27 #include <lz4.h>
28
29 static uint64_t zfs_crc64_table[256];
30
31 #define ECKSUM 666
32
33 #define ASSERT3S(x, y, z) ((void)0)
34 #define ASSERT3U(x, y, z) ((void)0)
35 #define ASSERT3P(x, y, z) ((void)0)
36 #define ASSERT0(x) ((void)0)
37 #define ASSERT(x) ((void)0)
38
39 static void
zfs_init_crc(void)40 zfs_init_crc(void)
41 {
42 int i, j;
43 uint64_t *ct;
44
45 /*
46 * Calculate the crc64 table (used for the zap hash
47 * function).
48 */
49 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
50 memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table));
51 for (i = 0; i < 256; i++) {
52 ct = zfs_crc64_table + i;
53 for (*ct = i, j = 8; j > 0; j--)
54 *ct = (*ct >> 1) ^
55 (-(*ct & 1) & ZFS_CRC64_POLY);
56 }
57 }
58 }
59
60 static void
zio_checksum_off(const void * buf __unused,uint64_t size __unused,const void * ctx_template __unused,zio_cksum_t * zcp)61 zio_checksum_off(const void *buf __unused, uint64_t size __unused,
62 const void *ctx_template __unused, zio_cksum_t *zcp)
63 {
64 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
65 }
66
67 /*
68 * Signature for checksum functions.
69 */
70 typedef void zio_checksum_t(const void *data, uint64_t size,
71 const void *ctx_template, zio_cksum_t *zcp);
72 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
73 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
74
75 typedef enum zio_checksum_flags {
76 /* Strong enough for metadata? */
77 ZCHECKSUM_FLAG_METADATA = (1 << 1),
78 /* ZIO embedded checksum */
79 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
80 /* Strong enough for dedup (without verification)? */
81 ZCHECKSUM_FLAG_DEDUP = (1 << 3),
82 /* Uses salt value */
83 ZCHECKSUM_FLAG_SALTED = (1 << 4),
84 /* Strong enough for nopwrite? */
85 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
86 } zio_checksum_flags_t;
87
88 /*
89 * Information about each checksum function.
90 */
91 typedef struct zio_checksum_info {
92 /* checksum function for each byteorder */
93 zio_checksum_t *ci_func[2];
94 zio_checksum_tmpl_init_t *ci_tmpl_init;
95 zio_checksum_tmpl_free_t *ci_tmpl_free;
96 zio_checksum_flags_t ci_flags;
97 const char *ci_name; /* descriptive name */
98 } zio_checksum_info_t;
99
100 #include "blkptr.c"
101
102 #include "fletcher.c"
103 #include "sha256.c"
104 #include "skein_zfs.c"
105 #include "edonr_zfs.c"
106
107 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
108 {{NULL, NULL}, NULL, NULL, 0, "inherit"},
109 {{NULL, NULL}, NULL, NULL, 0, "on"},
110 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"},
111 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
112 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
113 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
114 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
115 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
116 ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
117 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
118 0, "fletcher2"},
119 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
120 ZCHECKSUM_FLAG_METADATA, "fletcher4"},
121 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
122 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
123 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
124 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
125 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
126 {{zio_checksum_off, zio_checksum_off}, NULL, NULL,
127 0, "noparity"},
128 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
129 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
130 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
131 /* no skein and edonr for now */
132 {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
133 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
134 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
135 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
136 {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
137 zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
138 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
139 ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
140 };
141
142 /*
143 * Common signature for all zio compress/decompress functions.
144 */
145 typedef size_t zio_compress_func_t(void *src, void *dst,
146 size_t s_len, size_t d_len, int);
147 typedef int zio_decompress_func_t(void *src, void *dst,
148 size_t s_len, size_t d_len, int);
149
150 extern int gzip_decompress(void *src, void *dst,
151 size_t s_len, size_t d_len, int);
152 /*
153 * Information about each compression function.
154 */
155 typedef struct zio_compress_info {
156 zio_compress_func_t *ci_compress; /* compression function */
157 zio_decompress_func_t *ci_decompress; /* decompression function */
158 int ci_level; /* level parameter */
159 const char *ci_name; /* algorithm name */
160 } zio_compress_info_t;
161
162 #include "lzjb.c"
163 #include "zle.c"
164
165 /*
166 * Compression vectors.
167 */
168 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
169 {NULL, NULL, 0, "inherit"},
170 {NULL, NULL, 0, "on"},
171 {NULL, NULL, 0, "uncompressed"},
172 {NULL, lzjb_decompress, 0, "lzjb"},
173 {NULL, NULL, 0, "empty"},
174 {NULL, gzip_decompress, 1, "gzip-1"},
175 {NULL, gzip_decompress, 2, "gzip-2"},
176 {NULL, gzip_decompress, 3, "gzip-3"},
177 {NULL, gzip_decompress, 4, "gzip-4"},
178 {NULL, gzip_decompress, 5, "gzip-5"},
179 {NULL, gzip_decompress, 6, "gzip-6"},
180 {NULL, gzip_decompress, 7, "gzip-7"},
181 {NULL, gzip_decompress, 8, "gzip-8"},
182 {NULL, gzip_decompress, 9, "gzip-9"},
183 {NULL, zle_decompress, 64, "zle"},
184 {NULL, lz4_decompress, 0, "lz4"},
185 };
186
187 static void
byteswap_uint64_array(void * vbuf,size_t size)188 byteswap_uint64_array(void *vbuf, size_t size)
189 {
190 uint64_t *buf = vbuf;
191 size_t count = size >> 3;
192 int i;
193
194 ASSERT((size & 7) == 0);
195
196 for (i = 0; i < count; i++)
197 buf[i] = BSWAP_64(buf[i]);
198 }
199
200 /*
201 * Set the external verifier for a gang block based on <vdev, offset, txg>,
202 * a tuple which is guaranteed to be unique for the life of the pool.
203 */
204 static void
zio_checksum_gang_verifier(zio_cksum_t * zcp,const blkptr_t * bp)205 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
206 {
207 const dva_t *dva = BP_IDENTITY(bp);
208 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
209
210 ASSERT(BP_IS_GANG(bp));
211
212 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
213 }
214
215 /*
216 * Set the external verifier for a label block based on its offset.
217 * The vdev is implicit, and the txg is unknowable at pool open time --
218 * hence the logic in vdev_uberblock_load() to find the most recent copy.
219 */
220 static void
zio_checksum_label_verifier(zio_cksum_t * zcp,uint64_t offset)221 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
222 {
223 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
224 }
225
226 /*
227 * Calls the template init function of a checksum which supports context
228 * templates and installs the template into the spa_t.
229 */
230 static void
zio_checksum_template_init(enum zio_checksum checksum,spa_t * spa)231 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
232 {
233 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
234
235 if (ci->ci_tmpl_init == NULL)
236 return;
237
238 if (spa->spa_cksum_tmpls[checksum] != NULL)
239 return;
240
241 if (spa->spa_cksum_tmpls[checksum] == NULL) {
242 spa->spa_cksum_tmpls[checksum] =
243 ci->ci_tmpl_init(&spa->spa_cksum_salt);
244 }
245 }
246
247 /*
248 * Called by a spa_t that's about to be deallocated. This steps through
249 * all of the checksum context templates and deallocates any that were
250 * initialized using the algorithm-specific template init function.
251 */
252 void
zio_checksum_templates_free(spa_t * spa)253 zio_checksum_templates_free(spa_t *spa)
254 {
255 for (enum zio_checksum checksum = 0;
256 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
257 if (spa->spa_cksum_tmpls[checksum] != NULL) {
258 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
259
260 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
261 spa->spa_cksum_tmpls[checksum] = NULL;
262 }
263 }
264 }
265
266 static int
zio_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data)267 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
268 {
269 uint64_t size;
270 unsigned int checksum;
271 zio_checksum_info_t *ci;
272 void *ctx = NULL;
273 zio_cksum_t actual_cksum, expected_cksum, verifier;
274 int byteswap;
275
276 checksum = BP_GET_CHECKSUM(bp);
277 size = BP_GET_PSIZE(bp);
278
279 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
280 return (EINVAL);
281 ci = &zio_checksum_table[checksum];
282 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
283 return (EINVAL);
284
285 if (spa != NULL) {
286 zio_checksum_template_init(checksum, (spa_t *)spa);
287 ctx = spa->spa_cksum_tmpls[checksum];
288 }
289
290 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
291 zio_eck_t *eck;
292
293 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
294 checksum == ZIO_CHECKSUM_LABEL);
295
296 eck = (zio_eck_t *)((char *)data + size) - 1;
297
298 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
299 zio_checksum_gang_verifier(&verifier, bp);
300 else if (checksum == ZIO_CHECKSUM_LABEL)
301 zio_checksum_label_verifier(&verifier,
302 DVA_GET_OFFSET(BP_IDENTITY(bp)));
303 else
304 verifier = bp->blk_cksum;
305
306 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
307
308 if (byteswap)
309 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
310
311 expected_cksum = eck->zec_cksum;
312 eck->zec_cksum = verifier;
313 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
314 eck->zec_cksum = expected_cksum;
315
316 if (byteswap)
317 byteswap_uint64_array(&expected_cksum,
318 sizeof (zio_cksum_t));
319 } else {
320 byteswap = BP_SHOULD_BYTESWAP(bp);
321 expected_cksum = bp->blk_cksum;
322 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
323 }
324
325 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
326 /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
327 return (EIO);
328 }
329
330 return (0);
331 }
332
333 static int
zio_decompress_data(int cpfunc,void * src,uint64_t srcsize,void * dest,uint64_t destsize)334 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
335 void *dest, uint64_t destsize)
336 {
337 zio_compress_info_t *ci;
338
339 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
340 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
341 return (EIO);
342 }
343
344 ci = &zio_compress_table[cpfunc];
345 if (!ci->ci_decompress) {
346 printf("ZFS: unsupported compression algorithm %s\n",
347 ci->ci_name);
348 return (EIO);
349 }
350
351 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
352 }
353
354 static uint64_t
zap_hash(uint64_t salt,const char * name)355 zap_hash(uint64_t salt, const char *name)
356 {
357 const uint8_t *cp;
358 uint8_t c;
359 uint64_t crc = salt;
360
361 ASSERT(crc != 0);
362 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
363 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
364 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
365
366 /*
367 * Only use 28 bits, since we need 4 bits in the cookie for the
368 * collision differentiator. We MUST use the high bits, since
369 * those are the onces that we first pay attention to when
370 * chosing the bucket.
371 */
372 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
373
374 return (crc);
375 }
376
377 typedef struct raidz_col {
378 uint64_t rc_devidx; /* child device index for I/O */
379 uint64_t rc_offset; /* device offset */
380 uint64_t rc_size; /* I/O size */
381 void *rc_data; /* I/O data */
382 int rc_error; /* I/O error for this device */
383 uint8_t rc_tried; /* Did we attempt this I/O column? */
384 uint8_t rc_skipped; /* Did we skip this I/O column? */
385 } raidz_col_t;
386
387 typedef struct raidz_map {
388 uint64_t rm_cols; /* Regular column count */
389 uint64_t rm_scols; /* Count including skipped columns */
390 uint64_t rm_bigcols; /* Number of oversized columns */
391 uint64_t rm_asize; /* Actual total I/O size */
392 uint64_t rm_missingdata; /* Count of missing data devices */
393 uint64_t rm_missingparity; /* Count of missing parity devices */
394 uint64_t rm_firstdatacol; /* First data column/parity count */
395 uint64_t rm_nskip; /* Skipped sectors for padding */
396 uint64_t rm_skipstart; /* Column index of padding start */
397 uintptr_t rm_reports; /* # of referencing checksum reports */
398 uint8_t rm_freed; /* map no longer has referencing ZIO */
399 uint8_t rm_ecksuminjected; /* checksum error was injected */
400 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
401 } raidz_map_t;
402
403 #define VDEV_RAIDZ_P 0
404 #define VDEV_RAIDZ_Q 1
405 #define VDEV_RAIDZ_R 2
406
407 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
408 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
409
410 /*
411 * We provide a mechanism to perform the field multiplication operation on a
412 * 64-bit value all at once rather than a byte at a time. This works by
413 * creating a mask from the top bit in each byte and using that to
414 * conditionally apply the XOR of 0x1d.
415 */
416 #define VDEV_RAIDZ_64MUL_2(x, mask) \
417 { \
418 (mask) = (x) & 0x8080808080808080ULL; \
419 (mask) = ((mask) << 1) - ((mask) >> 7); \
420 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
421 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
422 }
423
424 #define VDEV_RAIDZ_64MUL_4(x, mask) \
425 { \
426 VDEV_RAIDZ_64MUL_2((x), mask); \
427 VDEV_RAIDZ_64MUL_2((x), mask); \
428 }
429
430 /*
431 * These two tables represent powers and logs of 2 in the Galois field defined
432 * above. These values were computed by repeatedly multiplying by 2 as above.
433 */
434 static const uint8_t vdev_raidz_pow2[256] = {
435 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
436 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
437 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
438 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
439 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
440 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
441 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
442 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
443 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
444 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
445 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
446 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
447 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
448 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
449 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
450 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
451 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
452 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
453 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
454 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
455 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
456 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
457 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
458 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
459 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
460 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
461 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
462 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
463 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
464 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
465 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
466 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
467 };
468 static const uint8_t vdev_raidz_log2[256] = {
469 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
470 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
471 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
472 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
473 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
474 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
475 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
476 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
477 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
478 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
479 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
480 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
481 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
482 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
483 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
484 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
485 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
486 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
487 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
488 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
489 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
490 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
491 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
492 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
493 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
494 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
495 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
496 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
497 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
498 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
499 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
500 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
501 };
502
503 /*
504 * Multiply a given number by 2 raised to the given power.
505 */
506 static uint8_t
vdev_raidz_exp2(uint8_t a,int exp)507 vdev_raidz_exp2(uint8_t a, int exp)
508 {
509 if (a == 0)
510 return (0);
511
512 ASSERT(exp >= 0);
513 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
514
515 exp += vdev_raidz_log2[a];
516 if (exp > 255)
517 exp -= 255;
518
519 return (vdev_raidz_pow2[exp]);
520 }
521
522 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)523 vdev_raidz_generate_parity_p(raidz_map_t *rm)
524 {
525 uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
526 int c;
527
528 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
529
530 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
531 src = rm->rm_col[c].rc_data;
532 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
533 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
534
535 if (c == rm->rm_firstdatacol) {
536 ASSERT(ccount == pcount);
537 for (i = 0; i < ccount; i++, src++, p++) {
538 *p = *src;
539 }
540 } else {
541 ASSERT(ccount <= pcount);
542 for (i = 0; i < ccount; i++, src++, p++) {
543 *p ^= *src;
544 }
545 }
546 }
547 }
548
549 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)550 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
551 {
552 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
553 int c;
554
555 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
556 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
557 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
558
559 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
560 src = rm->rm_col[c].rc_data;
561 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
562 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
563
564 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
565
566 if (c == rm->rm_firstdatacol) {
567 ASSERT(ccnt == pcnt || ccnt == 0);
568 for (i = 0; i < ccnt; i++, src++, p++, q++) {
569 *p = *src;
570 *q = *src;
571 }
572 for (; i < pcnt; i++, src++, p++, q++) {
573 *p = 0;
574 *q = 0;
575 }
576 } else {
577 ASSERT(ccnt <= pcnt);
578
579 /*
580 * Apply the algorithm described above by multiplying
581 * the previous result and adding in the new value.
582 */
583 for (i = 0; i < ccnt; i++, src++, p++, q++) {
584 *p ^= *src;
585
586 VDEV_RAIDZ_64MUL_2(*q, mask);
587 *q ^= *src;
588 }
589
590 /*
591 * Treat short columns as though they are full of 0s.
592 * Note that there's therefore nothing needed for P.
593 */
594 for (; i < pcnt; i++, q++) {
595 VDEV_RAIDZ_64MUL_2(*q, mask);
596 }
597 }
598 }
599 }
600
601 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)602 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
603 {
604 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
605 int c;
606
607 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
608 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
609 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
610 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
611 rm->rm_col[VDEV_RAIDZ_R].rc_size);
612
613 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
614 src = rm->rm_col[c].rc_data;
615 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
616 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
617 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
618
619 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
620
621 if (c == rm->rm_firstdatacol) {
622 ASSERT(ccnt == pcnt || ccnt == 0);
623 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
624 *p = *src;
625 *q = *src;
626 *r = *src;
627 }
628 for (; i < pcnt; i++, src++, p++, q++, r++) {
629 *p = 0;
630 *q = 0;
631 *r = 0;
632 }
633 } else {
634 ASSERT(ccnt <= pcnt);
635
636 /*
637 * Apply the algorithm described above by multiplying
638 * the previous result and adding in the new value.
639 */
640 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
641 *p ^= *src;
642
643 VDEV_RAIDZ_64MUL_2(*q, mask);
644 *q ^= *src;
645
646 VDEV_RAIDZ_64MUL_4(*r, mask);
647 *r ^= *src;
648 }
649
650 /*
651 * Treat short columns as though they are full of 0s.
652 * Note that there's therefore nothing needed for P.
653 */
654 for (; i < pcnt; i++, q++, r++) {
655 VDEV_RAIDZ_64MUL_2(*q, mask);
656 VDEV_RAIDZ_64MUL_4(*r, mask);
657 }
658 }
659 }
660 }
661
662 /*
663 * Generate RAID parity in the first virtual columns according to the number of
664 * parity columns available.
665 */
666 static void
vdev_raidz_generate_parity(raidz_map_t * rm)667 vdev_raidz_generate_parity(raidz_map_t *rm)
668 {
669 switch (rm->rm_firstdatacol) {
670 case 1:
671 vdev_raidz_generate_parity_p(rm);
672 break;
673 case 2:
674 vdev_raidz_generate_parity_pq(rm);
675 break;
676 case 3:
677 vdev_raidz_generate_parity_pqr(rm);
678 break;
679 default:
680 panic("invalid RAID-Z configuration");
681 }
682 }
683
684 /* BEGIN CSTYLED */
685 /*
686 * In the general case of reconstruction, we must solve the system of linear
687 * equations defined by the coeffecients used to generate parity as well as
688 * the contents of the data and parity disks. This can be expressed with
689 * vectors for the original data (D) and the actual data (d) and parity (p)
690 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
691 *
692 * __ __ __ __
693 * | | __ __ | p_0 |
694 * | V | | D_0 | | p_m-1 |
695 * | | x | : | = | d_0 |
696 * | I | | D_n-1 | | : |
697 * | | ~~ ~~ | d_n-1 |
698 * ~~ ~~ ~~ ~~
699 *
700 * I is simply a square identity matrix of size n, and V is a vandermonde
701 * matrix defined by the coeffecients we chose for the various parity columns
702 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
703 * computation as well as linear separability.
704 *
705 * __ __ __ __
706 * | 1 .. 1 1 1 | | p_0 |
707 * | 2^n-1 .. 4 2 1 | __ __ | : |
708 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
709 * | 1 .. 0 0 0 | | D_1 | | d_0 |
710 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
711 * | : : : : | | : | | d_2 |
712 * | 0 .. 1 0 0 | | D_n-1 | | : |
713 * | 0 .. 0 1 0 | ~~ ~~ | : |
714 * | 0 .. 0 0 1 | | d_n-1 |
715 * ~~ ~~ ~~ ~~
716 *
717 * Note that I, V, d, and p are known. To compute D, we must invert the
718 * matrix and use the known data and parity values to reconstruct the unknown
719 * data values. We begin by removing the rows in V|I and d|p that correspond
720 * to failed or missing columns; we then make V|I square (n x n) and d|p
721 * sized n by removing rows corresponding to unused parity from the bottom up
722 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
723 * using Gauss-Jordan elimination. In the example below we use m=3 parity
724 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
725 * __ __
726 * | 1 1 1 1 1 1 1 1 |
727 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
728 * | 19 205 116 29 64 16 4 1 | / /
729 * | 1 0 0 0 0 0 0 0 | / /
730 * | 0 1 0 0 0 0 0 0 | <--' /
731 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
732 * | 0 0 0 1 0 0 0 0 |
733 * | 0 0 0 0 1 0 0 0 |
734 * | 0 0 0 0 0 1 0 0 |
735 * | 0 0 0 0 0 0 1 0 |
736 * | 0 0 0 0 0 0 0 1 |
737 * ~~ ~~
738 * __ __
739 * | 1 1 1 1 1 1 1 1 |
740 * | 128 64 32 16 8 4 2 1 |
741 * | 19 205 116 29 64 16 4 1 |
742 * | 1 0 0 0 0 0 0 0 |
743 * | 0 1 0 0 0 0 0 0 |
744 * (V|I)' = | 0 0 1 0 0 0 0 0 |
745 * | 0 0 0 1 0 0 0 0 |
746 * | 0 0 0 0 1 0 0 0 |
747 * | 0 0 0 0 0 1 0 0 |
748 * | 0 0 0 0 0 0 1 0 |
749 * | 0 0 0 0 0 0 0 1 |
750 * ~~ ~~
751 *
752 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
753 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
754 * matrix is not singular.
755 * __ __
756 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
757 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
758 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
759 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
760 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
761 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
762 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
763 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
764 * ~~ ~~
765 * __ __
766 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
767 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
768 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
769 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
770 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
771 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
772 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
773 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
774 * ~~ ~~
775 * __ __
776 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
777 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
778 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
779 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
780 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
781 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
782 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
783 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
784 * ~~ ~~
785 * __ __
786 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
787 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
788 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
789 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
790 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
791 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
792 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
793 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
794 * ~~ ~~
795 * __ __
796 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
797 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
798 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
799 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
800 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
801 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
802 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
803 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
804 * ~~ ~~
805 * __ __
806 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
807 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
808 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
809 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
810 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
811 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
812 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
813 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
814 * ~~ ~~
815 * __ __
816 * | 0 0 1 0 0 0 0 0 |
817 * | 167 100 5 41 159 169 217 208 |
818 * | 166 100 4 40 158 168 216 209 |
819 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
820 * | 0 0 0 0 1 0 0 0 |
821 * | 0 0 0 0 0 1 0 0 |
822 * | 0 0 0 0 0 0 1 0 |
823 * | 0 0 0 0 0 0 0 1 |
824 * ~~ ~~
825 *
826 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
827 * of the missing data.
828 *
829 * As is apparent from the example above, the only non-trivial rows in the
830 * inverse matrix correspond to the data disks that we're trying to
831 * reconstruct. Indeed, those are the only rows we need as the others would
832 * only be useful for reconstructing data known or assumed to be valid. For
833 * that reason, we only build the coefficients in the rows that correspond to
834 * targeted columns.
835 */
836 /* END CSTYLED */
837
838 static void
vdev_raidz_matrix_init(raidz_map_t * rm __unused,int n,int nmap,int * map,uint8_t ** rows)839 vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map,
840 uint8_t **rows)
841 {
842 int i, j;
843 int pow;
844
845 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
846
847 /*
848 * Fill in the missing rows of interest.
849 */
850 for (i = 0; i < nmap; i++) {
851 ASSERT3S(0, <=, map[i]);
852 ASSERT3S(map[i], <=, 2);
853
854 pow = map[i] * n;
855 if (pow > 255)
856 pow -= 255;
857 ASSERT(pow <= 255);
858
859 for (j = 0; j < n; j++) {
860 pow -= map[i];
861 if (pow < 0)
862 pow += 255;
863 rows[i][j] = vdev_raidz_pow2[pow];
864 }
865 }
866 }
867
868 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)869 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
870 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
871 {
872 int i, j, ii, jj;
873 uint8_t log;
874
875 /*
876 * Assert that the first nmissing entries from the array of used
877 * columns correspond to parity columns and that subsequent entries
878 * correspond to data columns.
879 */
880 for (i = 0; i < nmissing; i++) {
881 ASSERT3S(used[i], <, rm->rm_firstdatacol);
882 }
883 for (; i < n; i++) {
884 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
885 }
886
887 /*
888 * First initialize the storage where we'll compute the inverse rows.
889 */
890 for (i = 0; i < nmissing; i++) {
891 for (j = 0; j < n; j++) {
892 invrows[i][j] = (i == j) ? 1 : 0;
893 }
894 }
895
896 /*
897 * Subtract all trivial rows from the rows of consequence.
898 */
899 for (i = 0; i < nmissing; i++) {
900 for (j = nmissing; j < n; j++) {
901 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
902 jj = used[j] - rm->rm_firstdatacol;
903 ASSERT3S(jj, <, n);
904 invrows[i][j] = rows[i][jj];
905 rows[i][jj] = 0;
906 }
907 }
908
909 /*
910 * For each of the rows of interest, we must normalize it and subtract
911 * a multiple of it from the other rows.
912 */
913 for (i = 0; i < nmissing; i++) {
914 for (j = 0; j < missing[i]; j++) {
915 ASSERT3U(rows[i][j], ==, 0);
916 }
917 ASSERT3U(rows[i][missing[i]], !=, 0);
918
919 /*
920 * Compute the inverse of the first element and multiply each
921 * element in the row by that value.
922 */
923 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
924
925 for (j = 0; j < n; j++) {
926 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
927 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
928 }
929
930 for (ii = 0; ii < nmissing; ii++) {
931 if (i == ii)
932 continue;
933
934 ASSERT3U(rows[ii][missing[i]], !=, 0);
935
936 log = vdev_raidz_log2[rows[ii][missing[i]]];
937
938 for (j = 0; j < n; j++) {
939 rows[ii][j] ^=
940 vdev_raidz_exp2(rows[i][j], log);
941 invrows[ii][j] ^=
942 vdev_raidz_exp2(invrows[i][j], log);
943 }
944 }
945 }
946
947 /*
948 * Verify that the data that is left in the rows are properly part of
949 * an identity matrix.
950 */
951 for (i = 0; i < nmissing; i++) {
952 for (j = 0; j < n; j++) {
953 if (j == missing[i]) {
954 ASSERT3U(rows[i][j], ==, 1);
955 } else {
956 ASSERT3U(rows[i][j], ==, 0);
957 }
958 }
959 }
960 }
961
962 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)963 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
964 int *missing, uint8_t **invrows, const uint8_t *used)
965 {
966 int i, j, x, cc, c;
967 uint8_t *src;
968 uint64_t ccount;
969 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
970 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
971 uint8_t log, val;
972 int ll;
973 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
974 uint8_t *p, *pp;
975 size_t psize;
976
977 log = 0; /* gcc */
978 psize = sizeof (invlog[0][0]) * n * nmissing;
979 p = malloc(psize);
980 if (p == NULL) {
981 printf("Out of memory\n");
982 return;
983 }
984
985 for (pp = p, i = 0; i < nmissing; i++) {
986 invlog[i] = pp;
987 pp += n;
988 }
989
990 for (i = 0; i < nmissing; i++) {
991 for (j = 0; j < n; j++) {
992 ASSERT3U(invrows[i][j], !=, 0);
993 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
994 }
995 }
996
997 for (i = 0; i < n; i++) {
998 c = used[i];
999 ASSERT3U(c, <, rm->rm_cols);
1000
1001 src = rm->rm_col[c].rc_data;
1002 ccount = rm->rm_col[c].rc_size;
1003 for (j = 0; j < nmissing; j++) {
1004 cc = missing[j] + rm->rm_firstdatacol;
1005 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1006 ASSERT3U(cc, <, rm->rm_cols);
1007 ASSERT3U(cc, !=, c);
1008
1009 dst[j] = rm->rm_col[cc].rc_data;
1010 dcount[j] = rm->rm_col[cc].rc_size;
1011 }
1012
1013 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1014
1015 for (x = 0; x < ccount; x++, src++) {
1016 if (*src != 0)
1017 log = vdev_raidz_log2[*src];
1018
1019 for (cc = 0; cc < nmissing; cc++) {
1020 if (x >= dcount[cc])
1021 continue;
1022
1023 if (*src == 0) {
1024 val = 0;
1025 } else {
1026 if ((ll = log + invlog[cc][i]) >= 255)
1027 ll -= 255;
1028 val = vdev_raidz_pow2[ll];
1029 }
1030
1031 if (i == 0)
1032 dst[cc][x] = val;
1033 else
1034 dst[cc][x] ^= val;
1035 }
1036 }
1037 }
1038
1039 free(p);
1040 }
1041
1042 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1043 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1044 {
1045 int n, i, c, t, tt;
1046 int nmissing_rows;
1047 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1048 int parity_map[VDEV_RAIDZ_MAXPARITY];
1049
1050 uint8_t *p, *pp;
1051 size_t psize;
1052
1053 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1054 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1055 uint8_t *used;
1056
1057 int code = 0;
1058
1059
1060 n = rm->rm_cols - rm->rm_firstdatacol;
1061
1062 /*
1063 * Figure out which data columns are missing.
1064 */
1065 nmissing_rows = 0;
1066 for (t = 0; t < ntgts; t++) {
1067 if (tgts[t] >= rm->rm_firstdatacol) {
1068 missing_rows[nmissing_rows++] =
1069 tgts[t] - rm->rm_firstdatacol;
1070 }
1071 }
1072
1073 /*
1074 * Figure out which parity columns to use to help generate the missing
1075 * data columns.
1076 */
1077 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1078 ASSERT(tt < ntgts);
1079 ASSERT(c < rm->rm_firstdatacol);
1080
1081 /*
1082 * Skip any targeted parity columns.
1083 */
1084 if (c == tgts[tt]) {
1085 tt++;
1086 continue;
1087 }
1088
1089 code |= 1 << c;
1090
1091 parity_map[i] = c;
1092 i++;
1093 }
1094
1095 ASSERT(code != 0);
1096 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1097
1098 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1099 nmissing_rows * n + sizeof (used[0]) * n;
1100 p = malloc(psize);
1101 if (p == NULL) {
1102 printf("Out of memory\n");
1103 return (code);
1104 }
1105
1106 for (pp = p, i = 0; i < nmissing_rows; i++) {
1107 rows[i] = pp;
1108 pp += n;
1109 invrows[i] = pp;
1110 pp += n;
1111 }
1112 used = pp;
1113
1114 for (i = 0; i < nmissing_rows; i++) {
1115 used[i] = parity_map[i];
1116 }
1117
1118 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1119 if (tt < nmissing_rows &&
1120 c == missing_rows[tt] + rm->rm_firstdatacol) {
1121 tt++;
1122 continue;
1123 }
1124
1125 ASSERT3S(i, <, n);
1126 used[i] = c;
1127 i++;
1128 }
1129
1130 /*
1131 * Initialize the interesting rows of the matrix.
1132 */
1133 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1134
1135 /*
1136 * Invert the matrix.
1137 */
1138 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1139 invrows, used);
1140
1141 /*
1142 * Reconstruct the missing data using the generated matrix.
1143 */
1144 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1145 invrows, used);
1146
1147 free(p);
1148
1149 return (code);
1150 }
1151
1152 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1153 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1154 {
1155 int tgts[VDEV_RAIDZ_MAXPARITY];
1156 int ntgts;
1157 int i, c;
1158 int code;
1159 int nbadparity, nbaddata;
1160
1161 /*
1162 * The tgts list must already be sorted.
1163 */
1164 for (i = 1; i < nt; i++) {
1165 ASSERT(t[i] > t[i - 1]);
1166 }
1167
1168 nbadparity = rm->rm_firstdatacol;
1169 nbaddata = rm->rm_cols - nbadparity;
1170 ntgts = 0;
1171 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1172 if (i < nt && c == t[i]) {
1173 tgts[ntgts++] = c;
1174 i++;
1175 } else if (rm->rm_col[c].rc_error != 0) {
1176 tgts[ntgts++] = c;
1177 } else if (c >= rm->rm_firstdatacol) {
1178 nbaddata--;
1179 } else {
1180 nbadparity--;
1181 }
1182 }
1183
1184 ASSERT(ntgts >= nt);
1185 ASSERT(nbaddata >= 0);
1186 ASSERT(nbaddata + nbadparity == ntgts);
1187
1188 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1189 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1190 ASSERT(code > 0);
1191 return (code);
1192 }
1193
1194 static raidz_map_t *
vdev_raidz_map_alloc(void * data,off_t offset,size_t size,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)1195 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1196 uint64_t dcols, uint64_t nparity)
1197 {
1198 raidz_map_t *rm;
1199 uint64_t b = offset >> unit_shift;
1200 uint64_t s = size >> unit_shift;
1201 uint64_t f = b % dcols;
1202 uint64_t o = (b / dcols) << unit_shift;
1203 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1204
1205 q = s / (dcols - nparity);
1206 r = s - q * (dcols - nparity);
1207 bc = (r == 0 ? 0 : r + nparity);
1208 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1209
1210 if (q == 0) {
1211 acols = bc;
1212 scols = MIN(dcols, roundup(bc, nparity + 1));
1213 } else {
1214 acols = dcols;
1215 scols = dcols;
1216 }
1217
1218 ASSERT3U(acols, <=, scols);
1219
1220 rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1221 if (rm == NULL)
1222 return (rm);
1223
1224 rm->rm_cols = acols;
1225 rm->rm_scols = scols;
1226 rm->rm_bigcols = bc;
1227 rm->rm_skipstart = bc;
1228 rm->rm_missingdata = 0;
1229 rm->rm_missingparity = 0;
1230 rm->rm_firstdatacol = nparity;
1231 rm->rm_reports = 0;
1232 rm->rm_freed = 0;
1233 rm->rm_ecksuminjected = 0;
1234
1235 asize = 0;
1236
1237 for (c = 0; c < scols; c++) {
1238 col = f + c;
1239 coff = o;
1240 if (col >= dcols) {
1241 col -= dcols;
1242 coff += 1ULL << unit_shift;
1243 }
1244 rm->rm_col[c].rc_devidx = col;
1245 rm->rm_col[c].rc_offset = coff;
1246 rm->rm_col[c].rc_data = NULL;
1247 rm->rm_col[c].rc_error = 0;
1248 rm->rm_col[c].rc_tried = 0;
1249 rm->rm_col[c].rc_skipped = 0;
1250
1251 if (c >= acols)
1252 rm->rm_col[c].rc_size = 0;
1253 else if (c < bc)
1254 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1255 else
1256 rm->rm_col[c].rc_size = q << unit_shift;
1257
1258 asize += rm->rm_col[c].rc_size;
1259 }
1260
1261 ASSERT3U(asize, ==, tot << unit_shift);
1262 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1263 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1264 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1265 ASSERT3U(rm->rm_nskip, <=, nparity);
1266
1267 for (c = 0; c < rm->rm_firstdatacol; c++) {
1268 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1269 if (rm->rm_col[c].rc_data == NULL) {
1270 c++;
1271 while (c != 0)
1272 free(rm->rm_col[--c].rc_data);
1273 free(rm);
1274 return (NULL);
1275 }
1276 }
1277
1278 rm->rm_col[c].rc_data = data;
1279
1280 for (c = c + 1; c < acols; c++)
1281 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1282 rm->rm_col[c - 1].rc_size;
1283
1284 /*
1285 * If all data stored spans all columns, there's a danger that parity
1286 * will always be on the same device and, since parity isn't read
1287 * during normal operation, that that device's I/O bandwidth won't be
1288 * used effectively. We therefore switch the parity every 1MB.
1289 *
1290 * ... at least that was, ostensibly, the theory. As a practical
1291 * matter unless we juggle the parity between all devices evenly, we
1292 * won't see any benefit. Further, occasional writes that aren't a
1293 * multiple of the LCM of the number of children and the minimum
1294 * stripe width are sufficient to avoid pessimal behavior.
1295 * Unfortunately, this decision created an implicit on-disk format
1296 * requirement that we need to support for all eternity, but only
1297 * for single-parity RAID-Z.
1298 *
1299 * If we intend to skip a sector in the zeroth column for padding
1300 * we must make sure to note this swap. We will never intend to
1301 * skip the first column since at least one data and one parity
1302 * column must appear in each row.
1303 */
1304 ASSERT(rm->rm_cols >= 2);
1305 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1306
1307 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1308 devidx = rm->rm_col[0].rc_devidx;
1309 o = rm->rm_col[0].rc_offset;
1310 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1311 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1312 rm->rm_col[1].rc_devidx = devidx;
1313 rm->rm_col[1].rc_offset = o;
1314
1315 if (rm->rm_skipstart == 0)
1316 rm->rm_skipstart = 1;
1317 }
1318
1319 return (rm);
1320 }
1321
1322 static void
vdev_raidz_map_free(raidz_map_t * rm)1323 vdev_raidz_map_free(raidz_map_t *rm)
1324 {
1325 int c;
1326
1327 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1328 free(rm->rm_col[c].rc_data);
1329
1330 free(rm);
1331 }
1332
1333 static vdev_t *
vdev_child(vdev_t * pvd,uint64_t devidx)1334 vdev_child(vdev_t *pvd, uint64_t devidx)
1335 {
1336 vdev_t *cvd;
1337
1338 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1339 if (cvd->v_id == devidx)
1340 break;
1341 }
1342
1343 return (cvd);
1344 }
1345
1346 /*
1347 * We keep track of whether or not there were any injected errors, so that
1348 * any ereports we generate can note it.
1349 */
1350 static int
raidz_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data,uint64_t size __unused)1351 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1352 uint64_t size __unused)
1353 {
1354
1355 return (zio_checksum_verify(spa, bp, data));
1356 }
1357
1358 /*
1359 * Generate the parity from the data columns. If we tried and were able to
1360 * read the parity without error, verify that the generated parity matches the
1361 * data we read. If it doesn't, we fire off a checksum error. Return the
1362 * number such failures.
1363 */
1364 static int
raidz_parity_verify(raidz_map_t * rm)1365 raidz_parity_verify(raidz_map_t *rm)
1366 {
1367 void *orig[VDEV_RAIDZ_MAXPARITY];
1368 int c, ret = 0;
1369 raidz_col_t *rc;
1370
1371 for (c = 0; c < rm->rm_firstdatacol; c++) {
1372 rc = &rm->rm_col[c];
1373 if (!rc->rc_tried || rc->rc_error != 0)
1374 continue;
1375 orig[c] = malloc(rc->rc_size);
1376 if (orig[c] != NULL) {
1377 bcopy(rc->rc_data, orig[c], rc->rc_size);
1378 } else {
1379 printf("Out of memory\n");
1380 }
1381 }
1382
1383 vdev_raidz_generate_parity(rm);
1384
1385 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1386 rc = &rm->rm_col[c];
1387 if (!rc->rc_tried || rc->rc_error != 0)
1388 continue;
1389 if (orig[c] == NULL ||
1390 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1391 rc->rc_error = ECKSUM;
1392 ret++;
1393 }
1394 free(orig[c]);
1395 }
1396
1397 return (ret);
1398 }
1399
1400 /*
1401 * Iterate over all combinations of bad data and attempt a reconstruction.
1402 * Note that the algorithm below is non-optimal because it doesn't take into
1403 * account how reconstruction is actually performed. For example, with
1404 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1405 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1406 * cases we'd only use parity information in column 0.
1407 */
1408 static int
vdev_raidz_combrec(const spa_t * spa,raidz_map_t * rm,const blkptr_t * bp,void * data,off_t offset __unused,uint64_t bytes,int total_errors,int data_errors)1409 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1410 void *data, off_t offset __unused, uint64_t bytes, int total_errors,
1411 int data_errors)
1412 {
1413 raidz_col_t *rc;
1414 void *orig[VDEV_RAIDZ_MAXPARITY];
1415 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1416 int *tgts = &tstore[1];
1417 int current, next, i, c, n;
1418 int code, ret = 0;
1419
1420 ASSERT(total_errors < rm->rm_firstdatacol);
1421
1422 /*
1423 * This simplifies one edge condition.
1424 */
1425 tgts[-1] = -1;
1426
1427 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1428 /*
1429 * Initialize the targets array by finding the first n columns
1430 * that contain no error.
1431 *
1432 * If there were no data errors, we need to ensure that we're
1433 * always explicitly attempting to reconstruct at least one
1434 * data column. To do this, we simply push the highest target
1435 * up into the data columns.
1436 */
1437 for (c = 0, i = 0; i < n; i++) {
1438 if (i == n - 1 && data_errors == 0 &&
1439 c < rm->rm_firstdatacol) {
1440 c = rm->rm_firstdatacol;
1441 }
1442
1443 while (rm->rm_col[c].rc_error != 0) {
1444 c++;
1445 ASSERT3S(c, <, rm->rm_cols);
1446 }
1447
1448 tgts[i] = c++;
1449 }
1450
1451 /*
1452 * Setting tgts[n] simplifies the other edge condition.
1453 */
1454 tgts[n] = rm->rm_cols;
1455
1456 /*
1457 * These buffers were allocated in previous iterations.
1458 */
1459 for (i = 0; i < n - 1; i++) {
1460 ASSERT(orig[i] != NULL);
1461 }
1462
1463 orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1464 if (orig[n - 1] == NULL) {
1465 ret = ENOMEM;
1466 goto done;
1467 }
1468
1469 current = 0;
1470 next = tgts[current];
1471
1472 while (current != n) {
1473 tgts[current] = next;
1474 current = 0;
1475
1476 /*
1477 * Save off the original data that we're going to
1478 * attempt to reconstruct.
1479 */
1480 for (i = 0; i < n; i++) {
1481 ASSERT(orig[i] != NULL);
1482 c = tgts[i];
1483 ASSERT3S(c, >=, 0);
1484 ASSERT3S(c, <, rm->rm_cols);
1485 rc = &rm->rm_col[c];
1486 bcopy(rc->rc_data, orig[i], rc->rc_size);
1487 }
1488
1489 /*
1490 * Attempt a reconstruction and exit the outer loop on
1491 * success.
1492 */
1493 code = vdev_raidz_reconstruct(rm, tgts, n);
1494 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1495 for (i = 0; i < n; i++) {
1496 c = tgts[i];
1497 rc = &rm->rm_col[c];
1498 ASSERT(rc->rc_error == 0);
1499 rc->rc_error = ECKSUM;
1500 }
1501
1502 ret = code;
1503 goto done;
1504 }
1505
1506 /*
1507 * Restore the original data.
1508 */
1509 for (i = 0; i < n; i++) {
1510 c = tgts[i];
1511 rc = &rm->rm_col[c];
1512 bcopy(orig[i], rc->rc_data, rc->rc_size);
1513 }
1514
1515 do {
1516 /*
1517 * Find the next valid column after the current
1518 * position..
1519 */
1520 for (next = tgts[current] + 1;
1521 next < rm->rm_cols &&
1522 rm->rm_col[next].rc_error != 0; next++)
1523 continue;
1524
1525 ASSERT(next <= tgts[current + 1]);
1526
1527 /*
1528 * If that spot is available, we're done here.
1529 */
1530 if (next != tgts[current + 1])
1531 break;
1532
1533 /*
1534 * Otherwise, find the next valid column after
1535 * the previous position.
1536 */
1537 for (c = tgts[current - 1] + 1;
1538 rm->rm_col[c].rc_error != 0; c++)
1539 continue;
1540
1541 tgts[current] = c;
1542 current++;
1543
1544 } while (current != n);
1545 }
1546 }
1547 n--;
1548 done:
1549 for (i = n - 1; i >= 0; i--) {
1550 free(orig[i]);
1551 }
1552
1553 return (ret);
1554 }
1555
1556 static int
vdev_raidz_read(vdev_t * vd,const blkptr_t * bp,void * data,off_t offset,size_t bytes)1557 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1558 off_t offset, size_t bytes)
1559 {
1560 vdev_t *tvd = vd->v_top;
1561 vdev_t *cvd;
1562 raidz_map_t *rm;
1563 raidz_col_t *rc;
1564 int c, error;
1565 int unexpected_errors;
1566 int parity_errors;
1567 int parity_untried;
1568 int data_errors;
1569 int total_errors;
1570 int n;
1571 int tgts[VDEV_RAIDZ_MAXPARITY];
1572 int code;
1573
1574 rc = NULL; /* gcc */
1575 error = 0;
1576
1577 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1578 vd->v_nchildren, vd->v_nparity);
1579 if (rm == NULL)
1580 return (ENOMEM);
1581
1582 /*
1583 * Iterate over the columns in reverse order so that we hit the parity
1584 * last -- any errors along the way will force us to read the parity.
1585 */
1586 for (c = rm->rm_cols - 1; c >= 0; c--) {
1587 rc = &rm->rm_col[c];
1588 cvd = vdev_child(vd, rc->rc_devidx);
1589 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1590 if (c >= rm->rm_firstdatacol)
1591 rm->rm_missingdata++;
1592 else
1593 rm->rm_missingparity++;
1594 rc->rc_error = ENXIO;
1595 rc->rc_tried = 1; /* don't even try */
1596 rc->rc_skipped = 1;
1597 continue;
1598 }
1599 #if 0 /* XXX: Too hard for the boot code. */
1600 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1601 if (c >= rm->rm_firstdatacol)
1602 rm->rm_missingdata++;
1603 else
1604 rm->rm_missingparity++;
1605 rc->rc_error = ESTALE;
1606 rc->rc_skipped = 1;
1607 continue;
1608 }
1609 #endif
1610 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1611 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1612 rc->rc_offset, rc->rc_size);
1613 rc->rc_tried = 1;
1614 rc->rc_skipped = 0;
1615 }
1616 }
1617
1618 reconstruct:
1619 unexpected_errors = 0;
1620 parity_errors = 0;
1621 parity_untried = 0;
1622 data_errors = 0;
1623 total_errors = 0;
1624
1625 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1626 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1627
1628 for (c = 0; c < rm->rm_cols; c++) {
1629 rc = &rm->rm_col[c];
1630
1631 if (rc->rc_error) {
1632 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1633
1634 if (c < rm->rm_firstdatacol)
1635 parity_errors++;
1636 else
1637 data_errors++;
1638
1639 if (!rc->rc_skipped)
1640 unexpected_errors++;
1641
1642 total_errors++;
1643 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1644 parity_untried++;
1645 }
1646 }
1647
1648 /*
1649 * There are three potential phases for a read:
1650 * 1. produce valid data from the columns read
1651 * 2. read all disks and try again
1652 * 3. perform combinatorial reconstruction
1653 *
1654 * Each phase is progressively both more expensive and less likely to
1655 * occur. If we encounter more errors than we can repair or all phases
1656 * fail, we have no choice but to return an error.
1657 */
1658
1659 /*
1660 * If the number of errors we saw was correctable -- less than or equal
1661 * to the number of parity disks read -- attempt to produce data that
1662 * has a valid checksum. Naturally, this case applies in the absence of
1663 * any errors.
1664 */
1665 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1666 int rv;
1667
1668 if (data_errors == 0) {
1669 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1670 if (rv == 0) {
1671 /*
1672 * If we read parity information (unnecessarily
1673 * as it happens since no reconstruction was
1674 * needed) regenerate and verify the parity.
1675 * We also regenerate parity when resilvering
1676 * so we can write it out to the failed device
1677 * later.
1678 */
1679 if (parity_errors + parity_untried <
1680 rm->rm_firstdatacol) {
1681 n = raidz_parity_verify(rm);
1682 unexpected_errors += n;
1683 ASSERT(parity_errors + n <=
1684 rm->rm_firstdatacol);
1685 }
1686 goto done;
1687 }
1688 } else {
1689 /*
1690 * We either attempt to read all the parity columns or
1691 * none of them. If we didn't try to read parity, we
1692 * wouldn't be here in the correctable case. There must
1693 * also have been fewer parity errors than parity
1694 * columns or, again, we wouldn't be in this code path.
1695 */
1696 ASSERT(parity_untried == 0);
1697 ASSERT(parity_errors < rm->rm_firstdatacol);
1698
1699 /*
1700 * Identify the data columns that reported an error.
1701 */
1702 n = 0;
1703 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1704 rc = &rm->rm_col[c];
1705 if (rc->rc_error != 0) {
1706 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1707 tgts[n++] = c;
1708 }
1709 }
1710
1711 ASSERT(rm->rm_firstdatacol >= n);
1712
1713 code = vdev_raidz_reconstruct(rm, tgts, n);
1714
1715 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1716 if (rv == 0) {
1717 /*
1718 * If we read more parity disks than were used
1719 * for reconstruction, confirm that the other
1720 * parity disks produced correct data. This
1721 * routine is suboptimal in that it regenerates
1722 * the parity that we already used in addition
1723 * to the parity that we're attempting to
1724 * verify, but this should be a relatively
1725 * uncommon case, and can be optimized if it
1726 * becomes a problem. Note that we regenerate
1727 * parity when resilvering so we can write it
1728 * out to failed devices later.
1729 */
1730 if (parity_errors < rm->rm_firstdatacol - n) {
1731 n = raidz_parity_verify(rm);
1732 unexpected_errors += n;
1733 ASSERT(parity_errors + n <=
1734 rm->rm_firstdatacol);
1735 }
1736
1737 goto done;
1738 }
1739 }
1740 }
1741
1742 /*
1743 * This isn't a typical situation -- either we got a read
1744 * error or a child silently returned bad data. Read every
1745 * block so we can try again with as much data and parity as
1746 * we can track down. If we've already been through once
1747 * before, all children will be marked as tried so we'll
1748 * proceed to combinatorial reconstruction.
1749 */
1750 unexpected_errors = 1;
1751 rm->rm_missingdata = 0;
1752 rm->rm_missingparity = 0;
1753
1754 n = 0;
1755 for (c = 0; c < rm->rm_cols; c++) {
1756 rc = &rm->rm_col[c];
1757
1758 if (rc->rc_tried)
1759 continue;
1760
1761 cvd = vdev_child(vd, rc->rc_devidx);
1762 ASSERT(cvd != NULL);
1763 rc->rc_error = cvd->v_read(cvd, NULL,
1764 rc->rc_data, rc->rc_offset, rc->rc_size);
1765 if (rc->rc_error == 0)
1766 n++;
1767 rc->rc_tried = 1;
1768 rc->rc_skipped = 0;
1769 }
1770 /*
1771 * If we managed to read anything more, retry the
1772 * reconstruction.
1773 */
1774 if (n > 0)
1775 goto reconstruct;
1776
1777 /*
1778 * At this point we've attempted to reconstruct the data given the
1779 * errors we detected, and we've attempted to read all columns. There
1780 * must, therefore, be one or more additional problems -- silent errors
1781 * resulting in invalid data rather than explicit I/O errors resulting
1782 * in absent data. We check if there is enough additional data to
1783 * possibly reconstruct the data and then perform combinatorial
1784 * reconstruction over all possible combinations. If that fails,
1785 * we're cooked.
1786 */
1787 if (total_errors > rm->rm_firstdatacol) {
1788 error = EIO;
1789 } else if (total_errors < rm->rm_firstdatacol &&
1790 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1791 total_errors, data_errors)) != 0) {
1792 /*
1793 * If we didn't use all the available parity for the
1794 * combinatorial reconstruction, verify that the remaining
1795 * parity is correct.
1796 */
1797 if (code != (1 << rm->rm_firstdatacol) - 1)
1798 (void) raidz_parity_verify(rm);
1799 } else {
1800 /*
1801 * We're here because either:
1802 *
1803 * total_errors == rm_first_datacol, or
1804 * vdev_raidz_combrec() failed
1805 *
1806 * In either case, there is enough bad data to prevent
1807 * reconstruction.
1808 *
1809 * Start checksum ereports for all children which haven't
1810 * failed, and the IO wasn't speculative.
1811 */
1812 error = ECKSUM;
1813 }
1814
1815 done:
1816 vdev_raidz_map_free(rm);
1817
1818 return (error);
1819 }
1820