1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 28 static uint64_t zfs_crc64_table[256]; 29 30 #define ECKSUM 666 31 32 #define ASSERT3S(x, y, z) ((void)0) 33 #define ASSERT3U(x, y, z) ((void)0) 34 #define ASSERT3P(x, y, z) ((void)0) 35 #define ASSERT0(x) ((void)0) 36 #define ASSERT(x) ((void)0) 37 38 #define panic(...) do { \ 39 printf(__VA_ARGS__); \ 40 for (;;) ; \ 41 } while (0) 42 43 #define kmem_alloc(size, flag) zfs_alloc((size)) 44 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 45 46 static void 47 zfs_init_crc(void) 48 { 49 int i, j; 50 uint64_t *ct; 51 52 /* 53 * Calculate the crc64 table (used for the zap hash 54 * function). 55 */ 56 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 57 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 58 for (i = 0; i < 256; i++) 59 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 60 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 61 } 62 } 63 64 static void 65 zio_checksum_off(const void *buf, uint64_t size, 66 const void *ctx_template, zio_cksum_t *zcp) 67 { 68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 69 } 70 71 /* 72 * Signature for checksum functions. 73 */ 74 typedef void zio_checksum_t(const void *data, uint64_t size, 75 const void *ctx_template, zio_cksum_t *zcp); 76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 77 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 78 79 typedef enum zio_checksum_flags { 80 /* Strong enough for metadata? */ 81 ZCHECKSUM_FLAG_METADATA = (1 << 1), 82 /* ZIO embedded checksum */ 83 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 84 /* Strong enough for dedup (without verification)? */ 85 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 86 /* Uses salt value */ 87 ZCHECKSUM_FLAG_SALTED = (1 << 4), 88 /* Strong enough for nopwrite? */ 89 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 90 } zio_checksum_flags_t; 91 92 /* 93 * Information about each checksum function. 94 */ 95 typedef struct zio_checksum_info { 96 /* checksum function for each byteorder */ 97 zio_checksum_t *ci_func[2]; 98 zio_checksum_tmpl_init_t *ci_tmpl_init; 99 zio_checksum_tmpl_free_t *ci_tmpl_free; 100 zio_checksum_flags_t ci_flags; 101 const char *ci_name; /* descriptive name */ 102 } zio_checksum_info_t; 103 104 #include "blkptr.c" 105 106 #include "fletcher.c" 107 #include "sha256.c" 108 #include "skein_zfs.c" 109 #include "edonr_zfs.c" 110 111 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 112 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 113 {{NULL, NULL}, NULL, NULL, 0, "on"}, 114 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 115 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 116 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 117 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 118 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 119 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 120 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 121 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 122 0, "fletcher2"}, 123 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 124 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 125 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 126 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 127 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 128 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 129 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 130 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 131 0, "noparity"}, 132 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 133 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 134 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 135 /* no skein and edonr for now */ 136 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 137 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 138 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 139 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 140 {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, 141 zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, 142 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | 143 ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, 144 }; 145 146 /* 147 * Common signature for all zio compress/decompress functions. 148 */ 149 typedef size_t zio_compress_func_t(void *src, void *dst, 150 size_t s_len, size_t d_len, int); 151 typedef int zio_decompress_func_t(void *src, void *dst, 152 size_t s_len, size_t d_len, int); 153 154 extern int gzip_decompress(void *src, void *dst, 155 size_t s_len, size_t d_len, int); 156 /* 157 * Information about each compression function. 158 */ 159 typedef struct zio_compress_info { 160 zio_compress_func_t *ci_compress; /* compression function */ 161 zio_decompress_func_t *ci_decompress; /* decompression function */ 162 int ci_level; /* level parameter */ 163 const char *ci_name; /* algorithm name */ 164 } zio_compress_info_t; 165 166 #include "lzjb.c" 167 #include "zle.c" 168 #include "lz4.c" 169 170 /* 171 * Compression vectors. 172 */ 173 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 174 {NULL, NULL, 0, "inherit"}, 175 {NULL, NULL, 0, "on"}, 176 {NULL, NULL, 0, "uncompressed"}, 177 {NULL, lzjb_decompress, 0, "lzjb"}, 178 {NULL, NULL, 0, "empty"}, 179 {NULL, gzip_decompress, 1, "gzip-1"}, 180 {NULL, gzip_decompress, 2, "gzip-2"}, 181 {NULL, gzip_decompress, 3, "gzip-3"}, 182 {NULL, gzip_decompress, 4, "gzip-4"}, 183 {NULL, gzip_decompress, 5, "gzip-5"}, 184 {NULL, gzip_decompress, 6, "gzip-6"}, 185 {NULL, gzip_decompress, 7, "gzip-7"}, 186 {NULL, gzip_decompress, 8, "gzip-8"}, 187 {NULL, gzip_decompress, 9, "gzip-9"}, 188 {NULL, zle_decompress, 64, "zle"}, 189 {NULL, lz4_decompress, 0, "lz4"}, 190 }; 191 192 static void 193 byteswap_uint64_array(void *vbuf, size_t size) 194 { 195 uint64_t *buf = vbuf; 196 size_t count = size >> 3; 197 int i; 198 199 ASSERT((size & 7) == 0); 200 201 for (i = 0; i < count; i++) 202 buf[i] = BSWAP_64(buf[i]); 203 } 204 205 /* 206 * Set the external verifier for a gang block based on <vdev, offset, txg>, 207 * a tuple which is guaranteed to be unique for the life of the pool. 208 */ 209 static void 210 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 211 { 212 const dva_t *dva = BP_IDENTITY(bp); 213 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 214 215 ASSERT(BP_IS_GANG(bp)); 216 217 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 218 } 219 220 /* 221 * Set the external verifier for a label block based on its offset. 222 * The vdev is implicit, and the txg is unknowable at pool open time -- 223 * hence the logic in vdev_uberblock_load() to find the most recent copy. 224 */ 225 static void 226 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 227 { 228 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 229 } 230 231 /* 232 * Calls the template init function of a checksum which supports context 233 * templates and installs the template into the spa_t. 234 */ 235 static void 236 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 237 { 238 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 239 240 if (ci->ci_tmpl_init == NULL) 241 return; 242 243 if (spa->spa_cksum_tmpls[checksum] != NULL) 244 return; 245 246 if (spa->spa_cksum_tmpls[checksum] == NULL) { 247 spa->spa_cksum_tmpls[checksum] = 248 ci->ci_tmpl_init(&spa->spa_cksum_salt); 249 } 250 } 251 252 /* 253 * Called by a spa_t that's about to be deallocated. This steps through 254 * all of the checksum context templates and deallocates any that were 255 * initialized using the algorithm-specific template init function. 256 */ 257 void 258 zio_checksum_templates_free(spa_t *spa) 259 { 260 for (enum zio_checksum checksum = 0; 261 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 262 if (spa->spa_cksum_tmpls[checksum] != NULL) { 263 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 264 265 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 266 spa->spa_cksum_tmpls[checksum] = NULL; 267 } 268 } 269 } 270 271 static int 272 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 273 { 274 uint64_t size; 275 unsigned int checksum; 276 zio_checksum_info_t *ci; 277 void *ctx = NULL; 278 zio_cksum_t actual_cksum, expected_cksum, verifier; 279 int byteswap; 280 281 checksum = BP_GET_CHECKSUM(bp); 282 size = BP_GET_PSIZE(bp); 283 284 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 285 return (EINVAL); 286 ci = &zio_checksum_table[checksum]; 287 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 288 return (EINVAL); 289 290 if (spa != NULL) { 291 zio_checksum_template_init(checksum, (spa_t *) spa); 292 ctx = spa->spa_cksum_tmpls[checksum]; 293 } 294 295 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 296 zio_eck_t *eck; 297 298 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 299 checksum == ZIO_CHECKSUM_LABEL); 300 301 eck = (zio_eck_t *)((char *)data + size) - 1; 302 303 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 304 zio_checksum_gang_verifier(&verifier, bp); 305 else if (checksum == ZIO_CHECKSUM_LABEL) 306 zio_checksum_label_verifier(&verifier, 307 DVA_GET_OFFSET(BP_IDENTITY(bp))); 308 else 309 verifier = bp->blk_cksum; 310 311 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 312 313 if (byteswap) 314 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 315 316 expected_cksum = eck->zec_cksum; 317 eck->zec_cksum = verifier; 318 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 319 eck->zec_cksum = expected_cksum; 320 321 if (byteswap) 322 byteswap_uint64_array(&expected_cksum, 323 sizeof (zio_cksum_t)); 324 } else { 325 expected_cksum = bp->blk_cksum; 326 ci->ci_func[0](data, size, ctx, &actual_cksum); 327 } 328 329 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 330 /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */ 331 return (EIO); 332 } 333 334 return (0); 335 } 336 337 static int 338 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 339 void *dest, uint64_t destsize) 340 { 341 zio_compress_info_t *ci; 342 343 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 344 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 345 return (EIO); 346 } 347 348 ci = &zio_compress_table[cpfunc]; 349 if (!ci->ci_decompress) { 350 printf("ZFS: unsupported compression algorithm %s\n", 351 ci->ci_name); 352 return (EIO); 353 } 354 355 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 356 } 357 358 static uint64_t 359 zap_hash(uint64_t salt, const char *name) 360 { 361 const uint8_t *cp; 362 uint8_t c; 363 uint64_t crc = salt; 364 365 ASSERT(crc != 0); 366 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 367 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 368 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 369 370 /* 371 * Only use 28 bits, since we need 4 bits in the cookie for the 372 * collision differentiator. We MUST use the high bits, since 373 * those are the onces that we first pay attention to when 374 * chosing the bucket. 375 */ 376 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 377 378 return (crc); 379 } 380 381 static void *zfs_alloc(size_t size); 382 static void zfs_free(void *ptr, size_t size); 383 384 typedef struct raidz_col { 385 uint64_t rc_devidx; /* child device index for I/O */ 386 uint64_t rc_offset; /* device offset */ 387 uint64_t rc_size; /* I/O size */ 388 void *rc_data; /* I/O data */ 389 int rc_error; /* I/O error for this device */ 390 uint8_t rc_tried; /* Did we attempt this I/O column? */ 391 uint8_t rc_skipped; /* Did we skip this I/O column? */ 392 } raidz_col_t; 393 394 typedef struct raidz_map { 395 uint64_t rm_cols; /* Regular column count */ 396 uint64_t rm_scols; /* Count including skipped columns */ 397 uint64_t rm_bigcols; /* Number of oversized columns */ 398 uint64_t rm_asize; /* Actual total I/O size */ 399 uint64_t rm_missingdata; /* Count of missing data devices */ 400 uint64_t rm_missingparity; /* Count of missing parity devices */ 401 uint64_t rm_firstdatacol; /* First data column/parity count */ 402 uint64_t rm_nskip; /* Skipped sectors for padding */ 403 uint64_t rm_skipstart; /* Column index of padding start */ 404 uintptr_t rm_reports; /* # of referencing checksum reports */ 405 uint8_t rm_freed; /* map no longer has referencing ZIO */ 406 uint8_t rm_ecksuminjected; /* checksum error was injected */ 407 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 408 } raidz_map_t; 409 410 #define VDEV_RAIDZ_P 0 411 #define VDEV_RAIDZ_Q 1 412 #define VDEV_RAIDZ_R 2 413 414 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 415 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 416 417 /* 418 * We provide a mechanism to perform the field multiplication operation on a 419 * 64-bit value all at once rather than a byte at a time. This works by 420 * creating a mask from the top bit in each byte and using that to 421 * conditionally apply the XOR of 0x1d. 422 */ 423 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 424 { \ 425 (mask) = (x) & 0x8080808080808080ULL; \ 426 (mask) = ((mask) << 1) - ((mask) >> 7); \ 427 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 428 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 429 } 430 431 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 432 { \ 433 VDEV_RAIDZ_64MUL_2((x), mask); \ 434 VDEV_RAIDZ_64MUL_2((x), mask); \ 435 } 436 437 /* 438 * These two tables represent powers and logs of 2 in the Galois field defined 439 * above. These values were computed by repeatedly multiplying by 2 as above. 440 */ 441 static const uint8_t vdev_raidz_pow2[256] = { 442 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 443 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 444 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 445 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 446 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 447 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 448 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 449 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 450 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 451 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 452 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 453 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 454 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 455 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 456 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 457 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 458 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 459 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 460 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 461 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 462 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 463 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 464 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 465 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 466 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 467 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 468 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 469 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 470 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 471 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 472 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 473 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 474 }; 475 static const uint8_t vdev_raidz_log2[256] = { 476 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 477 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 478 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 479 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 480 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 481 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 482 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 483 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 484 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 485 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 486 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 487 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 488 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 489 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 490 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 491 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 492 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 493 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 494 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 495 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 496 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 497 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 498 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 499 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 500 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 501 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 502 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 503 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 504 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 505 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 506 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 507 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 508 }; 509 510 /* 511 * Multiply a given number by 2 raised to the given power. 512 */ 513 static uint8_t 514 vdev_raidz_exp2(uint8_t a, int exp) 515 { 516 if (a == 0) 517 return (0); 518 519 ASSERT(exp >= 0); 520 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 521 522 exp += vdev_raidz_log2[a]; 523 if (exp > 255) 524 exp -= 255; 525 526 return (vdev_raidz_pow2[exp]); 527 } 528 529 static void 530 vdev_raidz_generate_parity_p(raidz_map_t *rm) 531 { 532 uint64_t *p, *src, pcount __attribute__((unused)), ccount, i; 533 int c; 534 535 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 536 537 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 538 src = rm->rm_col[c].rc_data; 539 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 540 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 541 542 if (c == rm->rm_firstdatacol) { 543 ASSERT(ccount == pcount); 544 for (i = 0; i < ccount; i++, src++, p++) { 545 *p = *src; 546 } 547 } else { 548 ASSERT(ccount <= pcount); 549 for (i = 0; i < ccount; i++, src++, p++) { 550 *p ^= *src; 551 } 552 } 553 } 554 } 555 556 static void 557 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 558 { 559 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 560 int c; 561 562 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 563 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 564 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 565 566 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 567 src = rm->rm_col[c].rc_data; 568 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 569 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 570 571 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 572 573 if (c == rm->rm_firstdatacol) { 574 ASSERT(ccnt == pcnt || ccnt == 0); 575 for (i = 0; i < ccnt; i++, src++, p++, q++) { 576 *p = *src; 577 *q = *src; 578 } 579 for (; i < pcnt; i++, src++, p++, q++) { 580 *p = 0; 581 *q = 0; 582 } 583 } else { 584 ASSERT(ccnt <= pcnt); 585 586 /* 587 * Apply the algorithm described above by multiplying 588 * the previous result and adding in the new value. 589 */ 590 for (i = 0; i < ccnt; i++, src++, p++, q++) { 591 *p ^= *src; 592 593 VDEV_RAIDZ_64MUL_2(*q, mask); 594 *q ^= *src; 595 } 596 597 /* 598 * Treat short columns as though they are full of 0s. 599 * Note that there's therefore nothing needed for P. 600 */ 601 for (; i < pcnt; i++, q++) { 602 VDEV_RAIDZ_64MUL_2(*q, mask); 603 } 604 } 605 } 606 } 607 608 static void 609 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 610 { 611 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 612 int c; 613 614 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 615 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 616 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 617 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 618 rm->rm_col[VDEV_RAIDZ_R].rc_size); 619 620 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 621 src = rm->rm_col[c].rc_data; 622 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 623 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 624 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 625 626 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 627 628 if (c == rm->rm_firstdatacol) { 629 ASSERT(ccnt == pcnt || ccnt == 0); 630 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 631 *p = *src; 632 *q = *src; 633 *r = *src; 634 } 635 for (; i < pcnt; i++, src++, p++, q++, r++) { 636 *p = 0; 637 *q = 0; 638 *r = 0; 639 } 640 } else { 641 ASSERT(ccnt <= pcnt); 642 643 /* 644 * Apply the algorithm described above by multiplying 645 * the previous result and adding in the new value. 646 */ 647 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 648 *p ^= *src; 649 650 VDEV_RAIDZ_64MUL_2(*q, mask); 651 *q ^= *src; 652 653 VDEV_RAIDZ_64MUL_4(*r, mask); 654 *r ^= *src; 655 } 656 657 /* 658 * Treat short columns as though they are full of 0s. 659 * Note that there's therefore nothing needed for P. 660 */ 661 for (; i < pcnt; i++, q++, r++) { 662 VDEV_RAIDZ_64MUL_2(*q, mask); 663 VDEV_RAIDZ_64MUL_4(*r, mask); 664 } 665 } 666 } 667 } 668 669 /* 670 * Generate RAID parity in the first virtual columns according to the number of 671 * parity columns available. 672 */ 673 static void 674 vdev_raidz_generate_parity(raidz_map_t *rm) 675 { 676 switch (rm->rm_firstdatacol) { 677 case 1: 678 vdev_raidz_generate_parity_p(rm); 679 break; 680 case 2: 681 vdev_raidz_generate_parity_pq(rm); 682 break; 683 case 3: 684 vdev_raidz_generate_parity_pqr(rm); 685 break; 686 default: 687 panic("invalid RAID-Z configuration"); 688 } 689 } 690 691 /* BEGIN CSTYLED */ 692 /* 693 * In the general case of reconstruction, we must solve the system of linear 694 * equations defined by the coeffecients used to generate parity as well as 695 * the contents of the data and parity disks. This can be expressed with 696 * vectors for the original data (D) and the actual data (d) and parity (p) 697 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 698 * 699 * __ __ __ __ 700 * | | __ __ | p_0 | 701 * | V | | D_0 | | p_m-1 | 702 * | | x | : | = | d_0 | 703 * | I | | D_n-1 | | : | 704 * | | ~~ ~~ | d_n-1 | 705 * ~~ ~~ ~~ ~~ 706 * 707 * I is simply a square identity matrix of size n, and V is a vandermonde 708 * matrix defined by the coeffecients we chose for the various parity columns 709 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 710 * computation as well as linear separability. 711 * 712 * __ __ __ __ 713 * | 1 .. 1 1 1 | | p_0 | 714 * | 2^n-1 .. 4 2 1 | __ __ | : | 715 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 716 * | 1 .. 0 0 0 | | D_1 | | d_0 | 717 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 718 * | : : : : | | : | | d_2 | 719 * | 0 .. 1 0 0 | | D_n-1 | | : | 720 * | 0 .. 0 1 0 | ~~ ~~ | : | 721 * | 0 .. 0 0 1 | | d_n-1 | 722 * ~~ ~~ ~~ ~~ 723 * 724 * Note that I, V, d, and p are known. To compute D, we must invert the 725 * matrix and use the known data and parity values to reconstruct the unknown 726 * data values. We begin by removing the rows in V|I and d|p that correspond 727 * to failed or missing columns; we then make V|I square (n x n) and d|p 728 * sized n by removing rows corresponding to unused parity from the bottom up 729 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 730 * using Gauss-Jordan elimination. In the example below we use m=3 parity 731 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 732 * __ __ 733 * | 1 1 1 1 1 1 1 1 | 734 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 735 * | 19 205 116 29 64 16 4 1 | / / 736 * | 1 0 0 0 0 0 0 0 | / / 737 * | 0 1 0 0 0 0 0 0 | <--' / 738 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 739 * | 0 0 0 1 0 0 0 0 | 740 * | 0 0 0 0 1 0 0 0 | 741 * | 0 0 0 0 0 1 0 0 | 742 * | 0 0 0 0 0 0 1 0 | 743 * | 0 0 0 0 0 0 0 1 | 744 * ~~ ~~ 745 * __ __ 746 * | 1 1 1 1 1 1 1 1 | 747 * | 128 64 32 16 8 4 2 1 | 748 * | 19 205 116 29 64 16 4 1 | 749 * | 1 0 0 0 0 0 0 0 | 750 * | 0 1 0 0 0 0 0 0 | 751 * (V|I)' = | 0 0 1 0 0 0 0 0 | 752 * | 0 0 0 1 0 0 0 0 | 753 * | 0 0 0 0 1 0 0 0 | 754 * | 0 0 0 0 0 1 0 0 | 755 * | 0 0 0 0 0 0 1 0 | 756 * | 0 0 0 0 0 0 0 1 | 757 * ~~ ~~ 758 * 759 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 760 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 761 * matrix is not singular. 762 * __ __ 763 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 764 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 765 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 766 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 767 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 768 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 769 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 770 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 771 * ~~ ~~ 772 * __ __ 773 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 774 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 775 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 776 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 777 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 778 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 779 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 780 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 781 * ~~ ~~ 782 * __ __ 783 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 784 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 785 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 786 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 787 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 788 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 789 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 790 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 791 * ~~ ~~ 792 * __ __ 793 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 794 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 795 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 796 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 797 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 798 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 799 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 800 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 801 * ~~ ~~ 802 * __ __ 803 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 804 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 805 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 806 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 807 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 808 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 809 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 810 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 811 * ~~ ~~ 812 * __ __ 813 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 814 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 815 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 816 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 817 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 818 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 819 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 820 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 821 * ~~ ~~ 822 * __ __ 823 * | 0 0 1 0 0 0 0 0 | 824 * | 167 100 5 41 159 169 217 208 | 825 * | 166 100 4 40 158 168 216 209 | 826 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 827 * | 0 0 0 0 1 0 0 0 | 828 * | 0 0 0 0 0 1 0 0 | 829 * | 0 0 0 0 0 0 1 0 | 830 * | 0 0 0 0 0 0 0 1 | 831 * ~~ ~~ 832 * 833 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 834 * of the missing data. 835 * 836 * As is apparent from the example above, the only non-trivial rows in the 837 * inverse matrix correspond to the data disks that we're trying to 838 * reconstruct. Indeed, those are the only rows we need as the others would 839 * only be useful for reconstructing data known or assumed to be valid. For 840 * that reason, we only build the coefficients in the rows that correspond to 841 * targeted columns. 842 */ 843 /* END CSTYLED */ 844 845 static void 846 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 847 uint8_t **rows) 848 { 849 int i, j; 850 int pow; 851 852 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 853 854 /* 855 * Fill in the missing rows of interest. 856 */ 857 for (i = 0; i < nmap; i++) { 858 ASSERT3S(0, <=, map[i]); 859 ASSERT3S(map[i], <=, 2); 860 861 pow = map[i] * n; 862 if (pow > 255) 863 pow -= 255; 864 ASSERT(pow <= 255); 865 866 for (j = 0; j < n; j++) { 867 pow -= map[i]; 868 if (pow < 0) 869 pow += 255; 870 rows[i][j] = vdev_raidz_pow2[pow]; 871 } 872 } 873 } 874 875 static void 876 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 877 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 878 { 879 int i, j, ii, jj; 880 uint8_t log; 881 882 /* 883 * Assert that the first nmissing entries from the array of used 884 * columns correspond to parity columns and that subsequent entries 885 * correspond to data columns. 886 */ 887 for (i = 0; i < nmissing; i++) { 888 ASSERT3S(used[i], <, rm->rm_firstdatacol); 889 } 890 for (; i < n; i++) { 891 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 892 } 893 894 /* 895 * First initialize the storage where we'll compute the inverse rows. 896 */ 897 for (i = 0; i < nmissing; i++) { 898 for (j = 0; j < n; j++) { 899 invrows[i][j] = (i == j) ? 1 : 0; 900 } 901 } 902 903 /* 904 * Subtract all trivial rows from the rows of consequence. 905 */ 906 for (i = 0; i < nmissing; i++) { 907 for (j = nmissing; j < n; j++) { 908 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 909 jj = used[j] - rm->rm_firstdatacol; 910 ASSERT3S(jj, <, n); 911 invrows[i][j] = rows[i][jj]; 912 rows[i][jj] = 0; 913 } 914 } 915 916 /* 917 * For each of the rows of interest, we must normalize it and subtract 918 * a multiple of it from the other rows. 919 */ 920 for (i = 0; i < nmissing; i++) { 921 for (j = 0; j < missing[i]; j++) { 922 ASSERT3U(rows[i][j], ==, 0); 923 } 924 ASSERT3U(rows[i][missing[i]], !=, 0); 925 926 /* 927 * Compute the inverse of the first element and multiply each 928 * element in the row by that value. 929 */ 930 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 931 932 for (j = 0; j < n; j++) { 933 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 934 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 935 } 936 937 for (ii = 0; ii < nmissing; ii++) { 938 if (i == ii) 939 continue; 940 941 ASSERT3U(rows[ii][missing[i]], !=, 0); 942 943 log = vdev_raidz_log2[rows[ii][missing[i]]]; 944 945 for (j = 0; j < n; j++) { 946 rows[ii][j] ^= 947 vdev_raidz_exp2(rows[i][j], log); 948 invrows[ii][j] ^= 949 vdev_raidz_exp2(invrows[i][j], log); 950 } 951 } 952 } 953 954 /* 955 * Verify that the data that is left in the rows are properly part of 956 * an identity matrix. 957 */ 958 for (i = 0; i < nmissing; i++) { 959 for (j = 0; j < n; j++) { 960 if (j == missing[i]) { 961 ASSERT3U(rows[i][j], ==, 1); 962 } else { 963 ASSERT3U(rows[i][j], ==, 0); 964 } 965 } 966 } 967 } 968 969 static void 970 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 971 int *missing, uint8_t **invrows, const uint8_t *used) 972 { 973 int i, j, x, cc, c; 974 uint8_t *src; 975 uint64_t ccount; 976 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 977 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 978 uint8_t log, val; 979 int ll; 980 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 981 uint8_t *p, *pp; 982 size_t psize; 983 984 log = 0; /* gcc */ 985 psize = sizeof (invlog[0][0]) * n * nmissing; 986 p = zfs_alloc(psize); 987 988 for (pp = p, i = 0; i < nmissing; i++) { 989 invlog[i] = pp; 990 pp += n; 991 } 992 993 for (i = 0; i < nmissing; i++) { 994 for (j = 0; j < n; j++) { 995 ASSERT3U(invrows[i][j], !=, 0); 996 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 997 } 998 } 999 1000 for (i = 0; i < n; i++) { 1001 c = used[i]; 1002 ASSERT3U(c, <, rm->rm_cols); 1003 1004 src = rm->rm_col[c].rc_data; 1005 ccount = rm->rm_col[c].rc_size; 1006 for (j = 0; j < nmissing; j++) { 1007 cc = missing[j] + rm->rm_firstdatacol; 1008 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1009 ASSERT3U(cc, <, rm->rm_cols); 1010 ASSERT3U(cc, !=, c); 1011 1012 dst[j] = rm->rm_col[cc].rc_data; 1013 dcount[j] = rm->rm_col[cc].rc_size; 1014 } 1015 1016 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1017 1018 for (x = 0; x < ccount; x++, src++) { 1019 if (*src != 0) 1020 log = vdev_raidz_log2[*src]; 1021 1022 for (cc = 0; cc < nmissing; cc++) { 1023 if (x >= dcount[cc]) 1024 continue; 1025 1026 if (*src == 0) { 1027 val = 0; 1028 } else { 1029 if ((ll = log + invlog[cc][i]) >= 255) 1030 ll -= 255; 1031 val = vdev_raidz_pow2[ll]; 1032 } 1033 1034 if (i == 0) 1035 dst[cc][x] = val; 1036 else 1037 dst[cc][x] ^= val; 1038 } 1039 } 1040 } 1041 1042 zfs_free(p, psize); 1043 } 1044 1045 static int 1046 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1047 { 1048 int n, i, c, t, tt; 1049 int nmissing_rows; 1050 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1051 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1052 1053 uint8_t *p, *pp; 1054 size_t psize; 1055 1056 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1057 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1058 uint8_t *used; 1059 1060 int code = 0; 1061 1062 1063 n = rm->rm_cols - rm->rm_firstdatacol; 1064 1065 /* 1066 * Figure out which data columns are missing. 1067 */ 1068 nmissing_rows = 0; 1069 for (t = 0; t < ntgts; t++) { 1070 if (tgts[t] >= rm->rm_firstdatacol) { 1071 missing_rows[nmissing_rows++] = 1072 tgts[t] - rm->rm_firstdatacol; 1073 } 1074 } 1075 1076 /* 1077 * Figure out which parity columns to use to help generate the missing 1078 * data columns. 1079 */ 1080 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1081 ASSERT(tt < ntgts); 1082 ASSERT(c < rm->rm_firstdatacol); 1083 1084 /* 1085 * Skip any targeted parity columns. 1086 */ 1087 if (c == tgts[tt]) { 1088 tt++; 1089 continue; 1090 } 1091 1092 code |= 1 << c; 1093 1094 parity_map[i] = c; 1095 i++; 1096 } 1097 1098 ASSERT(code != 0); 1099 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1100 1101 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1102 nmissing_rows * n + sizeof (used[0]) * n; 1103 p = kmem_alloc(psize, KM_SLEEP); 1104 1105 for (pp = p, i = 0; i < nmissing_rows; i++) { 1106 rows[i] = pp; 1107 pp += n; 1108 invrows[i] = pp; 1109 pp += n; 1110 } 1111 used = pp; 1112 1113 for (i = 0; i < nmissing_rows; i++) { 1114 used[i] = parity_map[i]; 1115 } 1116 1117 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1118 if (tt < nmissing_rows && 1119 c == missing_rows[tt] + rm->rm_firstdatacol) { 1120 tt++; 1121 continue; 1122 } 1123 1124 ASSERT3S(i, <, n); 1125 used[i] = c; 1126 i++; 1127 } 1128 1129 /* 1130 * Initialize the interesting rows of the matrix. 1131 */ 1132 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1133 1134 /* 1135 * Invert the matrix. 1136 */ 1137 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1138 invrows, used); 1139 1140 /* 1141 * Reconstruct the missing data using the generated matrix. 1142 */ 1143 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1144 invrows, used); 1145 1146 kmem_free(p, psize); 1147 1148 return (code); 1149 } 1150 1151 static int 1152 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1153 { 1154 int tgts[VDEV_RAIDZ_MAXPARITY]; 1155 int ntgts; 1156 int i, c; 1157 int code; 1158 int nbadparity, nbaddata; 1159 1160 /* 1161 * The tgts list must already be sorted. 1162 */ 1163 for (i = 1; i < nt; i++) { 1164 ASSERT(t[i] > t[i - 1]); 1165 } 1166 1167 nbadparity = rm->rm_firstdatacol; 1168 nbaddata = rm->rm_cols - nbadparity; 1169 ntgts = 0; 1170 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1171 if (i < nt && c == t[i]) { 1172 tgts[ntgts++] = c; 1173 i++; 1174 } else if (rm->rm_col[c].rc_error != 0) { 1175 tgts[ntgts++] = c; 1176 } else if (c >= rm->rm_firstdatacol) { 1177 nbaddata--; 1178 } else { 1179 nbadparity--; 1180 } 1181 } 1182 1183 ASSERT(ntgts >= nt); 1184 ASSERT(nbaddata >= 0); 1185 ASSERT(nbaddata + nbadparity == ntgts); 1186 1187 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1188 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1189 ASSERT(code > 0); 1190 return (code); 1191 } 1192 1193 static raidz_map_t * 1194 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1195 uint64_t dcols, uint64_t nparity) 1196 { 1197 raidz_map_t *rm; 1198 uint64_t b = offset >> unit_shift; 1199 uint64_t s = size >> unit_shift; 1200 uint64_t f = b % dcols; 1201 uint64_t o = (b / dcols) << unit_shift; 1202 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1203 1204 q = s / (dcols - nparity); 1205 r = s - q * (dcols - nparity); 1206 bc = (r == 0 ? 0 : r + nparity); 1207 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1208 1209 if (q == 0) { 1210 acols = bc; 1211 scols = MIN(dcols, roundup(bc, nparity + 1)); 1212 } else { 1213 acols = dcols; 1214 scols = dcols; 1215 } 1216 1217 ASSERT3U(acols, <=, scols); 1218 1219 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1220 1221 rm->rm_cols = acols; 1222 rm->rm_scols = scols; 1223 rm->rm_bigcols = bc; 1224 rm->rm_skipstart = bc; 1225 rm->rm_missingdata = 0; 1226 rm->rm_missingparity = 0; 1227 rm->rm_firstdatacol = nparity; 1228 rm->rm_reports = 0; 1229 rm->rm_freed = 0; 1230 rm->rm_ecksuminjected = 0; 1231 1232 asize = 0; 1233 1234 for (c = 0; c < scols; c++) { 1235 col = f + c; 1236 coff = o; 1237 if (col >= dcols) { 1238 col -= dcols; 1239 coff += 1ULL << unit_shift; 1240 } 1241 rm->rm_col[c].rc_devidx = col; 1242 rm->rm_col[c].rc_offset = coff; 1243 rm->rm_col[c].rc_data = NULL; 1244 rm->rm_col[c].rc_error = 0; 1245 rm->rm_col[c].rc_tried = 0; 1246 rm->rm_col[c].rc_skipped = 0; 1247 1248 if (c >= acols) 1249 rm->rm_col[c].rc_size = 0; 1250 else if (c < bc) 1251 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1252 else 1253 rm->rm_col[c].rc_size = q << unit_shift; 1254 1255 asize += rm->rm_col[c].rc_size; 1256 } 1257 1258 ASSERT3U(asize, ==, tot << unit_shift); 1259 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1260 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1261 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1262 ASSERT3U(rm->rm_nskip, <=, nparity); 1263 1264 for (c = 0; c < rm->rm_firstdatacol; c++) 1265 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1266 1267 rm->rm_col[c].rc_data = data; 1268 1269 for (c = c + 1; c < acols; c++) 1270 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1271 rm->rm_col[c - 1].rc_size; 1272 1273 /* 1274 * If all data stored spans all columns, there's a danger that parity 1275 * will always be on the same device and, since parity isn't read 1276 * during normal operation, that that device's I/O bandwidth won't be 1277 * used effectively. We therefore switch the parity every 1MB. 1278 * 1279 * ... at least that was, ostensibly, the theory. As a practical 1280 * matter unless we juggle the parity between all devices evenly, we 1281 * won't see any benefit. Further, occasional writes that aren't a 1282 * multiple of the LCM of the number of children and the minimum 1283 * stripe width are sufficient to avoid pessimal behavior. 1284 * Unfortunately, this decision created an implicit on-disk format 1285 * requirement that we need to support for all eternity, but only 1286 * for single-parity RAID-Z. 1287 * 1288 * If we intend to skip a sector in the zeroth column for padding 1289 * we must make sure to note this swap. We will never intend to 1290 * skip the first column since at least one data and one parity 1291 * column must appear in each row. 1292 */ 1293 ASSERT(rm->rm_cols >= 2); 1294 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1295 1296 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1297 devidx = rm->rm_col[0].rc_devidx; 1298 o = rm->rm_col[0].rc_offset; 1299 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1300 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1301 rm->rm_col[1].rc_devidx = devidx; 1302 rm->rm_col[1].rc_offset = o; 1303 1304 if (rm->rm_skipstart == 0) 1305 rm->rm_skipstart = 1; 1306 } 1307 1308 return (rm); 1309 } 1310 1311 static void 1312 vdev_raidz_map_free(raidz_map_t *rm) 1313 { 1314 int c; 1315 1316 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1317 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1318 1319 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1320 } 1321 1322 static vdev_t * 1323 vdev_child(vdev_t *pvd, uint64_t devidx) 1324 { 1325 vdev_t *cvd; 1326 1327 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1328 if (cvd->v_id == devidx) 1329 break; 1330 } 1331 1332 return (cvd); 1333 } 1334 1335 /* 1336 * We keep track of whether or not there were any injected errors, so that 1337 * any ereports we generate can note it. 1338 */ 1339 static int 1340 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1341 uint64_t size) 1342 { 1343 1344 return (zio_checksum_verify(spa, bp, data)); 1345 } 1346 1347 /* 1348 * Generate the parity from the data columns. If we tried and were able to 1349 * read the parity without error, verify that the generated parity matches the 1350 * data we read. If it doesn't, we fire off a checksum error. Return the 1351 * number such failures. 1352 */ 1353 static int 1354 raidz_parity_verify(raidz_map_t *rm) 1355 { 1356 void *orig[VDEV_RAIDZ_MAXPARITY]; 1357 int c, ret = 0; 1358 raidz_col_t *rc; 1359 1360 for (c = 0; c < rm->rm_firstdatacol; c++) { 1361 rc = &rm->rm_col[c]; 1362 if (!rc->rc_tried || rc->rc_error != 0) 1363 continue; 1364 orig[c] = zfs_alloc(rc->rc_size); 1365 bcopy(rc->rc_data, orig[c], rc->rc_size); 1366 } 1367 1368 vdev_raidz_generate_parity(rm); 1369 1370 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1371 rc = &rm->rm_col[c]; 1372 if (!rc->rc_tried || rc->rc_error != 0) 1373 continue; 1374 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1375 rc->rc_error = ECKSUM; 1376 ret++; 1377 } 1378 zfs_free(orig[c], rc->rc_size); 1379 } 1380 1381 return (ret); 1382 } 1383 1384 /* 1385 * Iterate over all combinations of bad data and attempt a reconstruction. 1386 * Note that the algorithm below is non-optimal because it doesn't take into 1387 * account how reconstruction is actually performed. For example, with 1388 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1389 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1390 * cases we'd only use parity information in column 0. 1391 */ 1392 static int 1393 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1394 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) 1395 { 1396 raidz_col_t *rc; 1397 void *orig[VDEV_RAIDZ_MAXPARITY]; 1398 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1399 int *tgts = &tstore[1]; 1400 int current, next, i, c, n; 1401 int code, ret = 0; 1402 1403 ASSERT(total_errors < rm->rm_firstdatacol); 1404 1405 /* 1406 * This simplifies one edge condition. 1407 */ 1408 tgts[-1] = -1; 1409 1410 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1411 /* 1412 * Initialize the targets array by finding the first n columns 1413 * that contain no error. 1414 * 1415 * If there were no data errors, we need to ensure that we're 1416 * always explicitly attempting to reconstruct at least one 1417 * data column. To do this, we simply push the highest target 1418 * up into the data columns. 1419 */ 1420 for (c = 0, i = 0; i < n; i++) { 1421 if (i == n - 1 && data_errors == 0 && 1422 c < rm->rm_firstdatacol) { 1423 c = rm->rm_firstdatacol; 1424 } 1425 1426 while (rm->rm_col[c].rc_error != 0) { 1427 c++; 1428 ASSERT3S(c, <, rm->rm_cols); 1429 } 1430 1431 tgts[i] = c++; 1432 } 1433 1434 /* 1435 * Setting tgts[n] simplifies the other edge condition. 1436 */ 1437 tgts[n] = rm->rm_cols; 1438 1439 /* 1440 * These buffers were allocated in previous iterations. 1441 */ 1442 for (i = 0; i < n - 1; i++) { 1443 ASSERT(orig[i] != NULL); 1444 } 1445 1446 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1447 1448 current = 0; 1449 next = tgts[current]; 1450 1451 while (current != n) { 1452 tgts[current] = next; 1453 current = 0; 1454 1455 /* 1456 * Save off the original data that we're going to 1457 * attempt to reconstruct. 1458 */ 1459 for (i = 0; i < n; i++) { 1460 ASSERT(orig[i] != NULL); 1461 c = tgts[i]; 1462 ASSERT3S(c, >=, 0); 1463 ASSERT3S(c, <, rm->rm_cols); 1464 rc = &rm->rm_col[c]; 1465 bcopy(rc->rc_data, orig[i], rc->rc_size); 1466 } 1467 1468 /* 1469 * Attempt a reconstruction and exit the outer loop on 1470 * success. 1471 */ 1472 code = vdev_raidz_reconstruct(rm, tgts, n); 1473 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1474 for (i = 0; i < n; i++) { 1475 c = tgts[i]; 1476 rc = &rm->rm_col[c]; 1477 ASSERT(rc->rc_error == 0); 1478 rc->rc_error = ECKSUM; 1479 } 1480 1481 ret = code; 1482 goto done; 1483 } 1484 1485 /* 1486 * Restore the original data. 1487 */ 1488 for (i = 0; i < n; i++) { 1489 c = tgts[i]; 1490 rc = &rm->rm_col[c]; 1491 bcopy(orig[i], rc->rc_data, rc->rc_size); 1492 } 1493 1494 do { 1495 /* 1496 * Find the next valid column after the current 1497 * position.. 1498 */ 1499 for (next = tgts[current] + 1; 1500 next < rm->rm_cols && 1501 rm->rm_col[next].rc_error != 0; next++) 1502 continue; 1503 1504 ASSERT(next <= tgts[current + 1]); 1505 1506 /* 1507 * If that spot is available, we're done here. 1508 */ 1509 if (next != tgts[current + 1]) 1510 break; 1511 1512 /* 1513 * Otherwise, find the next valid column after 1514 * the previous position. 1515 */ 1516 for (c = tgts[current - 1] + 1; 1517 rm->rm_col[c].rc_error != 0; c++) 1518 continue; 1519 1520 tgts[current] = c; 1521 current++; 1522 1523 } while (current != n); 1524 } 1525 } 1526 n--; 1527 done: 1528 for (i = n - 1; i >= 0; i--) { 1529 zfs_free(orig[i], rm->rm_col[0].rc_size); 1530 } 1531 1532 return (ret); 1533 } 1534 1535 static int 1536 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1537 off_t offset, size_t bytes) 1538 { 1539 vdev_t *tvd = vd->v_top; 1540 vdev_t *cvd; 1541 raidz_map_t *rm; 1542 raidz_col_t *rc; 1543 int c, error; 1544 int unexpected_errors; 1545 int parity_errors; 1546 int parity_untried; 1547 int data_errors; 1548 int total_errors; 1549 int n; 1550 int tgts[VDEV_RAIDZ_MAXPARITY]; 1551 int code; 1552 1553 rc = NULL; /* gcc */ 1554 error = 0; 1555 1556 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1557 vd->v_nchildren, vd->v_nparity); 1558 1559 /* 1560 * Iterate over the columns in reverse order so that we hit the parity 1561 * last -- any errors along the way will force us to read the parity. 1562 */ 1563 for (c = rm->rm_cols - 1; c >= 0; c--) { 1564 rc = &rm->rm_col[c]; 1565 cvd = vdev_child(vd, rc->rc_devidx); 1566 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1567 if (c >= rm->rm_firstdatacol) 1568 rm->rm_missingdata++; 1569 else 1570 rm->rm_missingparity++; 1571 rc->rc_error = ENXIO; 1572 rc->rc_tried = 1; /* don't even try */ 1573 rc->rc_skipped = 1; 1574 continue; 1575 } 1576 #if 0 /* XXX: Too hard for the boot code. */ 1577 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1578 if (c >= rm->rm_firstdatacol) 1579 rm->rm_missingdata++; 1580 else 1581 rm->rm_missingparity++; 1582 rc->rc_error = ESTALE; 1583 rc->rc_skipped = 1; 1584 continue; 1585 } 1586 #endif 1587 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1588 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1589 rc->rc_offset, rc->rc_size); 1590 rc->rc_tried = 1; 1591 rc->rc_skipped = 0; 1592 } 1593 } 1594 1595 reconstruct: 1596 unexpected_errors = 0; 1597 parity_errors = 0; 1598 parity_untried = 0; 1599 data_errors = 0; 1600 total_errors = 0; 1601 1602 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1603 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1604 1605 for (c = 0; c < rm->rm_cols; c++) { 1606 rc = &rm->rm_col[c]; 1607 1608 if (rc->rc_error) { 1609 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1610 1611 if (c < rm->rm_firstdatacol) 1612 parity_errors++; 1613 else 1614 data_errors++; 1615 1616 if (!rc->rc_skipped) 1617 unexpected_errors++; 1618 1619 total_errors++; 1620 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1621 parity_untried++; 1622 } 1623 } 1624 1625 /* 1626 * There are three potential phases for a read: 1627 * 1. produce valid data from the columns read 1628 * 2. read all disks and try again 1629 * 3. perform combinatorial reconstruction 1630 * 1631 * Each phase is progressively both more expensive and less likely to 1632 * occur. If we encounter more errors than we can repair or all phases 1633 * fail, we have no choice but to return an error. 1634 */ 1635 1636 /* 1637 * If the number of errors we saw was correctable -- less than or equal 1638 * to the number of parity disks read -- attempt to produce data that 1639 * has a valid checksum. Naturally, this case applies in the absence of 1640 * any errors. 1641 */ 1642 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1643 if (data_errors == 0) { 1644 if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { 1645 /* 1646 * If we read parity information (unnecessarily 1647 * as it happens since no reconstruction was 1648 * needed) regenerate and verify the parity. 1649 * We also regenerate parity when resilvering 1650 * so we can write it out to the failed device 1651 * later. 1652 */ 1653 if (parity_errors + parity_untried < 1654 rm->rm_firstdatacol) { 1655 n = raidz_parity_verify(rm); 1656 unexpected_errors += n; 1657 ASSERT(parity_errors + n <= 1658 rm->rm_firstdatacol); 1659 } 1660 goto done; 1661 } 1662 } else { 1663 /* 1664 * We either attempt to read all the parity columns or 1665 * none of them. If we didn't try to read parity, we 1666 * wouldn't be here in the correctable case. There must 1667 * also have been fewer parity errors than parity 1668 * columns or, again, we wouldn't be in this code path. 1669 */ 1670 ASSERT(parity_untried == 0); 1671 ASSERT(parity_errors < rm->rm_firstdatacol); 1672 1673 /* 1674 * Identify the data columns that reported an error. 1675 */ 1676 n = 0; 1677 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1678 rc = &rm->rm_col[c]; 1679 if (rc->rc_error != 0) { 1680 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1681 tgts[n++] = c; 1682 } 1683 } 1684 1685 ASSERT(rm->rm_firstdatacol >= n); 1686 1687 code = vdev_raidz_reconstruct(rm, tgts, n); 1688 1689 if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { 1690 /* 1691 * If we read more parity disks than were used 1692 * for reconstruction, confirm that the other 1693 * parity disks produced correct data. This 1694 * routine is suboptimal in that it regenerates 1695 * the parity that we already used in addition 1696 * to the parity that we're attempting to 1697 * verify, but this should be a relatively 1698 * uncommon case, and can be optimized if it 1699 * becomes a problem. Note that we regenerate 1700 * parity when resilvering so we can write it 1701 * out to failed devices later. 1702 */ 1703 if (parity_errors < rm->rm_firstdatacol - n) { 1704 n = raidz_parity_verify(rm); 1705 unexpected_errors += n; 1706 ASSERT(parity_errors + n <= 1707 rm->rm_firstdatacol); 1708 } 1709 1710 goto done; 1711 } 1712 } 1713 } 1714 1715 /* 1716 * This isn't a typical situation -- either we got a read 1717 * error or a child silently returned bad data. Read every 1718 * block so we can try again with as much data and parity as 1719 * we can track down. If we've already been through once 1720 * before, all children will be marked as tried so we'll 1721 * proceed to combinatorial reconstruction. 1722 */ 1723 unexpected_errors = 1; 1724 rm->rm_missingdata = 0; 1725 rm->rm_missingparity = 0; 1726 1727 n = 0; 1728 for (c = 0; c < rm->rm_cols; c++) { 1729 rc = &rm->rm_col[c]; 1730 1731 if (rc->rc_tried) 1732 continue; 1733 1734 cvd = vdev_child(vd, rc->rc_devidx); 1735 ASSERT(cvd != NULL); 1736 rc->rc_error = cvd->v_read(cvd, NULL, 1737 rc->rc_data, rc->rc_offset, rc->rc_size); 1738 if (rc->rc_error == 0) 1739 n++; 1740 rc->rc_tried = 1; 1741 rc->rc_skipped = 0; 1742 } 1743 /* 1744 * If we managed to read anything more, retry the 1745 * reconstruction. 1746 */ 1747 if (n > 0) 1748 goto reconstruct; 1749 1750 /* 1751 * At this point we've attempted to reconstruct the data given the 1752 * errors we detected, and we've attempted to read all columns. There 1753 * must, therefore, be one or more additional problems -- silent errors 1754 * resulting in invalid data rather than explicit I/O errors resulting 1755 * in absent data. We check if there is enough additional data to 1756 * possibly reconstruct the data and then perform combinatorial 1757 * reconstruction over all possible combinations. If that fails, 1758 * we're cooked. 1759 */ 1760 if (total_errors > rm->rm_firstdatacol) { 1761 error = EIO; 1762 } else if (total_errors < rm->rm_firstdatacol && 1763 (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes, 1764 total_errors, data_errors)) != 0) { 1765 /* 1766 * If we didn't use all the available parity for the 1767 * combinatorial reconstruction, verify that the remaining 1768 * parity is correct. 1769 */ 1770 if (code != (1 << rm->rm_firstdatacol) - 1) 1771 (void) raidz_parity_verify(rm); 1772 } else { 1773 /* 1774 * We're here because either: 1775 * 1776 * total_errors == rm_first_datacol, or 1777 * vdev_raidz_combrec() failed 1778 * 1779 * In either case, there is enough bad data to prevent 1780 * reconstruction. 1781 * 1782 * Start checksum ereports for all children which haven't 1783 * failed, and the IO wasn't speculative. 1784 */ 1785 error = ECKSUM; 1786 } 1787 1788 done: 1789 vdev_raidz_map_free(rm); 1790 1791 return (error); 1792 } 1793