1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 #include <lz4.h> 28 29 static uint64_t zfs_crc64_table[256]; 30 31 #define ECKSUM 666 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 #define kmem_alloc(size, flag) zfs_alloc((size)) 40 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 41 42 static void 43 zfs_init_crc(void) 44 { 45 int i, j; 46 uint64_t *ct; 47 48 /* 49 * Calculate the crc64 table (used for the zap hash 50 * function). 51 */ 52 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 53 memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table)); 54 for (i = 0; i < 256; i++) { 55 ct = zfs_crc64_table + i; 56 for (*ct = i, j = 8; j > 0; j--) 57 *ct = (*ct >> 1) ^ 58 (-(*ct & 1) & ZFS_CRC64_POLY); 59 } 60 } 61 } 62 63 static void 64 zio_checksum_off(const void *buf __unused, uint64_t size __unused, 65 const void *ctx_template __unused, zio_cksum_t *zcp) 66 { 67 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 68 } 69 70 /* 71 * Signature for checksum functions. 72 */ 73 typedef void zio_checksum_t(const void *data, uint64_t size, 74 const void *ctx_template, zio_cksum_t *zcp); 75 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 76 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 77 78 typedef enum zio_checksum_flags { 79 /* Strong enough for metadata? */ 80 ZCHECKSUM_FLAG_METADATA = (1 << 1), 81 /* ZIO embedded checksum */ 82 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 83 /* Strong enough for dedup (without verification)? */ 84 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 85 /* Uses salt value */ 86 ZCHECKSUM_FLAG_SALTED = (1 << 4), 87 /* Strong enough for nopwrite? */ 88 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 89 } zio_checksum_flags_t; 90 91 /* 92 * Information about each checksum function. 93 */ 94 typedef struct zio_checksum_info { 95 /* checksum function for each byteorder */ 96 zio_checksum_t *ci_func[2]; 97 zio_checksum_tmpl_init_t *ci_tmpl_init; 98 zio_checksum_tmpl_free_t *ci_tmpl_free; 99 zio_checksum_flags_t ci_flags; 100 const char *ci_name; /* descriptive name */ 101 } zio_checksum_info_t; 102 103 #include "blkptr.c" 104 105 #include "fletcher.c" 106 #include "sha256.c" 107 #include "skein_zfs.c" 108 #include "edonr_zfs.c" 109 110 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 111 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 112 {{NULL, NULL}, NULL, NULL, 0, "on"}, 113 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 114 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 115 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 116 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 117 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 118 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 119 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 120 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 121 0, "fletcher2"}, 122 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 123 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 124 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 125 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 126 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 127 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 128 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 129 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 130 0, "noparity"}, 131 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 132 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 133 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 134 /* no skein and edonr for now */ 135 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 136 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 137 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 138 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 139 {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, 140 zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, 141 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | 142 ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, 143 }; 144 145 /* 146 * Common signature for all zio compress/decompress functions. 147 */ 148 typedef size_t zio_compress_func_t(void *src, void *dst, 149 size_t s_len, size_t d_len, int); 150 typedef int zio_decompress_func_t(void *src, void *dst, 151 size_t s_len, size_t d_len, int); 152 153 extern int gzip_decompress(void *src, void *dst, 154 size_t s_len, size_t d_len, int); 155 /* 156 * Information about each compression function. 157 */ 158 typedef struct zio_compress_info { 159 zio_compress_func_t *ci_compress; /* compression function */ 160 zio_decompress_func_t *ci_decompress; /* decompression function */ 161 int ci_level; /* level parameter */ 162 const char *ci_name; /* algorithm name */ 163 } zio_compress_info_t; 164 165 #include "lzjb.c" 166 #include "zle.c" 167 168 /* 169 * Compression vectors. 170 */ 171 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 172 {NULL, NULL, 0, "inherit"}, 173 {NULL, NULL, 0, "on"}, 174 {NULL, NULL, 0, "uncompressed"}, 175 {NULL, lzjb_decompress, 0, "lzjb"}, 176 {NULL, NULL, 0, "empty"}, 177 {NULL, gzip_decompress, 1, "gzip-1"}, 178 {NULL, gzip_decompress, 2, "gzip-2"}, 179 {NULL, gzip_decompress, 3, "gzip-3"}, 180 {NULL, gzip_decompress, 4, "gzip-4"}, 181 {NULL, gzip_decompress, 5, "gzip-5"}, 182 {NULL, gzip_decompress, 6, "gzip-6"}, 183 {NULL, gzip_decompress, 7, "gzip-7"}, 184 {NULL, gzip_decompress, 8, "gzip-8"}, 185 {NULL, gzip_decompress, 9, "gzip-9"}, 186 {NULL, zle_decompress, 64, "zle"}, 187 {NULL, lz4_decompress, 0, "lz4"}, 188 }; 189 190 static void 191 byteswap_uint64_array(void *vbuf, size_t size) 192 { 193 uint64_t *buf = vbuf; 194 size_t count = size >> 3; 195 int i; 196 197 ASSERT((size & 7) == 0); 198 199 for (i = 0; i < count; i++) 200 buf[i] = BSWAP_64(buf[i]); 201 } 202 203 /* 204 * Set the external verifier for a gang block based on <vdev, offset, txg>, 205 * a tuple which is guaranteed to be unique for the life of the pool. 206 */ 207 static void 208 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 209 { 210 const dva_t *dva = BP_IDENTITY(bp); 211 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 212 213 ASSERT(BP_IS_GANG(bp)); 214 215 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 216 } 217 218 /* 219 * Set the external verifier for a label block based on its offset. 220 * The vdev is implicit, and the txg is unknowable at pool open time -- 221 * hence the logic in vdev_uberblock_load() to find the most recent copy. 222 */ 223 static void 224 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 225 { 226 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 227 } 228 229 /* 230 * Calls the template init function of a checksum which supports context 231 * templates and installs the template into the spa_t. 232 */ 233 static void 234 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 235 { 236 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 237 238 if (ci->ci_tmpl_init == NULL) 239 return; 240 241 if (spa->spa_cksum_tmpls[checksum] != NULL) 242 return; 243 244 if (spa->spa_cksum_tmpls[checksum] == NULL) { 245 spa->spa_cksum_tmpls[checksum] = 246 ci->ci_tmpl_init(&spa->spa_cksum_salt); 247 } 248 } 249 250 /* 251 * Called by a spa_t that's about to be deallocated. This steps through 252 * all of the checksum context templates and deallocates any that were 253 * initialized using the algorithm-specific template init function. 254 */ 255 void 256 zio_checksum_templates_free(spa_t *spa) 257 { 258 for (enum zio_checksum checksum = 0; 259 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 260 if (spa->spa_cksum_tmpls[checksum] != NULL) { 261 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 262 263 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 264 spa->spa_cksum_tmpls[checksum] = NULL; 265 } 266 } 267 } 268 269 static int 270 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 271 { 272 uint64_t size; 273 unsigned int checksum; 274 zio_checksum_info_t *ci; 275 void *ctx = NULL; 276 zio_cksum_t actual_cksum, expected_cksum, verifier; 277 int byteswap; 278 279 checksum = BP_GET_CHECKSUM(bp); 280 size = BP_GET_PSIZE(bp); 281 282 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 283 return (EINVAL); 284 ci = &zio_checksum_table[checksum]; 285 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 286 return (EINVAL); 287 288 if (spa != NULL) { 289 zio_checksum_template_init(checksum, (spa_t *)spa); 290 ctx = spa->spa_cksum_tmpls[checksum]; 291 } 292 293 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 294 zio_eck_t *eck; 295 296 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 297 checksum == ZIO_CHECKSUM_LABEL); 298 299 eck = (zio_eck_t *)((char *)data + size) - 1; 300 301 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 302 zio_checksum_gang_verifier(&verifier, bp); 303 else if (checksum == ZIO_CHECKSUM_LABEL) 304 zio_checksum_label_verifier(&verifier, 305 DVA_GET_OFFSET(BP_IDENTITY(bp))); 306 else 307 verifier = bp->blk_cksum; 308 309 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 310 311 if (byteswap) 312 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 313 314 expected_cksum = eck->zec_cksum; 315 eck->zec_cksum = verifier; 316 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 317 eck->zec_cksum = expected_cksum; 318 319 if (byteswap) 320 byteswap_uint64_array(&expected_cksum, 321 sizeof (zio_cksum_t)); 322 } else { 323 expected_cksum = bp->blk_cksum; 324 ci->ci_func[0](data, size, ctx, &actual_cksum); 325 } 326 327 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 328 /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */ 329 return (EIO); 330 } 331 332 return (0); 333 } 334 335 static int 336 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 337 void *dest, uint64_t destsize) 338 { 339 zio_compress_info_t *ci; 340 341 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 342 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 343 return (EIO); 344 } 345 346 ci = &zio_compress_table[cpfunc]; 347 if (!ci->ci_decompress) { 348 printf("ZFS: unsupported compression algorithm %s\n", 349 ci->ci_name); 350 return (EIO); 351 } 352 353 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 354 } 355 356 static uint64_t 357 zap_hash(uint64_t salt, const char *name) 358 { 359 const uint8_t *cp; 360 uint8_t c; 361 uint64_t crc = salt; 362 363 ASSERT(crc != 0); 364 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 365 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 366 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 367 368 /* 369 * Only use 28 bits, since we need 4 bits in the cookie for the 370 * collision differentiator. We MUST use the high bits, since 371 * those are the onces that we first pay attention to when 372 * chosing the bucket. 373 */ 374 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 375 376 return (crc); 377 } 378 379 static void *zfs_alloc(size_t size); 380 static void zfs_free(void *ptr, size_t size); 381 382 typedef struct raidz_col { 383 uint64_t rc_devidx; /* child device index for I/O */ 384 uint64_t rc_offset; /* device offset */ 385 uint64_t rc_size; /* I/O size */ 386 void *rc_data; /* I/O data */ 387 int rc_error; /* I/O error for this device */ 388 uint8_t rc_tried; /* Did we attempt this I/O column? */ 389 uint8_t rc_skipped; /* Did we skip this I/O column? */ 390 } raidz_col_t; 391 392 typedef struct raidz_map { 393 uint64_t rm_cols; /* Regular column count */ 394 uint64_t rm_scols; /* Count including skipped columns */ 395 uint64_t rm_bigcols; /* Number of oversized columns */ 396 uint64_t rm_asize; /* Actual total I/O size */ 397 uint64_t rm_missingdata; /* Count of missing data devices */ 398 uint64_t rm_missingparity; /* Count of missing parity devices */ 399 uint64_t rm_firstdatacol; /* First data column/parity count */ 400 uint64_t rm_nskip; /* Skipped sectors for padding */ 401 uint64_t rm_skipstart; /* Column index of padding start */ 402 uintptr_t rm_reports; /* # of referencing checksum reports */ 403 uint8_t rm_freed; /* map no longer has referencing ZIO */ 404 uint8_t rm_ecksuminjected; /* checksum error was injected */ 405 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 406 } raidz_map_t; 407 408 #define VDEV_RAIDZ_P 0 409 #define VDEV_RAIDZ_Q 1 410 #define VDEV_RAIDZ_R 2 411 412 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 413 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 414 415 /* 416 * We provide a mechanism to perform the field multiplication operation on a 417 * 64-bit value all at once rather than a byte at a time. This works by 418 * creating a mask from the top bit in each byte and using that to 419 * conditionally apply the XOR of 0x1d. 420 */ 421 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 422 { \ 423 (mask) = (x) & 0x8080808080808080ULL; \ 424 (mask) = ((mask) << 1) - ((mask) >> 7); \ 425 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 426 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 427 } 428 429 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 430 { \ 431 VDEV_RAIDZ_64MUL_2((x), mask); \ 432 VDEV_RAIDZ_64MUL_2((x), mask); \ 433 } 434 435 /* 436 * These two tables represent powers and logs of 2 in the Galois field defined 437 * above. These values were computed by repeatedly multiplying by 2 as above. 438 */ 439 static const uint8_t vdev_raidz_pow2[256] = { 440 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 441 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 442 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 443 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 444 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 445 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 446 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 447 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 448 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 449 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 450 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 451 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 452 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 453 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 454 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 455 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 456 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 457 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 458 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 459 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 460 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 461 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 462 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 463 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 464 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 465 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 466 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 467 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 468 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 469 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 470 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 471 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 472 }; 473 static const uint8_t vdev_raidz_log2[256] = { 474 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 475 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 476 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 477 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 478 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 479 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 480 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 481 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 482 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 483 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 484 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 485 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 486 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 487 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 488 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 489 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 490 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 491 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 492 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 493 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 494 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 495 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 496 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 497 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 498 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 499 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 500 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 501 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 502 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 503 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 504 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 505 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 506 }; 507 508 /* 509 * Multiply a given number by 2 raised to the given power. 510 */ 511 static uint8_t 512 vdev_raidz_exp2(uint8_t a, int exp) 513 { 514 if (a == 0) 515 return (0); 516 517 ASSERT(exp >= 0); 518 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 519 520 exp += vdev_raidz_log2[a]; 521 if (exp > 255) 522 exp -= 255; 523 524 return (vdev_raidz_pow2[exp]); 525 } 526 527 static void 528 vdev_raidz_generate_parity_p(raidz_map_t *rm) 529 { 530 uint64_t *p, *src, pcount __attribute__((unused)), ccount, i; 531 int c; 532 533 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 534 535 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 536 src = rm->rm_col[c].rc_data; 537 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 538 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 539 540 if (c == rm->rm_firstdatacol) { 541 ASSERT(ccount == pcount); 542 for (i = 0; i < ccount; i++, src++, p++) { 543 *p = *src; 544 } 545 } else { 546 ASSERT(ccount <= pcount); 547 for (i = 0; i < ccount; i++, src++, p++) { 548 *p ^= *src; 549 } 550 } 551 } 552 } 553 554 static void 555 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 556 { 557 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 558 int c; 559 560 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 561 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 562 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 563 564 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 565 src = rm->rm_col[c].rc_data; 566 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 567 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 568 569 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 570 571 if (c == rm->rm_firstdatacol) { 572 ASSERT(ccnt == pcnt || ccnt == 0); 573 for (i = 0; i < ccnt; i++, src++, p++, q++) { 574 *p = *src; 575 *q = *src; 576 } 577 for (; i < pcnt; i++, src++, p++, q++) { 578 *p = 0; 579 *q = 0; 580 } 581 } else { 582 ASSERT(ccnt <= pcnt); 583 584 /* 585 * Apply the algorithm described above by multiplying 586 * the previous result and adding in the new value. 587 */ 588 for (i = 0; i < ccnt; i++, src++, p++, q++) { 589 *p ^= *src; 590 591 VDEV_RAIDZ_64MUL_2(*q, mask); 592 *q ^= *src; 593 } 594 595 /* 596 * Treat short columns as though they are full of 0s. 597 * Note that there's therefore nothing needed for P. 598 */ 599 for (; i < pcnt; i++, q++) { 600 VDEV_RAIDZ_64MUL_2(*q, mask); 601 } 602 } 603 } 604 } 605 606 static void 607 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 608 { 609 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 610 int c; 611 612 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 613 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 614 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 615 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 616 rm->rm_col[VDEV_RAIDZ_R].rc_size); 617 618 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 619 src = rm->rm_col[c].rc_data; 620 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 621 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 622 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 623 624 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 625 626 if (c == rm->rm_firstdatacol) { 627 ASSERT(ccnt == pcnt || ccnt == 0); 628 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 629 *p = *src; 630 *q = *src; 631 *r = *src; 632 } 633 for (; i < pcnt; i++, src++, p++, q++, r++) { 634 *p = 0; 635 *q = 0; 636 *r = 0; 637 } 638 } else { 639 ASSERT(ccnt <= pcnt); 640 641 /* 642 * Apply the algorithm described above by multiplying 643 * the previous result and adding in the new value. 644 */ 645 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 646 *p ^= *src; 647 648 VDEV_RAIDZ_64MUL_2(*q, mask); 649 *q ^= *src; 650 651 VDEV_RAIDZ_64MUL_4(*r, mask); 652 *r ^= *src; 653 } 654 655 /* 656 * Treat short columns as though they are full of 0s. 657 * Note that there's therefore nothing needed for P. 658 */ 659 for (; i < pcnt; i++, q++, r++) { 660 VDEV_RAIDZ_64MUL_2(*q, mask); 661 VDEV_RAIDZ_64MUL_4(*r, mask); 662 } 663 } 664 } 665 } 666 667 /* 668 * Generate RAID parity in the first virtual columns according to the number of 669 * parity columns available. 670 */ 671 static void 672 vdev_raidz_generate_parity(raidz_map_t *rm) 673 { 674 switch (rm->rm_firstdatacol) { 675 case 1: 676 vdev_raidz_generate_parity_p(rm); 677 break; 678 case 2: 679 vdev_raidz_generate_parity_pq(rm); 680 break; 681 case 3: 682 vdev_raidz_generate_parity_pqr(rm); 683 break; 684 default: 685 panic("invalid RAID-Z configuration"); 686 } 687 } 688 689 /* BEGIN CSTYLED */ 690 /* 691 * In the general case of reconstruction, we must solve the system of linear 692 * equations defined by the coeffecients used to generate parity as well as 693 * the contents of the data and parity disks. This can be expressed with 694 * vectors for the original data (D) and the actual data (d) and parity (p) 695 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 696 * 697 * __ __ __ __ 698 * | | __ __ | p_0 | 699 * | V | | D_0 | | p_m-1 | 700 * | | x | : | = | d_0 | 701 * | I | | D_n-1 | | : | 702 * | | ~~ ~~ | d_n-1 | 703 * ~~ ~~ ~~ ~~ 704 * 705 * I is simply a square identity matrix of size n, and V is a vandermonde 706 * matrix defined by the coeffecients we chose for the various parity columns 707 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 708 * computation as well as linear separability. 709 * 710 * __ __ __ __ 711 * | 1 .. 1 1 1 | | p_0 | 712 * | 2^n-1 .. 4 2 1 | __ __ | : | 713 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 714 * | 1 .. 0 0 0 | | D_1 | | d_0 | 715 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 716 * | : : : : | | : | | d_2 | 717 * | 0 .. 1 0 0 | | D_n-1 | | : | 718 * | 0 .. 0 1 0 | ~~ ~~ | : | 719 * | 0 .. 0 0 1 | | d_n-1 | 720 * ~~ ~~ ~~ ~~ 721 * 722 * Note that I, V, d, and p are known. To compute D, we must invert the 723 * matrix and use the known data and parity values to reconstruct the unknown 724 * data values. We begin by removing the rows in V|I and d|p that correspond 725 * to failed or missing columns; we then make V|I square (n x n) and d|p 726 * sized n by removing rows corresponding to unused parity from the bottom up 727 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 728 * using Gauss-Jordan elimination. In the example below we use m=3 parity 729 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 730 * __ __ 731 * | 1 1 1 1 1 1 1 1 | 732 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 733 * | 19 205 116 29 64 16 4 1 | / / 734 * | 1 0 0 0 0 0 0 0 | / / 735 * | 0 1 0 0 0 0 0 0 | <--' / 736 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 737 * | 0 0 0 1 0 0 0 0 | 738 * | 0 0 0 0 1 0 0 0 | 739 * | 0 0 0 0 0 1 0 0 | 740 * | 0 0 0 0 0 0 1 0 | 741 * | 0 0 0 0 0 0 0 1 | 742 * ~~ ~~ 743 * __ __ 744 * | 1 1 1 1 1 1 1 1 | 745 * | 128 64 32 16 8 4 2 1 | 746 * | 19 205 116 29 64 16 4 1 | 747 * | 1 0 0 0 0 0 0 0 | 748 * | 0 1 0 0 0 0 0 0 | 749 * (V|I)' = | 0 0 1 0 0 0 0 0 | 750 * | 0 0 0 1 0 0 0 0 | 751 * | 0 0 0 0 1 0 0 0 | 752 * | 0 0 0 0 0 1 0 0 | 753 * | 0 0 0 0 0 0 1 0 | 754 * | 0 0 0 0 0 0 0 1 | 755 * ~~ ~~ 756 * 757 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 758 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 759 * matrix is not singular. 760 * __ __ 761 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 762 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 763 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 764 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 765 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 766 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 767 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 768 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 769 * ~~ ~~ 770 * __ __ 771 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 772 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 773 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 774 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 775 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 776 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 777 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 778 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 779 * ~~ ~~ 780 * __ __ 781 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 782 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 783 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 784 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 785 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 786 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 787 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 788 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 789 * ~~ ~~ 790 * __ __ 791 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 792 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 793 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 794 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 795 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 796 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 797 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 798 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 799 * ~~ ~~ 800 * __ __ 801 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 802 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 803 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 804 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 805 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 806 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 807 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 808 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 809 * ~~ ~~ 810 * __ __ 811 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 812 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 813 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 814 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 815 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 816 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 817 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 818 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 819 * ~~ ~~ 820 * __ __ 821 * | 0 0 1 0 0 0 0 0 | 822 * | 167 100 5 41 159 169 217 208 | 823 * | 166 100 4 40 158 168 216 209 | 824 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 825 * | 0 0 0 0 1 0 0 0 | 826 * | 0 0 0 0 0 1 0 0 | 827 * | 0 0 0 0 0 0 1 0 | 828 * | 0 0 0 0 0 0 0 1 | 829 * ~~ ~~ 830 * 831 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 832 * of the missing data. 833 * 834 * As is apparent from the example above, the only non-trivial rows in the 835 * inverse matrix correspond to the data disks that we're trying to 836 * reconstruct. Indeed, those are the only rows we need as the others would 837 * only be useful for reconstructing data known or assumed to be valid. For 838 * that reason, we only build the coefficients in the rows that correspond to 839 * targeted columns. 840 */ 841 /* END CSTYLED */ 842 843 static void 844 vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map, 845 uint8_t **rows) 846 { 847 int i, j; 848 int pow; 849 850 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 851 852 /* 853 * Fill in the missing rows of interest. 854 */ 855 for (i = 0; i < nmap; i++) { 856 ASSERT3S(0, <=, map[i]); 857 ASSERT3S(map[i], <=, 2); 858 859 pow = map[i] * n; 860 if (pow > 255) 861 pow -= 255; 862 ASSERT(pow <= 255); 863 864 for (j = 0; j < n; j++) { 865 pow -= map[i]; 866 if (pow < 0) 867 pow += 255; 868 rows[i][j] = vdev_raidz_pow2[pow]; 869 } 870 } 871 } 872 873 static void 874 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 875 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 876 { 877 int i, j, ii, jj; 878 uint8_t log; 879 880 /* 881 * Assert that the first nmissing entries from the array of used 882 * columns correspond to parity columns and that subsequent entries 883 * correspond to data columns. 884 */ 885 for (i = 0; i < nmissing; i++) { 886 ASSERT3S(used[i], <, rm->rm_firstdatacol); 887 } 888 for (; i < n; i++) { 889 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 890 } 891 892 /* 893 * First initialize the storage where we'll compute the inverse rows. 894 */ 895 for (i = 0; i < nmissing; i++) { 896 for (j = 0; j < n; j++) { 897 invrows[i][j] = (i == j) ? 1 : 0; 898 } 899 } 900 901 /* 902 * Subtract all trivial rows from the rows of consequence. 903 */ 904 for (i = 0; i < nmissing; i++) { 905 for (j = nmissing; j < n; j++) { 906 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 907 jj = used[j] - rm->rm_firstdatacol; 908 ASSERT3S(jj, <, n); 909 invrows[i][j] = rows[i][jj]; 910 rows[i][jj] = 0; 911 } 912 } 913 914 /* 915 * For each of the rows of interest, we must normalize it and subtract 916 * a multiple of it from the other rows. 917 */ 918 for (i = 0; i < nmissing; i++) { 919 for (j = 0; j < missing[i]; j++) { 920 ASSERT3U(rows[i][j], ==, 0); 921 } 922 ASSERT3U(rows[i][missing[i]], !=, 0); 923 924 /* 925 * Compute the inverse of the first element and multiply each 926 * element in the row by that value. 927 */ 928 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 929 930 for (j = 0; j < n; j++) { 931 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 932 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 933 } 934 935 for (ii = 0; ii < nmissing; ii++) { 936 if (i == ii) 937 continue; 938 939 ASSERT3U(rows[ii][missing[i]], !=, 0); 940 941 log = vdev_raidz_log2[rows[ii][missing[i]]]; 942 943 for (j = 0; j < n; j++) { 944 rows[ii][j] ^= 945 vdev_raidz_exp2(rows[i][j], log); 946 invrows[ii][j] ^= 947 vdev_raidz_exp2(invrows[i][j], log); 948 } 949 } 950 } 951 952 /* 953 * Verify that the data that is left in the rows are properly part of 954 * an identity matrix. 955 */ 956 for (i = 0; i < nmissing; i++) { 957 for (j = 0; j < n; j++) { 958 if (j == missing[i]) { 959 ASSERT3U(rows[i][j], ==, 1); 960 } else { 961 ASSERT3U(rows[i][j], ==, 0); 962 } 963 } 964 } 965 } 966 967 static void 968 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 969 int *missing, uint8_t **invrows, const uint8_t *used) 970 { 971 int i, j, x, cc, c; 972 uint8_t *src; 973 uint64_t ccount; 974 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 975 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 976 uint8_t log, val; 977 int ll; 978 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 979 uint8_t *p, *pp; 980 size_t psize; 981 982 log = 0; /* gcc */ 983 psize = sizeof (invlog[0][0]) * n * nmissing; 984 p = zfs_alloc(psize); 985 986 for (pp = p, i = 0; i < nmissing; i++) { 987 invlog[i] = pp; 988 pp += n; 989 } 990 991 for (i = 0; i < nmissing; i++) { 992 for (j = 0; j < n; j++) { 993 ASSERT3U(invrows[i][j], !=, 0); 994 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 995 } 996 } 997 998 for (i = 0; i < n; i++) { 999 c = used[i]; 1000 ASSERT3U(c, <, rm->rm_cols); 1001 1002 src = rm->rm_col[c].rc_data; 1003 ccount = rm->rm_col[c].rc_size; 1004 for (j = 0; j < nmissing; j++) { 1005 cc = missing[j] + rm->rm_firstdatacol; 1006 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1007 ASSERT3U(cc, <, rm->rm_cols); 1008 ASSERT3U(cc, !=, c); 1009 1010 dst[j] = rm->rm_col[cc].rc_data; 1011 dcount[j] = rm->rm_col[cc].rc_size; 1012 } 1013 1014 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1015 1016 for (x = 0; x < ccount; x++, src++) { 1017 if (*src != 0) 1018 log = vdev_raidz_log2[*src]; 1019 1020 for (cc = 0; cc < nmissing; cc++) { 1021 if (x >= dcount[cc]) 1022 continue; 1023 1024 if (*src == 0) { 1025 val = 0; 1026 } else { 1027 if ((ll = log + invlog[cc][i]) >= 255) 1028 ll -= 255; 1029 val = vdev_raidz_pow2[ll]; 1030 } 1031 1032 if (i == 0) 1033 dst[cc][x] = val; 1034 else 1035 dst[cc][x] ^= val; 1036 } 1037 } 1038 } 1039 1040 zfs_free(p, psize); 1041 } 1042 1043 static int 1044 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1045 { 1046 int n, i, c, t, tt; 1047 int nmissing_rows; 1048 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1049 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1050 1051 uint8_t *p, *pp; 1052 size_t psize; 1053 1054 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1055 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1056 uint8_t *used; 1057 1058 int code = 0; 1059 1060 1061 n = rm->rm_cols - rm->rm_firstdatacol; 1062 1063 /* 1064 * Figure out which data columns are missing. 1065 */ 1066 nmissing_rows = 0; 1067 for (t = 0; t < ntgts; t++) { 1068 if (tgts[t] >= rm->rm_firstdatacol) { 1069 missing_rows[nmissing_rows++] = 1070 tgts[t] - rm->rm_firstdatacol; 1071 } 1072 } 1073 1074 /* 1075 * Figure out which parity columns to use to help generate the missing 1076 * data columns. 1077 */ 1078 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1079 ASSERT(tt < ntgts); 1080 ASSERT(c < rm->rm_firstdatacol); 1081 1082 /* 1083 * Skip any targeted parity columns. 1084 */ 1085 if (c == tgts[tt]) { 1086 tt++; 1087 continue; 1088 } 1089 1090 code |= 1 << c; 1091 1092 parity_map[i] = c; 1093 i++; 1094 } 1095 1096 ASSERT(code != 0); 1097 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1098 1099 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1100 nmissing_rows * n + sizeof (used[0]) * n; 1101 p = kmem_alloc(psize, KM_SLEEP); 1102 1103 for (pp = p, i = 0; i < nmissing_rows; i++) { 1104 rows[i] = pp; 1105 pp += n; 1106 invrows[i] = pp; 1107 pp += n; 1108 } 1109 used = pp; 1110 1111 for (i = 0; i < nmissing_rows; i++) { 1112 used[i] = parity_map[i]; 1113 } 1114 1115 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1116 if (tt < nmissing_rows && 1117 c == missing_rows[tt] + rm->rm_firstdatacol) { 1118 tt++; 1119 continue; 1120 } 1121 1122 ASSERT3S(i, <, n); 1123 used[i] = c; 1124 i++; 1125 } 1126 1127 /* 1128 * Initialize the interesting rows of the matrix. 1129 */ 1130 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1131 1132 /* 1133 * Invert the matrix. 1134 */ 1135 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1136 invrows, used); 1137 1138 /* 1139 * Reconstruct the missing data using the generated matrix. 1140 */ 1141 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1142 invrows, used); 1143 1144 kmem_free(p, psize); 1145 1146 return (code); 1147 } 1148 1149 static int 1150 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1151 { 1152 int tgts[VDEV_RAIDZ_MAXPARITY]; 1153 int ntgts; 1154 int i, c; 1155 int code; 1156 int nbadparity, nbaddata; 1157 1158 /* 1159 * The tgts list must already be sorted. 1160 */ 1161 for (i = 1; i < nt; i++) { 1162 ASSERT(t[i] > t[i - 1]); 1163 } 1164 1165 nbadparity = rm->rm_firstdatacol; 1166 nbaddata = rm->rm_cols - nbadparity; 1167 ntgts = 0; 1168 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1169 if (i < nt && c == t[i]) { 1170 tgts[ntgts++] = c; 1171 i++; 1172 } else if (rm->rm_col[c].rc_error != 0) { 1173 tgts[ntgts++] = c; 1174 } else if (c >= rm->rm_firstdatacol) { 1175 nbaddata--; 1176 } else { 1177 nbadparity--; 1178 } 1179 } 1180 1181 ASSERT(ntgts >= nt); 1182 ASSERT(nbaddata >= 0); 1183 ASSERT(nbaddata + nbadparity == ntgts); 1184 1185 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1186 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1187 ASSERT(code > 0); 1188 return (code); 1189 } 1190 1191 static raidz_map_t * 1192 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1193 uint64_t dcols, uint64_t nparity) 1194 { 1195 raidz_map_t *rm; 1196 uint64_t b = offset >> unit_shift; 1197 uint64_t s = size >> unit_shift; 1198 uint64_t f = b % dcols; 1199 uint64_t o = (b / dcols) << unit_shift; 1200 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1201 1202 q = s / (dcols - nparity); 1203 r = s - q * (dcols - nparity); 1204 bc = (r == 0 ? 0 : r + nparity); 1205 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1206 1207 if (q == 0) { 1208 acols = bc; 1209 scols = MIN(dcols, roundup(bc, nparity + 1)); 1210 } else { 1211 acols = dcols; 1212 scols = dcols; 1213 } 1214 1215 ASSERT3U(acols, <=, scols); 1216 1217 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1218 1219 rm->rm_cols = acols; 1220 rm->rm_scols = scols; 1221 rm->rm_bigcols = bc; 1222 rm->rm_skipstart = bc; 1223 rm->rm_missingdata = 0; 1224 rm->rm_missingparity = 0; 1225 rm->rm_firstdatacol = nparity; 1226 rm->rm_reports = 0; 1227 rm->rm_freed = 0; 1228 rm->rm_ecksuminjected = 0; 1229 1230 asize = 0; 1231 1232 for (c = 0; c < scols; c++) { 1233 col = f + c; 1234 coff = o; 1235 if (col >= dcols) { 1236 col -= dcols; 1237 coff += 1ULL << unit_shift; 1238 } 1239 rm->rm_col[c].rc_devidx = col; 1240 rm->rm_col[c].rc_offset = coff; 1241 rm->rm_col[c].rc_data = NULL; 1242 rm->rm_col[c].rc_error = 0; 1243 rm->rm_col[c].rc_tried = 0; 1244 rm->rm_col[c].rc_skipped = 0; 1245 1246 if (c >= acols) 1247 rm->rm_col[c].rc_size = 0; 1248 else if (c < bc) 1249 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1250 else 1251 rm->rm_col[c].rc_size = q << unit_shift; 1252 1253 asize += rm->rm_col[c].rc_size; 1254 } 1255 1256 ASSERT3U(asize, ==, tot << unit_shift); 1257 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1258 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1259 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1260 ASSERT3U(rm->rm_nskip, <=, nparity); 1261 1262 for (c = 0; c < rm->rm_firstdatacol; c++) 1263 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1264 1265 rm->rm_col[c].rc_data = data; 1266 1267 for (c = c + 1; c < acols; c++) 1268 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1269 rm->rm_col[c - 1].rc_size; 1270 1271 /* 1272 * If all data stored spans all columns, there's a danger that parity 1273 * will always be on the same device and, since parity isn't read 1274 * during normal operation, that that device's I/O bandwidth won't be 1275 * used effectively. We therefore switch the parity every 1MB. 1276 * 1277 * ... at least that was, ostensibly, the theory. As a practical 1278 * matter unless we juggle the parity between all devices evenly, we 1279 * won't see any benefit. Further, occasional writes that aren't a 1280 * multiple of the LCM of the number of children and the minimum 1281 * stripe width are sufficient to avoid pessimal behavior. 1282 * Unfortunately, this decision created an implicit on-disk format 1283 * requirement that we need to support for all eternity, but only 1284 * for single-parity RAID-Z. 1285 * 1286 * If we intend to skip a sector in the zeroth column for padding 1287 * we must make sure to note this swap. We will never intend to 1288 * skip the first column since at least one data and one parity 1289 * column must appear in each row. 1290 */ 1291 ASSERT(rm->rm_cols >= 2); 1292 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1293 1294 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1295 devidx = rm->rm_col[0].rc_devidx; 1296 o = rm->rm_col[0].rc_offset; 1297 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1298 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1299 rm->rm_col[1].rc_devidx = devidx; 1300 rm->rm_col[1].rc_offset = o; 1301 1302 if (rm->rm_skipstart == 0) 1303 rm->rm_skipstart = 1; 1304 } 1305 1306 return (rm); 1307 } 1308 1309 static void 1310 vdev_raidz_map_free(raidz_map_t *rm) 1311 { 1312 int c; 1313 1314 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1315 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1316 1317 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1318 } 1319 1320 static vdev_t * 1321 vdev_child(vdev_t *pvd, uint64_t devidx) 1322 { 1323 vdev_t *cvd; 1324 1325 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1326 if (cvd->v_id == devidx) 1327 break; 1328 } 1329 1330 return (cvd); 1331 } 1332 1333 /* 1334 * We keep track of whether or not there were any injected errors, so that 1335 * any ereports we generate can note it. 1336 */ 1337 static int 1338 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1339 uint64_t size __unused) 1340 { 1341 1342 return (zio_checksum_verify(spa, bp, data)); 1343 } 1344 1345 /* 1346 * Generate the parity from the data columns. If we tried and were able to 1347 * read the parity without error, verify that the generated parity matches the 1348 * data we read. If it doesn't, we fire off a checksum error. Return the 1349 * number such failures. 1350 */ 1351 static int 1352 raidz_parity_verify(raidz_map_t *rm) 1353 { 1354 void *orig[VDEV_RAIDZ_MAXPARITY]; 1355 int c, ret = 0; 1356 raidz_col_t *rc; 1357 1358 for (c = 0; c < rm->rm_firstdatacol; c++) { 1359 rc = &rm->rm_col[c]; 1360 if (!rc->rc_tried || rc->rc_error != 0) 1361 continue; 1362 orig[c] = zfs_alloc(rc->rc_size); 1363 bcopy(rc->rc_data, orig[c], rc->rc_size); 1364 } 1365 1366 vdev_raidz_generate_parity(rm); 1367 1368 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1369 rc = &rm->rm_col[c]; 1370 if (!rc->rc_tried || rc->rc_error != 0) 1371 continue; 1372 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1373 rc->rc_error = ECKSUM; 1374 ret++; 1375 } 1376 zfs_free(orig[c], rc->rc_size); 1377 } 1378 1379 return (ret); 1380 } 1381 1382 /* 1383 * Iterate over all combinations of bad data and attempt a reconstruction. 1384 * Note that the algorithm below is non-optimal because it doesn't take into 1385 * account how reconstruction is actually performed. For example, with 1386 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1387 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1388 * cases we'd only use parity information in column 0. 1389 */ 1390 static int 1391 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1392 void *data, off_t offset __unused, uint64_t bytes, int total_errors, 1393 int data_errors) 1394 { 1395 raidz_col_t *rc; 1396 void *orig[VDEV_RAIDZ_MAXPARITY]; 1397 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1398 int *tgts = &tstore[1]; 1399 int current, next, i, c, n; 1400 int code, ret = 0; 1401 1402 ASSERT(total_errors < rm->rm_firstdatacol); 1403 1404 /* 1405 * This simplifies one edge condition. 1406 */ 1407 tgts[-1] = -1; 1408 1409 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1410 /* 1411 * Initialize the targets array by finding the first n columns 1412 * that contain no error. 1413 * 1414 * If there were no data errors, we need to ensure that we're 1415 * always explicitly attempting to reconstruct at least one 1416 * data column. To do this, we simply push the highest target 1417 * up into the data columns. 1418 */ 1419 for (c = 0, i = 0; i < n; i++) { 1420 if (i == n - 1 && data_errors == 0 && 1421 c < rm->rm_firstdatacol) { 1422 c = rm->rm_firstdatacol; 1423 } 1424 1425 while (rm->rm_col[c].rc_error != 0) { 1426 c++; 1427 ASSERT3S(c, <, rm->rm_cols); 1428 } 1429 1430 tgts[i] = c++; 1431 } 1432 1433 /* 1434 * Setting tgts[n] simplifies the other edge condition. 1435 */ 1436 tgts[n] = rm->rm_cols; 1437 1438 /* 1439 * These buffers were allocated in previous iterations. 1440 */ 1441 for (i = 0; i < n - 1; i++) { 1442 ASSERT(orig[i] != NULL); 1443 } 1444 1445 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1446 1447 current = 0; 1448 next = tgts[current]; 1449 1450 while (current != n) { 1451 tgts[current] = next; 1452 current = 0; 1453 1454 /* 1455 * Save off the original data that we're going to 1456 * attempt to reconstruct. 1457 */ 1458 for (i = 0; i < n; i++) { 1459 ASSERT(orig[i] != NULL); 1460 c = tgts[i]; 1461 ASSERT3S(c, >=, 0); 1462 ASSERT3S(c, <, rm->rm_cols); 1463 rc = &rm->rm_col[c]; 1464 bcopy(rc->rc_data, orig[i], rc->rc_size); 1465 } 1466 1467 /* 1468 * Attempt a reconstruction and exit the outer loop on 1469 * success. 1470 */ 1471 code = vdev_raidz_reconstruct(rm, tgts, n); 1472 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1473 for (i = 0; i < n; i++) { 1474 c = tgts[i]; 1475 rc = &rm->rm_col[c]; 1476 ASSERT(rc->rc_error == 0); 1477 rc->rc_error = ECKSUM; 1478 } 1479 1480 ret = code; 1481 goto done; 1482 } 1483 1484 /* 1485 * Restore the original data. 1486 */ 1487 for (i = 0; i < n; i++) { 1488 c = tgts[i]; 1489 rc = &rm->rm_col[c]; 1490 bcopy(orig[i], rc->rc_data, rc->rc_size); 1491 } 1492 1493 do { 1494 /* 1495 * Find the next valid column after the current 1496 * position.. 1497 */ 1498 for (next = tgts[current] + 1; 1499 next < rm->rm_cols && 1500 rm->rm_col[next].rc_error != 0; next++) 1501 continue; 1502 1503 ASSERT(next <= tgts[current + 1]); 1504 1505 /* 1506 * If that spot is available, we're done here. 1507 */ 1508 if (next != tgts[current + 1]) 1509 break; 1510 1511 /* 1512 * Otherwise, find the next valid column after 1513 * the previous position. 1514 */ 1515 for (c = tgts[current - 1] + 1; 1516 rm->rm_col[c].rc_error != 0; c++) 1517 continue; 1518 1519 tgts[current] = c; 1520 current++; 1521 1522 } while (current != n); 1523 } 1524 } 1525 n--; 1526 done: 1527 for (i = n - 1; i >= 0; i--) { 1528 zfs_free(orig[i], rm->rm_col[0].rc_size); 1529 } 1530 1531 return (ret); 1532 } 1533 1534 static int 1535 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1536 off_t offset, size_t bytes) 1537 { 1538 vdev_t *tvd = vd->v_top; 1539 vdev_t *cvd; 1540 raidz_map_t *rm; 1541 raidz_col_t *rc; 1542 int c, error; 1543 int unexpected_errors; 1544 int parity_errors; 1545 int parity_untried; 1546 int data_errors; 1547 int total_errors; 1548 int n; 1549 int tgts[VDEV_RAIDZ_MAXPARITY]; 1550 int code; 1551 1552 rc = NULL; /* gcc */ 1553 error = 0; 1554 1555 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1556 vd->v_nchildren, vd->v_nparity); 1557 1558 /* 1559 * Iterate over the columns in reverse order so that we hit the parity 1560 * last -- any errors along the way will force us to read the parity. 1561 */ 1562 for (c = rm->rm_cols - 1; c >= 0; c--) { 1563 rc = &rm->rm_col[c]; 1564 cvd = vdev_child(vd, rc->rc_devidx); 1565 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1566 if (c >= rm->rm_firstdatacol) 1567 rm->rm_missingdata++; 1568 else 1569 rm->rm_missingparity++; 1570 rc->rc_error = ENXIO; 1571 rc->rc_tried = 1; /* don't even try */ 1572 rc->rc_skipped = 1; 1573 continue; 1574 } 1575 #if 0 /* XXX: Too hard for the boot code. */ 1576 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1577 if (c >= rm->rm_firstdatacol) 1578 rm->rm_missingdata++; 1579 else 1580 rm->rm_missingparity++; 1581 rc->rc_error = ESTALE; 1582 rc->rc_skipped = 1; 1583 continue; 1584 } 1585 #endif 1586 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1587 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1588 rc->rc_offset, rc->rc_size); 1589 rc->rc_tried = 1; 1590 rc->rc_skipped = 0; 1591 } 1592 } 1593 1594 reconstruct: 1595 unexpected_errors = 0; 1596 parity_errors = 0; 1597 parity_untried = 0; 1598 data_errors = 0; 1599 total_errors = 0; 1600 1601 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1602 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1603 1604 for (c = 0; c < rm->rm_cols; c++) { 1605 rc = &rm->rm_col[c]; 1606 1607 if (rc->rc_error) { 1608 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1609 1610 if (c < rm->rm_firstdatacol) 1611 parity_errors++; 1612 else 1613 data_errors++; 1614 1615 if (!rc->rc_skipped) 1616 unexpected_errors++; 1617 1618 total_errors++; 1619 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1620 parity_untried++; 1621 } 1622 } 1623 1624 /* 1625 * There are three potential phases for a read: 1626 * 1. produce valid data from the columns read 1627 * 2. read all disks and try again 1628 * 3. perform combinatorial reconstruction 1629 * 1630 * Each phase is progressively both more expensive and less likely to 1631 * occur. If we encounter more errors than we can repair or all phases 1632 * fail, we have no choice but to return an error. 1633 */ 1634 1635 /* 1636 * If the number of errors we saw was correctable -- less than or equal 1637 * to the number of parity disks read -- attempt to produce data that 1638 * has a valid checksum. Naturally, this case applies in the absence of 1639 * any errors. 1640 */ 1641 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1642 int rv; 1643 1644 if (data_errors == 0) { 1645 rv = raidz_checksum_verify(vd->spa, bp, data, bytes); 1646 if (rv == 0) { 1647 /* 1648 * If we read parity information (unnecessarily 1649 * as it happens since no reconstruction was 1650 * needed) regenerate and verify the parity. 1651 * We also regenerate parity when resilvering 1652 * so we can write it out to the failed device 1653 * later. 1654 */ 1655 if (parity_errors + parity_untried < 1656 rm->rm_firstdatacol) { 1657 n = raidz_parity_verify(rm); 1658 unexpected_errors += n; 1659 ASSERT(parity_errors + n <= 1660 rm->rm_firstdatacol); 1661 } 1662 goto done; 1663 } 1664 } else { 1665 /* 1666 * We either attempt to read all the parity columns or 1667 * none of them. If we didn't try to read parity, we 1668 * wouldn't be here in the correctable case. There must 1669 * also have been fewer parity errors than parity 1670 * columns or, again, we wouldn't be in this code path. 1671 */ 1672 ASSERT(parity_untried == 0); 1673 ASSERT(parity_errors < rm->rm_firstdatacol); 1674 1675 /* 1676 * Identify the data columns that reported an error. 1677 */ 1678 n = 0; 1679 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1680 rc = &rm->rm_col[c]; 1681 if (rc->rc_error != 0) { 1682 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1683 tgts[n++] = c; 1684 } 1685 } 1686 1687 ASSERT(rm->rm_firstdatacol >= n); 1688 1689 code = vdev_raidz_reconstruct(rm, tgts, n); 1690 1691 rv = raidz_checksum_verify(vd->spa, bp, data, bytes); 1692 if (rv == 0) { 1693 /* 1694 * If we read more parity disks than were used 1695 * for reconstruction, confirm that the other 1696 * parity disks produced correct data. This 1697 * routine is suboptimal in that it regenerates 1698 * the parity that we already used in addition 1699 * to the parity that we're attempting to 1700 * verify, but this should be a relatively 1701 * uncommon case, and can be optimized if it 1702 * becomes a problem. Note that we regenerate 1703 * parity when resilvering so we can write it 1704 * out to failed devices later. 1705 */ 1706 if (parity_errors < rm->rm_firstdatacol - n) { 1707 n = raidz_parity_verify(rm); 1708 unexpected_errors += n; 1709 ASSERT(parity_errors + n <= 1710 rm->rm_firstdatacol); 1711 } 1712 1713 goto done; 1714 } 1715 } 1716 } 1717 1718 /* 1719 * This isn't a typical situation -- either we got a read 1720 * error or a child silently returned bad data. Read every 1721 * block so we can try again with as much data and parity as 1722 * we can track down. If we've already been through once 1723 * before, all children will be marked as tried so we'll 1724 * proceed to combinatorial reconstruction. 1725 */ 1726 unexpected_errors = 1; 1727 rm->rm_missingdata = 0; 1728 rm->rm_missingparity = 0; 1729 1730 n = 0; 1731 for (c = 0; c < rm->rm_cols; c++) { 1732 rc = &rm->rm_col[c]; 1733 1734 if (rc->rc_tried) 1735 continue; 1736 1737 cvd = vdev_child(vd, rc->rc_devidx); 1738 ASSERT(cvd != NULL); 1739 rc->rc_error = cvd->v_read(cvd, NULL, 1740 rc->rc_data, rc->rc_offset, rc->rc_size); 1741 if (rc->rc_error == 0) 1742 n++; 1743 rc->rc_tried = 1; 1744 rc->rc_skipped = 0; 1745 } 1746 /* 1747 * If we managed to read anything more, retry the 1748 * reconstruction. 1749 */ 1750 if (n > 0) 1751 goto reconstruct; 1752 1753 /* 1754 * At this point we've attempted to reconstruct the data given the 1755 * errors we detected, and we've attempted to read all columns. There 1756 * must, therefore, be one or more additional problems -- silent errors 1757 * resulting in invalid data rather than explicit I/O errors resulting 1758 * in absent data. We check if there is enough additional data to 1759 * possibly reconstruct the data and then perform combinatorial 1760 * reconstruction over all possible combinations. If that fails, 1761 * we're cooked. 1762 */ 1763 if (total_errors > rm->rm_firstdatacol) { 1764 error = EIO; 1765 } else if (total_errors < rm->rm_firstdatacol && 1766 (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes, 1767 total_errors, data_errors)) != 0) { 1768 /* 1769 * If we didn't use all the available parity for the 1770 * combinatorial reconstruction, verify that the remaining 1771 * parity is correct. 1772 */ 1773 if (code != (1 << rm->rm_firstdatacol) - 1) 1774 (void) raidz_parity_verify(rm); 1775 } else { 1776 /* 1777 * We're here because either: 1778 * 1779 * total_errors == rm_first_datacol, or 1780 * vdev_raidz_combrec() failed 1781 * 1782 * In either case, there is enough bad data to prevent 1783 * reconstruction. 1784 * 1785 * Start checksum ereports for all children which haven't 1786 * failed, and the IO wasn't speculative. 1787 */ 1788 error = ECKSUM; 1789 } 1790 1791 done: 1792 vdev_raidz_map_free(rm); 1793 1794 return (error); 1795 } 1796