1199767f8SToomas Soome /* 2199767f8SToomas Soome * CDDL HEADER START 3199767f8SToomas Soome * 4199767f8SToomas Soome * The contents of this file are subject to the terms of the 5199767f8SToomas Soome * Common Development and Distribution License (the "License"). 6199767f8SToomas Soome * You may not use this file except in compliance with the License. 7199767f8SToomas Soome * 8199767f8SToomas Soome * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9199767f8SToomas Soome * or http://www.opensolaris.org/os/licensing. 10199767f8SToomas Soome * See the License for the specific language governing permissions 11199767f8SToomas Soome * and limitations under the License. 12199767f8SToomas Soome * 13199767f8SToomas Soome * When distributing Covered Code, include this CDDL HEADER in each 14199767f8SToomas Soome * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15199767f8SToomas Soome * If applicable, add the following below this CDDL HEADER, with the 16199767f8SToomas Soome * fields enclosed by brackets "[]" replaced with your own identifying 17199767f8SToomas Soome * information: Portions Copyright [yyyy] [name of copyright owner] 18199767f8SToomas Soome * 19199767f8SToomas Soome * CDDL HEADER END 20199767f8SToomas Soome */ 21199767f8SToomas Soome /* 22199767f8SToomas Soome * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23199767f8SToomas Soome * Use is subject to license terms. 24199767f8SToomas Soome */ 25199767f8SToomas Soome 26199767f8SToomas Soome #include <sys/cdefs.h> 2710ae99eeSToomas Soome #include <lz4.h> 28199767f8SToomas Soome 29199767f8SToomas Soome static uint64_t zfs_crc64_table[256]; 30199767f8SToomas Soome 31199767f8SToomas Soome #define ECKSUM 666 32199767f8SToomas Soome 33199767f8SToomas Soome #define ASSERT3S(x, y, z) ((void)0) 34199767f8SToomas Soome #define ASSERT3U(x, y, z) ((void)0) 35199767f8SToomas Soome #define ASSERT3P(x, y, z) ((void)0) 36199767f8SToomas Soome #define ASSERT0(x) ((void)0) 37199767f8SToomas Soome #define ASSERT(x) ((void)0) 38199767f8SToomas Soome 39199767f8SToomas Soome #define kmem_alloc(size, flag) zfs_alloc((size)) 40199767f8SToomas Soome #define kmem_free(ptr, size) zfs_free((ptr), (size)) 41199767f8SToomas Soome 42199767f8SToomas Soome static void 43199767f8SToomas Soome zfs_init_crc(void) 44199767f8SToomas Soome { 45199767f8SToomas Soome int i, j; 46199767f8SToomas Soome uint64_t *ct; 47199767f8SToomas Soome 48199767f8SToomas Soome /* 49199767f8SToomas Soome * Calculate the crc64 table (used for the zap hash 50199767f8SToomas Soome * function). 51199767f8SToomas Soome */ 52199767f8SToomas Soome if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 537bbcfb41SToomas Soome memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table)); 547bbcfb41SToomas Soome for (i = 0; i < 256; i++) { 557bbcfb41SToomas Soome ct = zfs_crc64_table + i; 567bbcfb41SToomas Soome for (*ct = i, j = 8; j > 0; j--) 577bbcfb41SToomas Soome *ct = (*ct >> 1) ^ 587bbcfb41SToomas Soome (-(*ct & 1) & ZFS_CRC64_POLY); 597bbcfb41SToomas Soome } 60199767f8SToomas Soome } 61199767f8SToomas Soome } 62199767f8SToomas Soome 63199767f8SToomas Soome static void 64*8eef2ab6SToomas Soome zio_checksum_off(const void *buf __unused, uint64_t size __unused, 65*8eef2ab6SToomas Soome const void *ctx_template __unused, zio_cksum_t *zcp) 66199767f8SToomas Soome { 67199767f8SToomas Soome ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 68199767f8SToomas Soome } 69199767f8SToomas Soome 70199767f8SToomas Soome /* 71199767f8SToomas Soome * Signature for checksum functions. 72199767f8SToomas Soome */ 73199767f8SToomas Soome typedef void zio_checksum_t(const void *data, uint64_t size, 74199767f8SToomas Soome const void *ctx_template, zio_cksum_t *zcp); 75199767f8SToomas Soome typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 76199767f8SToomas Soome typedef void zio_checksum_tmpl_free_t(void *ctx_template); 77199767f8SToomas Soome 78199767f8SToomas Soome typedef enum zio_checksum_flags { 79199767f8SToomas Soome /* Strong enough for metadata? */ 80199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA = (1 << 1), 81199767f8SToomas Soome /* ZIO embedded checksum */ 82199767f8SToomas Soome ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 83199767f8SToomas Soome /* Strong enough for dedup (without verification)? */ 84199767f8SToomas Soome ZCHECKSUM_FLAG_DEDUP = (1 << 3), 85199767f8SToomas Soome /* Uses salt value */ 86199767f8SToomas Soome ZCHECKSUM_FLAG_SALTED = (1 << 4), 87199767f8SToomas Soome /* Strong enough for nopwrite? */ 88199767f8SToomas Soome ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 89199767f8SToomas Soome } zio_checksum_flags_t; 90199767f8SToomas Soome 91199767f8SToomas Soome /* 92199767f8SToomas Soome * Information about each checksum function. 93199767f8SToomas Soome */ 94199767f8SToomas Soome typedef struct zio_checksum_info { 95199767f8SToomas Soome /* checksum function for each byteorder */ 96199767f8SToomas Soome zio_checksum_t *ci_func[2]; 97199767f8SToomas Soome zio_checksum_tmpl_init_t *ci_tmpl_init; 98199767f8SToomas Soome zio_checksum_tmpl_free_t *ci_tmpl_free; 99199767f8SToomas Soome zio_checksum_flags_t ci_flags; 100199767f8SToomas Soome const char *ci_name; /* descriptive name */ 101199767f8SToomas Soome } zio_checksum_info_t; 102199767f8SToomas Soome 103199767f8SToomas Soome #include "blkptr.c" 104199767f8SToomas Soome 105199767f8SToomas Soome #include "fletcher.c" 106199767f8SToomas Soome #include "sha256.c" 1074a04e8dbSToomas Soome #include "skein_zfs.c" 1084a04e8dbSToomas Soome #include "edonr_zfs.c" 109199767f8SToomas Soome 110199767f8SToomas Soome static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 111199767f8SToomas Soome {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 112199767f8SToomas Soome {{NULL, NULL}, NULL, NULL, 0, "on"}, 113199767f8SToomas Soome {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 114199767f8SToomas Soome {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 115199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 116199767f8SToomas Soome {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 117199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 118199767f8SToomas Soome {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 119199767f8SToomas Soome ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 120199767f8SToomas Soome {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 121199767f8SToomas Soome 0, "fletcher2"}, 122199767f8SToomas Soome {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 123199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 124199767f8SToomas Soome {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 125199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 126199767f8SToomas Soome ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 127199767f8SToomas Soome {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 128199767f8SToomas Soome ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 129199767f8SToomas Soome {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 130199767f8SToomas Soome 0, "noparity"}, 131199767f8SToomas Soome {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 132199767f8SToomas Soome NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 133199767f8SToomas Soome ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 134199767f8SToomas Soome /* no skein and edonr for now */ 1354a04e8dbSToomas Soome {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 1364a04e8dbSToomas Soome zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 1374a04e8dbSToomas Soome ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 1384a04e8dbSToomas Soome ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 1394a04e8dbSToomas Soome {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, 1404a04e8dbSToomas Soome zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, 1414a04e8dbSToomas Soome ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | 1424a04e8dbSToomas Soome ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, 143199767f8SToomas Soome }; 144199767f8SToomas Soome 145199767f8SToomas Soome /* 146199767f8SToomas Soome * Common signature for all zio compress/decompress functions. 147199767f8SToomas Soome */ 148199767f8SToomas Soome typedef size_t zio_compress_func_t(void *src, void *dst, 149199767f8SToomas Soome size_t s_len, size_t d_len, int); 150199767f8SToomas Soome typedef int zio_decompress_func_t(void *src, void *dst, 151199767f8SToomas Soome size_t s_len, size_t d_len, int); 152199767f8SToomas Soome 153199767f8SToomas Soome extern int gzip_decompress(void *src, void *dst, 154199767f8SToomas Soome size_t s_len, size_t d_len, int); 155199767f8SToomas Soome /* 156199767f8SToomas Soome * Information about each compression function. 157199767f8SToomas Soome */ 158199767f8SToomas Soome typedef struct zio_compress_info { 159199767f8SToomas Soome zio_compress_func_t *ci_compress; /* compression function */ 160199767f8SToomas Soome zio_decompress_func_t *ci_decompress; /* decompression function */ 161199767f8SToomas Soome int ci_level; /* level parameter */ 162199767f8SToomas Soome const char *ci_name; /* algorithm name */ 163199767f8SToomas Soome } zio_compress_info_t; 164199767f8SToomas Soome 165199767f8SToomas Soome #include "lzjb.c" 166199767f8SToomas Soome #include "zle.c" 167199767f8SToomas Soome 168199767f8SToomas Soome /* 169199767f8SToomas Soome * Compression vectors. 170199767f8SToomas Soome */ 171199767f8SToomas Soome static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 172199767f8SToomas Soome {NULL, NULL, 0, "inherit"}, 173199767f8SToomas Soome {NULL, NULL, 0, "on"}, 174199767f8SToomas Soome {NULL, NULL, 0, "uncompressed"}, 175199767f8SToomas Soome {NULL, lzjb_decompress, 0, "lzjb"}, 176199767f8SToomas Soome {NULL, NULL, 0, "empty"}, 177199767f8SToomas Soome {NULL, gzip_decompress, 1, "gzip-1"}, 178199767f8SToomas Soome {NULL, gzip_decompress, 2, "gzip-2"}, 179199767f8SToomas Soome {NULL, gzip_decompress, 3, "gzip-3"}, 180199767f8SToomas Soome {NULL, gzip_decompress, 4, "gzip-4"}, 181199767f8SToomas Soome {NULL, gzip_decompress, 5, "gzip-5"}, 182199767f8SToomas Soome {NULL, gzip_decompress, 6, "gzip-6"}, 183199767f8SToomas Soome {NULL, gzip_decompress, 7, "gzip-7"}, 184199767f8SToomas Soome {NULL, gzip_decompress, 8, "gzip-8"}, 185199767f8SToomas Soome {NULL, gzip_decompress, 9, "gzip-9"}, 186199767f8SToomas Soome {NULL, zle_decompress, 64, "zle"}, 187199767f8SToomas Soome {NULL, lz4_decompress, 0, "lz4"}, 188199767f8SToomas Soome }; 189199767f8SToomas Soome 190199767f8SToomas Soome static void 191199767f8SToomas Soome byteswap_uint64_array(void *vbuf, size_t size) 192199767f8SToomas Soome { 193199767f8SToomas Soome uint64_t *buf = vbuf; 194199767f8SToomas Soome size_t count = size >> 3; 195199767f8SToomas Soome int i; 196199767f8SToomas Soome 197199767f8SToomas Soome ASSERT((size & 7) == 0); 198199767f8SToomas Soome 199199767f8SToomas Soome for (i = 0; i < count; i++) 200199767f8SToomas Soome buf[i] = BSWAP_64(buf[i]); 201199767f8SToomas Soome } 202199767f8SToomas Soome 203199767f8SToomas Soome /* 204199767f8SToomas Soome * Set the external verifier for a gang block based on <vdev, offset, txg>, 205199767f8SToomas Soome * a tuple which is guaranteed to be unique for the life of the pool. 206199767f8SToomas Soome */ 207199767f8SToomas Soome static void 208199767f8SToomas Soome zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 209199767f8SToomas Soome { 210199767f8SToomas Soome const dva_t *dva = BP_IDENTITY(bp); 211199767f8SToomas Soome uint64_t txg = BP_PHYSICAL_BIRTH(bp); 212199767f8SToomas Soome 213199767f8SToomas Soome ASSERT(BP_IS_GANG(bp)); 214199767f8SToomas Soome 215199767f8SToomas Soome ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 216199767f8SToomas Soome } 217199767f8SToomas Soome 218199767f8SToomas Soome /* 219199767f8SToomas Soome * Set the external verifier for a label block based on its offset. 220199767f8SToomas Soome * The vdev is implicit, and the txg is unknowable at pool open time -- 221199767f8SToomas Soome * hence the logic in vdev_uberblock_load() to find the most recent copy. 222199767f8SToomas Soome */ 223199767f8SToomas Soome static void 224199767f8SToomas Soome zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 225199767f8SToomas Soome { 226199767f8SToomas Soome ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 227199767f8SToomas Soome } 228199767f8SToomas Soome 229199767f8SToomas Soome /* 230199767f8SToomas Soome * Calls the template init function of a checksum which supports context 231199767f8SToomas Soome * templates and installs the template into the spa_t. 232199767f8SToomas Soome */ 233199767f8SToomas Soome static void 2344a04e8dbSToomas Soome zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 235199767f8SToomas Soome { 236199767f8SToomas Soome zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 237199767f8SToomas Soome 238199767f8SToomas Soome if (ci->ci_tmpl_init == NULL) 239199767f8SToomas Soome return; 2404a04e8dbSToomas Soome 241199767f8SToomas Soome if (spa->spa_cksum_tmpls[checksum] != NULL) 242199767f8SToomas Soome return; 243199767f8SToomas Soome 244199767f8SToomas Soome if (spa->spa_cksum_tmpls[checksum] == NULL) { 245199767f8SToomas Soome spa->spa_cksum_tmpls[checksum] = 246199767f8SToomas Soome ci->ci_tmpl_init(&spa->spa_cksum_salt); 247199767f8SToomas Soome } 2484a04e8dbSToomas Soome } 2494a04e8dbSToomas Soome 2504a04e8dbSToomas Soome /* 2514a04e8dbSToomas Soome * Called by a spa_t that's about to be deallocated. This steps through 2524a04e8dbSToomas Soome * all of the checksum context templates and deallocates any that were 2534a04e8dbSToomas Soome * initialized using the algorithm-specific template init function. 2544a04e8dbSToomas Soome */ 2554a04e8dbSToomas Soome void 2564a04e8dbSToomas Soome zio_checksum_templates_free(spa_t *spa) 2574a04e8dbSToomas Soome { 2584a04e8dbSToomas Soome for (enum zio_checksum checksum = 0; 2594a04e8dbSToomas Soome checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 2604a04e8dbSToomas Soome if (spa->spa_cksum_tmpls[checksum] != NULL) { 2614a04e8dbSToomas Soome zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 2624a04e8dbSToomas Soome 2634a04e8dbSToomas Soome ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 2644a04e8dbSToomas Soome spa->spa_cksum_tmpls[checksum] = NULL; 2654a04e8dbSToomas Soome } 2664a04e8dbSToomas Soome } 267199767f8SToomas Soome } 268199767f8SToomas Soome 269199767f8SToomas Soome static int 2704a04e8dbSToomas Soome zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 271199767f8SToomas Soome { 272199767f8SToomas Soome uint64_t size; 273199767f8SToomas Soome unsigned int checksum; 274199767f8SToomas Soome zio_checksum_info_t *ci; 2754a04e8dbSToomas Soome void *ctx = NULL; 276199767f8SToomas Soome zio_cksum_t actual_cksum, expected_cksum, verifier; 277199767f8SToomas Soome int byteswap; 278199767f8SToomas Soome 279199767f8SToomas Soome checksum = BP_GET_CHECKSUM(bp); 280199767f8SToomas Soome size = BP_GET_PSIZE(bp); 281199767f8SToomas Soome 282199767f8SToomas Soome if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 283199767f8SToomas Soome return (EINVAL); 284199767f8SToomas Soome ci = &zio_checksum_table[checksum]; 285199767f8SToomas Soome if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 286199767f8SToomas Soome return (EINVAL); 287199767f8SToomas Soome 2884a04e8dbSToomas Soome if (spa != NULL) { 2897bbcfb41SToomas Soome zio_checksum_template_init(checksum, (spa_t *)spa); 2904a04e8dbSToomas Soome ctx = spa->spa_cksum_tmpls[checksum]; 2914a04e8dbSToomas Soome } 2924a04e8dbSToomas Soome 293199767f8SToomas Soome if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 294199767f8SToomas Soome zio_eck_t *eck; 295199767f8SToomas Soome 296199767f8SToomas Soome ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 297199767f8SToomas Soome checksum == ZIO_CHECKSUM_LABEL); 298199767f8SToomas Soome 299199767f8SToomas Soome eck = (zio_eck_t *)((char *)data + size) - 1; 300199767f8SToomas Soome 301199767f8SToomas Soome if (checksum == ZIO_CHECKSUM_GANG_HEADER) 302199767f8SToomas Soome zio_checksum_gang_verifier(&verifier, bp); 303199767f8SToomas Soome else if (checksum == ZIO_CHECKSUM_LABEL) 304199767f8SToomas Soome zio_checksum_label_verifier(&verifier, 305199767f8SToomas Soome DVA_GET_OFFSET(BP_IDENTITY(bp))); 306199767f8SToomas Soome else 307199767f8SToomas Soome verifier = bp->blk_cksum; 308199767f8SToomas Soome 309199767f8SToomas Soome byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 310199767f8SToomas Soome 311199767f8SToomas Soome if (byteswap) 312199767f8SToomas Soome byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 313199767f8SToomas Soome 314199767f8SToomas Soome expected_cksum = eck->zec_cksum; 315199767f8SToomas Soome eck->zec_cksum = verifier; 3164a04e8dbSToomas Soome ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 317199767f8SToomas Soome eck->zec_cksum = expected_cksum; 318199767f8SToomas Soome 319199767f8SToomas Soome if (byteswap) 320199767f8SToomas Soome byteswap_uint64_array(&expected_cksum, 321199767f8SToomas Soome sizeof (zio_cksum_t)); 322199767f8SToomas Soome } else { 323199767f8SToomas Soome expected_cksum = bp->blk_cksum; 3244a04e8dbSToomas Soome ci->ci_func[0](data, size, ctx, &actual_cksum); 325199767f8SToomas Soome } 326199767f8SToomas Soome 327199767f8SToomas Soome if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 3284a04e8dbSToomas Soome /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */ 329199767f8SToomas Soome return (EIO); 330199767f8SToomas Soome } 331199767f8SToomas Soome 332199767f8SToomas Soome return (0); 333199767f8SToomas Soome } 334199767f8SToomas Soome 335199767f8SToomas Soome static int 336199767f8SToomas Soome zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 3377bbcfb41SToomas Soome void *dest, uint64_t destsize) 338199767f8SToomas Soome { 339199767f8SToomas Soome zio_compress_info_t *ci; 340199767f8SToomas Soome 341199767f8SToomas Soome if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 342199767f8SToomas Soome printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 343199767f8SToomas Soome return (EIO); 344199767f8SToomas Soome } 345199767f8SToomas Soome 346199767f8SToomas Soome ci = &zio_compress_table[cpfunc]; 347199767f8SToomas Soome if (!ci->ci_decompress) { 348199767f8SToomas Soome printf("ZFS: unsupported compression algorithm %s\n", 349199767f8SToomas Soome ci->ci_name); 350199767f8SToomas Soome return (EIO); 351199767f8SToomas Soome } 352199767f8SToomas Soome 353199767f8SToomas Soome return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 354199767f8SToomas Soome } 355199767f8SToomas Soome 356199767f8SToomas Soome static uint64_t 357199767f8SToomas Soome zap_hash(uint64_t salt, const char *name) 358199767f8SToomas Soome { 359199767f8SToomas Soome const uint8_t *cp; 360199767f8SToomas Soome uint8_t c; 361199767f8SToomas Soome uint64_t crc = salt; 362199767f8SToomas Soome 363199767f8SToomas Soome ASSERT(crc != 0); 364199767f8SToomas Soome ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 365199767f8SToomas Soome for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 366199767f8SToomas Soome crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 367199767f8SToomas Soome 368199767f8SToomas Soome /* 369199767f8SToomas Soome * Only use 28 bits, since we need 4 bits in the cookie for the 370199767f8SToomas Soome * collision differentiator. We MUST use the high bits, since 371199767f8SToomas Soome * those are the onces that we first pay attention to when 372199767f8SToomas Soome * chosing the bucket. 373199767f8SToomas Soome */ 374199767f8SToomas Soome crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 375199767f8SToomas Soome 376199767f8SToomas Soome return (crc); 377199767f8SToomas Soome } 378199767f8SToomas Soome 379199767f8SToomas Soome static void *zfs_alloc(size_t size); 380199767f8SToomas Soome static void zfs_free(void *ptr, size_t size); 381199767f8SToomas Soome 382199767f8SToomas Soome typedef struct raidz_col { 383199767f8SToomas Soome uint64_t rc_devidx; /* child device index for I/O */ 384199767f8SToomas Soome uint64_t rc_offset; /* device offset */ 385199767f8SToomas Soome uint64_t rc_size; /* I/O size */ 386199767f8SToomas Soome void *rc_data; /* I/O data */ 387199767f8SToomas Soome int rc_error; /* I/O error for this device */ 388199767f8SToomas Soome uint8_t rc_tried; /* Did we attempt this I/O column? */ 389199767f8SToomas Soome uint8_t rc_skipped; /* Did we skip this I/O column? */ 390199767f8SToomas Soome } raidz_col_t; 391199767f8SToomas Soome 392199767f8SToomas Soome typedef struct raidz_map { 393199767f8SToomas Soome uint64_t rm_cols; /* Regular column count */ 394199767f8SToomas Soome uint64_t rm_scols; /* Count including skipped columns */ 395199767f8SToomas Soome uint64_t rm_bigcols; /* Number of oversized columns */ 396199767f8SToomas Soome uint64_t rm_asize; /* Actual total I/O size */ 397199767f8SToomas Soome uint64_t rm_missingdata; /* Count of missing data devices */ 398199767f8SToomas Soome uint64_t rm_missingparity; /* Count of missing parity devices */ 399199767f8SToomas Soome uint64_t rm_firstdatacol; /* First data column/parity count */ 400199767f8SToomas Soome uint64_t rm_nskip; /* Skipped sectors for padding */ 401199767f8SToomas Soome uint64_t rm_skipstart; /* Column index of padding start */ 402199767f8SToomas Soome uintptr_t rm_reports; /* # of referencing checksum reports */ 403199767f8SToomas Soome uint8_t rm_freed; /* map no longer has referencing ZIO */ 404199767f8SToomas Soome uint8_t rm_ecksuminjected; /* checksum error was injected */ 405199767f8SToomas Soome raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 406199767f8SToomas Soome } raidz_map_t; 407199767f8SToomas Soome 408199767f8SToomas Soome #define VDEV_RAIDZ_P 0 409199767f8SToomas Soome #define VDEV_RAIDZ_Q 1 410199767f8SToomas Soome #define VDEV_RAIDZ_R 2 411199767f8SToomas Soome 412199767f8SToomas Soome #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 413199767f8SToomas Soome #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 414199767f8SToomas Soome 415199767f8SToomas Soome /* 416199767f8SToomas Soome * We provide a mechanism to perform the field multiplication operation on a 417199767f8SToomas Soome * 64-bit value all at once rather than a byte at a time. This works by 418199767f8SToomas Soome * creating a mask from the top bit in each byte and using that to 419199767f8SToomas Soome * conditionally apply the XOR of 0x1d. 420199767f8SToomas Soome */ 421199767f8SToomas Soome #define VDEV_RAIDZ_64MUL_2(x, mask) \ 422199767f8SToomas Soome { \ 423199767f8SToomas Soome (mask) = (x) & 0x8080808080808080ULL; \ 424199767f8SToomas Soome (mask) = ((mask) << 1) - ((mask) >> 7); \ 425199767f8SToomas Soome (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 426199767f8SToomas Soome ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 427199767f8SToomas Soome } 428199767f8SToomas Soome 429199767f8SToomas Soome #define VDEV_RAIDZ_64MUL_4(x, mask) \ 430199767f8SToomas Soome { \ 431199767f8SToomas Soome VDEV_RAIDZ_64MUL_2((x), mask); \ 432199767f8SToomas Soome VDEV_RAIDZ_64MUL_2((x), mask); \ 433199767f8SToomas Soome } 434199767f8SToomas Soome 435199767f8SToomas Soome /* 436199767f8SToomas Soome * These two tables represent powers and logs of 2 in the Galois field defined 437199767f8SToomas Soome * above. These values were computed by repeatedly multiplying by 2 as above. 438199767f8SToomas Soome */ 439199767f8SToomas Soome static const uint8_t vdev_raidz_pow2[256] = { 440199767f8SToomas Soome 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 441199767f8SToomas Soome 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 442199767f8SToomas Soome 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 443199767f8SToomas Soome 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 444199767f8SToomas Soome 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 445199767f8SToomas Soome 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 446199767f8SToomas Soome 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 447199767f8SToomas Soome 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 448199767f8SToomas Soome 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 449199767f8SToomas Soome 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 450199767f8SToomas Soome 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 451199767f8SToomas Soome 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 452199767f8SToomas Soome 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 453199767f8SToomas Soome 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 454199767f8SToomas Soome 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 455199767f8SToomas Soome 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 456199767f8SToomas Soome 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 457199767f8SToomas Soome 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 458199767f8SToomas Soome 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 459199767f8SToomas Soome 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 460199767f8SToomas Soome 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 461199767f8SToomas Soome 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 462199767f8SToomas Soome 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 463199767f8SToomas Soome 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 464199767f8SToomas Soome 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 465199767f8SToomas Soome 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 466199767f8SToomas Soome 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 467199767f8SToomas Soome 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 468199767f8SToomas Soome 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 469199767f8SToomas Soome 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 470199767f8SToomas Soome 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 471199767f8SToomas Soome 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 472199767f8SToomas Soome }; 473199767f8SToomas Soome static const uint8_t vdev_raidz_log2[256] = { 474199767f8SToomas Soome 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 475199767f8SToomas Soome 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 476199767f8SToomas Soome 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 477199767f8SToomas Soome 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 478199767f8SToomas Soome 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 479199767f8SToomas Soome 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 480199767f8SToomas Soome 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 481199767f8SToomas Soome 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 482199767f8SToomas Soome 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 483199767f8SToomas Soome 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 484199767f8SToomas Soome 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 485199767f8SToomas Soome 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 486199767f8SToomas Soome 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 487199767f8SToomas Soome 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 488199767f8SToomas Soome 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 489199767f8SToomas Soome 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 490199767f8SToomas Soome 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 491199767f8SToomas Soome 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 492199767f8SToomas Soome 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 493199767f8SToomas Soome 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 494199767f8SToomas Soome 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 495199767f8SToomas Soome 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 496199767f8SToomas Soome 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 497199767f8SToomas Soome 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 498199767f8SToomas Soome 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 499199767f8SToomas Soome 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 500199767f8SToomas Soome 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 501199767f8SToomas Soome 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 502199767f8SToomas Soome 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 503199767f8SToomas Soome 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 504199767f8SToomas Soome 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 505199767f8SToomas Soome 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 506199767f8SToomas Soome }; 507199767f8SToomas Soome 508199767f8SToomas Soome /* 509199767f8SToomas Soome * Multiply a given number by 2 raised to the given power. 510199767f8SToomas Soome */ 511199767f8SToomas Soome static uint8_t 512199767f8SToomas Soome vdev_raidz_exp2(uint8_t a, int exp) 513199767f8SToomas Soome { 514199767f8SToomas Soome if (a == 0) 515199767f8SToomas Soome return (0); 516199767f8SToomas Soome 517199767f8SToomas Soome ASSERT(exp >= 0); 518199767f8SToomas Soome ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 519199767f8SToomas Soome 520199767f8SToomas Soome exp += vdev_raidz_log2[a]; 521199767f8SToomas Soome if (exp > 255) 522199767f8SToomas Soome exp -= 255; 523199767f8SToomas Soome 524199767f8SToomas Soome return (vdev_raidz_pow2[exp]); 525199767f8SToomas Soome } 526199767f8SToomas Soome 527199767f8SToomas Soome static void 528199767f8SToomas Soome vdev_raidz_generate_parity_p(raidz_map_t *rm) 529199767f8SToomas Soome { 530199767f8SToomas Soome uint64_t *p, *src, pcount __attribute__((unused)), ccount, i; 531199767f8SToomas Soome int c; 532199767f8SToomas Soome 533199767f8SToomas Soome pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 534199767f8SToomas Soome 535199767f8SToomas Soome for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 536199767f8SToomas Soome src = rm->rm_col[c].rc_data; 537199767f8SToomas Soome p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 538199767f8SToomas Soome ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 539199767f8SToomas Soome 540199767f8SToomas Soome if (c == rm->rm_firstdatacol) { 541199767f8SToomas Soome ASSERT(ccount == pcount); 542199767f8SToomas Soome for (i = 0; i < ccount; i++, src++, p++) { 543199767f8SToomas Soome *p = *src; 544199767f8SToomas Soome } 545199767f8SToomas Soome } else { 546199767f8SToomas Soome ASSERT(ccount <= pcount); 547199767f8SToomas Soome for (i = 0; i < ccount; i++, src++, p++) { 548199767f8SToomas Soome *p ^= *src; 549199767f8SToomas Soome } 550199767f8SToomas Soome } 551199767f8SToomas Soome } 552199767f8SToomas Soome } 553199767f8SToomas Soome 554199767f8SToomas Soome static void 555199767f8SToomas Soome vdev_raidz_generate_parity_pq(raidz_map_t *rm) 556199767f8SToomas Soome { 557199767f8SToomas Soome uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 558199767f8SToomas Soome int c; 559199767f8SToomas Soome 560199767f8SToomas Soome pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 561199767f8SToomas Soome ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 562199767f8SToomas Soome rm->rm_col[VDEV_RAIDZ_Q].rc_size); 563199767f8SToomas Soome 564199767f8SToomas Soome for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 565199767f8SToomas Soome src = rm->rm_col[c].rc_data; 566199767f8SToomas Soome p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 567199767f8SToomas Soome q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 568199767f8SToomas Soome 569199767f8SToomas Soome ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 570199767f8SToomas Soome 571199767f8SToomas Soome if (c == rm->rm_firstdatacol) { 572199767f8SToomas Soome ASSERT(ccnt == pcnt || ccnt == 0); 573199767f8SToomas Soome for (i = 0; i < ccnt; i++, src++, p++, q++) { 574199767f8SToomas Soome *p = *src; 575199767f8SToomas Soome *q = *src; 576199767f8SToomas Soome } 577199767f8SToomas Soome for (; i < pcnt; i++, src++, p++, q++) { 578199767f8SToomas Soome *p = 0; 579199767f8SToomas Soome *q = 0; 580199767f8SToomas Soome } 581199767f8SToomas Soome } else { 582199767f8SToomas Soome ASSERT(ccnt <= pcnt); 583199767f8SToomas Soome 584199767f8SToomas Soome /* 585199767f8SToomas Soome * Apply the algorithm described above by multiplying 586199767f8SToomas Soome * the previous result and adding in the new value. 587199767f8SToomas Soome */ 588199767f8SToomas Soome for (i = 0; i < ccnt; i++, src++, p++, q++) { 589199767f8SToomas Soome *p ^= *src; 590199767f8SToomas Soome 591199767f8SToomas Soome VDEV_RAIDZ_64MUL_2(*q, mask); 592199767f8SToomas Soome *q ^= *src; 593199767f8SToomas Soome } 594199767f8SToomas Soome 595199767f8SToomas Soome /* 596199767f8SToomas Soome * Treat short columns as though they are full of 0s. 597199767f8SToomas Soome * Note that there's therefore nothing needed for P. 598199767f8SToomas Soome */ 599199767f8SToomas Soome for (; i < pcnt; i++, q++) { 600199767f8SToomas Soome VDEV_RAIDZ_64MUL_2(*q, mask); 601199767f8SToomas Soome } 602199767f8SToomas Soome } 603199767f8SToomas Soome } 604199767f8SToomas Soome } 605199767f8SToomas Soome 606199767f8SToomas Soome static void 607199767f8SToomas Soome vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 608199767f8SToomas Soome { 609199767f8SToomas Soome uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 610199767f8SToomas Soome int c; 611199767f8SToomas Soome 612199767f8SToomas Soome pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 613199767f8SToomas Soome ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 614199767f8SToomas Soome rm->rm_col[VDEV_RAIDZ_Q].rc_size); 615199767f8SToomas Soome ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 616199767f8SToomas Soome rm->rm_col[VDEV_RAIDZ_R].rc_size); 617199767f8SToomas Soome 618199767f8SToomas Soome for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 619199767f8SToomas Soome src = rm->rm_col[c].rc_data; 620199767f8SToomas Soome p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 621199767f8SToomas Soome q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 622199767f8SToomas Soome r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 623199767f8SToomas Soome 624199767f8SToomas Soome ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 625199767f8SToomas Soome 626199767f8SToomas Soome if (c == rm->rm_firstdatacol) { 627199767f8SToomas Soome ASSERT(ccnt == pcnt || ccnt == 0); 628199767f8SToomas Soome for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 629199767f8SToomas Soome *p = *src; 630199767f8SToomas Soome *q = *src; 631199767f8SToomas Soome *r = *src; 632199767f8SToomas Soome } 633199767f8SToomas Soome for (; i < pcnt; i++, src++, p++, q++, r++) { 634199767f8SToomas Soome *p = 0; 635199767f8SToomas Soome *q = 0; 636199767f8SToomas Soome *r = 0; 637199767f8SToomas Soome } 638199767f8SToomas Soome } else { 639199767f8SToomas Soome ASSERT(ccnt <= pcnt); 640199767f8SToomas Soome 641199767f8SToomas Soome /* 642199767f8SToomas Soome * Apply the algorithm described above by multiplying 643199767f8SToomas Soome * the previous result and adding in the new value. 644199767f8SToomas Soome */ 645199767f8SToomas Soome for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 646199767f8SToomas Soome *p ^= *src; 647199767f8SToomas Soome 648199767f8SToomas Soome VDEV_RAIDZ_64MUL_2(*q, mask); 649199767f8SToomas Soome *q ^= *src; 650199767f8SToomas Soome 651199767f8SToomas Soome VDEV_RAIDZ_64MUL_4(*r, mask); 652199767f8SToomas Soome *r ^= *src; 653199767f8SToomas Soome } 654199767f8SToomas Soome 655199767f8SToomas Soome /* 656199767f8SToomas Soome * Treat short columns as though they are full of 0s. 657199767f8SToomas Soome * Note that there's therefore nothing needed for P. 658199767f8SToomas Soome */ 659199767f8SToomas Soome for (; i < pcnt; i++, q++, r++) { 660199767f8SToomas Soome VDEV_RAIDZ_64MUL_2(*q, mask); 661199767f8SToomas Soome VDEV_RAIDZ_64MUL_4(*r, mask); 662199767f8SToomas Soome } 663199767f8SToomas Soome } 664199767f8SToomas Soome } 665199767f8SToomas Soome } 666199767f8SToomas Soome 667199767f8SToomas Soome /* 668199767f8SToomas Soome * Generate RAID parity in the first virtual columns according to the number of 669199767f8SToomas Soome * parity columns available. 670199767f8SToomas Soome */ 671199767f8SToomas Soome static void 672199767f8SToomas Soome vdev_raidz_generate_parity(raidz_map_t *rm) 673199767f8SToomas Soome { 674199767f8SToomas Soome switch (rm->rm_firstdatacol) { 675199767f8SToomas Soome case 1: 676199767f8SToomas Soome vdev_raidz_generate_parity_p(rm); 677199767f8SToomas Soome break; 678199767f8SToomas Soome case 2: 679199767f8SToomas Soome vdev_raidz_generate_parity_pq(rm); 680199767f8SToomas Soome break; 681199767f8SToomas Soome case 3: 682199767f8SToomas Soome vdev_raidz_generate_parity_pqr(rm); 683199767f8SToomas Soome break; 684199767f8SToomas Soome default: 685199767f8SToomas Soome panic("invalid RAID-Z configuration"); 686199767f8SToomas Soome } 687199767f8SToomas Soome } 688199767f8SToomas Soome 689199767f8SToomas Soome /* BEGIN CSTYLED */ 690199767f8SToomas Soome /* 691199767f8SToomas Soome * In the general case of reconstruction, we must solve the system of linear 692199767f8SToomas Soome * equations defined by the coeffecients used to generate parity as well as 693199767f8SToomas Soome * the contents of the data and parity disks. This can be expressed with 694199767f8SToomas Soome * vectors for the original data (D) and the actual data (d) and parity (p) 695199767f8SToomas Soome * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 696199767f8SToomas Soome * 697199767f8SToomas Soome * __ __ __ __ 698199767f8SToomas Soome * | | __ __ | p_0 | 699199767f8SToomas Soome * | V | | D_0 | | p_m-1 | 700199767f8SToomas Soome * | | x | : | = | d_0 | 701199767f8SToomas Soome * | I | | D_n-1 | | : | 702199767f8SToomas Soome * | | ~~ ~~ | d_n-1 | 703199767f8SToomas Soome * ~~ ~~ ~~ ~~ 704199767f8SToomas Soome * 705199767f8SToomas Soome * I is simply a square identity matrix of size n, and V is a vandermonde 706199767f8SToomas Soome * matrix defined by the coeffecients we chose for the various parity columns 707199767f8SToomas Soome * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 708199767f8SToomas Soome * computation as well as linear separability. 709199767f8SToomas Soome * 710199767f8SToomas Soome * __ __ __ __ 711199767f8SToomas Soome * | 1 .. 1 1 1 | | p_0 | 712199767f8SToomas Soome * | 2^n-1 .. 4 2 1 | __ __ | : | 713199767f8SToomas Soome * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 714199767f8SToomas Soome * | 1 .. 0 0 0 | | D_1 | | d_0 | 715199767f8SToomas Soome * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 716199767f8SToomas Soome * | : : : : | | : | | d_2 | 717199767f8SToomas Soome * | 0 .. 1 0 0 | | D_n-1 | | : | 718199767f8SToomas Soome * | 0 .. 0 1 0 | ~~ ~~ | : | 719199767f8SToomas Soome * | 0 .. 0 0 1 | | d_n-1 | 720199767f8SToomas Soome * ~~ ~~ ~~ ~~ 721199767f8SToomas Soome * 722199767f8SToomas Soome * Note that I, V, d, and p are known. To compute D, we must invert the 723199767f8SToomas Soome * matrix and use the known data and parity values to reconstruct the unknown 724199767f8SToomas Soome * data values. We begin by removing the rows in V|I and d|p that correspond 725199767f8SToomas Soome * to failed or missing columns; we then make V|I square (n x n) and d|p 726199767f8SToomas Soome * sized n by removing rows corresponding to unused parity from the bottom up 727199767f8SToomas Soome * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 728199767f8SToomas Soome * using Gauss-Jordan elimination. In the example below we use m=3 parity 729199767f8SToomas Soome * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 730199767f8SToomas Soome * __ __ 731199767f8SToomas Soome * | 1 1 1 1 1 1 1 1 | 732199767f8SToomas Soome * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 733199767f8SToomas Soome * | 19 205 116 29 64 16 4 1 | / / 734199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 | / / 735199767f8SToomas Soome * | 0 1 0 0 0 0 0 0 | <--' / 736199767f8SToomas Soome * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 737199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 | 738199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 | 739199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 | 740199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 | 741199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 | 742199767f8SToomas Soome * ~~ ~~ 743199767f8SToomas Soome * __ __ 744199767f8SToomas Soome * | 1 1 1 1 1 1 1 1 | 745199767f8SToomas Soome * | 128 64 32 16 8 4 2 1 | 746199767f8SToomas Soome * | 19 205 116 29 64 16 4 1 | 747199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 | 748199767f8SToomas Soome * | 0 1 0 0 0 0 0 0 | 749199767f8SToomas Soome * (V|I)' = | 0 0 1 0 0 0 0 0 | 750199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 | 751199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 | 752199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 | 753199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 | 754199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 | 755199767f8SToomas Soome * ~~ ~~ 756199767f8SToomas Soome * 757199767f8SToomas Soome * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 758199767f8SToomas Soome * have carefully chosen the seed values 1, 2, and 4 to ensure that this 759199767f8SToomas Soome * matrix is not singular. 760199767f8SToomas Soome * __ __ 761199767f8SToomas Soome * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 762199767f8SToomas Soome * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 763199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 764199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 765199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 766199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 767199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 768199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 769199767f8SToomas Soome * ~~ ~~ 770199767f8SToomas Soome * __ __ 771199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 772199767f8SToomas Soome * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 773199767f8SToomas Soome * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 774199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 775199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 776199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 777199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 778199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 779199767f8SToomas Soome * ~~ ~~ 780199767f8SToomas Soome * __ __ 781199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 782199767f8SToomas Soome * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 783199767f8SToomas Soome * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 784199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 785199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 786199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 787199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 788199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 789199767f8SToomas Soome * ~~ ~~ 790199767f8SToomas Soome * __ __ 791199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 792199767f8SToomas Soome * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 793199767f8SToomas Soome * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 794199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 795199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 796199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 797199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 798199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 799199767f8SToomas Soome * ~~ ~~ 800199767f8SToomas Soome * __ __ 801199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 802199767f8SToomas Soome * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 803199767f8SToomas Soome * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 804199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 805199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 806199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 807199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 808199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 809199767f8SToomas Soome * ~~ ~~ 810199767f8SToomas Soome * __ __ 811199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 812199767f8SToomas Soome * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 813199767f8SToomas Soome * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 814199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 815199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 816199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 817199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 818199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 819199767f8SToomas Soome * ~~ ~~ 820199767f8SToomas Soome * __ __ 821199767f8SToomas Soome * | 0 0 1 0 0 0 0 0 | 822199767f8SToomas Soome * | 167 100 5 41 159 169 217 208 | 823199767f8SToomas Soome * | 166 100 4 40 158 168 216 209 | 824199767f8SToomas Soome * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 825199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 | 826199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 | 827199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 | 828199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 | 829199767f8SToomas Soome * ~~ ~~ 830199767f8SToomas Soome * 831199767f8SToomas Soome * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 832199767f8SToomas Soome * of the missing data. 833199767f8SToomas Soome * 834199767f8SToomas Soome * As is apparent from the example above, the only non-trivial rows in the 835199767f8SToomas Soome * inverse matrix correspond to the data disks that we're trying to 836199767f8SToomas Soome * reconstruct. Indeed, those are the only rows we need as the others would 837199767f8SToomas Soome * only be useful for reconstructing data known or assumed to be valid. For 838199767f8SToomas Soome * that reason, we only build the coefficients in the rows that correspond to 839199767f8SToomas Soome * targeted columns. 840199767f8SToomas Soome */ 841199767f8SToomas Soome /* END CSTYLED */ 842199767f8SToomas Soome 843199767f8SToomas Soome static void 844*8eef2ab6SToomas Soome vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map, 845199767f8SToomas Soome uint8_t **rows) 846199767f8SToomas Soome { 847199767f8SToomas Soome int i, j; 848199767f8SToomas Soome int pow; 849199767f8SToomas Soome 850199767f8SToomas Soome ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 851199767f8SToomas Soome 852199767f8SToomas Soome /* 853199767f8SToomas Soome * Fill in the missing rows of interest. 854199767f8SToomas Soome */ 855199767f8SToomas Soome for (i = 0; i < nmap; i++) { 856199767f8SToomas Soome ASSERT3S(0, <=, map[i]); 857199767f8SToomas Soome ASSERT3S(map[i], <=, 2); 858199767f8SToomas Soome 859199767f8SToomas Soome pow = map[i] * n; 860199767f8SToomas Soome if (pow > 255) 861199767f8SToomas Soome pow -= 255; 862199767f8SToomas Soome ASSERT(pow <= 255); 863199767f8SToomas Soome 864199767f8SToomas Soome for (j = 0; j < n; j++) { 865199767f8SToomas Soome pow -= map[i]; 866199767f8SToomas Soome if (pow < 0) 867199767f8SToomas Soome pow += 255; 868199767f8SToomas Soome rows[i][j] = vdev_raidz_pow2[pow]; 869199767f8SToomas Soome } 870199767f8SToomas Soome } 871199767f8SToomas Soome } 872199767f8SToomas Soome 873199767f8SToomas Soome static void 874199767f8SToomas Soome vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 875199767f8SToomas Soome uint8_t **rows, uint8_t **invrows, const uint8_t *used) 876199767f8SToomas Soome { 877199767f8SToomas Soome int i, j, ii, jj; 878199767f8SToomas Soome uint8_t log; 879199767f8SToomas Soome 880199767f8SToomas Soome /* 881199767f8SToomas Soome * Assert that the first nmissing entries from the array of used 882199767f8SToomas Soome * columns correspond to parity columns and that subsequent entries 883199767f8SToomas Soome * correspond to data columns. 884199767f8SToomas Soome */ 885199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 886199767f8SToomas Soome ASSERT3S(used[i], <, rm->rm_firstdatacol); 887199767f8SToomas Soome } 888199767f8SToomas Soome for (; i < n; i++) { 889199767f8SToomas Soome ASSERT3S(used[i], >=, rm->rm_firstdatacol); 890199767f8SToomas Soome } 891199767f8SToomas Soome 892199767f8SToomas Soome /* 893199767f8SToomas Soome * First initialize the storage where we'll compute the inverse rows. 894199767f8SToomas Soome */ 895199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 896199767f8SToomas Soome for (j = 0; j < n; j++) { 897199767f8SToomas Soome invrows[i][j] = (i == j) ? 1 : 0; 898199767f8SToomas Soome } 899199767f8SToomas Soome } 900199767f8SToomas Soome 901199767f8SToomas Soome /* 902199767f8SToomas Soome * Subtract all trivial rows from the rows of consequence. 903199767f8SToomas Soome */ 904199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 905199767f8SToomas Soome for (j = nmissing; j < n; j++) { 906199767f8SToomas Soome ASSERT3U(used[j], >=, rm->rm_firstdatacol); 907199767f8SToomas Soome jj = used[j] - rm->rm_firstdatacol; 908199767f8SToomas Soome ASSERT3S(jj, <, n); 909199767f8SToomas Soome invrows[i][j] = rows[i][jj]; 910199767f8SToomas Soome rows[i][jj] = 0; 911199767f8SToomas Soome } 912199767f8SToomas Soome } 913199767f8SToomas Soome 914199767f8SToomas Soome /* 915199767f8SToomas Soome * For each of the rows of interest, we must normalize it and subtract 916199767f8SToomas Soome * a multiple of it from the other rows. 917199767f8SToomas Soome */ 918199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 919199767f8SToomas Soome for (j = 0; j < missing[i]; j++) { 920199767f8SToomas Soome ASSERT3U(rows[i][j], ==, 0); 921199767f8SToomas Soome } 922199767f8SToomas Soome ASSERT3U(rows[i][missing[i]], !=, 0); 923199767f8SToomas Soome 924199767f8SToomas Soome /* 925199767f8SToomas Soome * Compute the inverse of the first element and multiply each 926199767f8SToomas Soome * element in the row by that value. 927199767f8SToomas Soome */ 928199767f8SToomas Soome log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 929199767f8SToomas Soome 930199767f8SToomas Soome for (j = 0; j < n; j++) { 931199767f8SToomas Soome rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 932199767f8SToomas Soome invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 933199767f8SToomas Soome } 934199767f8SToomas Soome 935199767f8SToomas Soome for (ii = 0; ii < nmissing; ii++) { 936199767f8SToomas Soome if (i == ii) 937199767f8SToomas Soome continue; 938199767f8SToomas Soome 939199767f8SToomas Soome ASSERT3U(rows[ii][missing[i]], !=, 0); 940199767f8SToomas Soome 941199767f8SToomas Soome log = vdev_raidz_log2[rows[ii][missing[i]]]; 942199767f8SToomas Soome 943199767f8SToomas Soome for (j = 0; j < n; j++) { 944199767f8SToomas Soome rows[ii][j] ^= 945199767f8SToomas Soome vdev_raidz_exp2(rows[i][j], log); 946199767f8SToomas Soome invrows[ii][j] ^= 947199767f8SToomas Soome vdev_raidz_exp2(invrows[i][j], log); 948199767f8SToomas Soome } 949199767f8SToomas Soome } 950199767f8SToomas Soome } 951199767f8SToomas Soome 952199767f8SToomas Soome /* 953199767f8SToomas Soome * Verify that the data that is left in the rows are properly part of 954199767f8SToomas Soome * an identity matrix. 955199767f8SToomas Soome */ 956199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 957199767f8SToomas Soome for (j = 0; j < n; j++) { 958199767f8SToomas Soome if (j == missing[i]) { 959199767f8SToomas Soome ASSERT3U(rows[i][j], ==, 1); 960199767f8SToomas Soome } else { 961199767f8SToomas Soome ASSERT3U(rows[i][j], ==, 0); 962199767f8SToomas Soome } 963199767f8SToomas Soome } 964199767f8SToomas Soome } 965199767f8SToomas Soome } 966199767f8SToomas Soome 967199767f8SToomas Soome static void 968199767f8SToomas Soome vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 969199767f8SToomas Soome int *missing, uint8_t **invrows, const uint8_t *used) 970199767f8SToomas Soome { 971199767f8SToomas Soome int i, j, x, cc, c; 972199767f8SToomas Soome uint8_t *src; 973199767f8SToomas Soome uint64_t ccount; 974199767f8SToomas Soome uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 975199767f8SToomas Soome uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 976199767f8SToomas Soome uint8_t log, val; 977199767f8SToomas Soome int ll; 978199767f8SToomas Soome uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 979199767f8SToomas Soome uint8_t *p, *pp; 980199767f8SToomas Soome size_t psize; 981199767f8SToomas Soome 982199767f8SToomas Soome log = 0; /* gcc */ 983199767f8SToomas Soome psize = sizeof (invlog[0][0]) * n * nmissing; 984199767f8SToomas Soome p = zfs_alloc(psize); 985199767f8SToomas Soome 986199767f8SToomas Soome for (pp = p, i = 0; i < nmissing; i++) { 987199767f8SToomas Soome invlog[i] = pp; 988199767f8SToomas Soome pp += n; 989199767f8SToomas Soome } 990199767f8SToomas Soome 991199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 992199767f8SToomas Soome for (j = 0; j < n; j++) { 993199767f8SToomas Soome ASSERT3U(invrows[i][j], !=, 0); 994199767f8SToomas Soome invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 995199767f8SToomas Soome } 996199767f8SToomas Soome } 997199767f8SToomas Soome 998199767f8SToomas Soome for (i = 0; i < n; i++) { 999199767f8SToomas Soome c = used[i]; 1000199767f8SToomas Soome ASSERT3U(c, <, rm->rm_cols); 1001199767f8SToomas Soome 1002199767f8SToomas Soome src = rm->rm_col[c].rc_data; 1003199767f8SToomas Soome ccount = rm->rm_col[c].rc_size; 1004199767f8SToomas Soome for (j = 0; j < nmissing; j++) { 1005199767f8SToomas Soome cc = missing[j] + rm->rm_firstdatacol; 1006199767f8SToomas Soome ASSERT3U(cc, >=, rm->rm_firstdatacol); 1007199767f8SToomas Soome ASSERT3U(cc, <, rm->rm_cols); 1008199767f8SToomas Soome ASSERT3U(cc, !=, c); 1009199767f8SToomas Soome 1010199767f8SToomas Soome dst[j] = rm->rm_col[cc].rc_data; 1011199767f8SToomas Soome dcount[j] = rm->rm_col[cc].rc_size; 1012199767f8SToomas Soome } 1013199767f8SToomas Soome 1014199767f8SToomas Soome ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1015199767f8SToomas Soome 1016199767f8SToomas Soome for (x = 0; x < ccount; x++, src++) { 1017199767f8SToomas Soome if (*src != 0) 1018199767f8SToomas Soome log = vdev_raidz_log2[*src]; 1019199767f8SToomas Soome 1020199767f8SToomas Soome for (cc = 0; cc < nmissing; cc++) { 1021199767f8SToomas Soome if (x >= dcount[cc]) 1022199767f8SToomas Soome continue; 1023199767f8SToomas Soome 1024199767f8SToomas Soome if (*src == 0) { 1025199767f8SToomas Soome val = 0; 1026199767f8SToomas Soome } else { 1027199767f8SToomas Soome if ((ll = log + invlog[cc][i]) >= 255) 1028199767f8SToomas Soome ll -= 255; 1029199767f8SToomas Soome val = vdev_raidz_pow2[ll]; 1030199767f8SToomas Soome } 1031199767f8SToomas Soome 1032199767f8SToomas Soome if (i == 0) 1033199767f8SToomas Soome dst[cc][x] = val; 1034199767f8SToomas Soome else 1035199767f8SToomas Soome dst[cc][x] ^= val; 1036199767f8SToomas Soome } 1037199767f8SToomas Soome } 1038199767f8SToomas Soome } 1039199767f8SToomas Soome 1040199767f8SToomas Soome zfs_free(p, psize); 1041199767f8SToomas Soome } 1042199767f8SToomas Soome 1043199767f8SToomas Soome static int 1044199767f8SToomas Soome vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1045199767f8SToomas Soome { 1046199767f8SToomas Soome int n, i, c, t, tt; 1047199767f8SToomas Soome int nmissing_rows; 1048199767f8SToomas Soome int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1049199767f8SToomas Soome int parity_map[VDEV_RAIDZ_MAXPARITY]; 1050199767f8SToomas Soome 1051199767f8SToomas Soome uint8_t *p, *pp; 1052199767f8SToomas Soome size_t psize; 1053199767f8SToomas Soome 1054199767f8SToomas Soome uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1055199767f8SToomas Soome uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1056199767f8SToomas Soome uint8_t *used; 1057199767f8SToomas Soome 1058199767f8SToomas Soome int code = 0; 1059199767f8SToomas Soome 1060199767f8SToomas Soome 1061199767f8SToomas Soome n = rm->rm_cols - rm->rm_firstdatacol; 1062199767f8SToomas Soome 1063199767f8SToomas Soome /* 1064199767f8SToomas Soome * Figure out which data columns are missing. 1065199767f8SToomas Soome */ 1066199767f8SToomas Soome nmissing_rows = 0; 1067199767f8SToomas Soome for (t = 0; t < ntgts; t++) { 1068199767f8SToomas Soome if (tgts[t] >= rm->rm_firstdatacol) { 1069199767f8SToomas Soome missing_rows[nmissing_rows++] = 1070199767f8SToomas Soome tgts[t] - rm->rm_firstdatacol; 1071199767f8SToomas Soome } 1072199767f8SToomas Soome } 1073199767f8SToomas Soome 1074199767f8SToomas Soome /* 1075199767f8SToomas Soome * Figure out which parity columns to use to help generate the missing 1076199767f8SToomas Soome * data columns. 1077199767f8SToomas Soome */ 1078199767f8SToomas Soome for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1079199767f8SToomas Soome ASSERT(tt < ntgts); 1080199767f8SToomas Soome ASSERT(c < rm->rm_firstdatacol); 1081199767f8SToomas Soome 1082199767f8SToomas Soome /* 1083199767f8SToomas Soome * Skip any targeted parity columns. 1084199767f8SToomas Soome */ 1085199767f8SToomas Soome if (c == tgts[tt]) { 1086199767f8SToomas Soome tt++; 1087199767f8SToomas Soome continue; 1088199767f8SToomas Soome } 1089199767f8SToomas Soome 1090199767f8SToomas Soome code |= 1 << c; 1091199767f8SToomas Soome 1092199767f8SToomas Soome parity_map[i] = c; 1093199767f8SToomas Soome i++; 1094199767f8SToomas Soome } 1095199767f8SToomas Soome 1096199767f8SToomas Soome ASSERT(code != 0); 1097199767f8SToomas Soome ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1098199767f8SToomas Soome 1099199767f8SToomas Soome psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1100199767f8SToomas Soome nmissing_rows * n + sizeof (used[0]) * n; 1101199767f8SToomas Soome p = kmem_alloc(psize, KM_SLEEP); 1102199767f8SToomas Soome 1103199767f8SToomas Soome for (pp = p, i = 0; i < nmissing_rows; i++) { 1104199767f8SToomas Soome rows[i] = pp; 1105199767f8SToomas Soome pp += n; 1106199767f8SToomas Soome invrows[i] = pp; 1107199767f8SToomas Soome pp += n; 1108199767f8SToomas Soome } 1109199767f8SToomas Soome used = pp; 1110199767f8SToomas Soome 1111199767f8SToomas Soome for (i = 0; i < nmissing_rows; i++) { 1112199767f8SToomas Soome used[i] = parity_map[i]; 1113199767f8SToomas Soome } 1114199767f8SToomas Soome 1115199767f8SToomas Soome for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1116199767f8SToomas Soome if (tt < nmissing_rows && 1117199767f8SToomas Soome c == missing_rows[tt] + rm->rm_firstdatacol) { 1118199767f8SToomas Soome tt++; 1119199767f8SToomas Soome continue; 1120199767f8SToomas Soome } 1121199767f8SToomas Soome 1122199767f8SToomas Soome ASSERT3S(i, <, n); 1123199767f8SToomas Soome used[i] = c; 1124199767f8SToomas Soome i++; 1125199767f8SToomas Soome } 1126199767f8SToomas Soome 1127199767f8SToomas Soome /* 1128199767f8SToomas Soome * Initialize the interesting rows of the matrix. 1129199767f8SToomas Soome */ 1130199767f8SToomas Soome vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1131199767f8SToomas Soome 1132199767f8SToomas Soome /* 1133199767f8SToomas Soome * Invert the matrix. 1134199767f8SToomas Soome */ 1135199767f8SToomas Soome vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1136199767f8SToomas Soome invrows, used); 1137199767f8SToomas Soome 1138199767f8SToomas Soome /* 1139199767f8SToomas Soome * Reconstruct the missing data using the generated matrix. 1140199767f8SToomas Soome */ 1141199767f8SToomas Soome vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1142199767f8SToomas Soome invrows, used); 1143199767f8SToomas Soome 1144199767f8SToomas Soome kmem_free(p, psize); 1145199767f8SToomas Soome 1146199767f8SToomas Soome return (code); 1147199767f8SToomas Soome } 1148199767f8SToomas Soome 1149199767f8SToomas Soome static int 1150199767f8SToomas Soome vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1151199767f8SToomas Soome { 1152199767f8SToomas Soome int tgts[VDEV_RAIDZ_MAXPARITY]; 1153199767f8SToomas Soome int ntgts; 1154199767f8SToomas Soome int i, c; 1155199767f8SToomas Soome int code; 1156199767f8SToomas Soome int nbadparity, nbaddata; 1157199767f8SToomas Soome 1158199767f8SToomas Soome /* 1159199767f8SToomas Soome * The tgts list must already be sorted. 1160199767f8SToomas Soome */ 1161199767f8SToomas Soome for (i = 1; i < nt; i++) { 1162199767f8SToomas Soome ASSERT(t[i] > t[i - 1]); 1163199767f8SToomas Soome } 1164199767f8SToomas Soome 1165199767f8SToomas Soome nbadparity = rm->rm_firstdatacol; 1166199767f8SToomas Soome nbaddata = rm->rm_cols - nbadparity; 1167199767f8SToomas Soome ntgts = 0; 1168199767f8SToomas Soome for (i = 0, c = 0; c < rm->rm_cols; c++) { 1169199767f8SToomas Soome if (i < nt && c == t[i]) { 1170199767f8SToomas Soome tgts[ntgts++] = c; 1171199767f8SToomas Soome i++; 1172199767f8SToomas Soome } else if (rm->rm_col[c].rc_error != 0) { 1173199767f8SToomas Soome tgts[ntgts++] = c; 1174199767f8SToomas Soome } else if (c >= rm->rm_firstdatacol) { 1175199767f8SToomas Soome nbaddata--; 1176199767f8SToomas Soome } else { 1177199767f8SToomas Soome nbadparity--; 1178199767f8SToomas Soome } 1179199767f8SToomas Soome } 1180199767f8SToomas Soome 1181199767f8SToomas Soome ASSERT(ntgts >= nt); 1182199767f8SToomas Soome ASSERT(nbaddata >= 0); 1183199767f8SToomas Soome ASSERT(nbaddata + nbadparity == ntgts); 1184199767f8SToomas Soome 1185199767f8SToomas Soome code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1186199767f8SToomas Soome ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1187199767f8SToomas Soome ASSERT(code > 0); 1188199767f8SToomas Soome return (code); 1189199767f8SToomas Soome } 1190199767f8SToomas Soome 1191199767f8SToomas Soome static raidz_map_t * 1192199767f8SToomas Soome vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1193199767f8SToomas Soome uint64_t dcols, uint64_t nparity) 1194199767f8SToomas Soome { 1195199767f8SToomas Soome raidz_map_t *rm; 1196199767f8SToomas Soome uint64_t b = offset >> unit_shift; 1197199767f8SToomas Soome uint64_t s = size >> unit_shift; 1198199767f8SToomas Soome uint64_t f = b % dcols; 1199199767f8SToomas Soome uint64_t o = (b / dcols) << unit_shift; 1200199767f8SToomas Soome uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1201199767f8SToomas Soome 1202199767f8SToomas Soome q = s / (dcols - nparity); 1203199767f8SToomas Soome r = s - q * (dcols - nparity); 1204199767f8SToomas Soome bc = (r == 0 ? 0 : r + nparity); 1205199767f8SToomas Soome tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1206199767f8SToomas Soome 1207199767f8SToomas Soome if (q == 0) { 1208199767f8SToomas Soome acols = bc; 1209199767f8SToomas Soome scols = MIN(dcols, roundup(bc, nparity + 1)); 1210199767f8SToomas Soome } else { 1211199767f8SToomas Soome acols = dcols; 1212199767f8SToomas Soome scols = dcols; 1213199767f8SToomas Soome } 1214199767f8SToomas Soome 1215199767f8SToomas Soome ASSERT3U(acols, <=, scols); 1216199767f8SToomas Soome 1217199767f8SToomas Soome rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1218199767f8SToomas Soome 1219199767f8SToomas Soome rm->rm_cols = acols; 1220199767f8SToomas Soome rm->rm_scols = scols; 1221199767f8SToomas Soome rm->rm_bigcols = bc; 1222199767f8SToomas Soome rm->rm_skipstart = bc; 1223199767f8SToomas Soome rm->rm_missingdata = 0; 1224199767f8SToomas Soome rm->rm_missingparity = 0; 1225199767f8SToomas Soome rm->rm_firstdatacol = nparity; 1226199767f8SToomas Soome rm->rm_reports = 0; 1227199767f8SToomas Soome rm->rm_freed = 0; 1228199767f8SToomas Soome rm->rm_ecksuminjected = 0; 1229199767f8SToomas Soome 1230199767f8SToomas Soome asize = 0; 1231199767f8SToomas Soome 1232199767f8SToomas Soome for (c = 0; c < scols; c++) { 1233199767f8SToomas Soome col = f + c; 1234199767f8SToomas Soome coff = o; 1235199767f8SToomas Soome if (col >= dcols) { 1236199767f8SToomas Soome col -= dcols; 1237199767f8SToomas Soome coff += 1ULL << unit_shift; 1238199767f8SToomas Soome } 1239199767f8SToomas Soome rm->rm_col[c].rc_devidx = col; 1240199767f8SToomas Soome rm->rm_col[c].rc_offset = coff; 1241199767f8SToomas Soome rm->rm_col[c].rc_data = NULL; 1242199767f8SToomas Soome rm->rm_col[c].rc_error = 0; 1243199767f8SToomas Soome rm->rm_col[c].rc_tried = 0; 1244199767f8SToomas Soome rm->rm_col[c].rc_skipped = 0; 1245199767f8SToomas Soome 1246199767f8SToomas Soome if (c >= acols) 1247199767f8SToomas Soome rm->rm_col[c].rc_size = 0; 1248199767f8SToomas Soome else if (c < bc) 1249199767f8SToomas Soome rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1250199767f8SToomas Soome else 1251199767f8SToomas Soome rm->rm_col[c].rc_size = q << unit_shift; 1252199767f8SToomas Soome 1253199767f8SToomas Soome asize += rm->rm_col[c].rc_size; 1254199767f8SToomas Soome } 1255199767f8SToomas Soome 1256199767f8SToomas Soome ASSERT3U(asize, ==, tot << unit_shift); 1257199767f8SToomas Soome rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1258199767f8SToomas Soome rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1259199767f8SToomas Soome ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1260199767f8SToomas Soome ASSERT3U(rm->rm_nskip, <=, nparity); 1261199767f8SToomas Soome 1262199767f8SToomas Soome for (c = 0; c < rm->rm_firstdatacol; c++) 1263199767f8SToomas Soome rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1264199767f8SToomas Soome 1265199767f8SToomas Soome rm->rm_col[c].rc_data = data; 1266199767f8SToomas Soome 1267199767f8SToomas Soome for (c = c + 1; c < acols; c++) 1268199767f8SToomas Soome rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1269199767f8SToomas Soome rm->rm_col[c - 1].rc_size; 1270199767f8SToomas Soome 1271199767f8SToomas Soome /* 1272199767f8SToomas Soome * If all data stored spans all columns, there's a danger that parity 1273199767f8SToomas Soome * will always be on the same device and, since parity isn't read 1274199767f8SToomas Soome * during normal operation, that that device's I/O bandwidth won't be 1275199767f8SToomas Soome * used effectively. We therefore switch the parity every 1MB. 1276199767f8SToomas Soome * 1277199767f8SToomas Soome * ... at least that was, ostensibly, the theory. As a practical 1278199767f8SToomas Soome * matter unless we juggle the parity between all devices evenly, we 1279199767f8SToomas Soome * won't see any benefit. Further, occasional writes that aren't a 1280199767f8SToomas Soome * multiple of the LCM of the number of children and the minimum 1281199767f8SToomas Soome * stripe width are sufficient to avoid pessimal behavior. 1282199767f8SToomas Soome * Unfortunately, this decision created an implicit on-disk format 1283199767f8SToomas Soome * requirement that we need to support for all eternity, but only 1284199767f8SToomas Soome * for single-parity RAID-Z. 1285199767f8SToomas Soome * 1286199767f8SToomas Soome * If we intend to skip a sector in the zeroth column for padding 1287199767f8SToomas Soome * we must make sure to note this swap. We will never intend to 1288199767f8SToomas Soome * skip the first column since at least one data and one parity 1289199767f8SToomas Soome * column must appear in each row. 1290199767f8SToomas Soome */ 1291199767f8SToomas Soome ASSERT(rm->rm_cols >= 2); 1292199767f8SToomas Soome ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1293199767f8SToomas Soome 1294199767f8SToomas Soome if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1295199767f8SToomas Soome devidx = rm->rm_col[0].rc_devidx; 1296199767f8SToomas Soome o = rm->rm_col[0].rc_offset; 1297199767f8SToomas Soome rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1298199767f8SToomas Soome rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1299199767f8SToomas Soome rm->rm_col[1].rc_devidx = devidx; 1300199767f8SToomas Soome rm->rm_col[1].rc_offset = o; 1301199767f8SToomas Soome 1302199767f8SToomas Soome if (rm->rm_skipstart == 0) 1303199767f8SToomas Soome rm->rm_skipstart = 1; 1304199767f8SToomas Soome } 1305199767f8SToomas Soome 1306199767f8SToomas Soome return (rm); 1307199767f8SToomas Soome } 1308199767f8SToomas Soome 1309199767f8SToomas Soome static void 1310199767f8SToomas Soome vdev_raidz_map_free(raidz_map_t *rm) 1311199767f8SToomas Soome { 1312199767f8SToomas Soome int c; 1313199767f8SToomas Soome 1314199767f8SToomas Soome for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1315199767f8SToomas Soome zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1316199767f8SToomas Soome 1317199767f8SToomas Soome zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1318199767f8SToomas Soome } 1319199767f8SToomas Soome 1320199767f8SToomas Soome static vdev_t * 1321199767f8SToomas Soome vdev_child(vdev_t *pvd, uint64_t devidx) 1322199767f8SToomas Soome { 1323199767f8SToomas Soome vdev_t *cvd; 1324199767f8SToomas Soome 1325199767f8SToomas Soome STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1326199767f8SToomas Soome if (cvd->v_id == devidx) 1327199767f8SToomas Soome break; 1328199767f8SToomas Soome } 1329199767f8SToomas Soome 1330199767f8SToomas Soome return (cvd); 1331199767f8SToomas Soome } 1332199767f8SToomas Soome 1333199767f8SToomas Soome /* 1334199767f8SToomas Soome * We keep track of whether or not there were any injected errors, so that 1335199767f8SToomas Soome * any ereports we generate can note it. 1336199767f8SToomas Soome */ 1337199767f8SToomas Soome static int 13384a04e8dbSToomas Soome raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1339*8eef2ab6SToomas Soome uint64_t size __unused) 1340199767f8SToomas Soome { 1341199767f8SToomas Soome 13424a04e8dbSToomas Soome return (zio_checksum_verify(spa, bp, data)); 1343199767f8SToomas Soome } 1344199767f8SToomas Soome 1345199767f8SToomas Soome /* 1346199767f8SToomas Soome * Generate the parity from the data columns. If we tried and were able to 1347199767f8SToomas Soome * read the parity without error, verify that the generated parity matches the 1348199767f8SToomas Soome * data we read. If it doesn't, we fire off a checksum error. Return the 1349199767f8SToomas Soome * number such failures. 1350199767f8SToomas Soome */ 1351199767f8SToomas Soome static int 1352199767f8SToomas Soome raidz_parity_verify(raidz_map_t *rm) 1353199767f8SToomas Soome { 1354199767f8SToomas Soome void *orig[VDEV_RAIDZ_MAXPARITY]; 1355199767f8SToomas Soome int c, ret = 0; 1356199767f8SToomas Soome raidz_col_t *rc; 1357199767f8SToomas Soome 1358199767f8SToomas Soome for (c = 0; c < rm->rm_firstdatacol; c++) { 1359199767f8SToomas Soome rc = &rm->rm_col[c]; 1360199767f8SToomas Soome if (!rc->rc_tried || rc->rc_error != 0) 1361199767f8SToomas Soome continue; 1362199767f8SToomas Soome orig[c] = zfs_alloc(rc->rc_size); 1363199767f8SToomas Soome bcopy(rc->rc_data, orig[c], rc->rc_size); 1364199767f8SToomas Soome } 1365199767f8SToomas Soome 1366199767f8SToomas Soome vdev_raidz_generate_parity(rm); 1367199767f8SToomas Soome 1368199767f8SToomas Soome for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1369199767f8SToomas Soome rc = &rm->rm_col[c]; 1370199767f8SToomas Soome if (!rc->rc_tried || rc->rc_error != 0) 1371199767f8SToomas Soome continue; 1372199767f8SToomas Soome if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1373199767f8SToomas Soome rc->rc_error = ECKSUM; 1374199767f8SToomas Soome ret++; 1375199767f8SToomas Soome } 1376199767f8SToomas Soome zfs_free(orig[c], rc->rc_size); 1377199767f8SToomas Soome } 1378199767f8SToomas Soome 1379199767f8SToomas Soome return (ret); 1380199767f8SToomas Soome } 1381199767f8SToomas Soome 1382199767f8SToomas Soome /* 1383199767f8SToomas Soome * Iterate over all combinations of bad data and attempt a reconstruction. 1384199767f8SToomas Soome * Note that the algorithm below is non-optimal because it doesn't take into 1385199767f8SToomas Soome * account how reconstruction is actually performed. For example, with 1386199767f8SToomas Soome * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1387199767f8SToomas Soome * is targeted as invalid as if columns 1 and 4 are targeted since in both 1388199767f8SToomas Soome * cases we'd only use parity information in column 0. 1389199767f8SToomas Soome */ 1390199767f8SToomas Soome static int 13914a04e8dbSToomas Soome vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1392*8eef2ab6SToomas Soome void *data, off_t offset __unused, uint64_t bytes, int total_errors, 1393*8eef2ab6SToomas Soome int data_errors) 1394199767f8SToomas Soome { 1395199767f8SToomas Soome raidz_col_t *rc; 1396199767f8SToomas Soome void *orig[VDEV_RAIDZ_MAXPARITY]; 1397199767f8SToomas Soome int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1398199767f8SToomas Soome int *tgts = &tstore[1]; 1399199767f8SToomas Soome int current, next, i, c, n; 1400199767f8SToomas Soome int code, ret = 0; 1401199767f8SToomas Soome 1402199767f8SToomas Soome ASSERT(total_errors < rm->rm_firstdatacol); 1403199767f8SToomas Soome 1404199767f8SToomas Soome /* 1405199767f8SToomas Soome * This simplifies one edge condition. 1406199767f8SToomas Soome */ 1407199767f8SToomas Soome tgts[-1] = -1; 1408199767f8SToomas Soome 1409199767f8SToomas Soome for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1410199767f8SToomas Soome /* 1411199767f8SToomas Soome * Initialize the targets array by finding the first n columns 1412199767f8SToomas Soome * that contain no error. 1413199767f8SToomas Soome * 1414199767f8SToomas Soome * If there were no data errors, we need to ensure that we're 1415199767f8SToomas Soome * always explicitly attempting to reconstruct at least one 1416199767f8SToomas Soome * data column. To do this, we simply push the highest target 1417199767f8SToomas Soome * up into the data columns. 1418199767f8SToomas Soome */ 1419199767f8SToomas Soome for (c = 0, i = 0; i < n; i++) { 1420199767f8SToomas Soome if (i == n - 1 && data_errors == 0 && 1421199767f8SToomas Soome c < rm->rm_firstdatacol) { 1422199767f8SToomas Soome c = rm->rm_firstdatacol; 1423199767f8SToomas Soome } 1424199767f8SToomas Soome 1425199767f8SToomas Soome while (rm->rm_col[c].rc_error != 0) { 1426199767f8SToomas Soome c++; 1427199767f8SToomas Soome ASSERT3S(c, <, rm->rm_cols); 1428199767f8SToomas Soome } 1429199767f8SToomas Soome 1430199767f8SToomas Soome tgts[i] = c++; 1431199767f8SToomas Soome } 1432199767f8SToomas Soome 1433199767f8SToomas Soome /* 1434199767f8SToomas Soome * Setting tgts[n] simplifies the other edge condition. 1435199767f8SToomas Soome */ 1436199767f8SToomas Soome tgts[n] = rm->rm_cols; 1437199767f8SToomas Soome 1438199767f8SToomas Soome /* 1439199767f8SToomas Soome * These buffers were allocated in previous iterations. 1440199767f8SToomas Soome */ 1441199767f8SToomas Soome for (i = 0; i < n - 1; i++) { 1442199767f8SToomas Soome ASSERT(orig[i] != NULL); 1443199767f8SToomas Soome } 1444199767f8SToomas Soome 1445199767f8SToomas Soome orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1446199767f8SToomas Soome 1447199767f8SToomas Soome current = 0; 1448199767f8SToomas Soome next = tgts[current]; 1449199767f8SToomas Soome 1450199767f8SToomas Soome while (current != n) { 1451199767f8SToomas Soome tgts[current] = next; 1452199767f8SToomas Soome current = 0; 1453199767f8SToomas Soome 1454199767f8SToomas Soome /* 1455199767f8SToomas Soome * Save off the original data that we're going to 1456199767f8SToomas Soome * attempt to reconstruct. 1457199767f8SToomas Soome */ 1458199767f8SToomas Soome for (i = 0; i < n; i++) { 1459199767f8SToomas Soome ASSERT(orig[i] != NULL); 1460199767f8SToomas Soome c = tgts[i]; 1461199767f8SToomas Soome ASSERT3S(c, >=, 0); 1462199767f8SToomas Soome ASSERT3S(c, <, rm->rm_cols); 1463199767f8SToomas Soome rc = &rm->rm_col[c]; 1464199767f8SToomas Soome bcopy(rc->rc_data, orig[i], rc->rc_size); 1465199767f8SToomas Soome } 1466199767f8SToomas Soome 1467199767f8SToomas Soome /* 1468199767f8SToomas Soome * Attempt a reconstruction and exit the outer loop on 1469199767f8SToomas Soome * success. 1470199767f8SToomas Soome */ 1471199767f8SToomas Soome code = vdev_raidz_reconstruct(rm, tgts, n); 14724a04e8dbSToomas Soome if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1473199767f8SToomas Soome for (i = 0; i < n; i++) { 1474199767f8SToomas Soome c = tgts[i]; 1475199767f8SToomas Soome rc = &rm->rm_col[c]; 1476199767f8SToomas Soome ASSERT(rc->rc_error == 0); 1477199767f8SToomas Soome rc->rc_error = ECKSUM; 1478199767f8SToomas Soome } 1479199767f8SToomas Soome 1480199767f8SToomas Soome ret = code; 1481199767f8SToomas Soome goto done; 1482199767f8SToomas Soome } 1483199767f8SToomas Soome 1484199767f8SToomas Soome /* 1485199767f8SToomas Soome * Restore the original data. 1486199767f8SToomas Soome */ 1487199767f8SToomas Soome for (i = 0; i < n; i++) { 1488199767f8SToomas Soome c = tgts[i]; 1489199767f8SToomas Soome rc = &rm->rm_col[c]; 1490199767f8SToomas Soome bcopy(orig[i], rc->rc_data, rc->rc_size); 1491199767f8SToomas Soome } 1492199767f8SToomas Soome 1493199767f8SToomas Soome do { 1494199767f8SToomas Soome /* 1495199767f8SToomas Soome * Find the next valid column after the current 1496199767f8SToomas Soome * position.. 1497199767f8SToomas Soome */ 1498199767f8SToomas Soome for (next = tgts[current] + 1; 1499199767f8SToomas Soome next < rm->rm_cols && 1500199767f8SToomas Soome rm->rm_col[next].rc_error != 0; next++) 1501199767f8SToomas Soome continue; 1502199767f8SToomas Soome 1503199767f8SToomas Soome ASSERT(next <= tgts[current + 1]); 1504199767f8SToomas Soome 1505199767f8SToomas Soome /* 1506199767f8SToomas Soome * If that spot is available, we're done here. 1507199767f8SToomas Soome */ 1508199767f8SToomas Soome if (next != tgts[current + 1]) 1509199767f8SToomas Soome break; 1510199767f8SToomas Soome 1511199767f8SToomas Soome /* 1512199767f8SToomas Soome * Otherwise, find the next valid column after 1513199767f8SToomas Soome * the previous position. 1514199767f8SToomas Soome */ 1515199767f8SToomas Soome for (c = tgts[current - 1] + 1; 1516199767f8SToomas Soome rm->rm_col[c].rc_error != 0; c++) 1517199767f8SToomas Soome continue; 1518199767f8SToomas Soome 1519199767f8SToomas Soome tgts[current] = c; 1520199767f8SToomas Soome current++; 1521199767f8SToomas Soome 1522199767f8SToomas Soome } while (current != n); 1523199767f8SToomas Soome } 1524199767f8SToomas Soome } 1525199767f8SToomas Soome n--; 1526199767f8SToomas Soome done: 1527199767f8SToomas Soome for (i = n - 1; i >= 0; i--) { 1528199767f8SToomas Soome zfs_free(orig[i], rm->rm_col[0].rc_size); 1529199767f8SToomas Soome } 1530199767f8SToomas Soome 1531199767f8SToomas Soome return (ret); 1532199767f8SToomas Soome } 1533199767f8SToomas Soome 1534199767f8SToomas Soome static int 1535199767f8SToomas Soome vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1536199767f8SToomas Soome off_t offset, size_t bytes) 1537199767f8SToomas Soome { 1538199767f8SToomas Soome vdev_t *tvd = vd->v_top; 1539199767f8SToomas Soome vdev_t *cvd; 1540199767f8SToomas Soome raidz_map_t *rm; 1541199767f8SToomas Soome raidz_col_t *rc; 1542199767f8SToomas Soome int c, error; 1543199767f8SToomas Soome int unexpected_errors; 1544199767f8SToomas Soome int parity_errors; 1545199767f8SToomas Soome int parity_untried; 1546199767f8SToomas Soome int data_errors; 1547199767f8SToomas Soome int total_errors; 1548199767f8SToomas Soome int n; 1549199767f8SToomas Soome int tgts[VDEV_RAIDZ_MAXPARITY]; 1550199767f8SToomas Soome int code; 1551199767f8SToomas Soome 1552199767f8SToomas Soome rc = NULL; /* gcc */ 1553199767f8SToomas Soome error = 0; 1554199767f8SToomas Soome 1555199767f8SToomas Soome rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1556199767f8SToomas Soome vd->v_nchildren, vd->v_nparity); 1557199767f8SToomas Soome 1558199767f8SToomas Soome /* 1559199767f8SToomas Soome * Iterate over the columns in reverse order so that we hit the parity 1560199767f8SToomas Soome * last -- any errors along the way will force us to read the parity. 1561199767f8SToomas Soome */ 1562199767f8SToomas Soome for (c = rm->rm_cols - 1; c >= 0; c--) { 1563199767f8SToomas Soome rc = &rm->rm_col[c]; 1564199767f8SToomas Soome cvd = vdev_child(vd, rc->rc_devidx); 1565199767f8SToomas Soome if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1566199767f8SToomas Soome if (c >= rm->rm_firstdatacol) 1567199767f8SToomas Soome rm->rm_missingdata++; 1568199767f8SToomas Soome else 1569199767f8SToomas Soome rm->rm_missingparity++; 1570199767f8SToomas Soome rc->rc_error = ENXIO; 1571199767f8SToomas Soome rc->rc_tried = 1; /* don't even try */ 1572199767f8SToomas Soome rc->rc_skipped = 1; 1573199767f8SToomas Soome continue; 1574199767f8SToomas Soome } 1575199767f8SToomas Soome #if 0 /* XXX: Too hard for the boot code. */ 1576199767f8SToomas Soome if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1577199767f8SToomas Soome if (c >= rm->rm_firstdatacol) 1578199767f8SToomas Soome rm->rm_missingdata++; 1579199767f8SToomas Soome else 1580199767f8SToomas Soome rm->rm_missingparity++; 1581199767f8SToomas Soome rc->rc_error = ESTALE; 1582199767f8SToomas Soome rc->rc_skipped = 1; 1583199767f8SToomas Soome continue; 1584199767f8SToomas Soome } 1585199767f8SToomas Soome #endif 1586199767f8SToomas Soome if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1587199767f8SToomas Soome rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1588199767f8SToomas Soome rc->rc_offset, rc->rc_size); 1589199767f8SToomas Soome rc->rc_tried = 1; 1590199767f8SToomas Soome rc->rc_skipped = 0; 1591199767f8SToomas Soome } 1592199767f8SToomas Soome } 1593199767f8SToomas Soome 1594199767f8SToomas Soome reconstruct: 1595199767f8SToomas Soome unexpected_errors = 0; 1596199767f8SToomas Soome parity_errors = 0; 1597199767f8SToomas Soome parity_untried = 0; 1598199767f8SToomas Soome data_errors = 0; 1599199767f8SToomas Soome total_errors = 0; 1600199767f8SToomas Soome 1601199767f8SToomas Soome ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1602199767f8SToomas Soome ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1603199767f8SToomas Soome 1604199767f8SToomas Soome for (c = 0; c < rm->rm_cols; c++) { 1605199767f8SToomas Soome rc = &rm->rm_col[c]; 1606199767f8SToomas Soome 1607199767f8SToomas Soome if (rc->rc_error) { 1608199767f8SToomas Soome ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1609199767f8SToomas Soome 1610199767f8SToomas Soome if (c < rm->rm_firstdatacol) 1611199767f8SToomas Soome parity_errors++; 1612199767f8SToomas Soome else 1613199767f8SToomas Soome data_errors++; 1614199767f8SToomas Soome 1615199767f8SToomas Soome if (!rc->rc_skipped) 1616199767f8SToomas Soome unexpected_errors++; 1617199767f8SToomas Soome 1618199767f8SToomas Soome total_errors++; 1619199767f8SToomas Soome } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1620199767f8SToomas Soome parity_untried++; 1621199767f8SToomas Soome } 1622199767f8SToomas Soome } 1623199767f8SToomas Soome 1624199767f8SToomas Soome /* 1625199767f8SToomas Soome * There are three potential phases for a read: 1626199767f8SToomas Soome * 1. produce valid data from the columns read 1627199767f8SToomas Soome * 2. read all disks and try again 1628199767f8SToomas Soome * 3. perform combinatorial reconstruction 1629199767f8SToomas Soome * 1630199767f8SToomas Soome * Each phase is progressively both more expensive and less likely to 1631199767f8SToomas Soome * occur. If we encounter more errors than we can repair or all phases 1632199767f8SToomas Soome * fail, we have no choice but to return an error. 1633199767f8SToomas Soome */ 1634199767f8SToomas Soome 1635199767f8SToomas Soome /* 1636199767f8SToomas Soome * If the number of errors we saw was correctable -- less than or equal 1637199767f8SToomas Soome * to the number of parity disks read -- attempt to produce data that 1638199767f8SToomas Soome * has a valid checksum. Naturally, this case applies in the absence of 1639199767f8SToomas Soome * any errors. 1640199767f8SToomas Soome */ 1641199767f8SToomas Soome if (total_errors <= rm->rm_firstdatacol - parity_untried) { 16427bbcfb41SToomas Soome int rv; 16437bbcfb41SToomas Soome 1644199767f8SToomas Soome if (data_errors == 0) { 16457bbcfb41SToomas Soome rv = raidz_checksum_verify(vd->spa, bp, data, bytes); 16467bbcfb41SToomas Soome if (rv == 0) { 1647199767f8SToomas Soome /* 1648199767f8SToomas Soome * If we read parity information (unnecessarily 1649199767f8SToomas Soome * as it happens since no reconstruction was 1650199767f8SToomas Soome * needed) regenerate and verify the parity. 1651199767f8SToomas Soome * We also regenerate parity when resilvering 1652199767f8SToomas Soome * so we can write it out to the failed device 1653199767f8SToomas Soome * later. 1654199767f8SToomas Soome */ 1655199767f8SToomas Soome if (parity_errors + parity_untried < 1656199767f8SToomas Soome rm->rm_firstdatacol) { 1657199767f8SToomas Soome n = raidz_parity_verify(rm); 1658199767f8SToomas Soome unexpected_errors += n; 1659199767f8SToomas Soome ASSERT(parity_errors + n <= 1660199767f8SToomas Soome rm->rm_firstdatacol); 1661199767f8SToomas Soome } 1662199767f8SToomas Soome goto done; 1663199767f8SToomas Soome } 1664199767f8SToomas Soome } else { 1665199767f8SToomas Soome /* 1666199767f8SToomas Soome * We either attempt to read all the parity columns or 1667199767f8SToomas Soome * none of them. If we didn't try to read parity, we 1668199767f8SToomas Soome * wouldn't be here in the correctable case. There must 1669199767f8SToomas Soome * also have been fewer parity errors than parity 1670199767f8SToomas Soome * columns or, again, we wouldn't be in this code path. 1671199767f8SToomas Soome */ 1672199767f8SToomas Soome ASSERT(parity_untried == 0); 1673199767f8SToomas Soome ASSERT(parity_errors < rm->rm_firstdatacol); 1674199767f8SToomas Soome 1675199767f8SToomas Soome /* 1676199767f8SToomas Soome * Identify the data columns that reported an error. 1677199767f8SToomas Soome */ 1678199767f8SToomas Soome n = 0; 1679199767f8SToomas Soome for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1680199767f8SToomas Soome rc = &rm->rm_col[c]; 1681199767f8SToomas Soome if (rc->rc_error != 0) { 1682199767f8SToomas Soome ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1683199767f8SToomas Soome tgts[n++] = c; 1684199767f8SToomas Soome } 1685199767f8SToomas Soome } 1686199767f8SToomas Soome 1687199767f8SToomas Soome ASSERT(rm->rm_firstdatacol >= n); 1688199767f8SToomas Soome 1689199767f8SToomas Soome code = vdev_raidz_reconstruct(rm, tgts, n); 1690199767f8SToomas Soome 16917bbcfb41SToomas Soome rv = raidz_checksum_verify(vd->spa, bp, data, bytes); 16927bbcfb41SToomas Soome if (rv == 0) { 1693199767f8SToomas Soome /* 1694199767f8SToomas Soome * If we read more parity disks than were used 1695199767f8SToomas Soome * for reconstruction, confirm that the other 1696199767f8SToomas Soome * parity disks produced correct data. This 1697199767f8SToomas Soome * routine is suboptimal in that it regenerates 1698199767f8SToomas Soome * the parity that we already used in addition 1699199767f8SToomas Soome * to the parity that we're attempting to 1700199767f8SToomas Soome * verify, but this should be a relatively 1701199767f8SToomas Soome * uncommon case, and can be optimized if it 1702199767f8SToomas Soome * becomes a problem. Note that we regenerate 1703199767f8SToomas Soome * parity when resilvering so we can write it 1704199767f8SToomas Soome * out to failed devices later. 1705199767f8SToomas Soome */ 1706199767f8SToomas Soome if (parity_errors < rm->rm_firstdatacol - n) { 1707199767f8SToomas Soome n = raidz_parity_verify(rm); 1708199767f8SToomas Soome unexpected_errors += n; 1709199767f8SToomas Soome ASSERT(parity_errors + n <= 1710199767f8SToomas Soome rm->rm_firstdatacol); 1711199767f8SToomas Soome } 1712199767f8SToomas Soome 1713199767f8SToomas Soome goto done; 1714199767f8SToomas Soome } 1715199767f8SToomas Soome } 1716199767f8SToomas Soome } 1717199767f8SToomas Soome 1718199767f8SToomas Soome /* 1719199767f8SToomas Soome * This isn't a typical situation -- either we got a read 1720199767f8SToomas Soome * error or a child silently returned bad data. Read every 1721199767f8SToomas Soome * block so we can try again with as much data and parity as 1722199767f8SToomas Soome * we can track down. If we've already been through once 1723199767f8SToomas Soome * before, all children will be marked as tried so we'll 1724199767f8SToomas Soome * proceed to combinatorial reconstruction. 1725199767f8SToomas Soome */ 1726199767f8SToomas Soome unexpected_errors = 1; 1727199767f8SToomas Soome rm->rm_missingdata = 0; 1728199767f8SToomas Soome rm->rm_missingparity = 0; 1729199767f8SToomas Soome 1730199767f8SToomas Soome n = 0; 1731199767f8SToomas Soome for (c = 0; c < rm->rm_cols; c++) { 1732199767f8SToomas Soome rc = &rm->rm_col[c]; 1733199767f8SToomas Soome 1734199767f8SToomas Soome if (rc->rc_tried) 1735199767f8SToomas Soome continue; 1736199767f8SToomas Soome 1737199767f8SToomas Soome cvd = vdev_child(vd, rc->rc_devidx); 1738199767f8SToomas Soome ASSERT(cvd != NULL); 1739199767f8SToomas Soome rc->rc_error = cvd->v_read(cvd, NULL, 1740199767f8SToomas Soome rc->rc_data, rc->rc_offset, rc->rc_size); 1741199767f8SToomas Soome if (rc->rc_error == 0) 1742199767f8SToomas Soome n++; 1743199767f8SToomas Soome rc->rc_tried = 1; 1744199767f8SToomas Soome rc->rc_skipped = 0; 1745199767f8SToomas Soome } 1746199767f8SToomas Soome /* 1747199767f8SToomas Soome * If we managed to read anything more, retry the 1748199767f8SToomas Soome * reconstruction. 1749199767f8SToomas Soome */ 1750199767f8SToomas Soome if (n > 0) 1751199767f8SToomas Soome goto reconstruct; 1752199767f8SToomas Soome 1753199767f8SToomas Soome /* 1754199767f8SToomas Soome * At this point we've attempted to reconstruct the data given the 1755199767f8SToomas Soome * errors we detected, and we've attempted to read all columns. There 1756199767f8SToomas Soome * must, therefore, be one or more additional problems -- silent errors 1757199767f8SToomas Soome * resulting in invalid data rather than explicit I/O errors resulting 1758199767f8SToomas Soome * in absent data. We check if there is enough additional data to 1759199767f8SToomas Soome * possibly reconstruct the data and then perform combinatorial 1760199767f8SToomas Soome * reconstruction over all possible combinations. If that fails, 1761199767f8SToomas Soome * we're cooked. 1762199767f8SToomas Soome */ 1763199767f8SToomas Soome if (total_errors > rm->rm_firstdatacol) { 1764199767f8SToomas Soome error = EIO; 1765199767f8SToomas Soome } else if (total_errors < rm->rm_firstdatacol && 17664a04e8dbSToomas Soome (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes, 17677bbcfb41SToomas Soome total_errors, data_errors)) != 0) { 1768199767f8SToomas Soome /* 1769199767f8SToomas Soome * If we didn't use all the available parity for the 1770199767f8SToomas Soome * combinatorial reconstruction, verify that the remaining 1771199767f8SToomas Soome * parity is correct. 1772199767f8SToomas Soome */ 1773199767f8SToomas Soome if (code != (1 << rm->rm_firstdatacol) - 1) 1774199767f8SToomas Soome (void) raidz_parity_verify(rm); 1775199767f8SToomas Soome } else { 1776199767f8SToomas Soome /* 1777199767f8SToomas Soome * We're here because either: 1778199767f8SToomas Soome * 1779199767f8SToomas Soome * total_errors == rm_first_datacol, or 1780199767f8SToomas Soome * vdev_raidz_combrec() failed 1781199767f8SToomas Soome * 1782199767f8SToomas Soome * In either case, there is enough bad data to prevent 1783199767f8SToomas Soome * reconstruction. 1784199767f8SToomas Soome * 1785199767f8SToomas Soome * Start checksum ereports for all children which haven't 1786199767f8SToomas Soome * failed, and the IO wasn't speculative. 1787199767f8SToomas Soome */ 1788199767f8SToomas Soome error = ECKSUM; 1789199767f8SToomas Soome } 1790199767f8SToomas Soome 1791199767f8SToomas Soome done: 1792199767f8SToomas Soome vdev_raidz_map_free(rm); 1793199767f8SToomas Soome 1794199767f8SToomas Soome return (error); 1795199767f8SToomas Soome } 1796