1*199767f8SToomas Soome /* 2*199767f8SToomas Soome * CDDL HEADER START 3*199767f8SToomas Soome * 4*199767f8SToomas Soome * The contents of this file are subject to the terms of the 5*199767f8SToomas Soome * Common Development and Distribution License (the "License"). 6*199767f8SToomas Soome * You may not use this file except in compliance with the License. 7*199767f8SToomas Soome * 8*199767f8SToomas Soome * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*199767f8SToomas Soome * or http://www.opensolaris.org/os/licensing. 10*199767f8SToomas Soome * See the License for the specific language governing permissions 11*199767f8SToomas Soome * and limitations under the License. 12*199767f8SToomas Soome * 13*199767f8SToomas Soome * When distributing Covered Code, include this CDDL HEADER in each 14*199767f8SToomas Soome * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*199767f8SToomas Soome * If applicable, add the following below this CDDL HEADER, with the 16*199767f8SToomas Soome * fields enclosed by brackets "[]" replaced with your own identifying 17*199767f8SToomas Soome * information: Portions Copyright [yyyy] [name of copyright owner] 18*199767f8SToomas Soome * 19*199767f8SToomas Soome * CDDL HEADER END 20*199767f8SToomas Soome */ 21*199767f8SToomas Soome /* 22*199767f8SToomas Soome * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23*199767f8SToomas Soome * Use is subject to license terms. 24*199767f8SToomas Soome */ 25*199767f8SToomas Soome 26*199767f8SToomas Soome #include <sys/cdefs.h> 27*199767f8SToomas Soome 28*199767f8SToomas Soome static uint64_t zfs_crc64_table[256]; 29*199767f8SToomas Soome 30*199767f8SToomas Soome #define ECKSUM 666 31*199767f8SToomas Soome 32*199767f8SToomas Soome #define ASSERT3S(x, y, z) ((void)0) 33*199767f8SToomas Soome #define ASSERT3U(x, y, z) ((void)0) 34*199767f8SToomas Soome #define ASSERT3P(x, y, z) ((void)0) 35*199767f8SToomas Soome #define ASSERT0(x) ((void)0) 36*199767f8SToomas Soome #define ASSERT(x) ((void)0) 37*199767f8SToomas Soome 38*199767f8SToomas Soome #define panic(...) do { \ 39*199767f8SToomas Soome printf(__VA_ARGS__); \ 40*199767f8SToomas Soome for (;;) ; \ 41*199767f8SToomas Soome } while (0) 42*199767f8SToomas Soome 43*199767f8SToomas Soome #define kmem_alloc(size, flag) zfs_alloc((size)) 44*199767f8SToomas Soome #define kmem_free(ptr, size) zfs_free((ptr), (size)) 45*199767f8SToomas Soome 46*199767f8SToomas Soome static void 47*199767f8SToomas Soome zfs_init_crc(void) 48*199767f8SToomas Soome { 49*199767f8SToomas Soome int i, j; 50*199767f8SToomas Soome uint64_t *ct; 51*199767f8SToomas Soome 52*199767f8SToomas Soome /* 53*199767f8SToomas Soome * Calculate the crc64 table (used for the zap hash 54*199767f8SToomas Soome * function). 55*199767f8SToomas Soome */ 56*199767f8SToomas Soome if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 57*199767f8SToomas Soome memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 58*199767f8SToomas Soome for (i = 0; i < 256; i++) 59*199767f8SToomas Soome for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 60*199767f8SToomas Soome *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 61*199767f8SToomas Soome } 62*199767f8SToomas Soome } 63*199767f8SToomas Soome 64*199767f8SToomas Soome static void 65*199767f8SToomas Soome zio_checksum_off(const void *buf, uint64_t size, 66*199767f8SToomas Soome const void *ctx_template, zio_cksum_t *zcp) 67*199767f8SToomas Soome { 68*199767f8SToomas Soome ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 69*199767f8SToomas Soome } 70*199767f8SToomas Soome 71*199767f8SToomas Soome /* 72*199767f8SToomas Soome * Signature for checksum functions. 73*199767f8SToomas Soome */ 74*199767f8SToomas Soome typedef void zio_checksum_t(const void *data, uint64_t size, 75*199767f8SToomas Soome const void *ctx_template, zio_cksum_t *zcp); 76*199767f8SToomas Soome typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 77*199767f8SToomas Soome typedef void zio_checksum_tmpl_free_t(void *ctx_template); 78*199767f8SToomas Soome 79*199767f8SToomas Soome typedef enum zio_checksum_flags { 80*199767f8SToomas Soome /* Strong enough for metadata? */ 81*199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA = (1 << 1), 82*199767f8SToomas Soome /* ZIO embedded checksum */ 83*199767f8SToomas Soome ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 84*199767f8SToomas Soome /* Strong enough for dedup (without verification)? */ 85*199767f8SToomas Soome ZCHECKSUM_FLAG_DEDUP = (1 << 3), 86*199767f8SToomas Soome /* Uses salt value */ 87*199767f8SToomas Soome ZCHECKSUM_FLAG_SALTED = (1 << 4), 88*199767f8SToomas Soome /* Strong enough for nopwrite? */ 89*199767f8SToomas Soome ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 90*199767f8SToomas Soome } zio_checksum_flags_t; 91*199767f8SToomas Soome 92*199767f8SToomas Soome /* 93*199767f8SToomas Soome * Information about each checksum function. 94*199767f8SToomas Soome */ 95*199767f8SToomas Soome typedef struct zio_checksum_info { 96*199767f8SToomas Soome /* checksum function for each byteorder */ 97*199767f8SToomas Soome zio_checksum_t *ci_func[2]; 98*199767f8SToomas Soome zio_checksum_tmpl_init_t *ci_tmpl_init; 99*199767f8SToomas Soome zio_checksum_tmpl_free_t *ci_tmpl_free; 100*199767f8SToomas Soome zio_checksum_flags_t ci_flags; 101*199767f8SToomas Soome const char *ci_name; /* descriptive name */ 102*199767f8SToomas Soome } zio_checksum_info_t; 103*199767f8SToomas Soome 104*199767f8SToomas Soome #include "blkptr.c" 105*199767f8SToomas Soome 106*199767f8SToomas Soome #include "fletcher.c" 107*199767f8SToomas Soome #include "sha256.c" 108*199767f8SToomas Soome 109*199767f8SToomas Soome static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 110*199767f8SToomas Soome {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 111*199767f8SToomas Soome {{NULL, NULL}, NULL, NULL, 0, "on"}, 112*199767f8SToomas Soome {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 113*199767f8SToomas Soome {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 114*199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 115*199767f8SToomas Soome {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 116*199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 117*199767f8SToomas Soome {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 118*199767f8SToomas Soome ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 119*199767f8SToomas Soome {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 120*199767f8SToomas Soome 0, "fletcher2"}, 121*199767f8SToomas Soome {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 122*199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 123*199767f8SToomas Soome {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 124*199767f8SToomas Soome ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 125*199767f8SToomas Soome ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 126*199767f8SToomas Soome {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 127*199767f8SToomas Soome ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 128*199767f8SToomas Soome {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 129*199767f8SToomas Soome 0, "noparity"}, 130*199767f8SToomas Soome {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 131*199767f8SToomas Soome NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 132*199767f8SToomas Soome ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 133*199767f8SToomas Soome /* no skein and edonr for now */ 134*199767f8SToomas Soome {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 135*199767f8SToomas Soome ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | 136*199767f8SToomas Soome ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 137*199767f8SToomas Soome {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 138*199767f8SToomas Soome ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, 139*199767f8SToomas Soome }; 140*199767f8SToomas Soome 141*199767f8SToomas Soome /* 142*199767f8SToomas Soome * Common signature for all zio compress/decompress functions. 143*199767f8SToomas Soome */ 144*199767f8SToomas Soome typedef size_t zio_compress_func_t(void *src, void *dst, 145*199767f8SToomas Soome size_t s_len, size_t d_len, int); 146*199767f8SToomas Soome typedef int zio_decompress_func_t(void *src, void *dst, 147*199767f8SToomas Soome size_t s_len, size_t d_len, int); 148*199767f8SToomas Soome 149*199767f8SToomas Soome extern int gzip_decompress(void *src, void *dst, 150*199767f8SToomas Soome size_t s_len, size_t d_len, int); 151*199767f8SToomas Soome /* 152*199767f8SToomas Soome * Information about each compression function. 153*199767f8SToomas Soome */ 154*199767f8SToomas Soome typedef struct zio_compress_info { 155*199767f8SToomas Soome zio_compress_func_t *ci_compress; /* compression function */ 156*199767f8SToomas Soome zio_decompress_func_t *ci_decompress; /* decompression function */ 157*199767f8SToomas Soome int ci_level; /* level parameter */ 158*199767f8SToomas Soome const char *ci_name; /* algorithm name */ 159*199767f8SToomas Soome } zio_compress_info_t; 160*199767f8SToomas Soome 161*199767f8SToomas Soome #include "lzjb.c" 162*199767f8SToomas Soome #include "zle.c" 163*199767f8SToomas Soome #include "lz4.c" 164*199767f8SToomas Soome 165*199767f8SToomas Soome /* 166*199767f8SToomas Soome * Compression vectors. 167*199767f8SToomas Soome */ 168*199767f8SToomas Soome static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 169*199767f8SToomas Soome {NULL, NULL, 0, "inherit"}, 170*199767f8SToomas Soome {NULL, NULL, 0, "on"}, 171*199767f8SToomas Soome {NULL, NULL, 0, "uncompressed"}, 172*199767f8SToomas Soome {NULL, lzjb_decompress, 0, "lzjb"}, 173*199767f8SToomas Soome {NULL, NULL, 0, "empty"}, 174*199767f8SToomas Soome {NULL, gzip_decompress, 1, "gzip-1"}, 175*199767f8SToomas Soome {NULL, gzip_decompress, 2, "gzip-2"}, 176*199767f8SToomas Soome {NULL, gzip_decompress, 3, "gzip-3"}, 177*199767f8SToomas Soome {NULL, gzip_decompress, 4, "gzip-4"}, 178*199767f8SToomas Soome {NULL, gzip_decompress, 5, "gzip-5"}, 179*199767f8SToomas Soome {NULL, gzip_decompress, 6, "gzip-6"}, 180*199767f8SToomas Soome {NULL, gzip_decompress, 7, "gzip-7"}, 181*199767f8SToomas Soome {NULL, gzip_decompress, 8, "gzip-8"}, 182*199767f8SToomas Soome {NULL, gzip_decompress, 9, "gzip-9"}, 183*199767f8SToomas Soome {NULL, zle_decompress, 64, "zle"}, 184*199767f8SToomas Soome {NULL, lz4_decompress, 0, "lz4"}, 185*199767f8SToomas Soome }; 186*199767f8SToomas Soome 187*199767f8SToomas Soome static void 188*199767f8SToomas Soome byteswap_uint64_array(void *vbuf, size_t size) 189*199767f8SToomas Soome { 190*199767f8SToomas Soome uint64_t *buf = vbuf; 191*199767f8SToomas Soome size_t count = size >> 3; 192*199767f8SToomas Soome int i; 193*199767f8SToomas Soome 194*199767f8SToomas Soome ASSERT((size & 7) == 0); 195*199767f8SToomas Soome 196*199767f8SToomas Soome for (i = 0; i < count; i++) 197*199767f8SToomas Soome buf[i] = BSWAP_64(buf[i]); 198*199767f8SToomas Soome } 199*199767f8SToomas Soome 200*199767f8SToomas Soome /* 201*199767f8SToomas Soome * Set the external verifier for a gang block based on <vdev, offset, txg>, 202*199767f8SToomas Soome * a tuple which is guaranteed to be unique for the life of the pool. 203*199767f8SToomas Soome */ 204*199767f8SToomas Soome static void 205*199767f8SToomas Soome zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 206*199767f8SToomas Soome { 207*199767f8SToomas Soome const dva_t *dva = BP_IDENTITY(bp); 208*199767f8SToomas Soome uint64_t txg = BP_PHYSICAL_BIRTH(bp); 209*199767f8SToomas Soome 210*199767f8SToomas Soome ASSERT(BP_IS_GANG(bp)); 211*199767f8SToomas Soome 212*199767f8SToomas Soome ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 213*199767f8SToomas Soome } 214*199767f8SToomas Soome 215*199767f8SToomas Soome /* 216*199767f8SToomas Soome * Set the external verifier for a label block based on its offset. 217*199767f8SToomas Soome * The vdev is implicit, and the txg is unknowable at pool open time -- 218*199767f8SToomas Soome * hence the logic in vdev_uberblock_load() to find the most recent copy. 219*199767f8SToomas Soome */ 220*199767f8SToomas Soome static void 221*199767f8SToomas Soome zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 222*199767f8SToomas Soome { 223*199767f8SToomas Soome ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 224*199767f8SToomas Soome } 225*199767f8SToomas Soome 226*199767f8SToomas Soome /* 227*199767f8SToomas Soome * Calls the template init function of a checksum which supports context 228*199767f8SToomas Soome * templates and installs the template into the spa_t. 229*199767f8SToomas Soome */ 230*199767f8SToomas Soome static void 231*199767f8SToomas Soome zio_checksum_template_init(enum zio_checksum checksum, const spa_t *spa) 232*199767f8SToomas Soome { 233*199767f8SToomas Soome zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 234*199767f8SToomas Soome 235*199767f8SToomas Soome if (ci->ci_tmpl_init == NULL) 236*199767f8SToomas Soome return; 237*199767f8SToomas Soome #if 0 /* for now we dont have anything here */ 238*199767f8SToomas Soome if (spa->spa_cksum_tmpls[checksum] != NULL) 239*199767f8SToomas Soome return; 240*199767f8SToomas Soome 241*199767f8SToomas Soome VERIFY(ci->ci_tmpl_free != NULL); 242*199767f8SToomas Soome mutex_enter(&spa->spa_cksum_tmpls_lock); 243*199767f8SToomas Soome if (spa->spa_cksum_tmpls[checksum] == NULL) { 244*199767f8SToomas Soome spa->spa_cksum_tmpls[checksum] = 245*199767f8SToomas Soome ci->ci_tmpl_init(&spa->spa_cksum_salt); 246*199767f8SToomas Soome VERIFY(spa->spa_cksum_tmpls[checksum] != NULL); 247*199767f8SToomas Soome } 248*199767f8SToomas Soome mutex_exit(&spa->spa_cksum_tmpls_lock); 249*199767f8SToomas Soome #endif 250*199767f8SToomas Soome } 251*199767f8SToomas Soome 252*199767f8SToomas Soome static int 253*199767f8SToomas Soome zio_checksum_verify(const blkptr_t *bp, void *data) 254*199767f8SToomas Soome { 255*199767f8SToomas Soome uint64_t size; 256*199767f8SToomas Soome unsigned int checksum; 257*199767f8SToomas Soome zio_checksum_info_t *ci; 258*199767f8SToomas Soome zio_cksum_t actual_cksum, expected_cksum, verifier; 259*199767f8SToomas Soome int byteswap; 260*199767f8SToomas Soome 261*199767f8SToomas Soome checksum = BP_GET_CHECKSUM(bp); 262*199767f8SToomas Soome size = BP_GET_PSIZE(bp); 263*199767f8SToomas Soome 264*199767f8SToomas Soome if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 265*199767f8SToomas Soome return (EINVAL); 266*199767f8SToomas Soome ci = &zio_checksum_table[checksum]; 267*199767f8SToomas Soome if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 268*199767f8SToomas Soome return (EINVAL); 269*199767f8SToomas Soome 270*199767f8SToomas Soome zio_checksum_template_init(checksum, NULL); 271*199767f8SToomas Soome if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 272*199767f8SToomas Soome zio_eck_t *eck; 273*199767f8SToomas Soome 274*199767f8SToomas Soome ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 275*199767f8SToomas Soome checksum == ZIO_CHECKSUM_LABEL); 276*199767f8SToomas Soome 277*199767f8SToomas Soome eck = (zio_eck_t *)((char *)data + size) - 1; 278*199767f8SToomas Soome 279*199767f8SToomas Soome if (checksum == ZIO_CHECKSUM_GANG_HEADER) 280*199767f8SToomas Soome zio_checksum_gang_verifier(&verifier, bp); 281*199767f8SToomas Soome else if (checksum == ZIO_CHECKSUM_LABEL) 282*199767f8SToomas Soome zio_checksum_label_verifier(&verifier, 283*199767f8SToomas Soome DVA_GET_OFFSET(BP_IDENTITY(bp))); 284*199767f8SToomas Soome else 285*199767f8SToomas Soome verifier = bp->blk_cksum; 286*199767f8SToomas Soome 287*199767f8SToomas Soome byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 288*199767f8SToomas Soome 289*199767f8SToomas Soome if (byteswap) 290*199767f8SToomas Soome byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 291*199767f8SToomas Soome 292*199767f8SToomas Soome expected_cksum = eck->zec_cksum; 293*199767f8SToomas Soome eck->zec_cksum = verifier; 294*199767f8SToomas Soome ci->ci_func[byteswap](data, size, NULL, &actual_cksum); 295*199767f8SToomas Soome eck->zec_cksum = expected_cksum; 296*199767f8SToomas Soome 297*199767f8SToomas Soome if (byteswap) 298*199767f8SToomas Soome byteswap_uint64_array(&expected_cksum, 299*199767f8SToomas Soome sizeof (zio_cksum_t)); 300*199767f8SToomas Soome } else { 301*199767f8SToomas Soome expected_cksum = bp->blk_cksum; 302*199767f8SToomas Soome ci->ci_func[0](data, size, NULL, &actual_cksum); 303*199767f8SToomas Soome } 304*199767f8SToomas Soome 305*199767f8SToomas Soome if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 306*199767f8SToomas Soome /*printf("ZFS: read checksum failed\n");*/ 307*199767f8SToomas Soome return (EIO); 308*199767f8SToomas Soome } 309*199767f8SToomas Soome 310*199767f8SToomas Soome return (0); 311*199767f8SToomas Soome } 312*199767f8SToomas Soome 313*199767f8SToomas Soome static int 314*199767f8SToomas Soome zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 315*199767f8SToomas Soome void *dest, uint64_t destsize) 316*199767f8SToomas Soome { 317*199767f8SToomas Soome zio_compress_info_t *ci; 318*199767f8SToomas Soome 319*199767f8SToomas Soome if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 320*199767f8SToomas Soome printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 321*199767f8SToomas Soome return (EIO); 322*199767f8SToomas Soome } 323*199767f8SToomas Soome 324*199767f8SToomas Soome ci = &zio_compress_table[cpfunc]; 325*199767f8SToomas Soome if (!ci->ci_decompress) { 326*199767f8SToomas Soome printf("ZFS: unsupported compression algorithm %s\n", 327*199767f8SToomas Soome ci->ci_name); 328*199767f8SToomas Soome return (EIO); 329*199767f8SToomas Soome } 330*199767f8SToomas Soome 331*199767f8SToomas Soome return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 332*199767f8SToomas Soome } 333*199767f8SToomas Soome 334*199767f8SToomas Soome static uint64_t 335*199767f8SToomas Soome zap_hash(uint64_t salt, const char *name) 336*199767f8SToomas Soome { 337*199767f8SToomas Soome const uint8_t *cp; 338*199767f8SToomas Soome uint8_t c; 339*199767f8SToomas Soome uint64_t crc = salt; 340*199767f8SToomas Soome 341*199767f8SToomas Soome ASSERT(crc != 0); 342*199767f8SToomas Soome ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 343*199767f8SToomas Soome for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 344*199767f8SToomas Soome crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 345*199767f8SToomas Soome 346*199767f8SToomas Soome /* 347*199767f8SToomas Soome * Only use 28 bits, since we need 4 bits in the cookie for the 348*199767f8SToomas Soome * collision differentiator. We MUST use the high bits, since 349*199767f8SToomas Soome * those are the onces that we first pay attention to when 350*199767f8SToomas Soome * chosing the bucket. 351*199767f8SToomas Soome */ 352*199767f8SToomas Soome crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 353*199767f8SToomas Soome 354*199767f8SToomas Soome return (crc); 355*199767f8SToomas Soome } 356*199767f8SToomas Soome 357*199767f8SToomas Soome static void *zfs_alloc(size_t size); 358*199767f8SToomas Soome static void zfs_free(void *ptr, size_t size); 359*199767f8SToomas Soome 360*199767f8SToomas Soome typedef struct raidz_col { 361*199767f8SToomas Soome uint64_t rc_devidx; /* child device index for I/O */ 362*199767f8SToomas Soome uint64_t rc_offset; /* device offset */ 363*199767f8SToomas Soome uint64_t rc_size; /* I/O size */ 364*199767f8SToomas Soome void *rc_data; /* I/O data */ 365*199767f8SToomas Soome int rc_error; /* I/O error for this device */ 366*199767f8SToomas Soome uint8_t rc_tried; /* Did we attempt this I/O column? */ 367*199767f8SToomas Soome uint8_t rc_skipped; /* Did we skip this I/O column? */ 368*199767f8SToomas Soome } raidz_col_t; 369*199767f8SToomas Soome 370*199767f8SToomas Soome typedef struct raidz_map { 371*199767f8SToomas Soome uint64_t rm_cols; /* Regular column count */ 372*199767f8SToomas Soome uint64_t rm_scols; /* Count including skipped columns */ 373*199767f8SToomas Soome uint64_t rm_bigcols; /* Number of oversized columns */ 374*199767f8SToomas Soome uint64_t rm_asize; /* Actual total I/O size */ 375*199767f8SToomas Soome uint64_t rm_missingdata; /* Count of missing data devices */ 376*199767f8SToomas Soome uint64_t rm_missingparity; /* Count of missing parity devices */ 377*199767f8SToomas Soome uint64_t rm_firstdatacol; /* First data column/parity count */ 378*199767f8SToomas Soome uint64_t rm_nskip; /* Skipped sectors for padding */ 379*199767f8SToomas Soome uint64_t rm_skipstart; /* Column index of padding start */ 380*199767f8SToomas Soome uintptr_t rm_reports; /* # of referencing checksum reports */ 381*199767f8SToomas Soome uint8_t rm_freed; /* map no longer has referencing ZIO */ 382*199767f8SToomas Soome uint8_t rm_ecksuminjected; /* checksum error was injected */ 383*199767f8SToomas Soome raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 384*199767f8SToomas Soome } raidz_map_t; 385*199767f8SToomas Soome 386*199767f8SToomas Soome #define VDEV_RAIDZ_P 0 387*199767f8SToomas Soome #define VDEV_RAIDZ_Q 1 388*199767f8SToomas Soome #define VDEV_RAIDZ_R 2 389*199767f8SToomas Soome 390*199767f8SToomas Soome #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 391*199767f8SToomas Soome #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 392*199767f8SToomas Soome 393*199767f8SToomas Soome /* 394*199767f8SToomas Soome * We provide a mechanism to perform the field multiplication operation on a 395*199767f8SToomas Soome * 64-bit value all at once rather than a byte at a time. This works by 396*199767f8SToomas Soome * creating a mask from the top bit in each byte and using that to 397*199767f8SToomas Soome * conditionally apply the XOR of 0x1d. 398*199767f8SToomas Soome */ 399*199767f8SToomas Soome #define VDEV_RAIDZ_64MUL_2(x, mask) \ 400*199767f8SToomas Soome { \ 401*199767f8SToomas Soome (mask) = (x) & 0x8080808080808080ULL; \ 402*199767f8SToomas Soome (mask) = ((mask) << 1) - ((mask) >> 7); \ 403*199767f8SToomas Soome (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 404*199767f8SToomas Soome ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 405*199767f8SToomas Soome } 406*199767f8SToomas Soome 407*199767f8SToomas Soome #define VDEV_RAIDZ_64MUL_4(x, mask) \ 408*199767f8SToomas Soome { \ 409*199767f8SToomas Soome VDEV_RAIDZ_64MUL_2((x), mask); \ 410*199767f8SToomas Soome VDEV_RAIDZ_64MUL_2((x), mask); \ 411*199767f8SToomas Soome } 412*199767f8SToomas Soome 413*199767f8SToomas Soome /* 414*199767f8SToomas Soome * These two tables represent powers and logs of 2 in the Galois field defined 415*199767f8SToomas Soome * above. These values were computed by repeatedly multiplying by 2 as above. 416*199767f8SToomas Soome */ 417*199767f8SToomas Soome static const uint8_t vdev_raidz_pow2[256] = { 418*199767f8SToomas Soome 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 419*199767f8SToomas Soome 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 420*199767f8SToomas Soome 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 421*199767f8SToomas Soome 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 422*199767f8SToomas Soome 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 423*199767f8SToomas Soome 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 424*199767f8SToomas Soome 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 425*199767f8SToomas Soome 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 426*199767f8SToomas Soome 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 427*199767f8SToomas Soome 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 428*199767f8SToomas Soome 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 429*199767f8SToomas Soome 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 430*199767f8SToomas Soome 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 431*199767f8SToomas Soome 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 432*199767f8SToomas Soome 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 433*199767f8SToomas Soome 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 434*199767f8SToomas Soome 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 435*199767f8SToomas Soome 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 436*199767f8SToomas Soome 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 437*199767f8SToomas Soome 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 438*199767f8SToomas Soome 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 439*199767f8SToomas Soome 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 440*199767f8SToomas Soome 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 441*199767f8SToomas Soome 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 442*199767f8SToomas Soome 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 443*199767f8SToomas Soome 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 444*199767f8SToomas Soome 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 445*199767f8SToomas Soome 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 446*199767f8SToomas Soome 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 447*199767f8SToomas Soome 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 448*199767f8SToomas Soome 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 449*199767f8SToomas Soome 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 450*199767f8SToomas Soome }; 451*199767f8SToomas Soome static const uint8_t vdev_raidz_log2[256] = { 452*199767f8SToomas Soome 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 453*199767f8SToomas Soome 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 454*199767f8SToomas Soome 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 455*199767f8SToomas Soome 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 456*199767f8SToomas Soome 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 457*199767f8SToomas Soome 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 458*199767f8SToomas Soome 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 459*199767f8SToomas Soome 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 460*199767f8SToomas Soome 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 461*199767f8SToomas Soome 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 462*199767f8SToomas Soome 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 463*199767f8SToomas Soome 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 464*199767f8SToomas Soome 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 465*199767f8SToomas Soome 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 466*199767f8SToomas Soome 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 467*199767f8SToomas Soome 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 468*199767f8SToomas Soome 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 469*199767f8SToomas Soome 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 470*199767f8SToomas Soome 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 471*199767f8SToomas Soome 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 472*199767f8SToomas Soome 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 473*199767f8SToomas Soome 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 474*199767f8SToomas Soome 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 475*199767f8SToomas Soome 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 476*199767f8SToomas Soome 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 477*199767f8SToomas Soome 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 478*199767f8SToomas Soome 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 479*199767f8SToomas Soome 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 480*199767f8SToomas Soome 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 481*199767f8SToomas Soome 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 482*199767f8SToomas Soome 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 483*199767f8SToomas Soome 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 484*199767f8SToomas Soome }; 485*199767f8SToomas Soome 486*199767f8SToomas Soome /* 487*199767f8SToomas Soome * Multiply a given number by 2 raised to the given power. 488*199767f8SToomas Soome */ 489*199767f8SToomas Soome static uint8_t 490*199767f8SToomas Soome vdev_raidz_exp2(uint8_t a, int exp) 491*199767f8SToomas Soome { 492*199767f8SToomas Soome if (a == 0) 493*199767f8SToomas Soome return (0); 494*199767f8SToomas Soome 495*199767f8SToomas Soome ASSERT(exp >= 0); 496*199767f8SToomas Soome ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 497*199767f8SToomas Soome 498*199767f8SToomas Soome exp += vdev_raidz_log2[a]; 499*199767f8SToomas Soome if (exp > 255) 500*199767f8SToomas Soome exp -= 255; 501*199767f8SToomas Soome 502*199767f8SToomas Soome return (vdev_raidz_pow2[exp]); 503*199767f8SToomas Soome } 504*199767f8SToomas Soome 505*199767f8SToomas Soome static void 506*199767f8SToomas Soome vdev_raidz_generate_parity_p(raidz_map_t *rm) 507*199767f8SToomas Soome { 508*199767f8SToomas Soome uint64_t *p, *src, pcount __attribute__((unused)), ccount, i; 509*199767f8SToomas Soome int c; 510*199767f8SToomas Soome 511*199767f8SToomas Soome pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 512*199767f8SToomas Soome 513*199767f8SToomas Soome for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 514*199767f8SToomas Soome src = rm->rm_col[c].rc_data; 515*199767f8SToomas Soome p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 516*199767f8SToomas Soome ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 517*199767f8SToomas Soome 518*199767f8SToomas Soome if (c == rm->rm_firstdatacol) { 519*199767f8SToomas Soome ASSERT(ccount == pcount); 520*199767f8SToomas Soome for (i = 0; i < ccount; i++, src++, p++) { 521*199767f8SToomas Soome *p = *src; 522*199767f8SToomas Soome } 523*199767f8SToomas Soome } else { 524*199767f8SToomas Soome ASSERT(ccount <= pcount); 525*199767f8SToomas Soome for (i = 0; i < ccount; i++, src++, p++) { 526*199767f8SToomas Soome *p ^= *src; 527*199767f8SToomas Soome } 528*199767f8SToomas Soome } 529*199767f8SToomas Soome } 530*199767f8SToomas Soome } 531*199767f8SToomas Soome 532*199767f8SToomas Soome static void 533*199767f8SToomas Soome vdev_raidz_generate_parity_pq(raidz_map_t *rm) 534*199767f8SToomas Soome { 535*199767f8SToomas Soome uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 536*199767f8SToomas Soome int c; 537*199767f8SToomas Soome 538*199767f8SToomas Soome pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 539*199767f8SToomas Soome ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 540*199767f8SToomas Soome rm->rm_col[VDEV_RAIDZ_Q].rc_size); 541*199767f8SToomas Soome 542*199767f8SToomas Soome for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 543*199767f8SToomas Soome src = rm->rm_col[c].rc_data; 544*199767f8SToomas Soome p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 545*199767f8SToomas Soome q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 546*199767f8SToomas Soome 547*199767f8SToomas Soome ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 548*199767f8SToomas Soome 549*199767f8SToomas Soome if (c == rm->rm_firstdatacol) { 550*199767f8SToomas Soome ASSERT(ccnt == pcnt || ccnt == 0); 551*199767f8SToomas Soome for (i = 0; i < ccnt; i++, src++, p++, q++) { 552*199767f8SToomas Soome *p = *src; 553*199767f8SToomas Soome *q = *src; 554*199767f8SToomas Soome } 555*199767f8SToomas Soome for (; i < pcnt; i++, src++, p++, q++) { 556*199767f8SToomas Soome *p = 0; 557*199767f8SToomas Soome *q = 0; 558*199767f8SToomas Soome } 559*199767f8SToomas Soome } else { 560*199767f8SToomas Soome ASSERT(ccnt <= pcnt); 561*199767f8SToomas Soome 562*199767f8SToomas Soome /* 563*199767f8SToomas Soome * Apply the algorithm described above by multiplying 564*199767f8SToomas Soome * the previous result and adding in the new value. 565*199767f8SToomas Soome */ 566*199767f8SToomas Soome for (i = 0; i < ccnt; i++, src++, p++, q++) { 567*199767f8SToomas Soome *p ^= *src; 568*199767f8SToomas Soome 569*199767f8SToomas Soome VDEV_RAIDZ_64MUL_2(*q, mask); 570*199767f8SToomas Soome *q ^= *src; 571*199767f8SToomas Soome } 572*199767f8SToomas Soome 573*199767f8SToomas Soome /* 574*199767f8SToomas Soome * Treat short columns as though they are full of 0s. 575*199767f8SToomas Soome * Note that there's therefore nothing needed for P. 576*199767f8SToomas Soome */ 577*199767f8SToomas Soome for (; i < pcnt; i++, q++) { 578*199767f8SToomas Soome VDEV_RAIDZ_64MUL_2(*q, mask); 579*199767f8SToomas Soome } 580*199767f8SToomas Soome } 581*199767f8SToomas Soome } 582*199767f8SToomas Soome } 583*199767f8SToomas Soome 584*199767f8SToomas Soome static void 585*199767f8SToomas Soome vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 586*199767f8SToomas Soome { 587*199767f8SToomas Soome uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 588*199767f8SToomas Soome int c; 589*199767f8SToomas Soome 590*199767f8SToomas Soome pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 591*199767f8SToomas Soome ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 592*199767f8SToomas Soome rm->rm_col[VDEV_RAIDZ_Q].rc_size); 593*199767f8SToomas Soome ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 594*199767f8SToomas Soome rm->rm_col[VDEV_RAIDZ_R].rc_size); 595*199767f8SToomas Soome 596*199767f8SToomas Soome for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 597*199767f8SToomas Soome src = rm->rm_col[c].rc_data; 598*199767f8SToomas Soome p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 599*199767f8SToomas Soome q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 600*199767f8SToomas Soome r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 601*199767f8SToomas Soome 602*199767f8SToomas Soome ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 603*199767f8SToomas Soome 604*199767f8SToomas Soome if (c == rm->rm_firstdatacol) { 605*199767f8SToomas Soome ASSERT(ccnt == pcnt || ccnt == 0); 606*199767f8SToomas Soome for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 607*199767f8SToomas Soome *p = *src; 608*199767f8SToomas Soome *q = *src; 609*199767f8SToomas Soome *r = *src; 610*199767f8SToomas Soome } 611*199767f8SToomas Soome for (; i < pcnt; i++, src++, p++, q++, r++) { 612*199767f8SToomas Soome *p = 0; 613*199767f8SToomas Soome *q = 0; 614*199767f8SToomas Soome *r = 0; 615*199767f8SToomas Soome } 616*199767f8SToomas Soome } else { 617*199767f8SToomas Soome ASSERT(ccnt <= pcnt); 618*199767f8SToomas Soome 619*199767f8SToomas Soome /* 620*199767f8SToomas Soome * Apply the algorithm described above by multiplying 621*199767f8SToomas Soome * the previous result and adding in the new value. 622*199767f8SToomas Soome */ 623*199767f8SToomas Soome for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 624*199767f8SToomas Soome *p ^= *src; 625*199767f8SToomas Soome 626*199767f8SToomas Soome VDEV_RAIDZ_64MUL_2(*q, mask); 627*199767f8SToomas Soome *q ^= *src; 628*199767f8SToomas Soome 629*199767f8SToomas Soome VDEV_RAIDZ_64MUL_4(*r, mask); 630*199767f8SToomas Soome *r ^= *src; 631*199767f8SToomas Soome } 632*199767f8SToomas Soome 633*199767f8SToomas Soome /* 634*199767f8SToomas Soome * Treat short columns as though they are full of 0s. 635*199767f8SToomas Soome * Note that there's therefore nothing needed for P. 636*199767f8SToomas Soome */ 637*199767f8SToomas Soome for (; i < pcnt; i++, q++, r++) { 638*199767f8SToomas Soome VDEV_RAIDZ_64MUL_2(*q, mask); 639*199767f8SToomas Soome VDEV_RAIDZ_64MUL_4(*r, mask); 640*199767f8SToomas Soome } 641*199767f8SToomas Soome } 642*199767f8SToomas Soome } 643*199767f8SToomas Soome } 644*199767f8SToomas Soome 645*199767f8SToomas Soome /* 646*199767f8SToomas Soome * Generate RAID parity in the first virtual columns according to the number of 647*199767f8SToomas Soome * parity columns available. 648*199767f8SToomas Soome */ 649*199767f8SToomas Soome static void 650*199767f8SToomas Soome vdev_raidz_generate_parity(raidz_map_t *rm) 651*199767f8SToomas Soome { 652*199767f8SToomas Soome switch (rm->rm_firstdatacol) { 653*199767f8SToomas Soome case 1: 654*199767f8SToomas Soome vdev_raidz_generate_parity_p(rm); 655*199767f8SToomas Soome break; 656*199767f8SToomas Soome case 2: 657*199767f8SToomas Soome vdev_raidz_generate_parity_pq(rm); 658*199767f8SToomas Soome break; 659*199767f8SToomas Soome case 3: 660*199767f8SToomas Soome vdev_raidz_generate_parity_pqr(rm); 661*199767f8SToomas Soome break; 662*199767f8SToomas Soome default: 663*199767f8SToomas Soome panic("invalid RAID-Z configuration"); 664*199767f8SToomas Soome } 665*199767f8SToomas Soome } 666*199767f8SToomas Soome 667*199767f8SToomas Soome /* BEGIN CSTYLED */ 668*199767f8SToomas Soome /* 669*199767f8SToomas Soome * In the general case of reconstruction, we must solve the system of linear 670*199767f8SToomas Soome * equations defined by the coeffecients used to generate parity as well as 671*199767f8SToomas Soome * the contents of the data and parity disks. This can be expressed with 672*199767f8SToomas Soome * vectors for the original data (D) and the actual data (d) and parity (p) 673*199767f8SToomas Soome * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 674*199767f8SToomas Soome * 675*199767f8SToomas Soome * __ __ __ __ 676*199767f8SToomas Soome * | | __ __ | p_0 | 677*199767f8SToomas Soome * | V | | D_0 | | p_m-1 | 678*199767f8SToomas Soome * | | x | : | = | d_0 | 679*199767f8SToomas Soome * | I | | D_n-1 | | : | 680*199767f8SToomas Soome * | | ~~ ~~ | d_n-1 | 681*199767f8SToomas Soome * ~~ ~~ ~~ ~~ 682*199767f8SToomas Soome * 683*199767f8SToomas Soome * I is simply a square identity matrix of size n, and V is a vandermonde 684*199767f8SToomas Soome * matrix defined by the coeffecients we chose for the various parity columns 685*199767f8SToomas Soome * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 686*199767f8SToomas Soome * computation as well as linear separability. 687*199767f8SToomas Soome * 688*199767f8SToomas Soome * __ __ __ __ 689*199767f8SToomas Soome * | 1 .. 1 1 1 | | p_0 | 690*199767f8SToomas Soome * | 2^n-1 .. 4 2 1 | __ __ | : | 691*199767f8SToomas Soome * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 692*199767f8SToomas Soome * | 1 .. 0 0 0 | | D_1 | | d_0 | 693*199767f8SToomas Soome * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 694*199767f8SToomas Soome * | : : : : | | : | | d_2 | 695*199767f8SToomas Soome * | 0 .. 1 0 0 | | D_n-1 | | : | 696*199767f8SToomas Soome * | 0 .. 0 1 0 | ~~ ~~ | : | 697*199767f8SToomas Soome * | 0 .. 0 0 1 | | d_n-1 | 698*199767f8SToomas Soome * ~~ ~~ ~~ ~~ 699*199767f8SToomas Soome * 700*199767f8SToomas Soome * Note that I, V, d, and p are known. To compute D, we must invert the 701*199767f8SToomas Soome * matrix and use the known data and parity values to reconstruct the unknown 702*199767f8SToomas Soome * data values. We begin by removing the rows in V|I and d|p that correspond 703*199767f8SToomas Soome * to failed or missing columns; we then make V|I square (n x n) and d|p 704*199767f8SToomas Soome * sized n by removing rows corresponding to unused parity from the bottom up 705*199767f8SToomas Soome * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 706*199767f8SToomas Soome * using Gauss-Jordan elimination. In the example below we use m=3 parity 707*199767f8SToomas Soome * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 708*199767f8SToomas Soome * __ __ 709*199767f8SToomas Soome * | 1 1 1 1 1 1 1 1 | 710*199767f8SToomas Soome * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 711*199767f8SToomas Soome * | 19 205 116 29 64 16 4 1 | / / 712*199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 | / / 713*199767f8SToomas Soome * | 0 1 0 0 0 0 0 0 | <--' / 714*199767f8SToomas Soome * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 715*199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 | 716*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 | 717*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 | 718*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 | 719*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 | 720*199767f8SToomas Soome * ~~ ~~ 721*199767f8SToomas Soome * __ __ 722*199767f8SToomas Soome * | 1 1 1 1 1 1 1 1 | 723*199767f8SToomas Soome * | 128 64 32 16 8 4 2 1 | 724*199767f8SToomas Soome * | 19 205 116 29 64 16 4 1 | 725*199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 | 726*199767f8SToomas Soome * | 0 1 0 0 0 0 0 0 | 727*199767f8SToomas Soome * (V|I)' = | 0 0 1 0 0 0 0 0 | 728*199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 | 729*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 | 730*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 | 731*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 | 732*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 | 733*199767f8SToomas Soome * ~~ ~~ 734*199767f8SToomas Soome * 735*199767f8SToomas Soome * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 736*199767f8SToomas Soome * have carefully chosen the seed values 1, 2, and 4 to ensure that this 737*199767f8SToomas Soome * matrix is not singular. 738*199767f8SToomas Soome * __ __ 739*199767f8SToomas Soome * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 740*199767f8SToomas Soome * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 741*199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 742*199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 743*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 744*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 745*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 746*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 747*199767f8SToomas Soome * ~~ ~~ 748*199767f8SToomas Soome * __ __ 749*199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 750*199767f8SToomas Soome * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 751*199767f8SToomas Soome * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 752*199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 753*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 754*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 755*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 756*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 757*199767f8SToomas Soome * ~~ ~~ 758*199767f8SToomas Soome * __ __ 759*199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 760*199767f8SToomas Soome * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 761*199767f8SToomas Soome * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 762*199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 763*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 764*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 765*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 766*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 767*199767f8SToomas Soome * ~~ ~~ 768*199767f8SToomas Soome * __ __ 769*199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 770*199767f8SToomas Soome * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 771*199767f8SToomas Soome * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 772*199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 773*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 774*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 775*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 776*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 777*199767f8SToomas Soome * ~~ ~~ 778*199767f8SToomas Soome * __ __ 779*199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 780*199767f8SToomas Soome * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 781*199767f8SToomas Soome * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 782*199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 783*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 784*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 785*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 786*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 787*199767f8SToomas Soome * ~~ ~~ 788*199767f8SToomas Soome * __ __ 789*199767f8SToomas Soome * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 790*199767f8SToomas Soome * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 791*199767f8SToomas Soome * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 792*199767f8SToomas Soome * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 793*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 794*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 795*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 796*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 797*199767f8SToomas Soome * ~~ ~~ 798*199767f8SToomas Soome * __ __ 799*199767f8SToomas Soome * | 0 0 1 0 0 0 0 0 | 800*199767f8SToomas Soome * | 167 100 5 41 159 169 217 208 | 801*199767f8SToomas Soome * | 166 100 4 40 158 168 216 209 | 802*199767f8SToomas Soome * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 803*199767f8SToomas Soome * | 0 0 0 0 1 0 0 0 | 804*199767f8SToomas Soome * | 0 0 0 0 0 1 0 0 | 805*199767f8SToomas Soome * | 0 0 0 0 0 0 1 0 | 806*199767f8SToomas Soome * | 0 0 0 0 0 0 0 1 | 807*199767f8SToomas Soome * ~~ ~~ 808*199767f8SToomas Soome * 809*199767f8SToomas Soome * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 810*199767f8SToomas Soome * of the missing data. 811*199767f8SToomas Soome * 812*199767f8SToomas Soome * As is apparent from the example above, the only non-trivial rows in the 813*199767f8SToomas Soome * inverse matrix correspond to the data disks that we're trying to 814*199767f8SToomas Soome * reconstruct. Indeed, those are the only rows we need as the others would 815*199767f8SToomas Soome * only be useful for reconstructing data known or assumed to be valid. For 816*199767f8SToomas Soome * that reason, we only build the coefficients in the rows that correspond to 817*199767f8SToomas Soome * targeted columns. 818*199767f8SToomas Soome */ 819*199767f8SToomas Soome /* END CSTYLED */ 820*199767f8SToomas Soome 821*199767f8SToomas Soome static void 822*199767f8SToomas Soome vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 823*199767f8SToomas Soome uint8_t **rows) 824*199767f8SToomas Soome { 825*199767f8SToomas Soome int i, j; 826*199767f8SToomas Soome int pow; 827*199767f8SToomas Soome 828*199767f8SToomas Soome ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 829*199767f8SToomas Soome 830*199767f8SToomas Soome /* 831*199767f8SToomas Soome * Fill in the missing rows of interest. 832*199767f8SToomas Soome */ 833*199767f8SToomas Soome for (i = 0; i < nmap; i++) { 834*199767f8SToomas Soome ASSERT3S(0, <=, map[i]); 835*199767f8SToomas Soome ASSERT3S(map[i], <=, 2); 836*199767f8SToomas Soome 837*199767f8SToomas Soome pow = map[i] * n; 838*199767f8SToomas Soome if (pow > 255) 839*199767f8SToomas Soome pow -= 255; 840*199767f8SToomas Soome ASSERT(pow <= 255); 841*199767f8SToomas Soome 842*199767f8SToomas Soome for (j = 0; j < n; j++) { 843*199767f8SToomas Soome pow -= map[i]; 844*199767f8SToomas Soome if (pow < 0) 845*199767f8SToomas Soome pow += 255; 846*199767f8SToomas Soome rows[i][j] = vdev_raidz_pow2[pow]; 847*199767f8SToomas Soome } 848*199767f8SToomas Soome } 849*199767f8SToomas Soome } 850*199767f8SToomas Soome 851*199767f8SToomas Soome static void 852*199767f8SToomas Soome vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 853*199767f8SToomas Soome uint8_t **rows, uint8_t **invrows, const uint8_t *used) 854*199767f8SToomas Soome { 855*199767f8SToomas Soome int i, j, ii, jj; 856*199767f8SToomas Soome uint8_t log; 857*199767f8SToomas Soome 858*199767f8SToomas Soome /* 859*199767f8SToomas Soome * Assert that the first nmissing entries from the array of used 860*199767f8SToomas Soome * columns correspond to parity columns and that subsequent entries 861*199767f8SToomas Soome * correspond to data columns. 862*199767f8SToomas Soome */ 863*199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 864*199767f8SToomas Soome ASSERT3S(used[i], <, rm->rm_firstdatacol); 865*199767f8SToomas Soome } 866*199767f8SToomas Soome for (; i < n; i++) { 867*199767f8SToomas Soome ASSERT3S(used[i], >=, rm->rm_firstdatacol); 868*199767f8SToomas Soome } 869*199767f8SToomas Soome 870*199767f8SToomas Soome /* 871*199767f8SToomas Soome * First initialize the storage where we'll compute the inverse rows. 872*199767f8SToomas Soome */ 873*199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 874*199767f8SToomas Soome for (j = 0; j < n; j++) { 875*199767f8SToomas Soome invrows[i][j] = (i == j) ? 1 : 0; 876*199767f8SToomas Soome } 877*199767f8SToomas Soome } 878*199767f8SToomas Soome 879*199767f8SToomas Soome /* 880*199767f8SToomas Soome * Subtract all trivial rows from the rows of consequence. 881*199767f8SToomas Soome */ 882*199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 883*199767f8SToomas Soome for (j = nmissing; j < n; j++) { 884*199767f8SToomas Soome ASSERT3U(used[j], >=, rm->rm_firstdatacol); 885*199767f8SToomas Soome jj = used[j] - rm->rm_firstdatacol; 886*199767f8SToomas Soome ASSERT3S(jj, <, n); 887*199767f8SToomas Soome invrows[i][j] = rows[i][jj]; 888*199767f8SToomas Soome rows[i][jj] = 0; 889*199767f8SToomas Soome } 890*199767f8SToomas Soome } 891*199767f8SToomas Soome 892*199767f8SToomas Soome /* 893*199767f8SToomas Soome * For each of the rows of interest, we must normalize it and subtract 894*199767f8SToomas Soome * a multiple of it from the other rows. 895*199767f8SToomas Soome */ 896*199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 897*199767f8SToomas Soome for (j = 0; j < missing[i]; j++) { 898*199767f8SToomas Soome ASSERT3U(rows[i][j], ==, 0); 899*199767f8SToomas Soome } 900*199767f8SToomas Soome ASSERT3U(rows[i][missing[i]], !=, 0); 901*199767f8SToomas Soome 902*199767f8SToomas Soome /* 903*199767f8SToomas Soome * Compute the inverse of the first element and multiply each 904*199767f8SToomas Soome * element in the row by that value. 905*199767f8SToomas Soome */ 906*199767f8SToomas Soome log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 907*199767f8SToomas Soome 908*199767f8SToomas Soome for (j = 0; j < n; j++) { 909*199767f8SToomas Soome rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 910*199767f8SToomas Soome invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 911*199767f8SToomas Soome } 912*199767f8SToomas Soome 913*199767f8SToomas Soome for (ii = 0; ii < nmissing; ii++) { 914*199767f8SToomas Soome if (i == ii) 915*199767f8SToomas Soome continue; 916*199767f8SToomas Soome 917*199767f8SToomas Soome ASSERT3U(rows[ii][missing[i]], !=, 0); 918*199767f8SToomas Soome 919*199767f8SToomas Soome log = vdev_raidz_log2[rows[ii][missing[i]]]; 920*199767f8SToomas Soome 921*199767f8SToomas Soome for (j = 0; j < n; j++) { 922*199767f8SToomas Soome rows[ii][j] ^= 923*199767f8SToomas Soome vdev_raidz_exp2(rows[i][j], log); 924*199767f8SToomas Soome invrows[ii][j] ^= 925*199767f8SToomas Soome vdev_raidz_exp2(invrows[i][j], log); 926*199767f8SToomas Soome } 927*199767f8SToomas Soome } 928*199767f8SToomas Soome } 929*199767f8SToomas Soome 930*199767f8SToomas Soome /* 931*199767f8SToomas Soome * Verify that the data that is left in the rows are properly part of 932*199767f8SToomas Soome * an identity matrix. 933*199767f8SToomas Soome */ 934*199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 935*199767f8SToomas Soome for (j = 0; j < n; j++) { 936*199767f8SToomas Soome if (j == missing[i]) { 937*199767f8SToomas Soome ASSERT3U(rows[i][j], ==, 1); 938*199767f8SToomas Soome } else { 939*199767f8SToomas Soome ASSERT3U(rows[i][j], ==, 0); 940*199767f8SToomas Soome } 941*199767f8SToomas Soome } 942*199767f8SToomas Soome } 943*199767f8SToomas Soome } 944*199767f8SToomas Soome 945*199767f8SToomas Soome static void 946*199767f8SToomas Soome vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 947*199767f8SToomas Soome int *missing, uint8_t **invrows, const uint8_t *used) 948*199767f8SToomas Soome { 949*199767f8SToomas Soome int i, j, x, cc, c; 950*199767f8SToomas Soome uint8_t *src; 951*199767f8SToomas Soome uint64_t ccount; 952*199767f8SToomas Soome uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 953*199767f8SToomas Soome uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 954*199767f8SToomas Soome uint8_t log, val; 955*199767f8SToomas Soome int ll; 956*199767f8SToomas Soome uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 957*199767f8SToomas Soome uint8_t *p, *pp; 958*199767f8SToomas Soome size_t psize; 959*199767f8SToomas Soome 960*199767f8SToomas Soome log = 0; /* gcc */ 961*199767f8SToomas Soome psize = sizeof (invlog[0][0]) * n * nmissing; 962*199767f8SToomas Soome p = zfs_alloc(psize); 963*199767f8SToomas Soome 964*199767f8SToomas Soome for (pp = p, i = 0; i < nmissing; i++) { 965*199767f8SToomas Soome invlog[i] = pp; 966*199767f8SToomas Soome pp += n; 967*199767f8SToomas Soome } 968*199767f8SToomas Soome 969*199767f8SToomas Soome for (i = 0; i < nmissing; i++) { 970*199767f8SToomas Soome for (j = 0; j < n; j++) { 971*199767f8SToomas Soome ASSERT3U(invrows[i][j], !=, 0); 972*199767f8SToomas Soome invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 973*199767f8SToomas Soome } 974*199767f8SToomas Soome } 975*199767f8SToomas Soome 976*199767f8SToomas Soome for (i = 0; i < n; i++) { 977*199767f8SToomas Soome c = used[i]; 978*199767f8SToomas Soome ASSERT3U(c, <, rm->rm_cols); 979*199767f8SToomas Soome 980*199767f8SToomas Soome src = rm->rm_col[c].rc_data; 981*199767f8SToomas Soome ccount = rm->rm_col[c].rc_size; 982*199767f8SToomas Soome for (j = 0; j < nmissing; j++) { 983*199767f8SToomas Soome cc = missing[j] + rm->rm_firstdatacol; 984*199767f8SToomas Soome ASSERT3U(cc, >=, rm->rm_firstdatacol); 985*199767f8SToomas Soome ASSERT3U(cc, <, rm->rm_cols); 986*199767f8SToomas Soome ASSERT3U(cc, !=, c); 987*199767f8SToomas Soome 988*199767f8SToomas Soome dst[j] = rm->rm_col[cc].rc_data; 989*199767f8SToomas Soome dcount[j] = rm->rm_col[cc].rc_size; 990*199767f8SToomas Soome } 991*199767f8SToomas Soome 992*199767f8SToomas Soome ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 993*199767f8SToomas Soome 994*199767f8SToomas Soome for (x = 0; x < ccount; x++, src++) { 995*199767f8SToomas Soome if (*src != 0) 996*199767f8SToomas Soome log = vdev_raidz_log2[*src]; 997*199767f8SToomas Soome 998*199767f8SToomas Soome for (cc = 0; cc < nmissing; cc++) { 999*199767f8SToomas Soome if (x >= dcount[cc]) 1000*199767f8SToomas Soome continue; 1001*199767f8SToomas Soome 1002*199767f8SToomas Soome if (*src == 0) { 1003*199767f8SToomas Soome val = 0; 1004*199767f8SToomas Soome } else { 1005*199767f8SToomas Soome if ((ll = log + invlog[cc][i]) >= 255) 1006*199767f8SToomas Soome ll -= 255; 1007*199767f8SToomas Soome val = vdev_raidz_pow2[ll]; 1008*199767f8SToomas Soome } 1009*199767f8SToomas Soome 1010*199767f8SToomas Soome if (i == 0) 1011*199767f8SToomas Soome dst[cc][x] = val; 1012*199767f8SToomas Soome else 1013*199767f8SToomas Soome dst[cc][x] ^= val; 1014*199767f8SToomas Soome } 1015*199767f8SToomas Soome } 1016*199767f8SToomas Soome } 1017*199767f8SToomas Soome 1018*199767f8SToomas Soome zfs_free(p, psize); 1019*199767f8SToomas Soome } 1020*199767f8SToomas Soome 1021*199767f8SToomas Soome static int 1022*199767f8SToomas Soome vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1023*199767f8SToomas Soome { 1024*199767f8SToomas Soome int n, i, c, t, tt; 1025*199767f8SToomas Soome int nmissing_rows; 1026*199767f8SToomas Soome int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1027*199767f8SToomas Soome int parity_map[VDEV_RAIDZ_MAXPARITY]; 1028*199767f8SToomas Soome 1029*199767f8SToomas Soome uint8_t *p, *pp; 1030*199767f8SToomas Soome size_t psize; 1031*199767f8SToomas Soome 1032*199767f8SToomas Soome uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1033*199767f8SToomas Soome uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1034*199767f8SToomas Soome uint8_t *used; 1035*199767f8SToomas Soome 1036*199767f8SToomas Soome int code = 0; 1037*199767f8SToomas Soome 1038*199767f8SToomas Soome 1039*199767f8SToomas Soome n = rm->rm_cols - rm->rm_firstdatacol; 1040*199767f8SToomas Soome 1041*199767f8SToomas Soome /* 1042*199767f8SToomas Soome * Figure out which data columns are missing. 1043*199767f8SToomas Soome */ 1044*199767f8SToomas Soome nmissing_rows = 0; 1045*199767f8SToomas Soome for (t = 0; t < ntgts; t++) { 1046*199767f8SToomas Soome if (tgts[t] >= rm->rm_firstdatacol) { 1047*199767f8SToomas Soome missing_rows[nmissing_rows++] = 1048*199767f8SToomas Soome tgts[t] - rm->rm_firstdatacol; 1049*199767f8SToomas Soome } 1050*199767f8SToomas Soome } 1051*199767f8SToomas Soome 1052*199767f8SToomas Soome /* 1053*199767f8SToomas Soome * Figure out which parity columns to use to help generate the missing 1054*199767f8SToomas Soome * data columns. 1055*199767f8SToomas Soome */ 1056*199767f8SToomas Soome for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1057*199767f8SToomas Soome ASSERT(tt < ntgts); 1058*199767f8SToomas Soome ASSERT(c < rm->rm_firstdatacol); 1059*199767f8SToomas Soome 1060*199767f8SToomas Soome /* 1061*199767f8SToomas Soome * Skip any targeted parity columns. 1062*199767f8SToomas Soome */ 1063*199767f8SToomas Soome if (c == tgts[tt]) { 1064*199767f8SToomas Soome tt++; 1065*199767f8SToomas Soome continue; 1066*199767f8SToomas Soome } 1067*199767f8SToomas Soome 1068*199767f8SToomas Soome code |= 1 << c; 1069*199767f8SToomas Soome 1070*199767f8SToomas Soome parity_map[i] = c; 1071*199767f8SToomas Soome i++; 1072*199767f8SToomas Soome } 1073*199767f8SToomas Soome 1074*199767f8SToomas Soome ASSERT(code != 0); 1075*199767f8SToomas Soome ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1076*199767f8SToomas Soome 1077*199767f8SToomas Soome psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1078*199767f8SToomas Soome nmissing_rows * n + sizeof (used[0]) * n; 1079*199767f8SToomas Soome p = kmem_alloc(psize, KM_SLEEP); 1080*199767f8SToomas Soome 1081*199767f8SToomas Soome for (pp = p, i = 0; i < nmissing_rows; i++) { 1082*199767f8SToomas Soome rows[i] = pp; 1083*199767f8SToomas Soome pp += n; 1084*199767f8SToomas Soome invrows[i] = pp; 1085*199767f8SToomas Soome pp += n; 1086*199767f8SToomas Soome } 1087*199767f8SToomas Soome used = pp; 1088*199767f8SToomas Soome 1089*199767f8SToomas Soome for (i = 0; i < nmissing_rows; i++) { 1090*199767f8SToomas Soome used[i] = parity_map[i]; 1091*199767f8SToomas Soome } 1092*199767f8SToomas Soome 1093*199767f8SToomas Soome for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1094*199767f8SToomas Soome if (tt < nmissing_rows && 1095*199767f8SToomas Soome c == missing_rows[tt] + rm->rm_firstdatacol) { 1096*199767f8SToomas Soome tt++; 1097*199767f8SToomas Soome continue; 1098*199767f8SToomas Soome } 1099*199767f8SToomas Soome 1100*199767f8SToomas Soome ASSERT3S(i, <, n); 1101*199767f8SToomas Soome used[i] = c; 1102*199767f8SToomas Soome i++; 1103*199767f8SToomas Soome } 1104*199767f8SToomas Soome 1105*199767f8SToomas Soome /* 1106*199767f8SToomas Soome * Initialize the interesting rows of the matrix. 1107*199767f8SToomas Soome */ 1108*199767f8SToomas Soome vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1109*199767f8SToomas Soome 1110*199767f8SToomas Soome /* 1111*199767f8SToomas Soome * Invert the matrix. 1112*199767f8SToomas Soome */ 1113*199767f8SToomas Soome vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1114*199767f8SToomas Soome invrows, used); 1115*199767f8SToomas Soome 1116*199767f8SToomas Soome /* 1117*199767f8SToomas Soome * Reconstruct the missing data using the generated matrix. 1118*199767f8SToomas Soome */ 1119*199767f8SToomas Soome vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1120*199767f8SToomas Soome invrows, used); 1121*199767f8SToomas Soome 1122*199767f8SToomas Soome kmem_free(p, psize); 1123*199767f8SToomas Soome 1124*199767f8SToomas Soome return (code); 1125*199767f8SToomas Soome } 1126*199767f8SToomas Soome 1127*199767f8SToomas Soome static int 1128*199767f8SToomas Soome vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1129*199767f8SToomas Soome { 1130*199767f8SToomas Soome int tgts[VDEV_RAIDZ_MAXPARITY]; 1131*199767f8SToomas Soome int ntgts; 1132*199767f8SToomas Soome int i, c; 1133*199767f8SToomas Soome int code; 1134*199767f8SToomas Soome int nbadparity, nbaddata; 1135*199767f8SToomas Soome 1136*199767f8SToomas Soome /* 1137*199767f8SToomas Soome * The tgts list must already be sorted. 1138*199767f8SToomas Soome */ 1139*199767f8SToomas Soome for (i = 1; i < nt; i++) { 1140*199767f8SToomas Soome ASSERT(t[i] > t[i - 1]); 1141*199767f8SToomas Soome } 1142*199767f8SToomas Soome 1143*199767f8SToomas Soome nbadparity = rm->rm_firstdatacol; 1144*199767f8SToomas Soome nbaddata = rm->rm_cols - nbadparity; 1145*199767f8SToomas Soome ntgts = 0; 1146*199767f8SToomas Soome for (i = 0, c = 0; c < rm->rm_cols; c++) { 1147*199767f8SToomas Soome if (i < nt && c == t[i]) { 1148*199767f8SToomas Soome tgts[ntgts++] = c; 1149*199767f8SToomas Soome i++; 1150*199767f8SToomas Soome } else if (rm->rm_col[c].rc_error != 0) { 1151*199767f8SToomas Soome tgts[ntgts++] = c; 1152*199767f8SToomas Soome } else if (c >= rm->rm_firstdatacol) { 1153*199767f8SToomas Soome nbaddata--; 1154*199767f8SToomas Soome } else { 1155*199767f8SToomas Soome nbadparity--; 1156*199767f8SToomas Soome } 1157*199767f8SToomas Soome } 1158*199767f8SToomas Soome 1159*199767f8SToomas Soome ASSERT(ntgts >= nt); 1160*199767f8SToomas Soome ASSERT(nbaddata >= 0); 1161*199767f8SToomas Soome ASSERT(nbaddata + nbadparity == ntgts); 1162*199767f8SToomas Soome 1163*199767f8SToomas Soome code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1164*199767f8SToomas Soome ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1165*199767f8SToomas Soome ASSERT(code > 0); 1166*199767f8SToomas Soome return (code); 1167*199767f8SToomas Soome } 1168*199767f8SToomas Soome 1169*199767f8SToomas Soome static raidz_map_t * 1170*199767f8SToomas Soome vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1171*199767f8SToomas Soome uint64_t dcols, uint64_t nparity) 1172*199767f8SToomas Soome { 1173*199767f8SToomas Soome raidz_map_t *rm; 1174*199767f8SToomas Soome uint64_t b = offset >> unit_shift; 1175*199767f8SToomas Soome uint64_t s = size >> unit_shift; 1176*199767f8SToomas Soome uint64_t f = b % dcols; 1177*199767f8SToomas Soome uint64_t o = (b / dcols) << unit_shift; 1178*199767f8SToomas Soome uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1179*199767f8SToomas Soome 1180*199767f8SToomas Soome q = s / (dcols - nparity); 1181*199767f8SToomas Soome r = s - q * (dcols - nparity); 1182*199767f8SToomas Soome bc = (r == 0 ? 0 : r + nparity); 1183*199767f8SToomas Soome tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1184*199767f8SToomas Soome 1185*199767f8SToomas Soome if (q == 0) { 1186*199767f8SToomas Soome acols = bc; 1187*199767f8SToomas Soome scols = MIN(dcols, roundup(bc, nparity + 1)); 1188*199767f8SToomas Soome } else { 1189*199767f8SToomas Soome acols = dcols; 1190*199767f8SToomas Soome scols = dcols; 1191*199767f8SToomas Soome } 1192*199767f8SToomas Soome 1193*199767f8SToomas Soome ASSERT3U(acols, <=, scols); 1194*199767f8SToomas Soome 1195*199767f8SToomas Soome rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1196*199767f8SToomas Soome 1197*199767f8SToomas Soome rm->rm_cols = acols; 1198*199767f8SToomas Soome rm->rm_scols = scols; 1199*199767f8SToomas Soome rm->rm_bigcols = bc; 1200*199767f8SToomas Soome rm->rm_skipstart = bc; 1201*199767f8SToomas Soome rm->rm_missingdata = 0; 1202*199767f8SToomas Soome rm->rm_missingparity = 0; 1203*199767f8SToomas Soome rm->rm_firstdatacol = nparity; 1204*199767f8SToomas Soome rm->rm_reports = 0; 1205*199767f8SToomas Soome rm->rm_freed = 0; 1206*199767f8SToomas Soome rm->rm_ecksuminjected = 0; 1207*199767f8SToomas Soome 1208*199767f8SToomas Soome asize = 0; 1209*199767f8SToomas Soome 1210*199767f8SToomas Soome for (c = 0; c < scols; c++) { 1211*199767f8SToomas Soome col = f + c; 1212*199767f8SToomas Soome coff = o; 1213*199767f8SToomas Soome if (col >= dcols) { 1214*199767f8SToomas Soome col -= dcols; 1215*199767f8SToomas Soome coff += 1ULL << unit_shift; 1216*199767f8SToomas Soome } 1217*199767f8SToomas Soome rm->rm_col[c].rc_devidx = col; 1218*199767f8SToomas Soome rm->rm_col[c].rc_offset = coff; 1219*199767f8SToomas Soome rm->rm_col[c].rc_data = NULL; 1220*199767f8SToomas Soome rm->rm_col[c].rc_error = 0; 1221*199767f8SToomas Soome rm->rm_col[c].rc_tried = 0; 1222*199767f8SToomas Soome rm->rm_col[c].rc_skipped = 0; 1223*199767f8SToomas Soome 1224*199767f8SToomas Soome if (c >= acols) 1225*199767f8SToomas Soome rm->rm_col[c].rc_size = 0; 1226*199767f8SToomas Soome else if (c < bc) 1227*199767f8SToomas Soome rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1228*199767f8SToomas Soome else 1229*199767f8SToomas Soome rm->rm_col[c].rc_size = q << unit_shift; 1230*199767f8SToomas Soome 1231*199767f8SToomas Soome asize += rm->rm_col[c].rc_size; 1232*199767f8SToomas Soome } 1233*199767f8SToomas Soome 1234*199767f8SToomas Soome ASSERT3U(asize, ==, tot << unit_shift); 1235*199767f8SToomas Soome rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1236*199767f8SToomas Soome rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1237*199767f8SToomas Soome ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1238*199767f8SToomas Soome ASSERT3U(rm->rm_nskip, <=, nparity); 1239*199767f8SToomas Soome 1240*199767f8SToomas Soome for (c = 0; c < rm->rm_firstdatacol; c++) 1241*199767f8SToomas Soome rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1242*199767f8SToomas Soome 1243*199767f8SToomas Soome rm->rm_col[c].rc_data = data; 1244*199767f8SToomas Soome 1245*199767f8SToomas Soome for (c = c + 1; c < acols; c++) 1246*199767f8SToomas Soome rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1247*199767f8SToomas Soome rm->rm_col[c - 1].rc_size; 1248*199767f8SToomas Soome 1249*199767f8SToomas Soome /* 1250*199767f8SToomas Soome * If all data stored spans all columns, there's a danger that parity 1251*199767f8SToomas Soome * will always be on the same device and, since parity isn't read 1252*199767f8SToomas Soome * during normal operation, that that device's I/O bandwidth won't be 1253*199767f8SToomas Soome * used effectively. We therefore switch the parity every 1MB. 1254*199767f8SToomas Soome * 1255*199767f8SToomas Soome * ... at least that was, ostensibly, the theory. As a practical 1256*199767f8SToomas Soome * matter unless we juggle the parity between all devices evenly, we 1257*199767f8SToomas Soome * won't see any benefit. Further, occasional writes that aren't a 1258*199767f8SToomas Soome * multiple of the LCM of the number of children and the minimum 1259*199767f8SToomas Soome * stripe width are sufficient to avoid pessimal behavior. 1260*199767f8SToomas Soome * Unfortunately, this decision created an implicit on-disk format 1261*199767f8SToomas Soome * requirement that we need to support for all eternity, but only 1262*199767f8SToomas Soome * for single-parity RAID-Z. 1263*199767f8SToomas Soome * 1264*199767f8SToomas Soome * If we intend to skip a sector in the zeroth column for padding 1265*199767f8SToomas Soome * we must make sure to note this swap. We will never intend to 1266*199767f8SToomas Soome * skip the first column since at least one data and one parity 1267*199767f8SToomas Soome * column must appear in each row. 1268*199767f8SToomas Soome */ 1269*199767f8SToomas Soome ASSERT(rm->rm_cols >= 2); 1270*199767f8SToomas Soome ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1271*199767f8SToomas Soome 1272*199767f8SToomas Soome if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1273*199767f8SToomas Soome devidx = rm->rm_col[0].rc_devidx; 1274*199767f8SToomas Soome o = rm->rm_col[0].rc_offset; 1275*199767f8SToomas Soome rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1276*199767f8SToomas Soome rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1277*199767f8SToomas Soome rm->rm_col[1].rc_devidx = devidx; 1278*199767f8SToomas Soome rm->rm_col[1].rc_offset = o; 1279*199767f8SToomas Soome 1280*199767f8SToomas Soome if (rm->rm_skipstart == 0) 1281*199767f8SToomas Soome rm->rm_skipstart = 1; 1282*199767f8SToomas Soome } 1283*199767f8SToomas Soome 1284*199767f8SToomas Soome return (rm); 1285*199767f8SToomas Soome } 1286*199767f8SToomas Soome 1287*199767f8SToomas Soome static void 1288*199767f8SToomas Soome vdev_raidz_map_free(raidz_map_t *rm) 1289*199767f8SToomas Soome { 1290*199767f8SToomas Soome int c; 1291*199767f8SToomas Soome 1292*199767f8SToomas Soome for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1293*199767f8SToomas Soome zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1294*199767f8SToomas Soome 1295*199767f8SToomas Soome zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1296*199767f8SToomas Soome } 1297*199767f8SToomas Soome 1298*199767f8SToomas Soome static vdev_t * 1299*199767f8SToomas Soome vdev_child(vdev_t *pvd, uint64_t devidx) 1300*199767f8SToomas Soome { 1301*199767f8SToomas Soome vdev_t *cvd; 1302*199767f8SToomas Soome 1303*199767f8SToomas Soome STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1304*199767f8SToomas Soome if (cvd->v_id == devidx) 1305*199767f8SToomas Soome break; 1306*199767f8SToomas Soome } 1307*199767f8SToomas Soome 1308*199767f8SToomas Soome return (cvd); 1309*199767f8SToomas Soome } 1310*199767f8SToomas Soome 1311*199767f8SToomas Soome /* 1312*199767f8SToomas Soome * We keep track of whether or not there were any injected errors, so that 1313*199767f8SToomas Soome * any ereports we generate can note it. 1314*199767f8SToomas Soome */ 1315*199767f8SToomas Soome static int 1316*199767f8SToomas Soome raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) 1317*199767f8SToomas Soome { 1318*199767f8SToomas Soome 1319*199767f8SToomas Soome return (zio_checksum_verify(bp, data)); 1320*199767f8SToomas Soome } 1321*199767f8SToomas Soome 1322*199767f8SToomas Soome /* 1323*199767f8SToomas Soome * Generate the parity from the data columns. If we tried and were able to 1324*199767f8SToomas Soome * read the parity without error, verify that the generated parity matches the 1325*199767f8SToomas Soome * data we read. If it doesn't, we fire off a checksum error. Return the 1326*199767f8SToomas Soome * number such failures. 1327*199767f8SToomas Soome */ 1328*199767f8SToomas Soome static int 1329*199767f8SToomas Soome raidz_parity_verify(raidz_map_t *rm) 1330*199767f8SToomas Soome { 1331*199767f8SToomas Soome void *orig[VDEV_RAIDZ_MAXPARITY]; 1332*199767f8SToomas Soome int c, ret = 0; 1333*199767f8SToomas Soome raidz_col_t *rc; 1334*199767f8SToomas Soome 1335*199767f8SToomas Soome for (c = 0; c < rm->rm_firstdatacol; c++) { 1336*199767f8SToomas Soome rc = &rm->rm_col[c]; 1337*199767f8SToomas Soome if (!rc->rc_tried || rc->rc_error != 0) 1338*199767f8SToomas Soome continue; 1339*199767f8SToomas Soome orig[c] = zfs_alloc(rc->rc_size); 1340*199767f8SToomas Soome bcopy(rc->rc_data, orig[c], rc->rc_size); 1341*199767f8SToomas Soome } 1342*199767f8SToomas Soome 1343*199767f8SToomas Soome vdev_raidz_generate_parity(rm); 1344*199767f8SToomas Soome 1345*199767f8SToomas Soome for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1346*199767f8SToomas Soome rc = &rm->rm_col[c]; 1347*199767f8SToomas Soome if (!rc->rc_tried || rc->rc_error != 0) 1348*199767f8SToomas Soome continue; 1349*199767f8SToomas Soome if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1350*199767f8SToomas Soome rc->rc_error = ECKSUM; 1351*199767f8SToomas Soome ret++; 1352*199767f8SToomas Soome } 1353*199767f8SToomas Soome zfs_free(orig[c], rc->rc_size); 1354*199767f8SToomas Soome } 1355*199767f8SToomas Soome 1356*199767f8SToomas Soome return (ret); 1357*199767f8SToomas Soome } 1358*199767f8SToomas Soome 1359*199767f8SToomas Soome /* 1360*199767f8SToomas Soome * Iterate over all combinations of bad data and attempt a reconstruction. 1361*199767f8SToomas Soome * Note that the algorithm below is non-optimal because it doesn't take into 1362*199767f8SToomas Soome * account how reconstruction is actually performed. For example, with 1363*199767f8SToomas Soome * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1364*199767f8SToomas Soome * is targeted as invalid as if columns 1 and 4 are targeted since in both 1365*199767f8SToomas Soome * cases we'd only use parity information in column 0. 1366*199767f8SToomas Soome */ 1367*199767f8SToomas Soome static int 1368*199767f8SToomas Soome vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, 1369*199767f8SToomas Soome off_t offset, uint64_t bytes, int total_errors, int data_errors) 1370*199767f8SToomas Soome { 1371*199767f8SToomas Soome raidz_col_t *rc; 1372*199767f8SToomas Soome void *orig[VDEV_RAIDZ_MAXPARITY]; 1373*199767f8SToomas Soome int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1374*199767f8SToomas Soome int *tgts = &tstore[1]; 1375*199767f8SToomas Soome int current, next, i, c, n; 1376*199767f8SToomas Soome int code, ret = 0; 1377*199767f8SToomas Soome 1378*199767f8SToomas Soome ASSERT(total_errors < rm->rm_firstdatacol); 1379*199767f8SToomas Soome 1380*199767f8SToomas Soome /* 1381*199767f8SToomas Soome * This simplifies one edge condition. 1382*199767f8SToomas Soome */ 1383*199767f8SToomas Soome tgts[-1] = -1; 1384*199767f8SToomas Soome 1385*199767f8SToomas Soome for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1386*199767f8SToomas Soome /* 1387*199767f8SToomas Soome * Initialize the targets array by finding the first n columns 1388*199767f8SToomas Soome * that contain no error. 1389*199767f8SToomas Soome * 1390*199767f8SToomas Soome * If there were no data errors, we need to ensure that we're 1391*199767f8SToomas Soome * always explicitly attempting to reconstruct at least one 1392*199767f8SToomas Soome * data column. To do this, we simply push the highest target 1393*199767f8SToomas Soome * up into the data columns. 1394*199767f8SToomas Soome */ 1395*199767f8SToomas Soome for (c = 0, i = 0; i < n; i++) { 1396*199767f8SToomas Soome if (i == n - 1 && data_errors == 0 && 1397*199767f8SToomas Soome c < rm->rm_firstdatacol) { 1398*199767f8SToomas Soome c = rm->rm_firstdatacol; 1399*199767f8SToomas Soome } 1400*199767f8SToomas Soome 1401*199767f8SToomas Soome while (rm->rm_col[c].rc_error != 0) { 1402*199767f8SToomas Soome c++; 1403*199767f8SToomas Soome ASSERT3S(c, <, rm->rm_cols); 1404*199767f8SToomas Soome } 1405*199767f8SToomas Soome 1406*199767f8SToomas Soome tgts[i] = c++; 1407*199767f8SToomas Soome } 1408*199767f8SToomas Soome 1409*199767f8SToomas Soome /* 1410*199767f8SToomas Soome * Setting tgts[n] simplifies the other edge condition. 1411*199767f8SToomas Soome */ 1412*199767f8SToomas Soome tgts[n] = rm->rm_cols; 1413*199767f8SToomas Soome 1414*199767f8SToomas Soome /* 1415*199767f8SToomas Soome * These buffers were allocated in previous iterations. 1416*199767f8SToomas Soome */ 1417*199767f8SToomas Soome for (i = 0; i < n - 1; i++) { 1418*199767f8SToomas Soome ASSERT(orig[i] != NULL); 1419*199767f8SToomas Soome } 1420*199767f8SToomas Soome 1421*199767f8SToomas Soome orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1422*199767f8SToomas Soome 1423*199767f8SToomas Soome current = 0; 1424*199767f8SToomas Soome next = tgts[current]; 1425*199767f8SToomas Soome 1426*199767f8SToomas Soome while (current != n) { 1427*199767f8SToomas Soome tgts[current] = next; 1428*199767f8SToomas Soome current = 0; 1429*199767f8SToomas Soome 1430*199767f8SToomas Soome /* 1431*199767f8SToomas Soome * Save off the original data that we're going to 1432*199767f8SToomas Soome * attempt to reconstruct. 1433*199767f8SToomas Soome */ 1434*199767f8SToomas Soome for (i = 0; i < n; i++) { 1435*199767f8SToomas Soome ASSERT(orig[i] != NULL); 1436*199767f8SToomas Soome c = tgts[i]; 1437*199767f8SToomas Soome ASSERT3S(c, >=, 0); 1438*199767f8SToomas Soome ASSERT3S(c, <, rm->rm_cols); 1439*199767f8SToomas Soome rc = &rm->rm_col[c]; 1440*199767f8SToomas Soome bcopy(rc->rc_data, orig[i], rc->rc_size); 1441*199767f8SToomas Soome } 1442*199767f8SToomas Soome 1443*199767f8SToomas Soome /* 1444*199767f8SToomas Soome * Attempt a reconstruction and exit the outer loop on 1445*199767f8SToomas Soome * success. 1446*199767f8SToomas Soome */ 1447*199767f8SToomas Soome code = vdev_raidz_reconstruct(rm, tgts, n); 1448*199767f8SToomas Soome if (raidz_checksum_verify(bp, data, bytes) == 0) { 1449*199767f8SToomas Soome for (i = 0; i < n; i++) { 1450*199767f8SToomas Soome c = tgts[i]; 1451*199767f8SToomas Soome rc = &rm->rm_col[c]; 1452*199767f8SToomas Soome ASSERT(rc->rc_error == 0); 1453*199767f8SToomas Soome rc->rc_error = ECKSUM; 1454*199767f8SToomas Soome } 1455*199767f8SToomas Soome 1456*199767f8SToomas Soome ret = code; 1457*199767f8SToomas Soome goto done; 1458*199767f8SToomas Soome } 1459*199767f8SToomas Soome 1460*199767f8SToomas Soome /* 1461*199767f8SToomas Soome * Restore the original data. 1462*199767f8SToomas Soome */ 1463*199767f8SToomas Soome for (i = 0; i < n; i++) { 1464*199767f8SToomas Soome c = tgts[i]; 1465*199767f8SToomas Soome rc = &rm->rm_col[c]; 1466*199767f8SToomas Soome bcopy(orig[i], rc->rc_data, rc->rc_size); 1467*199767f8SToomas Soome } 1468*199767f8SToomas Soome 1469*199767f8SToomas Soome do { 1470*199767f8SToomas Soome /* 1471*199767f8SToomas Soome * Find the next valid column after the current 1472*199767f8SToomas Soome * position.. 1473*199767f8SToomas Soome */ 1474*199767f8SToomas Soome for (next = tgts[current] + 1; 1475*199767f8SToomas Soome next < rm->rm_cols && 1476*199767f8SToomas Soome rm->rm_col[next].rc_error != 0; next++) 1477*199767f8SToomas Soome continue; 1478*199767f8SToomas Soome 1479*199767f8SToomas Soome ASSERT(next <= tgts[current + 1]); 1480*199767f8SToomas Soome 1481*199767f8SToomas Soome /* 1482*199767f8SToomas Soome * If that spot is available, we're done here. 1483*199767f8SToomas Soome */ 1484*199767f8SToomas Soome if (next != tgts[current + 1]) 1485*199767f8SToomas Soome break; 1486*199767f8SToomas Soome 1487*199767f8SToomas Soome /* 1488*199767f8SToomas Soome * Otherwise, find the next valid column after 1489*199767f8SToomas Soome * the previous position. 1490*199767f8SToomas Soome */ 1491*199767f8SToomas Soome for (c = tgts[current - 1] + 1; 1492*199767f8SToomas Soome rm->rm_col[c].rc_error != 0; c++) 1493*199767f8SToomas Soome continue; 1494*199767f8SToomas Soome 1495*199767f8SToomas Soome tgts[current] = c; 1496*199767f8SToomas Soome current++; 1497*199767f8SToomas Soome 1498*199767f8SToomas Soome } while (current != n); 1499*199767f8SToomas Soome } 1500*199767f8SToomas Soome } 1501*199767f8SToomas Soome n--; 1502*199767f8SToomas Soome done: 1503*199767f8SToomas Soome for (i = n - 1; i >= 0; i--) { 1504*199767f8SToomas Soome zfs_free(orig[i], rm->rm_col[0].rc_size); 1505*199767f8SToomas Soome } 1506*199767f8SToomas Soome 1507*199767f8SToomas Soome return (ret); 1508*199767f8SToomas Soome } 1509*199767f8SToomas Soome 1510*199767f8SToomas Soome static int 1511*199767f8SToomas Soome vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1512*199767f8SToomas Soome off_t offset, size_t bytes) 1513*199767f8SToomas Soome { 1514*199767f8SToomas Soome vdev_t *tvd = vd->v_top; 1515*199767f8SToomas Soome vdev_t *cvd; 1516*199767f8SToomas Soome raidz_map_t *rm; 1517*199767f8SToomas Soome raidz_col_t *rc; 1518*199767f8SToomas Soome int c, error; 1519*199767f8SToomas Soome int unexpected_errors; 1520*199767f8SToomas Soome int parity_errors; 1521*199767f8SToomas Soome int parity_untried; 1522*199767f8SToomas Soome int data_errors; 1523*199767f8SToomas Soome int total_errors; 1524*199767f8SToomas Soome int n; 1525*199767f8SToomas Soome int tgts[VDEV_RAIDZ_MAXPARITY]; 1526*199767f8SToomas Soome int code; 1527*199767f8SToomas Soome 1528*199767f8SToomas Soome rc = NULL; /* gcc */ 1529*199767f8SToomas Soome error = 0; 1530*199767f8SToomas Soome 1531*199767f8SToomas Soome rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1532*199767f8SToomas Soome vd->v_nchildren, vd->v_nparity); 1533*199767f8SToomas Soome 1534*199767f8SToomas Soome /* 1535*199767f8SToomas Soome * Iterate over the columns in reverse order so that we hit the parity 1536*199767f8SToomas Soome * last -- any errors along the way will force us to read the parity. 1537*199767f8SToomas Soome */ 1538*199767f8SToomas Soome for (c = rm->rm_cols - 1; c >= 0; c--) { 1539*199767f8SToomas Soome rc = &rm->rm_col[c]; 1540*199767f8SToomas Soome cvd = vdev_child(vd, rc->rc_devidx); 1541*199767f8SToomas Soome if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1542*199767f8SToomas Soome if (c >= rm->rm_firstdatacol) 1543*199767f8SToomas Soome rm->rm_missingdata++; 1544*199767f8SToomas Soome else 1545*199767f8SToomas Soome rm->rm_missingparity++; 1546*199767f8SToomas Soome rc->rc_error = ENXIO; 1547*199767f8SToomas Soome rc->rc_tried = 1; /* don't even try */ 1548*199767f8SToomas Soome rc->rc_skipped = 1; 1549*199767f8SToomas Soome continue; 1550*199767f8SToomas Soome } 1551*199767f8SToomas Soome #if 0 /* XXX: Too hard for the boot code. */ 1552*199767f8SToomas Soome if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1553*199767f8SToomas Soome if (c >= rm->rm_firstdatacol) 1554*199767f8SToomas Soome rm->rm_missingdata++; 1555*199767f8SToomas Soome else 1556*199767f8SToomas Soome rm->rm_missingparity++; 1557*199767f8SToomas Soome rc->rc_error = ESTALE; 1558*199767f8SToomas Soome rc->rc_skipped = 1; 1559*199767f8SToomas Soome continue; 1560*199767f8SToomas Soome } 1561*199767f8SToomas Soome #endif 1562*199767f8SToomas Soome if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1563*199767f8SToomas Soome rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1564*199767f8SToomas Soome rc->rc_offset, rc->rc_size); 1565*199767f8SToomas Soome rc->rc_tried = 1; 1566*199767f8SToomas Soome rc->rc_skipped = 0; 1567*199767f8SToomas Soome } 1568*199767f8SToomas Soome } 1569*199767f8SToomas Soome 1570*199767f8SToomas Soome reconstruct: 1571*199767f8SToomas Soome unexpected_errors = 0; 1572*199767f8SToomas Soome parity_errors = 0; 1573*199767f8SToomas Soome parity_untried = 0; 1574*199767f8SToomas Soome data_errors = 0; 1575*199767f8SToomas Soome total_errors = 0; 1576*199767f8SToomas Soome 1577*199767f8SToomas Soome ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1578*199767f8SToomas Soome ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1579*199767f8SToomas Soome 1580*199767f8SToomas Soome for (c = 0; c < rm->rm_cols; c++) { 1581*199767f8SToomas Soome rc = &rm->rm_col[c]; 1582*199767f8SToomas Soome 1583*199767f8SToomas Soome if (rc->rc_error) { 1584*199767f8SToomas Soome ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1585*199767f8SToomas Soome 1586*199767f8SToomas Soome if (c < rm->rm_firstdatacol) 1587*199767f8SToomas Soome parity_errors++; 1588*199767f8SToomas Soome else 1589*199767f8SToomas Soome data_errors++; 1590*199767f8SToomas Soome 1591*199767f8SToomas Soome if (!rc->rc_skipped) 1592*199767f8SToomas Soome unexpected_errors++; 1593*199767f8SToomas Soome 1594*199767f8SToomas Soome total_errors++; 1595*199767f8SToomas Soome } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1596*199767f8SToomas Soome parity_untried++; 1597*199767f8SToomas Soome } 1598*199767f8SToomas Soome } 1599*199767f8SToomas Soome 1600*199767f8SToomas Soome /* 1601*199767f8SToomas Soome * There are three potential phases for a read: 1602*199767f8SToomas Soome * 1. produce valid data from the columns read 1603*199767f8SToomas Soome * 2. read all disks and try again 1604*199767f8SToomas Soome * 3. perform combinatorial reconstruction 1605*199767f8SToomas Soome * 1606*199767f8SToomas Soome * Each phase is progressively both more expensive and less likely to 1607*199767f8SToomas Soome * occur. If we encounter more errors than we can repair or all phases 1608*199767f8SToomas Soome * fail, we have no choice but to return an error. 1609*199767f8SToomas Soome */ 1610*199767f8SToomas Soome 1611*199767f8SToomas Soome /* 1612*199767f8SToomas Soome * If the number of errors we saw was correctable -- less than or equal 1613*199767f8SToomas Soome * to the number of parity disks read -- attempt to produce data that 1614*199767f8SToomas Soome * has a valid checksum. Naturally, this case applies in the absence of 1615*199767f8SToomas Soome * any errors. 1616*199767f8SToomas Soome */ 1617*199767f8SToomas Soome if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1618*199767f8SToomas Soome if (data_errors == 0) { 1619*199767f8SToomas Soome if (raidz_checksum_verify(bp, data, bytes) == 0) { 1620*199767f8SToomas Soome /* 1621*199767f8SToomas Soome * If we read parity information (unnecessarily 1622*199767f8SToomas Soome * as it happens since no reconstruction was 1623*199767f8SToomas Soome * needed) regenerate and verify the parity. 1624*199767f8SToomas Soome * We also regenerate parity when resilvering 1625*199767f8SToomas Soome * so we can write it out to the failed device 1626*199767f8SToomas Soome * later. 1627*199767f8SToomas Soome */ 1628*199767f8SToomas Soome if (parity_errors + parity_untried < 1629*199767f8SToomas Soome rm->rm_firstdatacol) { 1630*199767f8SToomas Soome n = raidz_parity_verify(rm); 1631*199767f8SToomas Soome unexpected_errors += n; 1632*199767f8SToomas Soome ASSERT(parity_errors + n <= 1633*199767f8SToomas Soome rm->rm_firstdatacol); 1634*199767f8SToomas Soome } 1635*199767f8SToomas Soome goto done; 1636*199767f8SToomas Soome } 1637*199767f8SToomas Soome } else { 1638*199767f8SToomas Soome /* 1639*199767f8SToomas Soome * We either attempt to read all the parity columns or 1640*199767f8SToomas Soome * none of them. If we didn't try to read parity, we 1641*199767f8SToomas Soome * wouldn't be here in the correctable case. There must 1642*199767f8SToomas Soome * also have been fewer parity errors than parity 1643*199767f8SToomas Soome * columns or, again, we wouldn't be in this code path. 1644*199767f8SToomas Soome */ 1645*199767f8SToomas Soome ASSERT(parity_untried == 0); 1646*199767f8SToomas Soome ASSERT(parity_errors < rm->rm_firstdatacol); 1647*199767f8SToomas Soome 1648*199767f8SToomas Soome /* 1649*199767f8SToomas Soome * Identify the data columns that reported an error. 1650*199767f8SToomas Soome */ 1651*199767f8SToomas Soome n = 0; 1652*199767f8SToomas Soome for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1653*199767f8SToomas Soome rc = &rm->rm_col[c]; 1654*199767f8SToomas Soome if (rc->rc_error != 0) { 1655*199767f8SToomas Soome ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1656*199767f8SToomas Soome tgts[n++] = c; 1657*199767f8SToomas Soome } 1658*199767f8SToomas Soome } 1659*199767f8SToomas Soome 1660*199767f8SToomas Soome ASSERT(rm->rm_firstdatacol >= n); 1661*199767f8SToomas Soome 1662*199767f8SToomas Soome code = vdev_raidz_reconstruct(rm, tgts, n); 1663*199767f8SToomas Soome 1664*199767f8SToomas Soome if (raidz_checksum_verify(bp, data, bytes) == 0) { 1665*199767f8SToomas Soome /* 1666*199767f8SToomas Soome * If we read more parity disks than were used 1667*199767f8SToomas Soome * for reconstruction, confirm that the other 1668*199767f8SToomas Soome * parity disks produced correct data. This 1669*199767f8SToomas Soome * routine is suboptimal in that it regenerates 1670*199767f8SToomas Soome * the parity that we already used in addition 1671*199767f8SToomas Soome * to the parity that we're attempting to 1672*199767f8SToomas Soome * verify, but this should be a relatively 1673*199767f8SToomas Soome * uncommon case, and can be optimized if it 1674*199767f8SToomas Soome * becomes a problem. Note that we regenerate 1675*199767f8SToomas Soome * parity when resilvering so we can write it 1676*199767f8SToomas Soome * out to failed devices later. 1677*199767f8SToomas Soome */ 1678*199767f8SToomas Soome if (parity_errors < rm->rm_firstdatacol - n) { 1679*199767f8SToomas Soome n = raidz_parity_verify(rm); 1680*199767f8SToomas Soome unexpected_errors += n; 1681*199767f8SToomas Soome ASSERT(parity_errors + n <= 1682*199767f8SToomas Soome rm->rm_firstdatacol); 1683*199767f8SToomas Soome } 1684*199767f8SToomas Soome 1685*199767f8SToomas Soome goto done; 1686*199767f8SToomas Soome } 1687*199767f8SToomas Soome } 1688*199767f8SToomas Soome } 1689*199767f8SToomas Soome 1690*199767f8SToomas Soome /* 1691*199767f8SToomas Soome * This isn't a typical situation -- either we got a read 1692*199767f8SToomas Soome * error or a child silently returned bad data. Read every 1693*199767f8SToomas Soome * block so we can try again with as much data and parity as 1694*199767f8SToomas Soome * we can track down. If we've already been through once 1695*199767f8SToomas Soome * before, all children will be marked as tried so we'll 1696*199767f8SToomas Soome * proceed to combinatorial reconstruction. 1697*199767f8SToomas Soome */ 1698*199767f8SToomas Soome unexpected_errors = 1; 1699*199767f8SToomas Soome rm->rm_missingdata = 0; 1700*199767f8SToomas Soome rm->rm_missingparity = 0; 1701*199767f8SToomas Soome 1702*199767f8SToomas Soome n = 0; 1703*199767f8SToomas Soome for (c = 0; c < rm->rm_cols; c++) { 1704*199767f8SToomas Soome rc = &rm->rm_col[c]; 1705*199767f8SToomas Soome 1706*199767f8SToomas Soome if (rc->rc_tried) 1707*199767f8SToomas Soome continue; 1708*199767f8SToomas Soome 1709*199767f8SToomas Soome cvd = vdev_child(vd, rc->rc_devidx); 1710*199767f8SToomas Soome ASSERT(cvd != NULL); 1711*199767f8SToomas Soome rc->rc_error = cvd->v_read(cvd, NULL, 1712*199767f8SToomas Soome rc->rc_data, rc->rc_offset, rc->rc_size); 1713*199767f8SToomas Soome if (rc->rc_error == 0) 1714*199767f8SToomas Soome n++; 1715*199767f8SToomas Soome rc->rc_tried = 1; 1716*199767f8SToomas Soome rc->rc_skipped = 0; 1717*199767f8SToomas Soome } 1718*199767f8SToomas Soome /* 1719*199767f8SToomas Soome * If we managed to read anything more, retry the 1720*199767f8SToomas Soome * reconstruction. 1721*199767f8SToomas Soome */ 1722*199767f8SToomas Soome if (n > 0) 1723*199767f8SToomas Soome goto reconstruct; 1724*199767f8SToomas Soome 1725*199767f8SToomas Soome /* 1726*199767f8SToomas Soome * At this point we've attempted to reconstruct the data given the 1727*199767f8SToomas Soome * errors we detected, and we've attempted to read all columns. There 1728*199767f8SToomas Soome * must, therefore, be one or more additional problems -- silent errors 1729*199767f8SToomas Soome * resulting in invalid data rather than explicit I/O errors resulting 1730*199767f8SToomas Soome * in absent data. We check if there is enough additional data to 1731*199767f8SToomas Soome * possibly reconstruct the data and then perform combinatorial 1732*199767f8SToomas Soome * reconstruction over all possible combinations. If that fails, 1733*199767f8SToomas Soome * we're cooked. 1734*199767f8SToomas Soome */ 1735*199767f8SToomas Soome if (total_errors > rm->rm_firstdatacol) { 1736*199767f8SToomas Soome error = EIO; 1737*199767f8SToomas Soome } else if (total_errors < rm->rm_firstdatacol && 1738*199767f8SToomas Soome (code = vdev_raidz_combrec(rm, bp, data, offset, bytes, 1739*199767f8SToomas Soome total_errors, data_errors)) != 0) { 1740*199767f8SToomas Soome /* 1741*199767f8SToomas Soome * If we didn't use all the available parity for the 1742*199767f8SToomas Soome * combinatorial reconstruction, verify that the remaining 1743*199767f8SToomas Soome * parity is correct. 1744*199767f8SToomas Soome */ 1745*199767f8SToomas Soome if (code != (1 << rm->rm_firstdatacol) - 1) 1746*199767f8SToomas Soome (void) raidz_parity_verify(rm); 1747*199767f8SToomas Soome } else { 1748*199767f8SToomas Soome /* 1749*199767f8SToomas Soome * We're here because either: 1750*199767f8SToomas Soome * 1751*199767f8SToomas Soome * total_errors == rm_first_datacol, or 1752*199767f8SToomas Soome * vdev_raidz_combrec() failed 1753*199767f8SToomas Soome * 1754*199767f8SToomas Soome * In either case, there is enough bad data to prevent 1755*199767f8SToomas Soome * reconstruction. 1756*199767f8SToomas Soome * 1757*199767f8SToomas Soome * Start checksum ereports for all children which haven't 1758*199767f8SToomas Soome * failed, and the IO wasn't speculative. 1759*199767f8SToomas Soome */ 1760*199767f8SToomas Soome error = ECKSUM; 1761*199767f8SToomas Soome } 1762*199767f8SToomas Soome 1763*199767f8SToomas Soome done: 1764*199767f8SToomas Soome vdev_raidz_map_free(rm); 1765*199767f8SToomas Soome 1766*199767f8SToomas Soome return (error); 1767*199767f8SToomas Soome } 1768