1 /* 2 * GRUB -- GRand Unified Bootloader 3 * Copyright (C) 1999,2000,2001,2002,2003,2004 Free Software Foundation, Inc. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 */ 19 20 /* 21 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 /* 26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 27 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 28 */ 29 30 /* 31 * The zfs plug-in routines for GRUB are: 32 * 33 * zfs_mount() - locates a valid uberblock of the root pool and reads 34 * in its MOS at the memory address MOS. 35 * 36 * zfs_open() - locates a plain file object by following the MOS 37 * and places its dnode at the memory address DNODE. 38 * 39 * zfs_read() - read in the data blocks pointed by the DNODE. 40 * 41 * ZFS_SCRATCH is used as a working area. 42 * 43 * (memory addr) MOS DNODE ZFS_SCRATCH 44 * | | | 45 * +-------V---------V----------V---------------+ 46 * memory | | dnode | dnode | scratch | 47 * | | 512B | 512B | area | 48 * +--------------------------------------------+ 49 */ 50 51 #ifdef FSYS_ZFS 52 53 #include "shared.h" 54 #include "filesys.h" 55 #include "fsys_zfs.h" 56 57 /* cache for a file block of the currently zfs_open()-ed file */ 58 static void *file_buf = NULL; 59 static uint64_t file_start = 0; 60 static uint64_t file_end = 0; 61 62 /* cache for a dnode block */ 63 static dnode_phys_t *dnode_buf = NULL; 64 static dnode_phys_t *dnode_mdn = NULL; 65 static uint64_t dnode_start = 0; 66 static uint64_t dnode_end = 0; 67 68 static uint64_t pool_guid = 0; 69 static uberblock_t current_uberblock; 70 static char *stackbase; 71 72 decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] = 73 { 74 {"inherit", 0}, /* ZIO_COMPRESS_INHERIT */ 75 {"on", lzjb_decompress}, /* ZIO_COMPRESS_ON */ 76 {"off", 0}, /* ZIO_COMPRESS_OFF */ 77 {"lzjb", lzjb_decompress}, /* ZIO_COMPRESS_LZJB */ 78 {"empty", 0}, /* ZIO_COMPRESS_EMPTY */ 79 {"gzip-1", 0}, /* ZIO_COMPRESS_GZIP_1 */ 80 {"gzip-2", 0}, /* ZIO_COMPRESS_GZIP_2 */ 81 {"gzip-3", 0}, /* ZIO_COMPRESS_GZIP_3 */ 82 {"gzip-4", 0}, /* ZIO_COMPRESS_GZIP_4 */ 83 {"gzip-5", 0}, /* ZIO_COMPRESS_GZIP_5 */ 84 {"gzip-6", 0}, /* ZIO_COMPRESS_GZIP_6 */ 85 {"gzip-7", 0}, /* ZIO_COMPRESS_GZIP_7 */ 86 {"gzip-8", 0}, /* ZIO_COMPRESS_GZIP_8 */ 87 {"gzip-9", 0}, /* ZIO_COMPRESS_GZIP_9 */ 88 {"zle", 0}, /* ZIO_COMPRESS_ZLE */ 89 {"lz4", lz4_decompress} /* ZIO_COMPRESS_LZ4 */ 90 }; 91 92 static int zio_read_data(blkptr_t *bp, void *buf, char *stack); 93 94 /* 95 * Our own version of bcmp(). 96 */ 97 static int 98 zfs_bcmp(const void *s1, const void *s2, size_t n) 99 { 100 const uchar_t *ps1 = s1; 101 const uchar_t *ps2 = s2; 102 103 if (s1 != s2 && n != 0) { 104 do { 105 if (*ps1++ != *ps2++) 106 return (1); 107 } while (--n != 0); 108 } 109 110 return (0); 111 } 112 113 /* 114 * Our own version of log2(). Same thing as highbit()-1. 115 */ 116 static int 117 zfs_log2(uint64_t num) 118 { 119 int i = 0; 120 121 while (num > 1) { 122 i++; 123 num = num >> 1; 124 } 125 126 return (i); 127 } 128 129 /* Checksum Functions */ 130 static void 131 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) 132 { 133 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 134 } 135 136 /* Checksum Table and Values */ 137 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 138 {{NULL, NULL}, 0, 0, "inherit"}, 139 {{NULL, NULL}, 0, 0, "on"}, 140 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, 141 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"}, 142 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"}, 143 {{NULL, NULL}, 0, 0, "zilog"}, 144 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"}, 145 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"}, 146 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"}, 147 {{NULL, NULL}, 0, 0, "zilog2"}, 148 {{zio_checksum_off, zio_checksum_off}, 0, 0, "noparity"}, 149 {{zio_checksum_SHA512, NULL}, 0, 0, "SHA512"} 150 }; 151 152 /* 153 * zio_checksum_verify: Provides support for checksum verification. 154 * 155 * Fletcher2, Fletcher4, SHA-256 and SHA-512/256 are supported. 156 * 157 * Return: 158 * -1 = Failure 159 * 0 = Success 160 */ 161 static int 162 zio_checksum_verify(blkptr_t *bp, char *data, int size) 163 { 164 zio_cksum_t zc = bp->blk_cksum; 165 uint32_t checksum = BP_GET_CHECKSUM(bp); 166 int byteswap = BP_SHOULD_BYTESWAP(bp); 167 zio_eck_t *zec = (zio_eck_t *)(data + size) - 1; 168 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 169 zio_cksum_t actual_cksum, expected_cksum; 170 171 if (byteswap) { 172 grub_printf("byteswap not supported\n"); 173 return (-1); 174 } 175 176 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) { 177 grub_printf("checksum algorithm %u not supported\n", checksum); 178 return (-1); 179 } 180 181 if (ci->ci_eck) { 182 expected_cksum = zec->zec_cksum; 183 zec->zec_cksum = zc; 184 ci->ci_func[0](data, size, &actual_cksum); 185 zec->zec_cksum = expected_cksum; 186 zc = expected_cksum; 187 } else { 188 ci->ci_func[byteswap](data, size, &actual_cksum); 189 } 190 191 if ((actual_cksum.zc_word[0] - zc.zc_word[0]) | 192 (actual_cksum.zc_word[1] - zc.zc_word[1]) | 193 (actual_cksum.zc_word[2] - zc.zc_word[2]) | 194 (actual_cksum.zc_word[3] - zc.zc_word[3])) 195 return (-1); 196 197 return (0); 198 } 199 200 /* 201 * vdev_label_start returns the physical disk offset (in bytes) of 202 * label "l". 203 */ 204 static uint64_t 205 vdev_label_start(uint64_t psize, int l) 206 { 207 return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 208 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); 209 } 210 211 /* 212 * vdev_uberblock_compare takes two uberblock structures and returns an integer 213 * indicating the more recent of the two. 214 * Return Value = 1 if ub2 is more recent 215 * Return Value = -1 if ub1 is more recent 216 * The most recent uberblock is determined using its transaction number and 217 * timestamp. The uberblock with the highest transaction number is 218 * considered "newer". If the transaction numbers of the two blocks match, the 219 * timestamps are compared to determine the "newer" of the two. 220 */ 221 static int 222 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) 223 { 224 if (ub1->ub_txg < ub2->ub_txg) 225 return (-1); 226 if (ub1->ub_txg > ub2->ub_txg) 227 return (1); 228 229 if (ub1->ub_timestamp < ub2->ub_timestamp) 230 return (-1); 231 if (ub1->ub_timestamp > ub2->ub_timestamp) 232 return (1); 233 234 return (0); 235 } 236 237 /* 238 * Three pieces of information are needed to verify an uberblock: the magic 239 * number, the version number, and the checksum. 240 * 241 * Return: 242 * 0 - Success 243 * -1 - Failure 244 */ 245 static int 246 uberblock_verify(uberblock_t *uber, uint64_t ub_size, uint64_t offset) 247 { 248 blkptr_t bp; 249 250 BP_ZERO(&bp); 251 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 252 BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); 253 ZIO_SET_CHECKSUM(&bp.blk_cksum, offset, 0, 0, 0); 254 255 if (zio_checksum_verify(&bp, (char *)uber, ub_size) != 0) 256 return (-1); 257 258 if (uber->ub_magic == UBERBLOCK_MAGIC && 259 SPA_VERSION_IS_SUPPORTED(uber->ub_version)) 260 return (0); 261 262 return (-1); 263 } 264 265 /* 266 * Find the best uberblock. 267 * Return: 268 * Success - Pointer to the best uberblock. 269 * Failure - NULL 270 */ 271 static uberblock_t * 272 find_bestub(char *ub_array, uint64_t ashift, uint64_t sector) 273 { 274 uberblock_t *ubbest = NULL; 275 uberblock_t *ubnext; 276 uint64_t offset, ub_size; 277 int i; 278 279 ub_size = VDEV_UBERBLOCK_SIZE(ashift); 280 281 for (i = 0; i < VDEV_UBERBLOCK_COUNT(ashift); i++) { 282 ubnext = (uberblock_t *)ub_array; 283 ub_array += ub_size; 284 offset = (sector << SPA_MINBLOCKSHIFT) + 285 VDEV_UBERBLOCK_OFFSET(ashift, i); 286 287 if (uberblock_verify(ubnext, ub_size, offset) != 0) 288 continue; 289 290 if (ubbest == NULL || 291 vdev_uberblock_compare(ubnext, ubbest) > 0) 292 ubbest = ubnext; 293 } 294 295 return (ubbest); 296 } 297 298 /* 299 * Read a block of data based on the gang block address dva, 300 * and put its data in buf. 301 * 302 * Return: 303 * 0 - success 304 * 1 - failure 305 */ 306 static int 307 zio_read_gang(blkptr_t *bp, dva_t *dva, void *buf, char *stack) 308 { 309 zio_gbh_phys_t *zio_gb; 310 uint64_t offset, sector; 311 blkptr_t tmpbp; 312 int i; 313 314 zio_gb = (zio_gbh_phys_t *)stack; 315 stack += SPA_GANGBLOCKSIZE; 316 offset = DVA_GET_OFFSET(dva); 317 sector = DVA_OFFSET_TO_PHYS_SECTOR(offset); 318 319 /* read in the gang block header */ 320 if (devread(sector, 0, SPA_GANGBLOCKSIZE, (char *)zio_gb) == 0) { 321 grub_printf("failed to read in a gang block header\n"); 322 return (1); 323 } 324 325 /* self checksuming the gang block header */ 326 BP_ZERO(&tmpbp); 327 BP_SET_CHECKSUM(&tmpbp, ZIO_CHECKSUM_GANG_HEADER); 328 BP_SET_BYTEORDER(&tmpbp, ZFS_HOST_BYTEORDER); 329 ZIO_SET_CHECKSUM(&tmpbp.blk_cksum, DVA_GET_VDEV(dva), 330 DVA_GET_OFFSET(dva), bp->blk_birth, 0); 331 if (zio_checksum_verify(&tmpbp, (char *)zio_gb, SPA_GANGBLOCKSIZE)) { 332 grub_printf("failed to checksum a gang block header\n"); 333 return (1); 334 } 335 336 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 337 if (BP_IS_HOLE(&zio_gb->zg_blkptr[i])) 338 continue; 339 340 if (zio_read_data(&zio_gb->zg_blkptr[i], buf, stack)) 341 return (1); 342 buf += BP_GET_PSIZE(&zio_gb->zg_blkptr[i]); 343 } 344 345 return (0); 346 } 347 348 /* 349 * Read in a block of raw data to buf. 350 * 351 * Return: 352 * 0 - success 353 * 1 - failure 354 */ 355 static int 356 zio_read_data(blkptr_t *bp, void *buf, char *stack) 357 { 358 int i, psize; 359 360 psize = BP_GET_PSIZE(bp); 361 362 /* pick a good dva from the block pointer */ 363 for (i = 0; i < SPA_DVAS_PER_BP; i++) { 364 uint64_t offset, sector; 365 366 if (bp->blk_dva[i].dva_word[0] == 0 && 367 bp->blk_dva[i].dva_word[1] == 0) 368 continue; 369 370 if (DVA_GET_GANG(&bp->blk_dva[i])) { 371 if (zio_read_gang(bp, &bp->blk_dva[i], buf, stack) != 0) 372 continue; 373 } else { 374 /* read in a data block */ 375 offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 376 sector = DVA_OFFSET_TO_PHYS_SECTOR(offset); 377 if (devread(sector, 0, psize, buf) == 0) 378 continue; 379 } 380 381 /* verify that the checksum matches */ 382 if (zio_checksum_verify(bp, buf, psize) == 0) { 383 return (0); 384 } 385 } 386 387 grub_printf("could not read block due to EIO or ECKSUM\n"); 388 return (1); 389 } 390 391 /* 392 * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be 393 * more than BPE_PAYLOAD_SIZE bytes). 394 */ 395 static void 396 decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) 397 { 398 int psize, i; 399 uint8_t *buf8 = buf; 400 uint64_t w = 0; 401 const uint64_t *bp64 = (const uint64_t *)bp; 402 403 psize = BPE_GET_PSIZE(bp); 404 405 /* 406 * Decode the words of the block pointer into the byte array. 407 * Low bits of first word are the first byte (little endian). 408 */ 409 for (i = 0; i < psize; i++) { 410 if (i % sizeof (w) == 0) { 411 /* beginning of a word */ 412 w = *bp64; 413 bp64++; 414 if (!BPE_IS_PAYLOADWORD(bp, bp64)) 415 bp64++; 416 } 417 buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); 418 } 419 } 420 421 /* 422 * Fill in the buffer with the (decompressed) payload of the embedded 423 * blkptr_t. Takes into account compression and byteorder (the payload is 424 * treated as a stream of bytes). 425 * Return 0 on success, or ENOSPC if it won't fit in the buffer. 426 */ 427 static int 428 decode_embedded_bp(const blkptr_t *bp, void *buf) 429 { 430 int comp; 431 int lsize, psize; 432 uint8_t *dst = buf; 433 uint64_t w = 0; 434 435 lsize = BPE_GET_LSIZE(bp); 436 psize = BPE_GET_PSIZE(bp); 437 comp = BP_GET_COMPRESS(bp); 438 439 if (comp != ZIO_COMPRESS_OFF) { 440 uint8_t dstbuf[BPE_PAYLOAD_SIZE]; 441 442 if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS || 443 decomp_table[comp].decomp_func == NULL) { 444 grub_printf("compression algorithm not supported\n"); 445 return (ERR_FSYS_CORRUPT); 446 } 447 448 decode_embedded_bp_compressed(bp, dstbuf); 449 decomp_table[comp].decomp_func(dstbuf, buf, psize, lsize); 450 } else { 451 decode_embedded_bp_compressed(bp, buf); 452 } 453 454 return (0); 455 } 456 457 /* 458 * Read in a block of data, verify its checksum, decompress if needed, 459 * and put the uncompressed data in buf. 460 * 461 * Return: 462 * 0 - success 463 * errnum - failure 464 */ 465 static int 466 zio_read(blkptr_t *bp, void *buf, char *stack) 467 { 468 int lsize, psize, comp; 469 char *retbuf; 470 471 if (BP_IS_EMBEDDED(bp)) { 472 if (BPE_GET_ETYPE(bp) != BP_EMBEDDED_TYPE_DATA) { 473 grub_printf("unsupported embedded BP (type=%u)\n", 474 (int)BPE_GET_ETYPE(bp)); 475 return (ERR_FSYS_CORRUPT); 476 } 477 return (decode_embedded_bp(bp, buf)); 478 } 479 480 comp = BP_GET_COMPRESS(bp); 481 lsize = BP_GET_LSIZE(bp); 482 psize = BP_GET_PSIZE(bp); 483 484 if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS || 485 (comp != ZIO_COMPRESS_OFF && 486 decomp_table[comp].decomp_func == NULL)) { 487 grub_printf("compression algorithm not supported\n"); 488 return (ERR_FSYS_CORRUPT); 489 } 490 491 if ((char *)buf < stack && ((char *)buf) + lsize > stack) { 492 grub_printf("not enough memory to fit %u bytes on stack\n", 493 lsize); 494 return (ERR_WONT_FIT); 495 } 496 497 retbuf = buf; 498 if (comp != ZIO_COMPRESS_OFF) { 499 buf = stack; 500 stack += psize; 501 } 502 503 if (zio_read_data(bp, buf, stack) != 0) { 504 grub_printf("zio_read_data failed\n"); 505 return (ERR_FSYS_CORRUPT); 506 } 507 508 if (comp != ZIO_COMPRESS_OFF) { 509 if (decomp_table[comp].decomp_func(buf, retbuf, psize, 510 lsize) != 0) { 511 grub_printf("zio_read decompression failed\n"); 512 return (ERR_FSYS_CORRUPT); 513 } 514 } 515 516 return (0); 517 } 518 519 /* 520 * Get the block from a block id. 521 * push the block onto the stack. 522 * 523 * Return: 524 * 0 - success 525 * errnum - failure 526 */ 527 static int 528 dmu_read(dnode_phys_t *dn, uint64_t blkid, void *buf, char *stack) 529 { 530 int idx, level; 531 blkptr_t *bp_array = dn->dn_blkptr; 532 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 533 blkptr_t *bp, *tmpbuf; 534 535 bp = (blkptr_t *)stack; 536 stack += sizeof (blkptr_t); 537 538 tmpbuf = (blkptr_t *)stack; 539 stack += 1<<dn->dn_indblkshift; 540 541 for (level = dn->dn_nlevels - 1; level >= 0; level--) { 542 idx = (blkid >> (epbs * level)) & ((1<<epbs)-1); 543 *bp = bp_array[idx]; 544 if (level == 0) 545 tmpbuf = buf; 546 if (BP_IS_HOLE(bp)) { 547 grub_memset(buf, 0, 548 dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); 549 break; 550 } else if (errnum = zio_read(bp, tmpbuf, stack)) { 551 return (errnum); 552 } 553 554 bp_array = tmpbuf; 555 } 556 557 return (0); 558 } 559 560 /* 561 * mzap_lookup: Looks up property described by "name" and returns the value 562 * in "value". 563 * 564 * Return: 565 * 0 - success 566 * errnum - failure 567 */ 568 static int 569 mzap_lookup(mzap_phys_t *zapobj, int objsize, const char *name, 570 uint64_t *value) 571 { 572 int i, chunks; 573 mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk; 574 575 chunks = objsize / MZAP_ENT_LEN - 1; 576 for (i = 0; i < chunks; i++) { 577 if (grub_strcmp(mzap_ent[i].mze_name, name) == 0) { 578 *value = mzap_ent[i].mze_value; 579 return (0); 580 } 581 } 582 583 return (ERR_FSYS_CORRUPT); 584 } 585 586 static uint64_t 587 zap_hash(uint64_t salt, const char *name) 588 { 589 static uint64_t table[256]; 590 const uint8_t *cp; 591 uint8_t c; 592 uint64_t crc = salt; 593 594 if (table[128] == 0) { 595 uint64_t *ct; 596 int i, j; 597 for (i = 0; i < 256; i++) { 598 for (ct = table + i, *ct = i, j = 8; j > 0; j--) 599 *ct = (*ct >> 1) ^ (-(*ct & 1) & 600 ZFS_CRC64_POLY); 601 } 602 } 603 604 if (crc == 0 || table[128] != ZFS_CRC64_POLY) { 605 errnum = ERR_FSYS_CORRUPT; 606 return (0); 607 } 608 609 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 610 crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF]; 611 612 /* 613 * Only use 28 bits, since we need 4 bits in the cookie for the 614 * collision differentiator. We MUST use the high bits, since 615 * those are the ones that we first pay attention to when 616 * choosing the bucket. 617 */ 618 crc &= ~((1ULL << (64 - 28)) - 1); 619 620 return (crc); 621 } 622 623 /* 624 * Only to be used on 8-bit arrays. 625 * array_len is actual len in bytes (not encoded le_value_length). 626 * buf is null-terminated. 627 */ 628 static int 629 zap_leaf_array_equal(zap_leaf_phys_t *l, int blksft, int chunk, 630 int array_len, const char *buf) 631 { 632 int bseen = 0; 633 634 while (bseen < array_len) { 635 struct zap_leaf_array *la = 636 &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array; 637 int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); 638 639 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft)) 640 return (0); 641 642 if (zfs_bcmp(la->la_array, buf + bseen, toread) != 0) 643 break; 644 chunk = la->la_next; 645 bseen += toread; 646 } 647 return (bseen == array_len); 648 } 649 650 /* 651 * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the 652 * value for the property "name". 653 * 654 * Return: 655 * 0 - success 656 * errnum - failure 657 */ 658 static int 659 zap_leaf_lookup(zap_leaf_phys_t *l, int blksft, uint64_t h, 660 const char *name, uint64_t *value) 661 { 662 uint16_t chunk; 663 struct zap_leaf_entry *le; 664 665 /* Verify if this is a valid leaf block */ 666 if (l->l_hdr.lh_block_type != ZBT_LEAF) 667 return (ERR_FSYS_CORRUPT); 668 if (l->l_hdr.lh_magic != ZAP_LEAF_MAGIC) 669 return (ERR_FSYS_CORRUPT); 670 671 for (chunk = l->l_hash[LEAF_HASH(blksft, h)]; 672 chunk != CHAIN_END; chunk = le->le_next) { 673 674 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft)) 675 return (ERR_FSYS_CORRUPT); 676 677 le = ZAP_LEAF_ENTRY(l, blksft, chunk); 678 679 /* Verify the chunk entry */ 680 if (le->le_type != ZAP_CHUNK_ENTRY) 681 return (ERR_FSYS_CORRUPT); 682 683 if (le->le_hash != h) 684 continue; 685 686 if (zap_leaf_array_equal(l, blksft, le->le_name_chunk, 687 le->le_name_length, name)) { 688 689 struct zap_leaf_array *la; 690 uint8_t *ip; 691 692 if (le->le_int_size != 8 || le->le_value_length != 1) 693 return (ERR_FSYS_CORRUPT); 694 695 /* get the uint64_t property value */ 696 la = &ZAP_LEAF_CHUNK(l, blksft, 697 le->le_value_chunk).l_array; 698 ip = la->la_array; 699 700 *value = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | 701 (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | 702 (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | 703 (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; 704 705 return (0); 706 } 707 } 708 709 return (ERR_FSYS_CORRUPT); 710 } 711 712 /* 713 * Fat ZAP lookup 714 * 715 * Return: 716 * 0 - success 717 * errnum - failure 718 */ 719 static int 720 fzap_lookup(dnode_phys_t *zap_dnode, zap_phys_t *zap, 721 const char *name, uint64_t *value, char *stack) 722 { 723 zap_leaf_phys_t *l; 724 uint64_t hash, idx, blkid; 725 int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT); 726 727 /* Verify if this is a fat zap header block */ 728 if (zap->zap_magic != (uint64_t)ZAP_MAGIC || 729 zap->zap_flags != 0) 730 return (ERR_FSYS_CORRUPT); 731 732 hash = zap_hash(zap->zap_salt, name); 733 if (errnum) 734 return (errnum); 735 736 /* get block id from index */ 737 if (zap->zap_ptrtbl.zt_numblks != 0) { 738 /* external pointer tables not supported */ 739 return (ERR_FSYS_CORRUPT); 740 } 741 idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift); 742 blkid = ((uint64_t *)zap)[idx + (1<<(blksft-3-1))]; 743 744 /* Get the leaf block */ 745 l = (zap_leaf_phys_t *)stack; 746 stack += 1<<blksft; 747 if ((1<<blksft) < sizeof (zap_leaf_phys_t)) 748 return (ERR_FSYS_CORRUPT); 749 if (errnum = dmu_read(zap_dnode, blkid, l, stack)) 750 return (errnum); 751 752 return (zap_leaf_lookup(l, blksft, hash, name, value)); 753 } 754 755 /* 756 * Read in the data of a zap object and find the value for a matching 757 * property name. 758 * 759 * Return: 760 * 0 - success 761 * errnum - failure 762 */ 763 static int 764 zap_lookup(dnode_phys_t *zap_dnode, const char *name, uint64_t *val, 765 char *stack) 766 { 767 uint64_t block_type; 768 int size; 769 void *zapbuf; 770 771 /* Read in the first block of the zap object data. */ 772 zapbuf = stack; 773 size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 774 stack += size; 775 776 if ((errnum = dmu_read(zap_dnode, 0, zapbuf, stack)) != 0) 777 return (errnum); 778 779 block_type = *((uint64_t *)zapbuf); 780 781 if (block_type == ZBT_MICRO) { 782 return (mzap_lookup(zapbuf, size, name, val)); 783 } else if (block_type == ZBT_HEADER) { 784 /* this is a fat zap */ 785 return (fzap_lookup(zap_dnode, zapbuf, name, 786 val, stack)); 787 } 788 789 return (ERR_FSYS_CORRUPT); 790 } 791 792 typedef struct zap_attribute { 793 int za_integer_length; 794 uint64_t za_num_integers; 795 uint64_t za_first_integer; 796 char *za_name; 797 } zap_attribute_t; 798 799 typedef int (zap_cb_t)(zap_attribute_t *za, void *arg, char *stack); 800 801 static int 802 zap_iterate(dnode_phys_t *zap_dnode, zap_cb_t *cb, void *arg, char *stack) 803 { 804 uint32_t size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 805 zap_attribute_t za; 806 int i; 807 mzap_phys_t *mzp = (mzap_phys_t *)stack; 808 stack += size; 809 810 if ((errnum = dmu_read(zap_dnode, 0, mzp, stack)) != 0) 811 return (errnum); 812 813 /* 814 * Iteration over fatzap objects has not yet been implemented. 815 * If we encounter a pool in which there are more features for 816 * read than can fit inside a microzap (i.e., more than 2048 817 * features for read), we can add support for fatzap iteration. 818 * For now, fail. 819 */ 820 if (mzp->mz_block_type != ZBT_MICRO) { 821 grub_printf("feature information stored in fatzap, pool " 822 "version not supported\n"); 823 return (1); 824 } 825 826 za.za_integer_length = 8; 827 za.za_num_integers = 1; 828 for (i = 0; i < size / MZAP_ENT_LEN - 1; i++) { 829 mzap_ent_phys_t *mzep = &mzp->mz_chunk[i]; 830 int err; 831 832 za.za_first_integer = mzep->mze_value; 833 za.za_name = mzep->mze_name; 834 err = cb(&za, arg, stack); 835 if (err != 0) 836 return (err); 837 } 838 839 return (0); 840 } 841 842 /* 843 * Get the dnode of an object number from the metadnode of an object set. 844 * 845 * Input 846 * mdn - metadnode to get the object dnode 847 * objnum - object number for the object dnode 848 * type - if nonzero, object must be of this type 849 * buf - data buffer that holds the returning dnode 850 * stack - scratch area 851 * 852 * Return: 853 * 0 - success 854 * errnum - failure 855 */ 856 static int 857 dnode_get(dnode_phys_t *mdn, uint64_t objnum, uint8_t type, dnode_phys_t *buf, 858 char *stack) 859 { 860 uint64_t blkid, blksz; /* the block id this object dnode is in */ 861 int epbs; /* shift of number of dnodes in a block */ 862 int idx; /* index within a block */ 863 dnode_phys_t *dnbuf; 864 865 blksz = mdn->dn_datablkszsec << SPA_MINBLOCKSHIFT; 866 epbs = zfs_log2(blksz) - DNODE_SHIFT; 867 blkid = objnum >> epbs; 868 idx = objnum & ((1<<epbs)-1); 869 870 if (dnode_buf != NULL && dnode_mdn == mdn && 871 objnum >= dnode_start && objnum < dnode_end) { 872 grub_memmove(buf, &dnode_buf[idx], DNODE_SIZE); 873 VERIFY_DN_TYPE(buf, type); 874 return (0); 875 } 876 877 if (dnode_buf && blksz == 1<<DNODE_BLOCK_SHIFT) { 878 dnbuf = dnode_buf; 879 dnode_mdn = mdn; 880 dnode_start = blkid << epbs; 881 dnode_end = (blkid + 1) << epbs; 882 } else { 883 dnbuf = (dnode_phys_t *)stack; 884 stack += blksz; 885 } 886 887 if (errnum = dmu_read(mdn, blkid, (char *)dnbuf, stack)) 888 return (errnum); 889 890 grub_memmove(buf, &dnbuf[idx], DNODE_SIZE); 891 VERIFY_DN_TYPE(buf, type); 892 893 return (0); 894 } 895 896 /* 897 * Check if this is a special file that resides at the top 898 * dataset of the pool. Currently this is the GRUB menu, 899 * boot signature and boot signature backup. 900 * str starts with '/'. 901 */ 902 static int 903 is_top_dataset_file(char *str) 904 { 905 char *tptr; 906 907 if ((tptr = grub_strstr(str, "menu.lst")) && 908 (tptr[8] == '\0' || tptr[8] == ' ') && 909 *(tptr-1) == '/') 910 return (1); 911 912 if (grub_strncmp(str, BOOTSIGN_DIR"/", 913 grub_strlen(BOOTSIGN_DIR) + 1) == 0) 914 return (1); 915 916 if (grub_strcmp(str, BOOTSIGN_BACKUP) == 0) 917 return (1); 918 919 return (0); 920 } 921 922 static int 923 check_feature(zap_attribute_t *za, void *arg, char *stack) 924 { 925 const char **names = arg; 926 int i; 927 928 if (za->za_first_integer == 0) 929 return (0); 930 931 for (i = 0; names[i] != NULL; i++) { 932 if (grub_strcmp(za->za_name, names[i]) == 0) { 933 return (0); 934 } 935 } 936 grub_printf("missing feature for read '%s'\n", za->za_name); 937 return (ERR_NEWER_VERSION); 938 } 939 940 /* 941 * Get the file dnode for a given file name where mdn is the meta dnode 942 * for this ZFS object set. When found, place the file dnode in dn. 943 * The 'path' argument will be mangled. 944 * 945 * Return: 946 * 0 - success 947 * errnum - failure 948 */ 949 static int 950 dnode_get_path(dnode_phys_t *mdn, char *path, dnode_phys_t *dn, 951 char *stack) 952 { 953 uint64_t objnum, version; 954 char *cname, ch; 955 956 if (errnum = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE, 957 dn, stack)) 958 return (errnum); 959 960 if (errnum = zap_lookup(dn, ZPL_VERSION_STR, &version, stack)) 961 return (errnum); 962 if (version > ZPL_VERSION) 963 return (-1); 964 965 if (errnum = zap_lookup(dn, ZFS_ROOT_OBJ, &objnum, stack)) 966 return (errnum); 967 968 if (errnum = dnode_get(mdn, objnum, DMU_OT_DIRECTORY_CONTENTS, 969 dn, stack)) 970 return (errnum); 971 972 /* skip leading slashes */ 973 while (*path == '/') 974 path++; 975 976 while (*path && !grub_isspace(*path)) { 977 978 /* get the next component name */ 979 cname = path; 980 while (*path && !grub_isspace(*path) && *path != '/') 981 path++; 982 ch = *path; 983 *path = 0; /* ensure null termination */ 984 985 if (errnum = zap_lookup(dn, cname, &objnum, stack)) 986 return (errnum); 987 988 objnum = ZFS_DIRENT_OBJ(objnum); 989 if (errnum = dnode_get(mdn, objnum, 0, dn, stack)) 990 return (errnum); 991 992 *path = ch; 993 while (*path == '/') 994 path++; 995 } 996 997 /* We found the dnode for this file. Verify if it is a plain file. */ 998 VERIFY_DN_TYPE(dn, DMU_OT_PLAIN_FILE_CONTENTS); 999 1000 return (0); 1001 } 1002 1003 /* 1004 * Get the default 'bootfs' property value from the rootpool. 1005 * 1006 * Return: 1007 * 0 - success 1008 * errnum -failure 1009 */ 1010 static int 1011 get_default_bootfsobj(dnode_phys_t *mosmdn, uint64_t *obj, char *stack) 1012 { 1013 uint64_t objnum = 0; 1014 dnode_phys_t *dn = (dnode_phys_t *)stack; 1015 stack += DNODE_SIZE; 1016 1017 if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT, 1018 DMU_OT_OBJECT_DIRECTORY, dn, stack)) 1019 return (errnum); 1020 1021 /* 1022 * find the object number for 'pool_props', and get the dnode 1023 * of the 'pool_props'. 1024 */ 1025 if (zap_lookup(dn, DMU_POOL_PROPS, &objnum, stack)) 1026 return (ERR_FILESYSTEM_NOT_FOUND); 1027 1028 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_POOL_PROPS, dn, stack)) 1029 return (errnum); 1030 1031 if (zap_lookup(dn, ZPOOL_PROP_BOOTFS, &objnum, stack)) 1032 return (ERR_FILESYSTEM_NOT_FOUND); 1033 1034 if (!objnum) 1035 return (ERR_FILESYSTEM_NOT_FOUND); 1036 1037 *obj = objnum; 1038 return (0); 1039 } 1040 1041 /* 1042 * List of pool features that the grub implementation of ZFS supports for 1043 * read. Note that features that are only required for write do not need 1044 * to be listed here since grub opens pools in read-only mode. 1045 * 1046 * When this list is updated the version number in usr/src/grub/capability 1047 * must be incremented to ensure the new grub gets installed. 1048 */ 1049 static const char *spa_feature_names[] = { 1050 "org.illumos:lz4_compress", 1051 "com.delphix:hole_birth", 1052 "com.delphix:extensible_dataset", 1053 "com.delphix:embedded_data", 1054 "org.open-zfs:large_blocks", 1055 "org.illumos:sha512", 1056 NULL 1057 }; 1058 1059 /* 1060 * Checks whether the MOS features that are active are supported by this 1061 * (GRUB's) implementation of ZFS. 1062 * 1063 * Return: 1064 * 0: Success. 1065 * errnum: Failure. 1066 */ 1067 static int 1068 check_mos_features(dnode_phys_t *mosmdn, char *stack) 1069 { 1070 uint64_t objnum; 1071 dnode_phys_t *dn; 1072 uint8_t error = 0; 1073 1074 dn = (dnode_phys_t *)stack; 1075 stack += DNODE_SIZE; 1076 1077 if ((errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT, 1078 DMU_OT_OBJECT_DIRECTORY, dn, stack)) != 0) 1079 return (errnum); 1080 1081 /* 1082 * Find the object number for 'features_for_read' and retrieve its 1083 * corresponding dnode. Note that we don't check features_for_write 1084 * because GRUB is not opening the pool for write. 1085 */ 1086 if ((errnum = zap_lookup(dn, DMU_POOL_FEATURES_FOR_READ, &objnum, 1087 stack)) != 0) 1088 return (errnum); 1089 1090 if ((errnum = dnode_get(mosmdn, objnum, DMU_OTN_ZAP_METADATA, 1091 dn, stack)) != 0) 1092 return (errnum); 1093 1094 return (zap_iterate(dn, check_feature, spa_feature_names, stack)); 1095 } 1096 1097 /* 1098 * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname), 1099 * e.g. pool/rootfs, or a given object number (obj), e.g. the object number 1100 * of pool/rootfs. 1101 * 1102 * If no fsname and no obj are given, return the DSL_DIR metadnode. 1103 * If fsname is given, return its metadnode and its matching object number. 1104 * If only obj is given, return the metadnode for this object number. 1105 * 1106 * Return: 1107 * 0 - success 1108 * errnum - failure 1109 */ 1110 static int 1111 get_objset_mdn(dnode_phys_t *mosmdn, char *fsname, uint64_t *obj, 1112 dnode_phys_t *mdn, char *stack) 1113 { 1114 uint64_t objnum, headobj; 1115 char *cname, ch; 1116 blkptr_t *bp; 1117 objset_phys_t *osp; 1118 int issnapshot = 0; 1119 char *snapname; 1120 1121 if (fsname == NULL && obj) { 1122 headobj = *obj; 1123 goto skip; 1124 } 1125 1126 if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT, 1127 DMU_OT_OBJECT_DIRECTORY, mdn, stack)) 1128 return (errnum); 1129 1130 if (errnum = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum, 1131 stack)) 1132 return (errnum); 1133 1134 if (errnum = dnode_get(mosmdn, objnum, 0, mdn, stack)) 1135 return (errnum); 1136 1137 if (fsname == NULL) { 1138 headobj = 1139 ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj; 1140 goto skip; 1141 } 1142 1143 /* take out the pool name */ 1144 while (*fsname && !grub_isspace(*fsname) && *fsname != '/') 1145 fsname++; 1146 1147 while (*fsname && !grub_isspace(*fsname)) { 1148 uint64_t childobj; 1149 1150 while (*fsname == '/') 1151 fsname++; 1152 1153 cname = fsname; 1154 while (*fsname && !grub_isspace(*fsname) && *fsname != '/') 1155 fsname++; 1156 ch = *fsname; 1157 *fsname = 0; 1158 1159 snapname = cname; 1160 while (*snapname && !grub_isspace(*snapname) && *snapname != 1161 '@') 1162 snapname++; 1163 if (*snapname == '@') { 1164 issnapshot = 1; 1165 *snapname = 0; 1166 } 1167 childobj = 1168 ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_child_dir_zapobj; 1169 if (errnum = dnode_get(mosmdn, childobj, 1170 DMU_OT_DSL_DIR_CHILD_MAP, mdn, stack)) 1171 return (errnum); 1172 1173 if (zap_lookup(mdn, cname, &objnum, stack)) 1174 return (ERR_FILESYSTEM_NOT_FOUND); 1175 1176 if (errnum = dnode_get(mosmdn, objnum, 0, 1177 mdn, stack)) 1178 return (errnum); 1179 1180 *fsname = ch; 1181 if (issnapshot) 1182 *snapname = '@'; 1183 } 1184 headobj = ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj; 1185 if (obj) 1186 *obj = headobj; 1187 1188 skip: 1189 if (errnum = dnode_get(mosmdn, headobj, 0, mdn, stack)) 1190 return (errnum); 1191 if (issnapshot) { 1192 uint64_t snapobj; 1193 1194 snapobj = ((dsl_dataset_phys_t *)DN_BONUS(mdn))-> 1195 ds_snapnames_zapobj; 1196 1197 if (errnum = dnode_get(mosmdn, snapobj, 1198 DMU_OT_DSL_DS_SNAP_MAP, mdn, stack)) 1199 return (errnum); 1200 if (zap_lookup(mdn, snapname + 1, &headobj, stack)) 1201 return (ERR_FILESYSTEM_NOT_FOUND); 1202 if (errnum = dnode_get(mosmdn, headobj, 0, mdn, stack)) 1203 return (errnum); 1204 if (obj) 1205 *obj = headobj; 1206 } 1207 1208 bp = &((dsl_dataset_phys_t *)DN_BONUS(mdn))->ds_bp; 1209 osp = (objset_phys_t *)stack; 1210 stack += sizeof (objset_phys_t); 1211 if (errnum = zio_read(bp, osp, stack)) 1212 return (errnum); 1213 1214 grub_memmove((char *)mdn, (char *)&osp->os_meta_dnode, DNODE_SIZE); 1215 1216 return (0); 1217 } 1218 1219 /* 1220 * For a given XDR packed nvlist, verify the first 4 bytes and move on. 1221 * 1222 * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) : 1223 * 1224 * encoding method/host endian (4 bytes) 1225 * nvl_version (4 bytes) 1226 * nvl_nvflag (4 bytes) 1227 * encoded nvpairs: 1228 * encoded size of the nvpair (4 bytes) 1229 * decoded size of the nvpair (4 bytes) 1230 * name string size (4 bytes) 1231 * name string data (sizeof(NV_ALIGN4(string)) 1232 * data type (4 bytes) 1233 * # of elements in the nvpair (4 bytes) 1234 * data 1235 * 2 zero's for the last nvpair 1236 * (end of the entire list) (8 bytes) 1237 * 1238 * Return: 1239 * 0 - success 1240 * 1 - failure 1241 */ 1242 static int 1243 nvlist_unpack(char *nvlist, char **out) 1244 { 1245 /* Verify if the 1st and 2nd byte in the nvlist are valid. */ 1246 if (nvlist[0] != NV_ENCODE_XDR || nvlist[1] != HOST_ENDIAN) 1247 return (1); 1248 1249 *out = nvlist + 4; 1250 return (0); 1251 } 1252 1253 static char * 1254 nvlist_array(char *nvlist, int index) 1255 { 1256 int i, encode_size; 1257 1258 for (i = 0; i < index; i++) { 1259 /* skip the header, nvl_version, and nvl_nvflag */ 1260 nvlist = nvlist + 4 * 2; 1261 1262 while (encode_size = BSWAP_32(*(uint32_t *)nvlist)) 1263 nvlist += encode_size; /* goto the next nvpair */ 1264 1265 nvlist = nvlist + 4 * 2; /* skip the ending 2 zeros - 8 bytes */ 1266 } 1267 1268 return (nvlist); 1269 } 1270 1271 /* 1272 * The nvlist_next_nvpair() function returns a handle to the next nvpair in the 1273 * list following nvpair. If nvpair is NULL, the first pair is returned. If 1274 * nvpair is the last pair in the nvlist, NULL is returned. 1275 */ 1276 static char * 1277 nvlist_next_nvpair(char *nvl, char *nvpair) 1278 { 1279 char *cur, *prev; 1280 int encode_size; 1281 1282 if (nvl == NULL) 1283 return (NULL); 1284 1285 if (nvpair == NULL) { 1286 /* skip over nvl_version and nvl_nvflag */ 1287 nvpair = nvl + 4 * 2; 1288 } else { 1289 /* skip to the next nvpair */ 1290 encode_size = BSWAP_32(*(uint32_t *)nvpair); 1291 nvpair += encode_size; 1292 } 1293 1294 /* 8 bytes of 0 marks the end of the list */ 1295 if (*(uint64_t *)nvpair == 0) 1296 return (NULL); 1297 1298 return (nvpair); 1299 } 1300 1301 /* 1302 * This function returns 0 on success and 1 on failure. On success, a string 1303 * containing the name of nvpair is saved in buf. 1304 */ 1305 static int 1306 nvpair_name(char *nvp, char *buf, int buflen) 1307 { 1308 int len; 1309 1310 /* skip over encode/decode size */ 1311 nvp += 4 * 2; 1312 1313 len = BSWAP_32(*(uint32_t *)nvp); 1314 if (buflen < len + 1) 1315 return (1); 1316 1317 grub_memmove(buf, nvp + 4, len); 1318 buf[len] = '\0'; 1319 1320 return (0); 1321 } 1322 1323 /* 1324 * This function retrieves the value of the nvpair in the form of enumerated 1325 * type data_type_t. This is used to determine the appropriate type to pass to 1326 * nvpair_value(). 1327 */ 1328 static int 1329 nvpair_type(char *nvp) 1330 { 1331 int name_len, type; 1332 1333 /* skip over encode/decode size */ 1334 nvp += 4 * 2; 1335 1336 /* skip over name_len */ 1337 name_len = BSWAP_32(*(uint32_t *)nvp); 1338 nvp += 4; 1339 1340 /* skip over name */ 1341 nvp = nvp + ((name_len + 3) & ~3); /* align */ 1342 1343 type = BSWAP_32(*(uint32_t *)nvp); 1344 1345 return (type); 1346 } 1347 1348 static int 1349 nvpair_value(char *nvp, void *val, int valtype, int *nelmp) 1350 { 1351 int name_len, type, slen; 1352 char *strval = val; 1353 uint64_t *intval = val; 1354 1355 /* skip over encode/decode size */ 1356 nvp += 4 * 2; 1357 1358 /* skip over name_len */ 1359 name_len = BSWAP_32(*(uint32_t *)nvp); 1360 nvp += 4; 1361 1362 /* skip over name */ 1363 nvp = nvp + ((name_len + 3) & ~3); /* align */ 1364 1365 /* skip over type */ 1366 type = BSWAP_32(*(uint32_t *)nvp); 1367 nvp += 4; 1368 1369 if (type == valtype) { 1370 int nelm; 1371 1372 nelm = BSWAP_32(*(uint32_t *)nvp); 1373 if (valtype != DATA_TYPE_BOOLEAN && nelm < 1) 1374 return (1); 1375 nvp += 4; 1376 1377 switch (valtype) { 1378 case DATA_TYPE_BOOLEAN: 1379 return (0); 1380 1381 case DATA_TYPE_STRING: 1382 slen = BSWAP_32(*(uint32_t *)nvp); 1383 nvp += 4; 1384 grub_memmove(strval, nvp, slen); 1385 strval[slen] = '\0'; 1386 return (0); 1387 1388 case DATA_TYPE_UINT64: 1389 *intval = BSWAP_64(*(uint64_t *)nvp); 1390 return (0); 1391 1392 case DATA_TYPE_NVLIST: 1393 *(void **)val = (void *)nvp; 1394 return (0); 1395 1396 case DATA_TYPE_NVLIST_ARRAY: 1397 *(void **)val = (void *)nvp; 1398 if (nelmp) 1399 *nelmp = nelm; 1400 return (0); 1401 } 1402 } 1403 1404 return (1); 1405 } 1406 1407 static int 1408 nvlist_lookup_value(char *nvlist, char *name, void *val, int valtype, 1409 int *nelmp) 1410 { 1411 char *nvpair; 1412 1413 for (nvpair = nvlist_next_nvpair(nvlist, NULL); 1414 nvpair != NULL; 1415 nvpair = nvlist_next_nvpair(nvlist, nvpair)) { 1416 int name_len = BSWAP_32(*(uint32_t *)(nvpair + 4 * 2)); 1417 char *nvp_name = nvpair + 4 * 3; 1418 1419 if ((grub_strncmp(nvp_name, name, name_len) == 0) && 1420 nvpair_type(nvpair) == valtype) { 1421 return (nvpair_value(nvpair, val, valtype, nelmp)); 1422 } 1423 } 1424 return (1); 1425 } 1426 1427 /* 1428 * Check if this vdev is online and is in a good state. 1429 */ 1430 static int 1431 vdev_validate(char *nv) 1432 { 1433 uint64_t ival; 1434 1435 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_OFFLINE, &ival, 1436 DATA_TYPE_UINT64, NULL) == 0 || 1437 nvlist_lookup_value(nv, ZPOOL_CONFIG_FAULTED, &ival, 1438 DATA_TYPE_UINT64, NULL) == 0 || 1439 nvlist_lookup_value(nv, ZPOOL_CONFIG_REMOVED, &ival, 1440 DATA_TYPE_UINT64, NULL) == 0) 1441 return (ERR_DEV_VALUES); 1442 1443 return (0); 1444 } 1445 1446 /* 1447 * Get a valid vdev pathname/devid from the boot device. 1448 * The caller should already allocate MAXPATHLEN memory for bootpath and devid. 1449 */ 1450 static int 1451 vdev_get_bootpath(char *nv, uint64_t inguid, char *devid, char *bootpath, 1452 int is_spare) 1453 { 1454 char type[16]; 1455 1456 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_TYPE, &type, DATA_TYPE_STRING, 1457 NULL)) 1458 return (ERR_FSYS_CORRUPT); 1459 1460 if (grub_strcmp(type, VDEV_TYPE_DISK) == 0) { 1461 uint64_t guid; 1462 1463 if (vdev_validate(nv) != 0) 1464 return (ERR_NO_BOOTPATH); 1465 1466 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_GUID, 1467 &guid, DATA_TYPE_UINT64, NULL) != 0) 1468 return (ERR_NO_BOOTPATH); 1469 1470 if (guid != inguid) 1471 return (ERR_NO_BOOTPATH); 1472 1473 /* for a spare vdev, pick the disk labeled with "is_spare" */ 1474 if (is_spare) { 1475 uint64_t spare = 0; 1476 (void) nvlist_lookup_value(nv, ZPOOL_CONFIG_IS_SPARE, 1477 &spare, DATA_TYPE_UINT64, NULL); 1478 if (!spare) 1479 return (ERR_NO_BOOTPATH); 1480 } 1481 1482 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_PHYS_PATH, 1483 bootpath, DATA_TYPE_STRING, NULL) != 0) 1484 bootpath[0] = '\0'; 1485 1486 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_DEVID, 1487 devid, DATA_TYPE_STRING, NULL) != 0) 1488 devid[0] = '\0'; 1489 1490 if (grub_strlen(bootpath) >= MAXPATHLEN || 1491 grub_strlen(devid) >= MAXPATHLEN) 1492 return (ERR_WONT_FIT); 1493 1494 return (0); 1495 1496 } else if (grub_strcmp(type, VDEV_TYPE_MIRROR) == 0 || 1497 grub_strcmp(type, VDEV_TYPE_REPLACING) == 0 || 1498 (is_spare = (grub_strcmp(type, VDEV_TYPE_SPARE) == 0))) { 1499 int nelm, i; 1500 char *child; 1501 1502 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_CHILDREN, &child, 1503 DATA_TYPE_NVLIST_ARRAY, &nelm)) 1504 return (ERR_FSYS_CORRUPT); 1505 1506 for (i = 0; i < nelm; i++) { 1507 char *child_i; 1508 1509 child_i = nvlist_array(child, i); 1510 if (vdev_get_bootpath(child_i, inguid, devid, 1511 bootpath, is_spare) == 0) 1512 return (0); 1513 } 1514 } 1515 1516 return (ERR_NO_BOOTPATH); 1517 } 1518 1519 /* 1520 * Check the disk label information and retrieve needed vdev name-value pairs. 1521 * 1522 * Return: 1523 * 0 - success 1524 * ERR_* - failure 1525 */ 1526 static int 1527 check_pool_label(uint64_t sector, char *stack, char *outdevid, 1528 char *outpath, uint64_t *outguid, uint64_t *outashift, uint64_t *outversion) 1529 { 1530 vdev_phys_t *vdev; 1531 uint64_t pool_state, txg = 0; 1532 char *nvlist, *nv, *features; 1533 uint64_t diskguid; 1534 1535 sector += (VDEV_SKIP_SIZE >> SPA_MINBLOCKSHIFT); 1536 1537 /* Read in the vdev name-value pair list (112K). */ 1538 if (devread(sector, 0, VDEV_PHYS_SIZE, stack) == 0) 1539 return (ERR_READ); 1540 1541 vdev = (vdev_phys_t *)stack; 1542 stack += sizeof (vdev_phys_t); 1543 1544 if (nvlist_unpack(vdev->vp_nvlist, &nvlist)) 1545 return (ERR_FSYS_CORRUPT); 1546 1547 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_STATE, &pool_state, 1548 DATA_TYPE_UINT64, NULL)) 1549 return (ERR_FSYS_CORRUPT); 1550 1551 if (pool_state == POOL_STATE_DESTROYED) 1552 return (ERR_FILESYSTEM_NOT_FOUND); 1553 1554 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_NAME, 1555 current_rootpool, DATA_TYPE_STRING, NULL)) 1556 return (ERR_FSYS_CORRUPT); 1557 1558 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_TXG, &txg, 1559 DATA_TYPE_UINT64, NULL)) 1560 return (ERR_FSYS_CORRUPT); 1561 1562 /* not an active device */ 1563 if (txg == 0) 1564 return (ERR_NO_BOOTPATH); 1565 1566 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VERSION, outversion, 1567 DATA_TYPE_UINT64, NULL)) 1568 return (ERR_FSYS_CORRUPT); 1569 if (!SPA_VERSION_IS_SUPPORTED(*outversion)) 1570 return (ERR_NEWER_VERSION); 1571 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VDEV_TREE, &nv, 1572 DATA_TYPE_NVLIST, NULL)) 1573 return (ERR_FSYS_CORRUPT); 1574 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_GUID, &diskguid, 1575 DATA_TYPE_UINT64, NULL)) 1576 return (ERR_FSYS_CORRUPT); 1577 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_ASHIFT, outashift, 1578 DATA_TYPE_UINT64, NULL) != 0) 1579 return (ERR_FSYS_CORRUPT); 1580 if (vdev_get_bootpath(nv, diskguid, outdevid, outpath, 0)) 1581 return (ERR_NO_BOOTPATH); 1582 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_GUID, outguid, 1583 DATA_TYPE_UINT64, NULL)) 1584 return (ERR_FSYS_CORRUPT); 1585 1586 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ, 1587 &features, DATA_TYPE_NVLIST, NULL) == 0) { 1588 char *nvp; 1589 char *name = stack; 1590 stack += MAXNAMELEN; 1591 1592 for (nvp = nvlist_next_nvpair(features, NULL); 1593 nvp != NULL; 1594 nvp = nvlist_next_nvpair(features, nvp)) { 1595 zap_attribute_t za; 1596 1597 if (nvpair_name(nvp, name, MAXNAMELEN) != 0) 1598 return (ERR_FSYS_CORRUPT); 1599 1600 za.za_integer_length = 8; 1601 za.za_num_integers = 1; 1602 za.za_first_integer = 1; 1603 za.za_name = name; 1604 if (check_feature(&za, spa_feature_names, stack) != 0) 1605 return (ERR_NEWER_VERSION); 1606 } 1607 } 1608 1609 return (0); 1610 } 1611 1612 /* 1613 * zfs_mount() locates a valid uberblock of the root pool and read in its MOS 1614 * to the memory address MOS. 1615 * 1616 * Return: 1617 * 1 - success 1618 * 0 - failure 1619 */ 1620 int 1621 zfs_mount(void) 1622 { 1623 char *stack, *ub_array; 1624 int label = 0; 1625 uberblock_t *ubbest; 1626 objset_phys_t *osp; 1627 char tmp_bootpath[MAXNAMELEN]; 1628 char tmp_devid[MAXNAMELEN]; 1629 uint64_t tmp_guid, ashift, version; 1630 uint64_t adjpl = (uint64_t)part_length << SPA_MINBLOCKSHIFT; 1631 int err = errnum; /* preserve previous errnum state */ 1632 1633 /* if it's our first time here, zero the best uberblock out */ 1634 if (best_drive == 0 && best_part == 0 && find_best_root) { 1635 grub_memset(¤t_uberblock, 0, sizeof (uberblock_t)); 1636 pool_guid = 0; 1637 } 1638 1639 stackbase = ZFS_SCRATCH; 1640 stack = stackbase; 1641 ub_array = stack; 1642 stack += VDEV_UBERBLOCK_RING; 1643 1644 osp = (objset_phys_t *)stack; 1645 stack += sizeof (objset_phys_t); 1646 adjpl = P2ALIGN(adjpl, (uint64_t)sizeof (vdev_label_t)); 1647 1648 for (label = 0; label < VDEV_LABELS; label++) { 1649 1650 /* 1651 * some eltorito stacks don't give us a size and 1652 * we end up setting the size to MAXUINT, further 1653 * some of these devices stop working once a single 1654 * read past the end has been issued. Checking 1655 * for a maximum part_length and skipping the backup 1656 * labels at the end of the slice/partition/device 1657 * avoids breaking down on such devices. 1658 */ 1659 if (part_length == MAXUINT && label == 2) 1660 break; 1661 1662 uint64_t sector = vdev_label_start(adjpl, 1663 label) >> SPA_MINBLOCKSHIFT; 1664 1665 /* Read in the uberblock ring (128K). */ 1666 if (devread(sector + 1667 ((VDEV_SKIP_SIZE + VDEV_PHYS_SIZE) >> SPA_MINBLOCKSHIFT), 1668 0, VDEV_UBERBLOCK_RING, ub_array) == 0) 1669 continue; 1670 1671 if (check_pool_label(sector, stack, tmp_devid, 1672 tmp_bootpath, &tmp_guid, &ashift, &version)) 1673 continue; 1674 1675 if (pool_guid == 0) 1676 pool_guid = tmp_guid; 1677 1678 if ((ubbest = find_bestub(ub_array, ashift, sector)) == NULL || 1679 zio_read(&ubbest->ub_rootbp, osp, stack) != 0) 1680 continue; 1681 1682 VERIFY_OS_TYPE(osp, DMU_OST_META); 1683 1684 if (version >= SPA_VERSION_FEATURES && 1685 check_mos_features(&osp->os_meta_dnode, stack) != 0) 1686 continue; 1687 1688 if (find_best_root && ((pool_guid != tmp_guid) || 1689 vdev_uberblock_compare(ubbest, &(current_uberblock)) <= 0)) 1690 continue; 1691 1692 /* Got the MOS. Save it at the memory addr MOS. */ 1693 grub_memmove(MOS, &osp->os_meta_dnode, DNODE_SIZE); 1694 grub_memmove(¤t_uberblock, ubbest, sizeof (uberblock_t)); 1695 grub_memmove(current_bootpath, tmp_bootpath, MAXNAMELEN); 1696 grub_memmove(current_devid, tmp_devid, grub_strlen(tmp_devid)); 1697 is_zfs_mount = 1; 1698 return (1); 1699 } 1700 1701 /* 1702 * While some fs impls. (tftp) rely on setting and keeping 1703 * global errnums set, others won't reset it and will break 1704 * when issuing rawreads. The goal here is to simply not 1705 * have zfs mount attempts impact the previous state. 1706 */ 1707 errnum = err; 1708 return (0); 1709 } 1710 1711 /* 1712 * zfs_open() locates a file in the rootpool by following the 1713 * MOS and places the dnode of the file in the memory address DNODE. 1714 * 1715 * Return: 1716 * 1 - success 1717 * 0 - failure 1718 */ 1719 int 1720 zfs_open(char *filename) 1721 { 1722 char *stack; 1723 dnode_phys_t *mdn; 1724 1725 file_buf = NULL; 1726 stackbase = ZFS_SCRATCH; 1727 stack = stackbase; 1728 1729 mdn = (dnode_phys_t *)stack; 1730 stack += sizeof (dnode_phys_t); 1731 1732 dnode_mdn = NULL; 1733 dnode_buf = (dnode_phys_t *)stack; 1734 stack += 1<<DNODE_BLOCK_SHIFT; 1735 1736 /* 1737 * menu.lst is placed at the root pool filesystem level, 1738 * do not goto 'current_bootfs'. 1739 */ 1740 if (is_top_dataset_file(filename)) { 1741 if (errnum = get_objset_mdn(MOS, NULL, NULL, mdn, stack)) 1742 return (0); 1743 1744 current_bootfs_obj = 0; 1745 } else { 1746 if (current_bootfs[0] == '\0') { 1747 /* Get the default root filesystem object number */ 1748 if (errnum = get_default_bootfsobj(MOS, 1749 ¤t_bootfs_obj, stack)) 1750 return (0); 1751 1752 if (errnum = get_objset_mdn(MOS, NULL, 1753 ¤t_bootfs_obj, mdn, stack)) 1754 return (0); 1755 } else { 1756 if (errnum = get_objset_mdn(MOS, current_bootfs, 1757 ¤t_bootfs_obj, mdn, stack)) { 1758 grub_memset(current_bootfs, 0, MAXNAMELEN); 1759 return (0); 1760 } 1761 } 1762 } 1763 1764 if (dnode_get_path(mdn, filename, DNODE, stack)) { 1765 errnum = ERR_FILE_NOT_FOUND; 1766 return (0); 1767 } 1768 1769 /* get the file size and set the file position to 0 */ 1770 1771 /* 1772 * For DMU_OT_SA we will need to locate the SIZE attribute 1773 * attribute, which could be either in the bonus buffer 1774 * or the "spill" block. 1775 */ 1776 if (DNODE->dn_bonustype == DMU_OT_SA) { 1777 sa_hdr_phys_t *sahdrp; 1778 int hdrsize; 1779 1780 if (DNODE->dn_bonuslen != 0) { 1781 sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE); 1782 } else { 1783 if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 1784 blkptr_t *bp = &DNODE->dn_spill; 1785 void *buf; 1786 1787 buf = (void *)stack; 1788 stack += BP_GET_LSIZE(bp); 1789 1790 /* reset errnum to rawread() failure */ 1791 errnum = 0; 1792 if (zio_read(bp, buf, stack) != 0) { 1793 return (0); 1794 } 1795 sahdrp = buf; 1796 } else { 1797 errnum = ERR_FSYS_CORRUPT; 1798 return (0); 1799 } 1800 } 1801 hdrsize = SA_HDR_SIZE(sahdrp); 1802 filemax = *(uint64_t *)((char *)sahdrp + hdrsize + 1803 SA_SIZE_OFFSET); 1804 } else { 1805 filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size; 1806 } 1807 filepos = 0; 1808 1809 dnode_buf = NULL; 1810 return (1); 1811 } 1812 1813 /* 1814 * zfs_read reads in the data blocks pointed by the DNODE. 1815 * 1816 * Return: 1817 * len - the length successfully read in to the buffer 1818 * 0 - failure 1819 */ 1820 int 1821 zfs_read(char *buf, int len) 1822 { 1823 char *stack; 1824 int blksz, length, movesize; 1825 1826 if (file_buf == NULL) { 1827 file_buf = stackbase; 1828 stackbase += SPA_MAXBLOCKSIZE; 1829 file_start = file_end = 0; 1830 } 1831 stack = stackbase; 1832 1833 /* 1834 * If offset is in memory, move it into the buffer provided and return. 1835 */ 1836 if (filepos >= file_start && filepos+len <= file_end) { 1837 grub_memmove(buf, file_buf + filepos - file_start, len); 1838 filepos += len; 1839 return (len); 1840 } 1841 1842 blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT; 1843 1844 /* 1845 * Note: for GRUB, SPA_MAXBLOCKSIZE is 128KB. There is not enough 1846 * memory to allocate the new max blocksize (16MB), so while 1847 * GRUB understands the large_blocks on-disk feature, it can't 1848 * actually read large blocks. 1849 */ 1850 if (blksz > SPA_MAXBLOCKSIZE) { 1851 grub_printf("blocks larger than 128K are not supported\n"); 1852 return (0); 1853 } 1854 1855 /* 1856 * Entire Dnode is too big to fit into the space available. We 1857 * will need to read it in chunks. This could be optimized to 1858 * read in as large a chunk as there is space available, but for 1859 * now, this only reads in one data block at a time. 1860 */ 1861 length = len; 1862 while (length) { 1863 /* 1864 * Find requested blkid and the offset within that block. 1865 */ 1866 uint64_t blkid = filepos / blksz; 1867 1868 if (errnum = dmu_read(DNODE, blkid, file_buf, stack)) 1869 return (0); 1870 1871 file_start = blkid * blksz; 1872 file_end = file_start + blksz; 1873 1874 movesize = MIN(length, file_end - filepos); 1875 1876 grub_memmove(buf, file_buf + filepos - file_start, 1877 movesize); 1878 buf += movesize; 1879 length -= movesize; 1880 filepos += movesize; 1881 } 1882 1883 return (len); 1884 } 1885 1886 /* 1887 * No-Op 1888 */ 1889 int 1890 zfs_embed(int *start_sector, int needed_sectors) 1891 { 1892 return (1); 1893 } 1894 1895 #endif /* FSYS_ZFS */ 1896