1/*
2 *  GRUB  --  GRand Unified Bootloader
3 *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
4 *
5 *  This program is free software; you can redistribute it and/or modify
6 *  it under the terms of the GNU General Public License as published by
7 *  the Free Software Foundation; either version 2 of the License, or
8 *  (at your option) any later version.
9 *
10 *  This program is distributed in the hope that it will be useful,
11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 *  GNU General Public License for more details.
14 *
15 *  You should have received a copy of the GNU General Public License
16 *  along with this program; if not, write to the Free Software
17 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20/*
21 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
22 * Use is subject to license terms.
23 */
24
25/*
26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
28 * Copyright (c) 2014 Integros [integros.com]
29 */
30
31/*
32 * The zfs plug-in routines for GRUB are:
33 *
34 * zfs_mount() - locates a valid uberblock of the root pool and reads
35 *		in its MOS at the memory address MOS.
36 *
37 * zfs_open() - locates a plain file object by following the MOS
38 *		and places its dnode at the memory address DNODE.
39 *
40 * zfs_read() - read in the data blocks pointed by the DNODE.
41 *
42 * ZFS_SCRATCH is used as a working area.
43 *
44 * (memory addr)   MOS      DNODE	ZFS_SCRATCH
45 *		    |         |          |
46 *	    +-------V---------V----------V---------------+
47 *   memory |       | dnode   | dnode    |  scratch      |
48 *	    |       | 512B    | 512B     |  area         |
49 *	    +--------------------------------------------+
50 */
51
52#ifdef	FSYS_ZFS
53
54#include "shared.h"
55#include "filesys.h"
56#include "fsys_zfs.h"
57
58/* cache for a file block of the currently zfs_open()-ed file */
59static void *file_buf = NULL;
60static uint64_t file_start = 0;
61static uint64_t file_end = 0;
62
63/* cache for a dnode block */
64static dnode_phys_t *dnode_buf = NULL;
65static dnode_phys_t *dnode_mdn = NULL;
66static uint64_t dnode_start = 0;
67static uint64_t dnode_end = 0;
68
69static uint64_t pool_guid = 0;
70static uberblock_t current_uberblock;
71static char *stackbase;
72
73decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] =
74{
75	{"inherit", 0},			/* ZIO_COMPRESS_INHERIT */
76	{"on", lzjb_decompress}, 	/* ZIO_COMPRESS_ON */
77	{"off", 0},			/* ZIO_COMPRESS_OFF */
78	{"lzjb", lzjb_decompress},	/* ZIO_COMPRESS_LZJB */
79	{"empty", 0},			/* ZIO_COMPRESS_EMPTY */
80	{"gzip-1", 0},			/* ZIO_COMPRESS_GZIP_1 */
81	{"gzip-2", 0},			/* ZIO_COMPRESS_GZIP_2 */
82	{"gzip-3", 0},			/* ZIO_COMPRESS_GZIP_3 */
83	{"gzip-4", 0},			/* ZIO_COMPRESS_GZIP_4 */
84	{"gzip-5", 0},			/* ZIO_COMPRESS_GZIP_5 */
85	{"gzip-6", 0},			/* ZIO_COMPRESS_GZIP_6 */
86	{"gzip-7", 0},			/* ZIO_COMPRESS_GZIP_7 */
87	{"gzip-8", 0},			/* ZIO_COMPRESS_GZIP_8 */
88	{"gzip-9", 0},			/* ZIO_COMPRESS_GZIP_9 */
89	{"zle", 0},			/* ZIO_COMPRESS_ZLE */
90	{"lz4", lz4_decompress}		/* ZIO_COMPRESS_LZ4 */
91};
92
93static int zio_read_data(blkptr_t *bp, void *buf, char *stack);
94
95/*
96 * Our own version of bcmp().
97 */
98static int
99zfs_bcmp(const void *s1, const void *s2, size_t n)
100{
101	const uchar_t *ps1 = s1;
102	const uchar_t *ps2 = s2;
103
104	if (s1 != s2 && n != 0) {
105		do {
106			if (*ps1++ != *ps2++)
107				return (1);
108		} while (--n != 0);
109	}
110
111	return (0);
112}
113
114/*
115 * Our own version of log2().  Same thing as highbit()-1.
116 */
117static int
118zfs_log2(uint64_t num)
119{
120	int i = 0;
121
122	while (num > 1) {
123		i++;
124		num = num >> 1;
125	}
126
127	return (i);
128}
129
130/* Checksum Functions */
131static void
132zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
133{
134	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
135}
136
137/* Checksum Table and Values */
138zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
139	{{NULL,			NULL},			0, 0,	"inherit"},
140	{{NULL,			NULL},			0, 0,	"on"},
141	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"off"},
142	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1,	"label"},
143	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1,	"gang_header"},
144	{{NULL,			NULL},			0, 0,	"zilog"},
145	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0,	"fletcher2"},
146	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0,	"fletcher4"},
147	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0,	"SHA256"},
148	{{NULL,			NULL},			0, 0,	"zilog2"},
149	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"noparity"},
150	{{zio_checksum_SHA512,	NULL},			0, 0,	"SHA512"}
151};
152
153/*
154 * zio_checksum_verify: Provides support for checksum verification.
155 *
156 * Fletcher2, Fletcher4, SHA-256 and SHA-512/256 are supported.
157 *
158 * Return:
159 * 	-1 = Failure
160 *	 0 = Success
161 */
162static int
163zio_checksum_verify(blkptr_t *bp, char *data, int size)
164{
165	zio_cksum_t zc = bp->blk_cksum;
166	uint32_t checksum = BP_GET_CHECKSUM(bp);
167	int byteswap = BP_SHOULD_BYTESWAP(bp);
168	zio_eck_t *zec = (zio_eck_t *)(data + size) - 1;
169	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
170	zio_cksum_t actual_cksum, expected_cksum;
171
172	if (byteswap) {
173		grub_printf("byteswap not supported\n");
174		return (-1);
175	}
176
177	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) {
178		grub_printf("checksum algorithm %u not supported\n", checksum);
179		return (-1);
180	}
181
182	if (ci->ci_eck) {
183		expected_cksum = zec->zec_cksum;
184		zec->zec_cksum = zc;
185		ci->ci_func[0](data, size, &actual_cksum);
186		zec->zec_cksum = expected_cksum;
187		zc = expected_cksum;
188	} else {
189		ci->ci_func[byteswap](data, size, &actual_cksum);
190	}
191
192	if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
193	    (actual_cksum.zc_word[1] - zc.zc_word[1]) |
194	    (actual_cksum.zc_word[2] - zc.zc_word[2]) |
195	    (actual_cksum.zc_word[3] - zc.zc_word[3]))
196		return (-1);
197
198	return (0);
199}
200
201/*
202 * vdev_label_start returns the physical disk offset (in bytes) of
203 * label "l".
204 */
205static uint64_t
206vdev_label_start(uint64_t psize, int l)
207{
208	return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
209	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
210}
211
212/*
213 * vdev_uberblock_compare takes two uberblock structures and returns an integer
214 * indicating the more recent of the two.
215 * 	Return Value = 1 if ub2 is more recent
216 * 	Return Value = -1 if ub1 is more recent
217 * The most recent uberblock is determined using its transaction number and
218 * timestamp.  The uberblock with the highest transaction number is
219 * considered "newer".  If the transaction numbers of the two blocks match, the
220 * timestamps are compared to determine the "newer" of the two.
221 */
222static int
223vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
224{
225	if (ub1->ub_txg < ub2->ub_txg)
226		return (-1);
227	if (ub1->ub_txg > ub2->ub_txg)
228		return (1);
229
230	if (ub1->ub_timestamp < ub2->ub_timestamp)
231		return (-1);
232	if (ub1->ub_timestamp > ub2->ub_timestamp)
233		return (1);
234
235	return (0);
236}
237
238/*
239 * Three pieces of information are needed to verify an uberblock: the magic
240 * number, the version number, and the checksum.
241 *
242 * Return:
243 *     0 - Success
244 *    -1 - Failure
245 */
246static int
247uberblock_verify(uberblock_t *uber, uint64_t ub_size, uint64_t offset)
248{
249	blkptr_t bp;
250
251	BP_ZERO(&bp);
252	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
253	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
254	ZIO_SET_CHECKSUM(&bp.blk_cksum, offset, 0, 0, 0);
255
256	if (zio_checksum_verify(&bp, (char *)uber, ub_size) != 0)
257		return (-1);
258
259	if (uber->ub_magic == UBERBLOCK_MAGIC &&
260	    SPA_VERSION_IS_SUPPORTED(uber->ub_version))
261		return (0);
262
263	return (-1);
264}
265
266/*
267 * Find the best uberblock.
268 * Return:
269 *    Success - Pointer to the best uberblock.
270 *    Failure - NULL
271 */
272static uberblock_t *
273find_bestub(char *ub_array, uint64_t ashift, uint64_t sector)
274{
275	uberblock_t *ubbest = NULL;
276	uberblock_t *ubnext;
277	uint64_t offset, ub_size;
278	int i;
279
280	ub_size = VDEV_UBERBLOCK_SIZE(ashift);
281
282	for (i = 0; i < VDEV_UBERBLOCK_COUNT(ashift); i++) {
283		ubnext = (uberblock_t *)ub_array;
284		ub_array += ub_size;
285		offset = (sector << SPA_MINBLOCKSHIFT) +
286		    VDEV_UBERBLOCK_OFFSET(ashift, i);
287
288		if (uberblock_verify(ubnext, ub_size, offset) != 0)
289			continue;
290
291		if (ubbest == NULL ||
292		    vdev_uberblock_compare(ubnext, ubbest) > 0)
293			ubbest = ubnext;
294	}
295
296	return (ubbest);
297}
298
299/*
300 * Read a block of data based on the gang block address dva,
301 * and put its data in buf.
302 *
303 * Return:
304 *	0 - success
305 *	1 - failure
306 */
307static int
308zio_read_gang(blkptr_t *bp, dva_t *dva, void *buf, char *stack)
309{
310	zio_gbh_phys_t *zio_gb;
311	uint64_t offset, sector;
312	blkptr_t tmpbp;
313	int i;
314
315	zio_gb = (zio_gbh_phys_t *)stack;
316	stack += SPA_GANGBLOCKSIZE;
317	offset = DVA_GET_OFFSET(dva);
318	sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
319
320	/* read in the gang block header */
321	if (devread(sector, 0, SPA_GANGBLOCKSIZE, (char *)zio_gb) == 0) {
322		grub_printf("failed to read in a gang block header\n");
323		return (1);
324	}
325
326	/* self checksuming the gang block header */
327	BP_ZERO(&tmpbp);
328	BP_SET_CHECKSUM(&tmpbp, ZIO_CHECKSUM_GANG_HEADER);
329	BP_SET_BYTEORDER(&tmpbp, ZFS_HOST_BYTEORDER);
330	ZIO_SET_CHECKSUM(&tmpbp.blk_cksum, DVA_GET_VDEV(dva),
331	    DVA_GET_OFFSET(dva), bp->blk_birth, 0);
332	if (zio_checksum_verify(&tmpbp, (char *)zio_gb, SPA_GANGBLOCKSIZE)) {
333		grub_printf("failed to checksum a gang block header\n");
334		return (1);
335	}
336
337	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
338		if (BP_IS_HOLE(&zio_gb->zg_blkptr[i]))
339			continue;
340
341		if (zio_read_data(&zio_gb->zg_blkptr[i], buf, stack))
342			return (1);
343		buf += BP_GET_PSIZE(&zio_gb->zg_blkptr[i]);
344	}
345
346	return (0);
347}
348
349/*
350 * Read in a block of raw data to buf.
351 *
352 * Return:
353 *	0 - success
354 *	1 - failure
355 */
356static int
357zio_read_data(blkptr_t *bp, void *buf, char *stack)
358{
359	int i, psize;
360
361	psize = BP_GET_PSIZE(bp);
362
363	/* pick a good dva from the block pointer */
364	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
365		uint64_t offset, sector;
366
367		if (bp->blk_dva[i].dva_word[0] == 0 &&
368		    bp->blk_dva[i].dva_word[1] == 0)
369			continue;
370
371		if (DVA_GET_GANG(&bp->blk_dva[i])) {
372			if (zio_read_gang(bp, &bp->blk_dva[i], buf, stack) != 0)
373				continue;
374		} else {
375			/* read in a data block */
376			offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
377			sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
378			if (devread(sector, 0, psize, buf) == 0)
379				continue;
380		}
381
382		/* verify that the checksum matches */
383		if (zio_checksum_verify(bp, buf, psize) == 0) {
384			return (0);
385		}
386	}
387
388	grub_printf("could not read block due to EIO or ECKSUM\n");
389	return (1);
390}
391
392/*
393 * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
394 * more than BPE_PAYLOAD_SIZE bytes).
395 */
396static void
397decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
398{
399	int psize, i;
400	uint8_t *buf8 = buf;
401	uint64_t w = 0;
402	const uint64_t *bp64 = (const uint64_t *)bp;
403
404	psize = BPE_GET_PSIZE(bp);
405
406	/*
407	 * Decode the words of the block pointer into the byte array.
408	 * Low bits of first word are the first byte (little endian).
409	 */
410	for (i = 0; i < psize; i++) {
411		if (i % sizeof (w) == 0) {
412			/* beginning of a word */
413			w = *bp64;
414			bp64++;
415			if (!BPE_IS_PAYLOADWORD(bp, bp64))
416				bp64++;
417		}
418		buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
419	}
420}
421
422/*
423 * Fill in the buffer with the (decompressed) payload of the embedded
424 * blkptr_t.  Takes into account compression and byteorder (the payload is
425 * treated as a stream of bytes).
426 * Return 0 on success, or ENOSPC if it won't fit in the buffer.
427 */
428static int
429decode_embedded_bp(const blkptr_t *bp, void *buf)
430{
431	int comp;
432	int lsize, psize;
433	uint8_t *dst = buf;
434	uint64_t w = 0;
435
436	lsize = BPE_GET_LSIZE(bp);
437	psize = BPE_GET_PSIZE(bp);
438	comp = BP_GET_COMPRESS(bp);
439
440	if (comp != ZIO_COMPRESS_OFF) {
441		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
442
443		if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
444		    decomp_table[comp].decomp_func == NULL) {
445			grub_printf("compression algorithm not supported\n");
446			return (ERR_FSYS_CORRUPT);
447		}
448
449		decode_embedded_bp_compressed(bp, dstbuf);
450		decomp_table[comp].decomp_func(dstbuf, buf, psize, lsize);
451	} else {
452		decode_embedded_bp_compressed(bp, buf);
453	}
454
455	return (0);
456}
457
458/*
459 * Read in a block of data, verify its checksum, decompress if needed,
460 * and put the uncompressed data in buf.
461 *
462 * Return:
463 *	0 - success
464 *	errnum - failure
465 */
466static int
467zio_read(blkptr_t *bp, void *buf, char *stack)
468{
469	int lsize, psize, comp;
470	char *retbuf;
471
472	if (BP_IS_EMBEDDED(bp)) {
473		if (BPE_GET_ETYPE(bp) != BP_EMBEDDED_TYPE_DATA) {
474			grub_printf("unsupported embedded BP (type=%u)\n",
475			    (int)BPE_GET_ETYPE(bp));
476			return (ERR_FSYS_CORRUPT);
477		}
478		return (decode_embedded_bp(bp, buf));
479	}
480
481	comp = BP_GET_COMPRESS(bp);
482	lsize = BP_GET_LSIZE(bp);
483	psize = BP_GET_PSIZE(bp);
484
485	if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
486	    (comp != ZIO_COMPRESS_OFF &&
487	    decomp_table[comp].decomp_func == NULL)) {
488		grub_printf("compression algorithm not supported\n");
489		return (ERR_FSYS_CORRUPT);
490	}
491
492	if ((char *)buf < stack && ((char *)buf) + lsize > stack) {
493		grub_printf("not enough memory to fit %u bytes on stack\n",
494		    lsize);
495		return (ERR_WONT_FIT);
496	}
497
498	retbuf = buf;
499	if (comp != ZIO_COMPRESS_OFF) {
500		buf = stack;
501		stack += psize;
502	}
503
504	if (zio_read_data(bp, buf, stack) != 0) {
505		grub_printf("zio_read_data failed\n");
506		return (ERR_FSYS_CORRUPT);
507	}
508
509	if (comp != ZIO_COMPRESS_OFF) {
510		if (decomp_table[comp].decomp_func(buf, retbuf, psize,
511		    lsize) != 0) {
512			grub_printf("zio_read decompression failed\n");
513			return (ERR_FSYS_CORRUPT);
514		}
515	}
516
517	return (0);
518}
519
520/*
521 * Get the block from a block id.
522 * push the block onto the stack.
523 *
524 * Return:
525 * 	0 - success
526 * 	errnum - failure
527 */
528static int
529dmu_read(dnode_phys_t *dn, uint64_t blkid, void *buf, char *stack)
530{
531	int idx, level;
532	blkptr_t *bp_array = dn->dn_blkptr;
533	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
534	blkptr_t *bp, *tmpbuf;
535
536	bp = (blkptr_t *)stack;
537	stack += sizeof (blkptr_t);
538
539	tmpbuf = (blkptr_t *)stack;
540	stack += 1<<dn->dn_indblkshift;
541
542	for (level = dn->dn_nlevels - 1; level >= 0; level--) {
543		idx = (blkid >> (epbs * level)) & ((1<<epbs)-1);
544		*bp = bp_array[idx];
545		if (level == 0)
546			tmpbuf = buf;
547		if (BP_IS_HOLE(bp)) {
548			grub_memset(buf, 0,
549			    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
550			break;
551		} else if (errnum = zio_read(bp, tmpbuf, stack)) {
552			return (errnum);
553		}
554
555		bp_array = tmpbuf;
556	}
557
558	return (0);
559}
560
561/*
562 * mzap_lookup: Looks up property described by "name" and returns the value
563 * in "value".
564 *
565 * Return:
566 *	0 - success
567 *	errnum - failure
568 */
569static int
570mzap_lookup(mzap_phys_t *zapobj, int objsize, const char *name,
571    uint64_t *value)
572{
573	int i, chunks;
574	mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
575
576	chunks = objsize / MZAP_ENT_LEN - 1;
577	for (i = 0; i < chunks; i++) {
578		if (grub_strcmp(mzap_ent[i].mze_name, name) == 0) {
579			*value = mzap_ent[i].mze_value;
580			return (0);
581		}
582	}
583
584	return (ERR_FSYS_CORRUPT);
585}
586
587static uint64_t
588zap_hash(uint64_t salt, const char *name)
589{
590	static uint64_t table[256];
591	const uint8_t *cp;
592	uint8_t c;
593	uint64_t crc = salt;
594
595	if (table[128] == 0) {
596		uint64_t *ct;
597		int i, j;
598		for (i = 0; i < 256; i++) {
599			for (ct = table + i, *ct = i, j = 8; j > 0; j--)
600				*ct = (*ct >> 1) ^ (-(*ct & 1) &
601				    ZFS_CRC64_POLY);
602		}
603	}
604
605	if (crc == 0 || table[128] != ZFS_CRC64_POLY) {
606		errnum = ERR_FSYS_CORRUPT;
607		return (0);
608	}
609
610	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
611		crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
612
613	/*
614	 * Only use 28 bits, since we need 4 bits in the cookie for the
615	 * collision differentiator.  We MUST use the high bits, since
616	 * those are the ones that we first pay attention to when
617	 * choosing the bucket.
618	 */
619	crc &= ~((1ULL << (64 - 28)) - 1);
620
621	return (crc);
622}
623
624/*
625 * Only to be used on 8-bit arrays.
626 * array_len is actual len in bytes (not encoded le_value_length).
627 * buf is null-terminated.
628 */
629static int
630zap_leaf_array_equal(zap_leaf_phys_t *l, int blksft, int chunk,
631    int array_len, const char *buf)
632{
633	int bseen = 0;
634
635	while (bseen < array_len) {
636		struct zap_leaf_array *la =
637		    &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
638		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
639
640		if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
641			return (0);
642
643		if (zfs_bcmp(la->la_array, buf + bseen, toread) != 0)
644			break;
645		chunk = la->la_next;
646		bseen += toread;
647	}
648	return (bseen == array_len);
649}
650
651/*
652 * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
653 * value for the property "name".
654 *
655 * Return:
656 *	0 - success
657 *	errnum - failure
658 */
659static int
660zap_leaf_lookup(zap_leaf_phys_t *l, int blksft, uint64_t h,
661    const char *name, uint64_t *value)
662{
663	uint16_t chunk;
664	struct zap_leaf_entry *le;
665
666	/* Verify if this is a valid leaf block */
667	if (l->l_hdr.lh_block_type != ZBT_LEAF)
668		return (ERR_FSYS_CORRUPT);
669	if (l->l_hdr.lh_magic != ZAP_LEAF_MAGIC)
670		return (ERR_FSYS_CORRUPT);
671
672	for (chunk = l->l_hash[LEAF_HASH(blksft, h)];
673	    chunk != CHAIN_END; chunk = le->le_next) {
674
675		if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
676			return (ERR_FSYS_CORRUPT);
677
678		le = ZAP_LEAF_ENTRY(l, blksft, chunk);
679
680		/* Verify the chunk entry */
681		if (le->le_type != ZAP_CHUNK_ENTRY)
682			return (ERR_FSYS_CORRUPT);
683
684		if (le->le_hash != h)
685			continue;
686
687		if (zap_leaf_array_equal(l, blksft, le->le_name_chunk,
688		    le->le_name_length, name)) {
689
690			struct zap_leaf_array *la;
691			uint8_t *ip;
692
693			if (le->le_int_size != 8 || le->le_value_length != 1)
694				return (ERR_FSYS_CORRUPT);
695
696			/* get the uint64_t property value */
697			la = &ZAP_LEAF_CHUNK(l, blksft,
698			    le->le_value_chunk).l_array;
699			ip = la->la_array;
700
701			*value = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
702			    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
703			    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
704			    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
705
706			return (0);
707		}
708	}
709
710	return (ERR_FSYS_CORRUPT);
711}
712
713/*
714 * Fat ZAP lookup
715 *
716 * Return:
717 *	0 - success
718 *	errnum - failure
719 */
720static int
721fzap_lookup(dnode_phys_t *zap_dnode, zap_phys_t *zap,
722    const char *name, uint64_t *value, char *stack)
723{
724	zap_leaf_phys_t *l;
725	uint64_t hash, idx, blkid;
726	int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT);
727
728	/* Verify if this is a fat zap header block */
729	if (zap->zap_magic != (uint64_t)ZAP_MAGIC ||
730	    zap->zap_flags != 0)
731		return (ERR_FSYS_CORRUPT);
732
733	hash = zap_hash(zap->zap_salt, name);
734	if (errnum)
735		return (errnum);
736
737	/* get block id from index */
738	if (zap->zap_ptrtbl.zt_numblks != 0) {
739		/* external pointer tables not supported */
740		return (ERR_FSYS_CORRUPT);
741	}
742	idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
743	blkid = ((uint64_t *)zap)[idx + (1<<(blksft-3-1))];
744
745	/* Get the leaf block */
746	l = (zap_leaf_phys_t *)stack;
747	stack += 1<<blksft;
748	if ((1<<blksft) < sizeof (zap_leaf_phys_t))
749		return (ERR_FSYS_CORRUPT);
750	if (errnum = dmu_read(zap_dnode, blkid, l, stack))
751		return (errnum);
752
753	return (zap_leaf_lookup(l, blksft, hash, name, value));
754}
755
756/*
757 * Read in the data of a zap object and find the value for a matching
758 * property name.
759 *
760 * Return:
761 *	0 - success
762 *	errnum - failure
763 */
764static int
765zap_lookup(dnode_phys_t *zap_dnode, const char *name, uint64_t *val,
766    char *stack)
767{
768	uint64_t block_type;
769	int size;
770	void *zapbuf;
771
772	/* Read in the first block of the zap object data. */
773	zapbuf = stack;
774	size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
775	stack += size;
776
777	if ((errnum = dmu_read(zap_dnode, 0, zapbuf, stack)) != 0)
778		return (errnum);
779
780	block_type = *((uint64_t *)zapbuf);
781
782	if (block_type == ZBT_MICRO) {
783		return (mzap_lookup(zapbuf, size, name, val));
784	} else if (block_type == ZBT_HEADER) {
785		/* this is a fat zap */
786		return (fzap_lookup(zap_dnode, zapbuf, name,
787		    val, stack));
788	}
789
790	return (ERR_FSYS_CORRUPT);
791}
792
793typedef struct zap_attribute {
794	int za_integer_length;
795	uint64_t za_num_integers;
796	uint64_t za_first_integer;
797	char *za_name;
798} zap_attribute_t;
799
800typedef int (zap_cb_t)(zap_attribute_t *za, void *arg, char *stack);
801
802static int
803zap_iterate(dnode_phys_t *zap_dnode, zap_cb_t *cb, void *arg, char *stack)
804{
805	uint32_t size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
806	zap_attribute_t za;
807	int i;
808	mzap_phys_t *mzp = (mzap_phys_t *)stack;
809	stack += size;
810
811	if ((errnum = dmu_read(zap_dnode, 0, mzp, stack)) != 0)
812		return (errnum);
813
814	/*
815	 * Iteration over fatzap objects has not yet been implemented.
816	 * If we encounter a pool in which there are more features for
817	 * read than can fit inside a microzap (i.e., more than 2048
818	 * features for read), we can add support for fatzap iteration.
819	 * For now, fail.
820	 */
821	if (mzp->mz_block_type != ZBT_MICRO) {
822		grub_printf("feature information stored in fatzap, pool "
823		    "version not supported\n");
824		return (1);
825	}
826
827	za.za_integer_length = 8;
828	za.za_num_integers = 1;
829	for (i = 0; i < size / MZAP_ENT_LEN - 1; i++) {
830		mzap_ent_phys_t *mzep = &mzp->mz_chunk[i];
831		int err;
832
833		za.za_first_integer = mzep->mze_value;
834		za.za_name = mzep->mze_name;
835		err = cb(&za, arg, stack);
836		if (err != 0)
837			return (err);
838	}
839
840	return (0);
841}
842
843/*
844 * Get the dnode of an object number from the metadnode of an object set.
845 *
846 * Input
847 *	mdn - metadnode to get the object dnode
848 *	objnum - object number for the object dnode
849 *	type - if nonzero, object must be of this type
850 *	buf - data buffer that holds the returning dnode
851 *	stack - scratch area
852 *
853 * Return:
854 *	0 - success
855 *	errnum - failure
856 */
857static int
858dnode_get(dnode_phys_t *mdn, uint64_t objnum, uint8_t type, dnode_phys_t *buf,
859    char *stack)
860{
861	uint64_t blkid, blksz; /* the block id this object dnode is in */
862	int epbs; /* shift of number of dnodes in a block */
863	int idx; /* index within a block */
864	dnode_phys_t *dnbuf;
865
866	blksz = mdn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
867	epbs = zfs_log2(blksz) - DNODE_SHIFT;
868	blkid = objnum >> epbs;
869	idx = objnum & ((1<<epbs)-1);
870
871	if (dnode_buf != NULL && dnode_mdn == mdn &&
872	    objnum >= dnode_start && objnum < dnode_end) {
873		grub_memmove(buf, &dnode_buf[idx], DNODE_SIZE);
874		VERIFY_DN_TYPE(buf, type);
875		return (0);
876	}
877
878	if (dnode_buf && blksz == 1<<DNODE_BLOCK_SHIFT) {
879		dnbuf = dnode_buf;
880		dnode_mdn = mdn;
881		dnode_start = blkid << epbs;
882		dnode_end = (blkid + 1) << epbs;
883	} else {
884		dnbuf = (dnode_phys_t *)stack;
885		stack += blksz;
886	}
887
888	if (errnum = dmu_read(mdn, blkid, (char *)dnbuf, stack))
889		return (errnum);
890
891	grub_memmove(buf, &dnbuf[idx], DNODE_SIZE);
892	VERIFY_DN_TYPE(buf, type);
893
894	return (0);
895}
896
897/*
898 * Check if this is a special file that resides at the top
899 * dataset of the pool. Currently this is the GRUB menu,
900 * boot signature and boot signature backup.
901 * str starts with '/'.
902 */
903static int
904is_top_dataset_file(char *str)
905{
906	char *tptr;
907
908	if ((tptr = grub_strstr(str, "menu.lst")) &&
909	    (tptr[8] == '\0' || tptr[8] == ' ') &&
910	    *(tptr-1) == '/')
911		return (1);
912
913	if (grub_strncmp(str, BOOTSIGN_DIR"/",
914	    grub_strlen(BOOTSIGN_DIR) + 1) == 0)
915		return (1);
916
917	if (grub_strcmp(str, BOOTSIGN_BACKUP) == 0)
918		return (1);
919
920	return (0);
921}
922
923static int
924check_feature(zap_attribute_t *za, void *arg, char *stack)
925{
926	const char **names = arg;
927	int i;
928
929	if (za->za_first_integer == 0)
930		return (0);
931
932	for (i = 0; names[i] != NULL; i++) {
933		if (grub_strcmp(za->za_name, names[i]) == 0) {
934			return (0);
935		}
936	}
937	grub_printf("missing feature for read '%s'\n", za->za_name);
938	return (ERR_NEWER_VERSION);
939}
940
941/*
942 * Get the file dnode for a given file name where mdn is the meta dnode
943 * for this ZFS object set. When found, place the file dnode in dn.
944 * The 'path' argument will be mangled.
945 *
946 * Return:
947 *	0 - success
948 *	errnum - failure
949 */
950static int
951dnode_get_path(dnode_phys_t *mdn, char *path, dnode_phys_t *dn,
952    char *stack)
953{
954	uint64_t objnum, version;
955	char *cname, ch;
956
957	if (errnum = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
958	    dn, stack))
959		return (errnum);
960
961	if (errnum = zap_lookup(dn, ZPL_VERSION_STR, &version, stack))
962		return (errnum);
963	if (version > ZPL_VERSION)
964		return (-1);
965
966	if (errnum = zap_lookup(dn, ZFS_ROOT_OBJ, &objnum, stack))
967		return (errnum);
968
969	if (errnum = dnode_get(mdn, objnum, DMU_OT_DIRECTORY_CONTENTS,
970	    dn, stack))
971		return (errnum);
972
973	/* skip leading slashes */
974	while (*path == '/')
975		path++;
976
977	while (*path && !grub_isspace(*path)) {
978
979		/* get the next component name */
980		cname = path;
981		while (*path && !grub_isspace(*path) && *path != '/')
982			path++;
983		ch = *path;
984		*path = 0;   /* ensure null termination */
985
986		if (errnum = zap_lookup(dn, cname, &objnum, stack))
987			return (errnum);
988
989		objnum = ZFS_DIRENT_OBJ(objnum);
990		if (errnum = dnode_get(mdn, objnum, 0, dn, stack))
991			return (errnum);
992
993		*path = ch;
994		while (*path == '/')
995			path++;
996	}
997
998	/* We found the dnode for this file. Verify if it is a plain file. */
999	VERIFY_DN_TYPE(dn, DMU_OT_PLAIN_FILE_CONTENTS);
1000
1001	return (0);
1002}
1003
1004/*
1005 * Get the default 'bootfs' property value from the rootpool.
1006 *
1007 * Return:
1008 *	0 - success
1009 *	errnum -failure
1010 */
1011static int
1012get_default_bootfsobj(dnode_phys_t *mosmdn, uint64_t *obj, char *stack)
1013{
1014	uint64_t objnum = 0;
1015	dnode_phys_t *dn = (dnode_phys_t *)stack;
1016	stack += DNODE_SIZE;
1017
1018	if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1019	    DMU_OT_OBJECT_DIRECTORY, dn, stack))
1020		return (errnum);
1021
1022	/*
1023	 * find the object number for 'pool_props', and get the dnode
1024	 * of the 'pool_props'.
1025	 */
1026	if (zap_lookup(dn, DMU_POOL_PROPS, &objnum, stack))
1027		return (ERR_FILESYSTEM_NOT_FOUND);
1028
1029	if (errnum = dnode_get(mosmdn, objnum, DMU_OT_POOL_PROPS, dn, stack))
1030		return (errnum);
1031
1032	if (zap_lookup(dn, ZPOOL_PROP_BOOTFS, &objnum, stack))
1033		return (ERR_FILESYSTEM_NOT_FOUND);
1034
1035	if (!objnum)
1036		return (ERR_FILESYSTEM_NOT_FOUND);
1037
1038	*obj = objnum;
1039	return (0);
1040}
1041
1042/*
1043 * List of pool features that the grub implementation of ZFS supports for
1044 * read. Note that features that are only required for write do not need
1045 * to be listed here since grub opens pools in read-only mode.
1046 *
1047 * When this list is updated the version number in usr/src/grub/capability
1048 * must be incremented to ensure the new grub gets installed.
1049 */
1050static const char *spa_feature_names[] = {
1051	"org.illumos:lz4_compress",
1052	"com.delphix:hole_birth",
1053	"com.delphix:extensible_dataset",
1054	"com.delphix:embedded_data",
1055	"org.open-zfs:large_blocks",
1056	"org.illumos:sha512",
1057	NULL
1058};
1059
1060/*
1061 * Checks whether the MOS features that are active are supported by this
1062 * (GRUB's) implementation of ZFS.
1063 *
1064 * Return:
1065 *	0: Success.
1066 *	errnum: Failure.
1067 */
1068static int
1069check_mos_features(dnode_phys_t *mosmdn, char *stack)
1070{
1071	uint64_t objnum;
1072	dnode_phys_t *dn;
1073	uint8_t error = 0;
1074
1075	dn = (dnode_phys_t *)stack;
1076	stack += DNODE_SIZE;
1077
1078	if ((errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1079	    DMU_OT_OBJECT_DIRECTORY, dn, stack)) != 0)
1080		return (errnum);
1081
1082	/*
1083	 * Find the object number for 'features_for_read' and retrieve its
1084	 * corresponding dnode. Note that we don't check features_for_write
1085	 * because GRUB is not opening the pool for write.
1086	 */
1087	if ((errnum = zap_lookup(dn, DMU_POOL_FEATURES_FOR_READ, &objnum,
1088	    stack)) != 0)
1089		return (errnum);
1090
1091	if ((errnum = dnode_get(mosmdn, objnum, DMU_OTN_ZAP_METADATA,
1092	    dn, stack)) != 0)
1093		return (errnum);
1094
1095	return (zap_iterate(dn, check_feature, spa_feature_names, stack));
1096}
1097
1098/*
1099 * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
1100 * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
1101 * of pool/rootfs.
1102 *
1103 * If no fsname and no obj are given, return the DSL_DIR metadnode.
1104 * If fsname is given, return its metadnode and its matching object number.
1105 * If only obj is given, return the metadnode for this object number.
1106 *
1107 * Return:
1108 *	0 - success
1109 *	errnum - failure
1110 */
1111static int
1112get_objset_mdn(dnode_phys_t *mosmdn, char *fsname, uint64_t *obj,
1113    dnode_phys_t *mdn, char *stack)
1114{
1115	uint64_t objnum, headobj;
1116	char *cname, ch;
1117	blkptr_t *bp;
1118	objset_phys_t *osp;
1119	int issnapshot = 0;
1120	char *snapname;
1121
1122	if (fsname == NULL && obj) {
1123		headobj = *obj;
1124		goto skip;
1125	}
1126
1127	if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1128	    DMU_OT_OBJECT_DIRECTORY, mdn, stack))
1129		return (errnum);
1130
1131	if (errnum = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum,
1132	    stack))
1133		return (errnum);
1134
1135	if (errnum = dnode_get(mosmdn, objnum, 0, mdn, stack))
1136		return (errnum);
1137
1138	if (fsname == NULL) {
1139		headobj =
1140		    ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1141		goto skip;
1142	}
1143
1144	/* take out the pool name */
1145	while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1146		fsname++;
1147
1148	while (*fsname && !grub_isspace(*fsname)) {
1149		uint64_t childobj;
1150
1151		while (*fsname == '/')
1152			fsname++;
1153
1154		cname = fsname;
1155		while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1156			fsname++;
1157		ch = *fsname;
1158		*fsname = 0;
1159
1160		snapname = cname;
1161		while (*snapname && !grub_isspace(*snapname) && *snapname !=
1162		    '@')
1163			snapname++;
1164		if (*snapname == '@') {
1165			issnapshot = 1;
1166			*snapname = 0;
1167		}
1168		childobj =
1169		    ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_child_dir_zapobj;
1170		if (errnum = dnode_get(mosmdn, childobj,
1171		    DMU_OT_DSL_DIR_CHILD_MAP, mdn, stack))
1172			return (errnum);
1173
1174		if (zap_lookup(mdn, cname, &objnum, stack))
1175			return (ERR_FILESYSTEM_NOT_FOUND);
1176
1177		if (errnum = dnode_get(mosmdn, objnum, 0,
1178		    mdn, stack))
1179			return (errnum);
1180
1181		*fsname = ch;
1182		if (issnapshot)
1183			*snapname = '@';
1184	}
1185	headobj = ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1186	if (obj)
1187		*obj = headobj;
1188
1189skip:
1190	if (errnum = dnode_get(mosmdn, headobj, 0, mdn, stack))
1191		return (errnum);
1192	if (issnapshot) {
1193		uint64_t snapobj;
1194
1195		snapobj = ((dsl_dataset_phys_t *)DN_BONUS(mdn))->
1196		    ds_snapnames_zapobj;
1197
1198		if (errnum = dnode_get(mosmdn, snapobj,
1199		    DMU_OT_DSL_DS_SNAP_MAP, mdn, stack))
1200			return (errnum);
1201		if (zap_lookup(mdn, snapname + 1, &headobj, stack))
1202			return (ERR_FILESYSTEM_NOT_FOUND);
1203		if (errnum = dnode_get(mosmdn, headobj, 0, mdn, stack))
1204			return (errnum);
1205		if (obj)
1206			*obj = headobj;
1207	}
1208
1209	bp = &((dsl_dataset_phys_t *)DN_BONUS(mdn))->ds_bp;
1210	osp = (objset_phys_t *)stack;
1211	stack += sizeof (objset_phys_t);
1212	if (errnum = zio_read(bp, osp, stack))
1213		return (errnum);
1214
1215	grub_memmove((char *)mdn, (char *)&osp->os_meta_dnode, DNODE_SIZE);
1216
1217	return (0);
1218}
1219
1220/*
1221 * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1222 *
1223 * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1224 *
1225 *      encoding method/host endian     (4 bytes)
1226 *      nvl_version                     (4 bytes)
1227 *      nvl_nvflag                      (4 bytes)
1228 *	encoded nvpairs:
1229 *		encoded size of the nvpair      (4 bytes)
1230 *		decoded size of the nvpair      (4 bytes)
1231 *		name string size                (4 bytes)
1232 *		name string data                (sizeof(NV_ALIGN4(string))
1233 *		data type                       (4 bytes)
1234 *		# of elements in the nvpair     (4 bytes)
1235 *		data
1236 *      2 zero's for the last nvpair
1237 *		(end of the entire list)	(8 bytes)
1238 *
1239 * Return:
1240 *	0 - success
1241 *	1 - failure
1242 */
1243static int
1244nvlist_unpack(char *nvlist, char **out)
1245{
1246	/* Verify if the 1st and 2nd byte in the nvlist are valid. */
1247	if (nvlist[0] != NV_ENCODE_XDR || nvlist[1] != HOST_ENDIAN)
1248		return (1);
1249
1250	*out = nvlist + 4;
1251	return (0);
1252}
1253
1254static char *
1255nvlist_array(char *nvlist, int index)
1256{
1257	int i, encode_size;
1258
1259	for (i = 0; i < index; i++) {
1260		/* skip the header, nvl_version, and nvl_nvflag */
1261		nvlist = nvlist + 4 * 2;
1262
1263		while (encode_size = BSWAP_32(*(uint32_t *)nvlist))
1264			nvlist += encode_size; /* goto the next nvpair */
1265
1266		nvlist = nvlist + 4 * 2; /* skip the ending 2 zeros - 8 bytes */
1267	}
1268
1269	return (nvlist);
1270}
1271
1272/*
1273 * The nvlist_next_nvpair() function returns a handle to the next nvpair in the
1274 * list following nvpair. If nvpair is NULL, the first pair is returned. If
1275 * nvpair is the last pair in the nvlist, NULL is returned.
1276 */
1277static char *
1278nvlist_next_nvpair(char *nvl, char *nvpair)
1279{
1280	char *cur, *prev;
1281	int encode_size;
1282
1283	if (nvl == NULL)
1284		return (NULL);
1285
1286	if (nvpair == NULL) {
1287		/* skip over nvl_version and nvl_nvflag */
1288		nvpair = nvl + 4 * 2;
1289	} else {
1290		/* skip to the next nvpair */
1291		encode_size = BSWAP_32(*(uint32_t *)nvpair);
1292		nvpair += encode_size;
1293	}
1294
1295	/* 8 bytes of 0 marks the end of the list */
1296	if (*(uint64_t *)nvpair == 0)
1297		return (NULL);
1298
1299	return (nvpair);
1300}
1301
1302/*
1303 * This function returns 0 on success and 1 on failure. On success, a string
1304 * containing the name of nvpair is saved in buf.
1305 */
1306static int
1307nvpair_name(char *nvp, char *buf, int buflen)
1308{
1309	int len;
1310
1311	/* skip over encode/decode size */
1312	nvp += 4 * 2;
1313
1314	len = BSWAP_32(*(uint32_t *)nvp);
1315	if (buflen < len + 1)
1316		return (1);
1317
1318	grub_memmove(buf, nvp + 4, len);
1319	buf[len] = '\0';
1320
1321	return (0);
1322}
1323
1324/*
1325 * This function retrieves the value of the nvpair in the form of enumerated
1326 * type data_type_t. This is used to determine the appropriate type to pass to
1327 * nvpair_value().
1328 */
1329static int
1330nvpair_type(char *nvp)
1331{
1332	int name_len, type;
1333
1334	/* skip over encode/decode size */
1335	nvp += 4 * 2;
1336
1337	/* skip over name_len */
1338	name_len = BSWAP_32(*(uint32_t *)nvp);
1339	nvp += 4;
1340
1341	/* skip over name */
1342	nvp = nvp + ((name_len + 3) & ~3); /* align */
1343
1344	type = BSWAP_32(*(uint32_t *)nvp);
1345
1346	return (type);
1347}
1348
1349static int
1350nvpair_value(char *nvp, void *val, int valtype, int *nelmp)
1351{
1352	int name_len, type, slen;
1353	char *strval = val;
1354	uint64_t *intval = val;
1355
1356	/* skip over encode/decode size */
1357	nvp += 4 * 2;
1358
1359	/* skip over name_len */
1360	name_len = BSWAP_32(*(uint32_t *)nvp);
1361	nvp += 4;
1362
1363	/* skip over name */
1364	nvp = nvp + ((name_len + 3) & ~3); /* align */
1365
1366	/* skip over type */
1367	type = BSWAP_32(*(uint32_t *)nvp);
1368	nvp += 4;
1369
1370	if (type == valtype) {
1371		int nelm;
1372
1373		nelm = BSWAP_32(*(uint32_t *)nvp);
1374		if (valtype != DATA_TYPE_BOOLEAN && nelm < 1)
1375			return (1);
1376		nvp += 4;
1377
1378		switch (valtype) {
1379		case DATA_TYPE_BOOLEAN:
1380			return (0);
1381
1382		case DATA_TYPE_STRING:
1383			slen = BSWAP_32(*(uint32_t *)nvp);
1384			nvp += 4;
1385			grub_memmove(strval, nvp, slen);
1386			strval[slen] = '\0';
1387			return (0);
1388
1389		case DATA_TYPE_UINT64:
1390			*intval = BSWAP_64(*(uint64_t *)nvp);
1391			return (0);
1392
1393		case DATA_TYPE_NVLIST:
1394			*(void **)val = (void *)nvp;
1395			return (0);
1396
1397		case DATA_TYPE_NVLIST_ARRAY:
1398			*(void **)val = (void *)nvp;
1399			if (nelmp)
1400				*nelmp = nelm;
1401			return (0);
1402		}
1403	}
1404
1405	return (1);
1406}
1407
1408static int
1409nvlist_lookup_value(char *nvlist, char *name, void *val, int valtype,
1410    int *nelmp)
1411{
1412	char *nvpair;
1413
1414	for (nvpair = nvlist_next_nvpair(nvlist, NULL);
1415	    nvpair != NULL;
1416	    nvpair = nvlist_next_nvpair(nvlist, nvpair)) {
1417		int name_len = BSWAP_32(*(uint32_t *)(nvpair + 4 * 2));
1418		char *nvp_name = nvpair + 4 * 3;
1419
1420		if ((grub_strncmp(nvp_name, name, name_len) == 0) &&
1421		    nvpair_type(nvpair) == valtype) {
1422			return (nvpair_value(nvpair, val, valtype, nelmp));
1423		}
1424	}
1425	return (1);
1426}
1427
1428/*
1429 * Check if this vdev is online and is in a good state.
1430 */
1431static int
1432vdev_validate(char *nv)
1433{
1434	uint64_t ival;
1435
1436	if (nvlist_lookup_value(nv, ZPOOL_CONFIG_OFFLINE, &ival,
1437	    DATA_TYPE_UINT64, NULL) == 0 ||
1438	    nvlist_lookup_value(nv, ZPOOL_CONFIG_FAULTED, &ival,
1439	    DATA_TYPE_UINT64, NULL) == 0 ||
1440	    nvlist_lookup_value(nv, ZPOOL_CONFIG_REMOVED, &ival,
1441	    DATA_TYPE_UINT64, NULL) == 0)
1442		return (ERR_DEV_VALUES);
1443
1444	return (0);
1445}
1446
1447/*
1448 * Get a valid vdev pathname/devid from the boot device.
1449 * The caller should already allocate MAXPATHLEN memory for bootpath and devid.
1450 */
1451static int
1452vdev_get_bootpath(char *nv, uint64_t inguid, char *devid, char *bootpath,
1453    int is_spare)
1454{
1455	char type[16];
1456
1457	if (nvlist_lookup_value(nv, ZPOOL_CONFIG_TYPE, &type, DATA_TYPE_STRING,
1458	    NULL))
1459		return (ERR_FSYS_CORRUPT);
1460
1461	if (grub_strcmp(type, VDEV_TYPE_DISK) == 0) {
1462		uint64_t guid;
1463
1464		if (vdev_validate(nv) != 0)
1465			return (ERR_NO_BOOTPATH);
1466
1467		if (nvlist_lookup_value(nv, ZPOOL_CONFIG_GUID,
1468		    &guid, DATA_TYPE_UINT64, NULL) != 0)
1469			return (ERR_NO_BOOTPATH);
1470
1471		if (guid != inguid)
1472			return (ERR_NO_BOOTPATH);
1473
1474		/* for a spare vdev, pick the disk labeled with "is_spare" */
1475		if (is_spare) {
1476			uint64_t spare = 0;
1477			(void) nvlist_lookup_value(nv, ZPOOL_CONFIG_IS_SPARE,
1478			    &spare, DATA_TYPE_UINT64, NULL);
1479			if (!spare)
1480				return (ERR_NO_BOOTPATH);
1481		}
1482
1483		if (nvlist_lookup_value(nv, ZPOOL_CONFIG_PHYS_PATH,
1484		    bootpath, DATA_TYPE_STRING, NULL) != 0)
1485			bootpath[0] = '\0';
1486
1487		if (nvlist_lookup_value(nv, ZPOOL_CONFIG_DEVID,
1488		    devid, DATA_TYPE_STRING, NULL) != 0)
1489			devid[0] = '\0';
1490
1491		if (grub_strlen(bootpath) >= MAXPATHLEN ||
1492		    grub_strlen(devid) >= MAXPATHLEN)
1493			return (ERR_WONT_FIT);
1494
1495		return (0);
1496
1497	} else if (grub_strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
1498	    grub_strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
1499	    (is_spare = (grub_strcmp(type, VDEV_TYPE_SPARE) == 0))) {
1500		int nelm, i;
1501		char *child;
1502
1503		if (nvlist_lookup_value(nv, ZPOOL_CONFIG_CHILDREN, &child,
1504		    DATA_TYPE_NVLIST_ARRAY, &nelm))
1505			return (ERR_FSYS_CORRUPT);
1506
1507		for (i = 0; i < nelm; i++) {
1508			char *child_i;
1509
1510			child_i = nvlist_array(child, i);
1511			if (vdev_get_bootpath(child_i, inguid, devid,
1512			    bootpath, is_spare) == 0)
1513				return (0);
1514		}
1515	}
1516
1517	return (ERR_NO_BOOTPATH);
1518}
1519
1520/*
1521 * Check the disk label information and retrieve needed vdev name-value pairs.
1522 *
1523 * Return:
1524 *	0 - success
1525 *	ERR_* - failure
1526 */
1527static int
1528check_pool_label(uint64_t sector, char *stack, char *outdevid,
1529    char *outpath, uint64_t *outguid, uint64_t *outashift, uint64_t *outversion)
1530{
1531	vdev_phys_t *vdev;
1532	uint64_t pool_state, txg = 0;
1533	char *nvlist, *nv, *features;
1534	uint64_t diskguid;
1535
1536	sector += (VDEV_SKIP_SIZE >> SPA_MINBLOCKSHIFT);
1537
1538	/* Read in the vdev name-value pair list (112K). */
1539	if (devread(sector, 0, VDEV_PHYS_SIZE, stack) == 0)
1540		return (ERR_READ);
1541
1542	vdev = (vdev_phys_t *)stack;
1543	stack += sizeof (vdev_phys_t);
1544
1545	if (nvlist_unpack(vdev->vp_nvlist, &nvlist))
1546		return (ERR_FSYS_CORRUPT);
1547
1548	if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_STATE, &pool_state,
1549	    DATA_TYPE_UINT64, NULL))
1550		return (ERR_FSYS_CORRUPT);
1551
1552	if (pool_state == POOL_STATE_DESTROYED)
1553		return (ERR_FILESYSTEM_NOT_FOUND);
1554
1555	if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_NAME,
1556	    current_rootpool, DATA_TYPE_STRING, NULL))
1557		return (ERR_FSYS_CORRUPT);
1558
1559	if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_TXG, &txg,
1560	    DATA_TYPE_UINT64, NULL))
1561		return (ERR_FSYS_CORRUPT);
1562
1563	/* not an active device */
1564	if (txg == 0)
1565		return (ERR_NO_BOOTPATH);
1566
1567	if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VERSION, outversion,
1568	    DATA_TYPE_UINT64, NULL))
1569		return (ERR_FSYS_CORRUPT);
1570	if (!SPA_VERSION_IS_SUPPORTED(*outversion))
1571		return (ERR_NEWER_VERSION);
1572	if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VDEV_TREE, &nv,
1573	    DATA_TYPE_NVLIST, NULL))
1574		return (ERR_FSYS_CORRUPT);
1575	if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_GUID, &diskguid,
1576	    DATA_TYPE_UINT64, NULL))
1577		return (ERR_FSYS_CORRUPT);
1578	if (nvlist_lookup_value(nv, ZPOOL_CONFIG_ASHIFT, outashift,
1579	    DATA_TYPE_UINT64, NULL) != 0)
1580		return (ERR_FSYS_CORRUPT);
1581	if (vdev_get_bootpath(nv, diskguid, outdevid, outpath, 0))
1582		return (ERR_NO_BOOTPATH);
1583	if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_GUID, outguid,
1584	    DATA_TYPE_UINT64, NULL))
1585		return (ERR_FSYS_CORRUPT);
1586
1587	if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1588	    &features, DATA_TYPE_NVLIST, NULL) == 0) {
1589		char *nvp;
1590		char *name = stack;
1591		stack += MAXNAMELEN;
1592
1593		for (nvp = nvlist_next_nvpair(features, NULL);
1594		    nvp != NULL;
1595		    nvp = nvlist_next_nvpair(features, nvp)) {
1596			zap_attribute_t za;
1597
1598			if (nvpair_name(nvp, name, MAXNAMELEN) != 0)
1599				return (ERR_FSYS_CORRUPT);
1600
1601			za.za_integer_length = 8;
1602			za.za_num_integers = 1;
1603			za.za_first_integer = 1;
1604			za.za_name = name;
1605			if (check_feature(&za, spa_feature_names, stack) != 0)
1606				return (ERR_NEWER_VERSION);
1607		}
1608	}
1609
1610	return (0);
1611}
1612
1613/*
1614 * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1615 * to the memory address MOS.
1616 *
1617 * Return:
1618 *	1 - success
1619 *	0 - failure
1620 */
1621int
1622zfs_mount(void)
1623{
1624	char *stack, *ub_array;
1625	int label = 0;
1626	uberblock_t *ubbest;
1627	objset_phys_t *osp;
1628	char tmp_bootpath[MAXNAMELEN];
1629	char tmp_devid[MAXNAMELEN];
1630	uint64_t tmp_guid, ashift, version;
1631	uint64_t adjpl = (uint64_t)part_length << SPA_MINBLOCKSHIFT;
1632	int err = errnum; /* preserve previous errnum state */
1633
1634	/* if it's our first time here, zero the best uberblock out */
1635	if (best_drive == 0 && best_part == 0 && find_best_root) {
1636		grub_memset(&current_uberblock, 0, sizeof (uberblock_t));
1637		pool_guid = 0;
1638	}
1639
1640	stackbase = ZFS_SCRATCH;
1641	stack = stackbase;
1642	ub_array = stack;
1643	stack += VDEV_UBERBLOCK_RING;
1644
1645	osp = (objset_phys_t *)stack;
1646	stack += sizeof (objset_phys_t);
1647	adjpl = P2ALIGN(adjpl, (uint64_t)sizeof (vdev_label_t));
1648
1649	for (label = 0; label < VDEV_LABELS; label++) {
1650
1651		/*
1652		 * some eltorito stacks don't give us a size and
1653		 * we end up setting the size to MAXUINT, further
1654		 * some of these devices stop working once a single
1655		 * read past the end has been issued. Checking
1656		 * for a maximum part_length and skipping the backup
1657		 * labels at the end of the slice/partition/device
1658		 * avoids breaking down on such devices.
1659		 */
1660		if (part_length == MAXUINT && label == 2)
1661			break;
1662
1663		uint64_t sector = vdev_label_start(adjpl,
1664		    label) >> SPA_MINBLOCKSHIFT;
1665
1666		/* Read in the uberblock ring (128K). */
1667		if (devread(sector  +
1668		    ((VDEV_SKIP_SIZE + VDEV_PHYS_SIZE) >> SPA_MINBLOCKSHIFT),
1669		    0, VDEV_UBERBLOCK_RING, ub_array) == 0)
1670			continue;
1671
1672		if (check_pool_label(sector, stack, tmp_devid,
1673		    tmp_bootpath, &tmp_guid, &ashift, &version))
1674			continue;
1675
1676		if (pool_guid == 0)
1677			pool_guid = tmp_guid;
1678
1679		if ((ubbest = find_bestub(ub_array, ashift, sector)) == NULL ||
1680		    zio_read(&ubbest->ub_rootbp, osp, stack) != 0)
1681			continue;
1682
1683		VERIFY_OS_TYPE(osp, DMU_OST_META);
1684
1685		if (version >= SPA_VERSION_FEATURES &&
1686		    check_mos_features(&osp->os_meta_dnode, stack) != 0)
1687			continue;
1688
1689		if (find_best_root && ((pool_guid != tmp_guid) ||
1690		    vdev_uberblock_compare(ubbest, &(current_uberblock)) <= 0))
1691			continue;
1692
1693		/* Got the MOS. Save it at the memory addr MOS. */
1694		grub_memmove(MOS, &osp->os_meta_dnode, DNODE_SIZE);
1695		grub_memmove(&current_uberblock, ubbest, sizeof (uberblock_t));
1696		grub_memmove(current_bootpath, tmp_bootpath, MAXNAMELEN);
1697		grub_memmove(current_devid, tmp_devid, grub_strlen(tmp_devid));
1698		is_zfs_mount = 1;
1699		return (1);
1700	}
1701
1702	/*
1703	 * While some fs impls. (tftp) rely on setting and keeping
1704	 * global errnums set, others won't reset it and will break
1705	 * when issuing rawreads. The goal here is to simply not
1706	 * have zfs mount attempts impact the previous state.
1707	 */
1708	errnum = err;
1709	return (0);
1710}
1711
1712/*
1713 * zfs_open() locates a file in the rootpool by following the
1714 * MOS and places the dnode of the file in the memory address DNODE.
1715 *
1716 * Return:
1717 *	1 - success
1718 *	0 - failure
1719 */
1720int
1721zfs_open(char *filename)
1722{
1723	char *stack;
1724	dnode_phys_t *mdn;
1725
1726	file_buf = NULL;
1727	stackbase = ZFS_SCRATCH;
1728	stack = stackbase;
1729
1730	mdn = (dnode_phys_t *)stack;
1731	stack += sizeof (dnode_phys_t);
1732
1733	dnode_mdn = NULL;
1734	dnode_buf = (dnode_phys_t *)stack;
1735	stack += 1<<DNODE_BLOCK_SHIFT;
1736
1737	/*
1738	 * menu.lst is placed at the root pool filesystem level,
1739	 * do not goto 'current_bootfs'.
1740	 */
1741	if (is_top_dataset_file(filename)) {
1742		if (errnum = get_objset_mdn(MOS, NULL, NULL, mdn, stack))
1743			return (0);
1744
1745		current_bootfs_obj = 0;
1746	} else {
1747		if (current_bootfs[0] == '\0') {
1748			/* Get the default root filesystem object number */
1749			if (errnum = get_default_bootfsobj(MOS,
1750			    &current_bootfs_obj, stack))
1751				return (0);
1752
1753			if (errnum = get_objset_mdn(MOS, NULL,
1754			    &current_bootfs_obj, mdn, stack))
1755				return (0);
1756		} else {
1757			if (errnum = get_objset_mdn(MOS, current_bootfs,
1758			    &current_bootfs_obj, mdn, stack)) {
1759				grub_memset(current_bootfs, 0, MAXNAMELEN);
1760				return (0);
1761			}
1762		}
1763	}
1764
1765	if (dnode_get_path(mdn, filename, DNODE, stack)) {
1766		errnum = ERR_FILE_NOT_FOUND;
1767		return (0);
1768	}
1769
1770	/* get the file size and set the file position to 0 */
1771
1772	/*
1773	 * For DMU_OT_SA we will need to locate the SIZE attribute
1774	 * attribute, which could be either in the bonus buffer
1775	 * or the "spill" block.
1776	 */
1777	if (DNODE->dn_bonustype == DMU_OT_SA) {
1778		sa_hdr_phys_t *sahdrp;
1779		int hdrsize;
1780
1781		if (DNODE->dn_bonuslen != 0) {
1782			sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE);
1783		} else {
1784			if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
1785				blkptr_t *bp = &DNODE->dn_spill;
1786				void *buf;
1787
1788				buf = (void *)stack;
1789				stack += BP_GET_LSIZE(bp);
1790
1791				/* reset errnum to rawread() failure */
1792				errnum = 0;
1793				if (zio_read(bp, buf, stack) != 0) {
1794					return (0);
1795				}
1796				sahdrp = buf;
1797			} else {
1798				errnum = ERR_FSYS_CORRUPT;
1799				return (0);
1800			}
1801		}
1802		hdrsize = SA_HDR_SIZE(sahdrp);
1803		filemax = *(uint64_t *)((char *)sahdrp + hdrsize +
1804		    SA_SIZE_OFFSET);
1805	} else {
1806		filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size;
1807	}
1808	filepos = 0;
1809
1810	dnode_buf = NULL;
1811	return (1);
1812}
1813
1814/*
1815 * zfs_read reads in the data blocks pointed by the DNODE.
1816 *
1817 * Return:
1818 *	len - the length successfully read in to the buffer
1819 *	0   - failure
1820 */
1821int
1822zfs_read(char *buf, int len)
1823{
1824	char *stack;
1825	int blksz, length, movesize;
1826
1827	if (file_buf == NULL) {
1828		file_buf = stackbase;
1829		stackbase += SPA_MAXBLOCKSIZE;
1830		file_start = file_end = 0;
1831	}
1832	stack = stackbase;
1833
1834	/*
1835	 * If offset is in memory, move it into the buffer provided and return.
1836	 */
1837	if (filepos >= file_start && filepos+len <= file_end) {
1838		grub_memmove(buf, file_buf + filepos - file_start, len);
1839		filepos += len;
1840		return (len);
1841	}
1842
1843	blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1844
1845	/*
1846	 * Note: for GRUB, SPA_MAXBLOCKSIZE is 128KB.  There is not enough
1847	 * memory to allocate the new max blocksize (16MB), so while
1848	 * GRUB understands the large_blocks on-disk feature, it can't
1849	 * actually read large blocks.
1850	 */
1851	if (blksz > SPA_MAXBLOCKSIZE) {
1852		grub_printf("blocks larger than 128K are not supported\n");
1853		return (0);
1854	}
1855
1856	/*
1857	 * Entire Dnode is too big to fit into the space available.  We
1858	 * will need to read it in chunks.  This could be optimized to
1859	 * read in as large a chunk as there is space available, but for
1860	 * now, this only reads in one data block at a time.
1861	 */
1862	length = len;
1863	while (length) {
1864		/*
1865		 * Find requested blkid and the offset within that block.
1866		 */
1867		uint64_t blkid = filepos / blksz;
1868
1869		if (errnum = dmu_read(DNODE, blkid, file_buf, stack))
1870			return (0);
1871
1872		file_start = blkid * blksz;
1873		file_end = file_start + blksz;
1874
1875		movesize = MIN(length, file_end - filepos);
1876
1877		grub_memmove(buf, file_buf + filepos - file_start,
1878		    movesize);
1879		buf += movesize;
1880		length -= movesize;
1881		filepos += movesize;
1882	}
1883
1884	return (len);
1885}
1886
1887/*
1888 * No-Op
1889 */
1890int
1891zfs_embed(unsigned long long *start_sector, int needed_sectors)
1892{
1893	return (1);
1894}
1895
1896#endif /* FSYS_ZFS */
1897