2fa9e406ahrens * CDDL HEADER START
3fa9e406ahrens *
4fa9e406ahrens * The contents of this file are subject to the terms of the
5ea8dc4beschrock * Common Development and Distribution License (the "License").
6ea8dc4beschrock * You may not use this file except in compliance with the License.
7fa9e406ahrens *
8fa9e406ahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e406ahrens * or http://www.opensolaris.org/os/licensing.
10fa9e406ahrens * See the License for the specific language governing permissions
11fa9e406ahrens * and limitations under the License.
12fa9e406ahrens *
13fa9e406ahrens * When distributing Covered Code, include this CDDL HEADER in each
14fa9e406ahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e406ahrens * If applicable, add the following below this CDDL HEADER, with the
16fa9e406ahrens * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e406ahrens * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e406ahrens *
19fa9e406ahrens * CDDL HEADER END
20fa9e406ahrens */
233f9d6adLin Ling * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24663207aDon Brady * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25663207aDon Brady * Copyright (c) 2016, 2017 Intel Corporation.
26b327cd3Igor Kozhukhov * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27fa9e406ahrens */
30fa9e406ahrens * Functions to convert between a list of vdevs and an nvlist representing the
31fa9e406ahrens * configuration.  Each entry in the list can be one of:
32fa9e406ahrens *
338a5bcf7Toomas Soome *	Device vdevs
348a5bcf7Toomas Soome *		disk=(path=..., devid=...)
358a5bcf7Toomas Soome *		file=(path=...)
36fa9e406ahrens *
378a5bcf7Toomas Soome *	Group vdevs
388a5bcf7Toomas Soome *		raidz[1|2]=(...)
398a5bcf7Toomas Soome *		mirror=(...)
40fa9e406ahrens *
418a5bcf7Toomas Soome *	Hot spares
4299653d4eschrock *
43fa9e406ahrens * While the underlying implementation supports it, group vdevs cannot contain
44fa9e406ahrens * other group vdevs.  All userland verification of devices is contained within
45fa9e406ahrens * this file.  If successful, the nvlist returned can be passed directly to the
46fa9e406ahrens * kernel; we've done as much verification as possible in userland.
47fa9e406ahrens *
4899653d4eschrock * Hot spares are a special case, and passed down as an array of disk vdevs, at
4999653d4eschrock * the same level as the root of the vdev tree.
5099653d4eschrock *
518488aebtaylor * The only function exported by this file is 'make_root_vdev'.  The
528488aebtaylor * function performs several passes:
53fa9e406ahrens *
548a5bcf7Toomas Soome *	1. Construct the vdev specification.  Performs syntax validation and
55fa9e406ahrens *         makes sure each device is valid.
568a5bcf7Toomas Soome *	2. Check for devices in use.  Using libdiskmgt, makes sure that no
57fa9e406ahrens *         devices are also in use.  Some can be overridden using the 'force'
58fa9e406ahrens *         flag, others cannot.
598a5bcf7Toomas Soome *	3. Check for replication errors if the 'force' flag is not specified.
60fa9e406ahrens *         validates that the replication level is consistent across the
61fa9e406ahrens *         entire pool.
628a5bcf7Toomas Soome *	4. Call libzfs to label any whole disks with an EFI label.
63fa9e406ahrens */
65fa9e406ahrens#include <assert.h>
66fa9e406ahrens#include <devid.h>
67fa9e406ahrens#include <errno.h>
68fa9e406ahrens#include <fcntl.h>
69fa9e406ahrens#include <libdiskmgt.h>
70fa9e406ahrens#include <libintl.h>
71fa9e406ahrens#include <libnvpair.h>
72d8ab6e1Don Brady#include <libzutil.h>
73f94275cAdam Leventhal#include <limits.h>
745711d39loli#include <sys/spa.h>
75fa9e406ahrens#include <stdio.h>
76fa9e406ahrens#include <string.h>
77fa9e406ahrens#include <unistd.h>
78fa9e406ahrens#include <sys/efi_partition.h>
79fa9e406ahrens#include <sys/stat.h>
80fa9e406ahrens#include <sys/vtoc.h>
81fa9e406ahrens#include <sys/mntent.h>
83fa9e406ahrens#include "zpool_util.h"
85fa9e406ahrens#define	BACKUP_SLICE	"s2"
88fa9e406ahrens * For any given vdev specification, we can have multiple errors.  The
89fa9e406ahrens * vdev_error() function keeps track of whether we have seen an error yet, and
90fa9e406ahrens * prints out a header if its the first error we've seen.
91fa9e406ahrens */
9299653d4eschrockboolean_t error_seen;
9399653d4eschrockboolean_t is_force;
9699653d4eschrockstatic void
97fa9e406ahrensvdev_error(const char *fmt, ...)
99fa9e406ahrens	va_list ap;
101fa9e406ahrens	if (!error_seen) {
102fa9e406ahrens		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
103fa9e406ahrens		if (!is_force)
104fa9e406ahrens			(void) fprintf(stderr, gettext("use '-f' to override "
105fa9e406ahrens			    "the following errors:\n"));
106fa9e406ahrens		else
107fa9e406ahrens			(void) fprintf(stderr, gettext("the following errors "
108fa9e406ahrens			    "must be manually repaired:\n"));
10999653d4eschrock		error_seen = B_TRUE;
110fa9e406ahrens	}
112fa9e406ahrens	va_start(ap, fmt);
113fa9e406ahrens	(void) vfprintf(stderr, fmt, ap);
114fa9e406ahrens	va_end(ap);
11746a2abfeschrockstatic void
11846a2abfeschrocklibdiskmgt_error(int error)
120ea8dc4beschrock	/*
12199653d4eschrock	 * ENXIO/ENODEV is a valid error message if the device doesn't live in
122ea8dc4beschrock	 * /dev/dsk.  Don't bother printing an error message in this case.
123ea8dc4beschrock	 */
12499653d4eschrock	if (error == ENXIO || error == ENODEV)
125ea8dc4beschrock		return;
12746a2abfeschrock	(void) fprintf(stderr, gettext("warning: device in use checking "
12846a2abfeschrock	    "failed: %s\n"), strerror(error));
13246a2abfeschrock * Validate a device, passing the bulk of the work off to libdiskmgt.
133fa9e406ahrens */
1348488aebtaylorstatic int
13599653d4eschrockcheck_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
13746a2abfeschrock	char *msg;
13846a2abfeschrock	int error = 0;
13903a818bmmusante	dm_who_type_t who;
14103a818bmmusante	if (force)
14203a818bmmusante		who = DM_WHO_ZPOOL_FORCE;
14303a818bmmusante	else if (isspare)
14403a818bmmusante		who = DM_WHO_ZPOOL_SPARE;
14503a818bmmusante	else
14603a818bmmusante		who = DM_WHO_ZPOOL;
14803a818bmmusante	if (dm_inuse((char *)path, &msg, who, &error) || error) {
14946a2abfeschrock		if (error != 0) {
15046a2abfeschrock			libdiskmgt_error(error);
15146a2abfeschrock			return (0);
15246657f8mmusante		} else {
15346a2abfeschrock			vdev_error("%s", msg);
15446a2abfeschrock			free(msg);
155181c2f4mmusante			return (-1);
156fa9e406ahrens		}
157fa9e406ahrens	}
159fa9e406ahrens	/*
16046a2abfeschrock	 * If we're given a whole disk, ignore overlapping slices since we're
16146a2abfeschrock	 * about to label it anyway.
162fa9e406ahrens	 */
16346a2abfeschrock	error = 0;
16446a2abfeschrock	if (!wholedisk && !force &&
16546a2abfeschrock	    (dm_isoverlapping((char *)path, &msg, &error) || error)) {
166181c2f4mmusante		if (error == 0) {
167181c2f4mmusante			/* dm_isoverlapping returned -1 */
168181c2f4mmusante			vdev_error(gettext("%s overlaps with %s\n"), path, msg);
169181c2f4mmusante			free(msg);
170181c2f4mmusante			return (-1);
171181c2f4mmusante		} else if (error != ENODEV) {
172181c2f4mmusante			/* libdiskmgt's devcache only handles physical drives */
17346a2abfeschrock			libdiskmgt_error(error);
17446a2abfeschrock			return (0);
175fa9e406ahrens		}
17646a2abfeschrock	}
178181c2f4mmusante	return (0);
183fa9e406ahrens * Validate a whole disk.  Iterate over all slices on the disk and make sure
184fa9e406ahrens * that none is in use by calling check_slice().
185fa9e406ahrens */
1868488aebtaylorstatic int
18799653d4eschrockcheck_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
189fa9e406ahrens	dm_descriptor_t *drive, *media, *slice;
190fa9e406ahrens	int err = 0;
191fa9e406ahrens	int i;
192fa9e406ahrens	int ret;
194fa9e406ahrens	/*
195fa9e406ahrens	 * Get the drive associated with this disk.  This should never fail,
196fa9e406ahrens	 * because we already have an alias handle open for the device.
197fa9e406ahrens	 */
198fa9e406ahrens	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
1998a5bcf7Toomas Soome	    &err)) == NULL || *drive == 0) {
20046a2abfeschrock		if (err)
20146a2abfeschrock			libdiskmgt_error(err);
20246a2abfeschrock		return (0);
20346a2abfeschrock	}
205fa9e406ahrens	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
20646a2abfeschrock	    &err)) == NULL) {
20746a2abfeschrock		dm_free_descriptors(drive);
20846a2abfeschrock		if (err)
20946a2abfeschrock			libdiskmgt_error(err);
21046a2abfeschrock		return (0);
21146a2abfeschrock	}
213fa9e406ahrens	dm_free_descriptors(drive);
215fa9e406ahrens	/*
216fa9e406ahrens	 * It is possible that the user has specified a removable media drive,
217fa9e406ahrens	 * and the media is not present.
218fa9e406ahrens	 */
2198a5bcf7Toomas Soome	if (*media == 0) {
220fa9e406ahrens		dm_free_descriptors(media);
22146a2abfeschrock		vdev_error(gettext("'%s' has no media in drive\n"), name);
222fa9e406ahrens		return (-1);
223fa9e406ahrens	}
225fa9e406ahrens	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
22646a2abfeschrock	    &err)) == NULL) {
22746a2abfeschrock		dm_free_descriptors(media);
22846a2abfeschrock		if (err)
22946a2abfeschrock			libdiskmgt_error(err);
23046a2abfeschrock		return (0);
23146a2abfeschrock	}
233fa9e406ahrens	dm_free_descriptors(media);
235fa9e406ahrens	ret = 0;
237fa9e406ahrens	/*
238fa9e406ahrens	 * Iterate over all slices and report any errors.  We don't care about
239fa9e406ahrens	 * overlapping slices because we are using the whole disk.
240fa9e406ahrens	 */
2418a5bcf7Toomas Soome	for (i = 0; slice[i] != 0; i++) {
24299653d4eschrock		char *name = dm_get_name(slice[i], &err);
24499653d4eschrock		if (check_slice(name, force, B_TRUE, isspare) != 0)
245fa9e406ahrens			ret = -1;
24799653d4eschrock		dm_free_name(name);
248fa9e406ahrens	}
250fa9e406ahrens	dm_free_descriptors(slice);
251fa9e406ahrens	return (ret);
25546a2abfeschrock * Validate a device.
256fa9e406ahrens */
2578488aebtaylorstatic int
25899653d4eschrockcheck_device(const char *path, boolean_t force, boolean_t isspare)
260fa9e406ahrens	dm_descriptor_t desc;
261fa9e406ahrens	int err;
26246a2abfeschrock	char *dev;
264fa9e406ahrens	/*
265fa9e406ahrens	 * For whole disks, libdiskmgt does not include the leading dev path.
266fa9e406ahrens	 */
267fa9e406ahrens	dev = strrchr(path, '/');
268fa9e406ahrens	assert(dev != NULL);
269fa9e406ahrens	dev++;
2708a5bcf7Toomas Soome	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != 0) {
27199653d4eschrock		err = check_disk(path, desc, force, isspare);
27246a2abfeschrock		dm_free_descriptor(desc);
27346a2abfeschrock		return (err);
274fa9e406ahrens	}
27699653d4eschrock	return (check_slice(path, force, B_FALSE, isspare));
280fa9e406ahrens * Check that a file is valid.  All we can do in this case is check that it's
281181c2f4mmusante * not in use by another pool, and not in use by swap.
282fa9e406ahrens */
2838488aebtaylorstatic int
28499653d4eschrockcheck_file(const char *file, boolean_t force, boolean_t isspare)
28646a2abfeschrock	char  *name;
287fa9e406ahrens	int fd;
288fa9e406ahrens	int ret = 0;
289181c2f4mmusante	int err;
29046a2abfeschrock	pool_state_t state;
29199653d4eschrock	boolean_t inuse;
293181c2f4mmusante	if (dm_inuse_swap(file, &err)) {
294181c2f4mmusante		if (err)
295181c2f4mmusante			libdiskmgt_error(err);
296181c2f4mmusante		else
297181c2f4mmusante			vdev_error(gettext("%s is currently used by swap. "
298181c2f4mmusante			    "Please see swap(1M).\n"), file);
299181c2f4mmusante		return (-1);
300181c2f4mmusante	}
302fa9e406ahrens	if ((fd = open(file, O_RDONLY)) < 0)
303fa9e406ahrens		return (0);
30599653d4eschrock	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
30646a2abfeschrock		const char *desc;
30846a2abfeschrock		switch (state) {
30946a2abfeschrock		case POOL_STATE_ACTIVE:
31046a2abfeschrock			desc = gettext("active");
31146a2abfeschrock			break;
31346a2abfeschrock		case POOL_STATE_EXPORTED:
31446a2abfeschrock			desc = gettext("exported");
31546a2abfeschrock			break;
31746a2abfeschrock		case POOL_STATE_POTENTIALLY_ACTIVE:
31846a2abfeschrock			desc = gettext("potentially active");
31946a2abfeschrock			break;
32146a2abfeschrock		default:
32246a2abfeschrock			desc = gettext("unknown");
32346a2abfeschrock			break;
32446a2abfeschrock		}
32699653d4eschrock		/*
32799653d4eschrock		 * Allow hot spares to be shared between pools.
32899653d4eschrock		 */
32999653d4eschrock		if (state == POOL_STATE_SPARE && isspare)
33099653d4eschrock			return (0);
33299653d4eschrock		if (state == POOL_STATE_ACTIVE ||
33399653d4eschrock		    state == POOL_STATE_SPARE || !force) {
33499653d4eschrock			switch (state) {
33599653d4eschrock			case POOL_STATE_SPARE:
33699653d4eschrock				vdev_error(gettext("%s is reserved as a hot "
33799653d4eschrock				    "spare for pool %s\n"), file, name);
33899653d4eschrock				break;
33999653d4eschrock			default:
34099653d4eschrock				vdev_error(gettext("%s is part of %s pool "
34199653d4eschrock				    "'%s'\n"), file, desc, name);
34299653d4eschrock				break;
34399653d4eschrock			}
344fa9e406ahrens			ret = -1;
345fa9e406ahrens		}
347fa9e406ahrens		free(name);
348fa9e406ahrens	}
350fa9e406ahrens	(void) close(fd);
351fa9e406ahrens	return (ret);
3568488aebtaylor * By "whole disk" we mean an entire physical disk (something we can
3578488aebtaylor * label, toggle the write cache on, etc.) as opposed to the full
3588488aebtaylor * capacity of a pseudo-device such as lofi or did.  We act as if we
3598488aebtaylor * are labeling the disk, which should be a pretty good test of whether
3608488aebtaylor * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
3618488aebtaylor * it isn't.
3628488aebtaylor */
36399653d4eschrockstatic boolean_t
3648488aebtayloris_whole_disk(const char *arg)
3668488aebtaylor	struct dk_gpt *label;
3678488aebtaylor	int	fd;
3688488aebtaylor	char	path[MAXPATHLEN];
3708488aebtaylor	(void) snprintf(path, sizeof (path), "%s%s%s",
3716401734Will Andrews	    ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
3728488aebtaylor	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
3738488aebtaylor		return (B_FALSE);
3748488aebtaylor	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
3758488aebtaylor		(void) close(fd);
3768488aebtaylor		return (B_FALSE);
3778488aebtaylor	}
3788488aebtaylor	efi_free(label);
3798488aebtaylor	(void) close(fd);
3808488aebtaylor	return (B_TRUE);
384fa9e406ahrens * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
385fa9e406ahrens * device, fill in the device id to make a complete nvlist.  Valid forms for a
386fa9e406ahrens * leaf vdev are:
387fa9e406ahrens *
3888a5bcf7Toomas Soome *	/dev/dsk/xxx	Complete disk path
3898a5bcf7Toomas Soome *	/xxx		Full path to file
3908a5bcf7Toomas Soome *	xxx		Shorthand for /dev/dsk/xxx
391fa9e406ahrens */
3928488aebtaylorstatic nvlist_t *
3935711d39lolimake_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
395fa9e406ahrens	char path[MAXPATHLEN];
396fa9e406ahrens	struct stat64 statbuf;
397fa9e406ahrens	nvlist_t *vdev = NULL;
398fa9e406ahrens	char *type = NULL;
39999653d4eschrock	boolean_t wholedisk = B_FALSE;
4005711d39loli	uint64_t ashift = 0;
402fa9e406ahrens	/*
403fa9e406ahrens	 * Determine what type of vdev this is, and put the full path into
404fa9e406ahrens	 * 'path'.  We detect whether this is a device of file afterwards by
405fa9e406ahrens	 * checking the st_mode of the file.
406fa9e406ahrens	 */
407fa9e406ahrens	if (arg[0] == '/') {
408fa9e406ahrens		/*
409fa9e406ahrens		 * Complete device or file path.  Exact type is determined by
410fa9e406ahrens		 * examining the file descriptor afterwards.
411fa9e406ahrens		 */
4128488aebtaylor		wholedisk = is_whole_disk(arg);
4138488aebtaylor		if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
414fa9e406ahrens			(void) fprintf(stderr,
415fa9e406ahrens			    gettext("cannot open '%s': %s\n"),
416fa9e406ahrens			    arg, strerror(errno));
417fa9e406ahrens			return (NULL);
418fa9e406ahrens		}
420fa9e406ahrens		(void) strlcpy(path, arg, sizeof (path));
421fa9e406ahrens	} else {
422fa9e406ahrens		/*
423fa9e406ahrens		 * This may be a short path for a device, or it could be total
424fa9e406ahrens		 * gibberish.  Check to see if it's a known device in
425fa9e406ahrens		 * /dev/dsk/.  As part of this check, see if we've been given a
426fa9e406ahrens		 * an entire disk (minus the slice number).
427fa9e406ahrens		 */
4286401734Will Andrews		(void) snprintf(path, sizeof (path), "%s/%s", ZFS_DISK_ROOT,
429fa9e406ahrens		    arg);
4308488aebtaylor		wholedisk = is_whole_disk(path);
4318488aebtaylor		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
432fa9e406ahrens			/*
433fa9e406ahrens			 * If we got ENOENT, then the user gave us
434fa9e406ahrens			 * gibberish, so try to direct them with a
435fa9e406ahrens			 * reasonable error message.  Otherwise,
436fa9e406ahrens			 * regurgitate strerror() since it's the best we
437fa9e406ahrens			 * can do.
438fa9e406ahrens			 */
439fa9e406ahrens			if (errno == ENOENT) {
440fa9e406ahrens				(void) fprintf(stderr,
441fa9e406ahrens				    gettext("cannot open '%s': no such "
4426401734Will Andrews				    "device in %s\n"), arg, ZFS_DISK_ROOT);
443fa9e406ahrens				(void) fprintf(stderr,
444fa9e406ahrens				    gettext("must be a full path or "
445fa9e406ahrens				    "shorthand device name\n"));
446fa9e406ahrens				return (NULL);
447fa9e406ahrens			} else {
448fa9e406ahrens				(void) fprintf(stderr,
449fa9e406ahrens				    gettext("cannot open '%s': %s\n"),
450fa9e406ahrens				    path, strerror(errno));
451fa9e406ahrens				return (NULL);
452fa9e406ahrens			}
453fa9e406ahrens		}
454fa9e406ahrens	}
456fa9e406ahrens	/*
457fa9e406ahrens	 * Determine whether this is a device or a file.
458fa9e406ahrens	 */
4598488aebtaylor	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
460fa9e406ahrens		type = VDEV_TYPE_DISK;
461fa9e406ahrens	} else if (S_ISREG(statbuf.st_mode)) {
462fa9e406ahrens		type = VDEV_TYPE_FILE;
463fa9e406ahrens	} else {
464fa9e406ahrens		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
465fa9e406ahrens		    "block device or regular file\n"), path);
466fa9e406ahrens		return (NULL);
467fa9e406ahrens	}
469fa9e406ahrens	/*
470fa9e406ahrens	 * Finally, we have the complete device or file, and we know that it is
471fa9e406ahrens	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
472fa9e406ahrens	 * vdevs have a 'path' element, and devices also have a 'devid' element.
473fa9e406ahrens	 */
474fa9e406ahrens	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
475fa9e406ahrens	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
476fa9e406ahrens	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
4778654d02perrin	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
478663207aDon Brady	if (is_log)
479663207aDon Brady		verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
480663207aDon Brady		    VDEV_ALLOC_BIAS_LOG) == 0);
481afefbcdeschrock	if (strcmp(type, VDEV_TYPE_DISK) == 0)
482afefbcdeschrock		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
483afefbcdeschrock		    (uint64_t)wholedisk) == 0);
4855711d39loli	if (props != NULL) {
4865711d39loli		char *value = NULL;
4885711d39loli		if (nvlist_lookup_string(props,
4895711d39loli		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
4905711d39loli			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
4915711d39loli				(void) fprintf(stderr,
4925711d39loli				    gettext("ashift must be a number.\n"));
4935711d39loli				return (NULL);
4945711d39loli			}
4955711d39loli			if (ashift != 0 &&
4965711d39loli			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
4975711d39loli				(void) fprintf(stderr,
4985711d39loli				    gettext("invalid 'ashift=%" PRIu64 "' "
4995711d39loli				    "property: only values between %" PRId32 " "
5005711d39loli				    "and %" PRId32 " are allowed.\n"),
5015711d39loli				    ashift, ASHIFT_MIN, ASHIFT_MAX);
5025711d39loli				return (NULL);
5035711d39loli			}
5045711d39loli		}
5055711d39loli	}
507fa9e406ahrens	/*
508fa9e406ahrens	 * For a whole disk, defer getting its devid until after labeling it.
509fa9e406ahrens	 */
510fa9e406ahrens	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
511fa9e406ahrens		/*
512fa9e406ahrens		 * Get the devid for the device.
513fa9e406ahrens		 */
514fa9e406ahrens		int fd;
515fa9e406ahrens		ddi_devid_t devid;
516fa9e406ahrens		char *minor = NULL, *devid_str = NULL;
518fa9e406ahrens		if ((fd = open(path, O_RDONLY)) < 0) {
519fa9e406ahrens			(void) fprintf(stderr, gettext("cannot open '%s': "
520fa9e406ahrens			    "%s\n"), path, strerror(errno));
521fa9e406ahrens			nvlist_free(vdev);
522fa9e406ahrens			return (NULL);
523fa9e406ahrens		}
525fa9e406ahrens		if (devid_get(fd, &devid) == 0) {
526fa9e406ahrens			if (devid_get_minor_name(fd, &minor) == 0 &&
527fa9e406ahrens			    (devid_str = devid_str_encode(devid, minor)) !=
528fa9e406ahrens			    NULL) {
529fa9e406ahrens				verify(nvlist_add_string(vdev,
530fa9e406ahrens				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
531fa9e406ahrens			}
532fa9e406ahrens			if (devid_str != NULL)
533fa9e406ahrens				devid_str_free(devid_str);
534fa9e406ahrens			if (minor != NULL)
535fa9e406ahrens				devid_str_free(minor);
536fa9e406ahrens			devid_free(devid);
537fa9e406ahrens		}
539fa9e406ahrens		(void) close(fd);
540fa9e406ahrens	}
5425711d39loli	if (ashift > 0)
5435711d39loli		(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
545fa9e406ahrens	return (vdev);
549fa9e406ahrens * Go through and verify the replication level of the pool is consistent.
550fa9e406ahrens * Performs the following checks:
551fa9e406ahrens *
5528a5bcf7Toomas Soome *	For the new spec, verifies that devices in mirrors and raidz are the
5538a5bcf7Toomas Soome *	same size.
554fa9e406ahrens *
5558a5bcf7Toomas Soome *	If the current configuration already has inconsistent replication
5568a5bcf7Toomas Soome *	levels, ignore any other potential problems in the new spec.
557fa9e406ahrens *
5588a5bcf7Toomas Soome *	Otherwise, make sure that the current spec (if there is one) and the new
5598a5bcf7Toomas Soome *	spec have consistent replication levels.
560663207aDon Brady *
561663207aDon Brady *	If there is no current spec (create), make sure new spec has at least
562663207aDon Brady *	one general purpose vdev.
563fa9e406ahrens */
564fa9e406ahrenstypedef struct replication_level {
56599653d4eschrock	char *zprl_type;
56699653d4eschrock	uint64_t zprl_children;
56799653d4eschrock	uint64_t zprl_parity;
568fa9e406ahrens} replication_level_t;
5708488aebtaylor#define	ZPOOL_FUZZ	(16 * 1024 * 1024)
572663207aDon Bradystatic boolean_t
573663207aDon Bradyis_raidz_mirror(replication_level_t *a, replication_level_t *b,
574663207aDon Brady    replication_level_t **raidz, replication_level_t **mirror)
575663207aDon Brady{
576663207aDon Brady	if (strcmp(a->zprl_type, "raidz") == 0 &&
577663207aDon Brady	    strcmp(b->zprl_type, "mirror") == 0) {
578663207aDon Brady		*raidz = a;
579663207aDon Brady		*mirror = b;
580663207aDon Brady		return (B_TRUE);
581663207aDon Brady	}
582663207aDon Brady	return (B_FALSE);
583663207aDon Brady}
584663207aDon Brady
586fa9e406ahrens * Given a list of toplevel vdevs, return the current replication level.  If
587fa9e406ahrens * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
588fa9e406ahrens * an error message will be displayed for each self-inconsistent vdev.
589fa9e406ahrens */
5908488aebtaylorstatic replication_level_t *
59199653d4eschrockget_replication(nvlist_t *nvroot, boolean_t fatal)
593fa9e406ahrens	nvlist_t **top;
594fa9e406ahrens	uint_t t, toplevels;
595fa9e406ahrens	nvlist_t **child;
596fa9e406ahrens	uint_t c, children;
597fa9e406ahrens	nvlist_t *nv;
598fa9e406ahrens	char *type;
599b327cd3Igor Kozhukhov	replication_level_t lastrep = {0};
600b327cd3Igor Kozhukhov	replication_level_t rep;
601b327cd3Igor Kozhukhov	replication_level_t *ret;
602663207aDon Brady	replication_level_t *raidz, *mirror;
60399653d4eschrock	boolean_t dontreport;
605fa9e406ahrens	ret = safe_malloc(sizeof (replication_level_t));
607fa9e406ahrens	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
608fa9e406ahrens	    &top, &toplevels) == 0);
610fa9e406ahrens	for (t = 0; t < toplevels; t++) {
6118654d02perrin		uint64_t is_log = B_FALSE;
613fa9e406ahrens		nv = top[t];
6158654d02perrin		/*
6168654d02perrin		 * For separate logs we ignore the top level vdev replication
6178654d02perrin		 * constraints.
6188654d02perrin		 */
6198654d02perrin		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
6208654d02perrin		if (is_log)
6218654d02perrin			continue;
6238654d02perrin		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
6248654d02perrin		    &type) == 0);
625fa9e406ahrens		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
626fa9e406ahrens		    &child, &children) != 0) {
627fa9e406ahrens			/*
628fa9e406ahrens			 * This is a 'file' or 'disk' vdev.
629fa9e406ahrens			 */
63099653d4eschrock			rep.zprl_type = type;
63199653d4eschrock			rep.zprl_children = 1;
63299653d4eschrock			rep.zprl_parity = 0;
633fa9e406ahrens		} else {
634fa9e406ahrens			uint64_t vdev_size;
636fa9e406ahrens			/*
637fa9e406ahrens			 * This is a mirror or RAID-Z vdev.  Go through and make
638fa9e406ahrens			 * sure the contents are all the same (files vs. disks),
639fa9e406ahrens			 * keeping track of the number of elements in the
640fa9e406ahrens			 * process.
641fa9e406ahrens			 *
642fa9e406ahrens			 * We also check that the size of each vdev (if it can
643fa9e406ahrens			 * be determined) is the same.
644fa9e406ahrens			 */
64599653d4eschrock			rep.zprl_type = type;
64699653d4eschrock			rep.zprl_children = 0;
64899653d4eschrock			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
64999653d4eschrock				verify(nvlist_lookup_uint64(nv,
65099653d4eschrock				    ZPOOL_CONFIG_NPARITY,
65199653d4eschrock				    &rep.zprl_parity) == 0);
65299653d4eschrock				assert(rep.zprl_parity != 0);
65399653d4eschrock			} else {
65499653d4eschrock				rep.zprl_parity = 0;
65599653d4eschrock			}
657fa9e406ahrens			/*
6588654d02perrin			 * The 'dontreport' variable indicates that we've
659fa9e406ahrens			 * already reported an error for this spec, so don't
660fa9e406ahrens			 * bother doing it again.
661fa9e406ahrens			 */
662fa9e406ahrens			type = NULL;
663fa9e406ahrens			dontreport = 0;
664fa9e406ahrens			vdev_size = -1ULL;
665fa9e406ahrens			for (c = 0; c < children; c++) {
666fa9e406ahrens				nvlist_t *cnv = child[c];
667fa9e406ahrens				char *path;
668fa9e406ahrens				struct stat64 statbuf;
669fa9e406ahrens				uint64_t size = -1ULL;
670fa9e406ahrens				char *childtype;
671fa9e406ahrens				int fd, err;
67399653d4eschrock				rep.zprl_children++;
675fa9e406ahrens				verify(nvlist_lookup_string(cnv,
676fa9e406ahrens				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
67894de1d4eschrock				/*
6798654d02perrin				 * If this is a replacing or spare vdev, then
680ac0215floli				 * get the real first child of the vdev: do this
681ac0215floli				 * in a loop because replacing and spare vdevs
682ac0215floli				 * can be nested.
68394de1d4eschrock				 */
684ac0215floli				while (strcmp(childtype,
68594de1d4eschrock				    VDEV_TYPE_REPLACING) == 0 ||
68694de1d4eschrock				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
68794de1d4eschrock					nvlist_t **rchild;
68894de1d4eschrock					uint_t rchildren;
69094de1d4eschrock					verify(nvlist_lookup_nvlist_array(cnv,
69194de1d4eschrock					    ZPOOL_CONFIG_CHILDREN, &rchild,
69294de1d4eschrock					    &rchildren) == 0);
69394de1d4eschrock					assert(rchildren == 2);
69494de1d4eschrock					cnv = rchild[0];
69694de1d4eschrock					verify(nvlist_lookup_string(cnv,
69794de1d4eschrock					    ZPOOL_CONFIG_TYPE,
69894de1d4eschrock					    &childtype) == 0);
69994de1d4eschrock				}
701fa9e406ahrens				verify(nvlist_lookup_string(cnv,
702fa9e406ahrens				    ZPOOL_CONFIG_PATH, &path) == 0);
704fa9e406ahrens				/*
705fa9e406ahrens				 * If we have a raidz/mirror that combines disks
706fa9e406ahrens				 * with files, report it as an error.
707fa9e406ahrens				 */
708fa9e406ahrens				if (!dontreport && type != NULL &&
709fa9e406ahrens				    strcmp(type, childtype) != 0) {
710fa9e406ahrens					if (ret != NULL)
711fa9e406ahrens						free(ret);
712fa9e406ahrens					ret = NULL;
713fa9e406ahrens					if (fatal)
714fa9e406ahrens						vdev_error(gettext(
715fa9e406ahrens						    "mismatched replication "
716fa9e406ahrens						    "level: %s contains both "
717fa9e406ahrens						    "files and devices\n"),
71899653d4eschrock						    rep.zprl_type);
719fa9e406ahrens					else
720fa9e406ahrens						return (NULL);
72199653d4eschrock					dontreport = B_TRUE;
722fa9e406ahrens				}
724fa9e406ahrens				/*
725fa9e406ahrens				 * According to stat(2), the value of 'st_size'
726fa9e406ahrens				 * is undefined for block devices and character
727fa9e406ahrens				 * devices.  But there is no effective way to
728fa9e406ahrens				 * determine the real size in userland.
729fa9e406ahrens				 *
730fa9e406ahrens				 * Instead, we'll take advantage of an
731fa9e406ahrens				 * implementation detail of spec_size().  If the
732fa9e406ahrens				 * device is currently open, then we (should)
733fa9e406ahrens				 * return a valid size.
734fa9e406ahrens				 *
735fa9e406ahrens				 * If we still don't get a valid size (indicated
736fa9e406ahrens				 * by a size of 0 or MAXOFFSET_T), then ignore
737fa9e406ahrens				 * this device altogether.
738fa9e406ahrens				 */
739fa9e406ahrens				if ((fd = open(path, O_RDONLY)) >= 0) {
740fa9e406ahrens					err = fstat64(fd, &statbuf);
741fa9e406ahrens					(void) close(fd);
742fa9e406ahrens				} else {
743fa9e406ahrens					err = stat64(path, &statbuf);
744fa9e406ahrens				}
746fa9e406ahrens				if (err != 0 ||
747fa9e406ahrens				    statbuf.st_size == 0 ||
748fa9e406ahrens				    statbuf.st_size == MAXOFFSET_T)
749fa9e406ahrens					continue;
751fa9e406ahrens				size = statbuf.st_size;
753fa9e406ahrens				/*
7548488aebtaylor				 * Also make sure that devices and
7558488aebtaylor				 * slices have a consistent size.  If
7568488aebtaylor				 * they differ by a significant amount
7578488aebtaylor				 * (~16MB) then report an error.
758fa9e406ahrens				 */
7598488aebtaylor				if (!dontreport &&
7608488aebtaylor				    (vdev_size != -1ULL &&
7618488aebtaylor				    (labs(size - vdev_size) >
7628488aebtaylor				    ZPOOL_FUZZ))) {
763fa9e406ahrens					if (ret != NULL)
764fa9e406ahrens						free(ret);
765fa9e406ahrens					ret = NULL;
766fa9e406ahrens					if (fatal)
767fa9e406ahrens						vdev_error(gettext(
768fa9e406ahrens						    "%s contains devices of "
769fa9e406ahrens						    "different sizes\n"),
77099653d4eschrock						    rep.zprl_type);
771fa9e406ahrens					else
772fa9e406ahrens						return (NULL);
77399653d4eschrock					dontreport = B_TRUE;
774fa9e406ahrens				}
776fa9e406ahrens				type = childtype;
777fa9e406ahrens				vdev_size = size;
778fa9e406ahrens			}
779fa9e406ahrens		}
781fa9e406ahrens		/*
782fa9e406ahrens		 * At this point, we have the replication of the last toplevel
783663207aDon Brady		 * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
784fa9e406ahrens		 * different.
785fa9e406ahrens		 */
78699653d4eschrock		if (lastrep.zprl_type != NULL) {
787663207aDon Brady			if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
788663207aDon Brady			    is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
789663207aDon Brady				/*
790663207aDon Brady				 * Accepted raidz and mirror when they can
791663207aDon Brady				 * handle the same number of disk failures.
792663207aDon Brady				 */
793663207aDon Brady				if (raidz->zprl_parity !=
794663207aDon Brady				    mirror->zprl_children - 1) {
795663207aDon Brady					if (ret != NULL)
796663207aDon Brady						free(ret);
797663207aDon Brady					ret = NULL;
798663207aDon Brady					if (fatal)
799663207aDon Brady						vdev_error(gettext(
800663207aDon Brady						    "mismatched replication "
801663207aDon Brady						    "level: "
802663207aDon Brady						    "%s and %s vdevs with "
803663207aDon Brady						    "different redundancy, "
804663207aDon Brady						    "%llu vs. %llu (%llu-way) "
805663207aDon Brady						    "are present\n"),
806663207aDon Brady						    raidz->zprl_type,
807663207aDon Brady						    mirror->zprl_type,
808663207aDon Brady						    raidz->zprl_parity,
809663207aDon Brady						    mirror->zprl_children - 1,
810663207aDon Brady						    mirror->zprl_children);
811663207aDon Brady					else
812663207aDon Brady						return (NULL);
813663207aDon Brady				}
814663207aDon Brady			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
815663207aDon Brady			    0) {
816fa9e406ahrens				if (ret != NULL)
817fa9e406ahrens					free(ret);
818fa9e406ahrens				ret = NULL;
819fa9e406ahrens				if (fatal)
820fa9e406ahrens					vdev_error(gettext(
82199653d4eschrock					    "mismatched replication level: "