1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21*99653d4eSeschrock 22fa9e4066Sahrens /* 2346a2abf2Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24fa9e4066Sahrens * Use is subject to license terms. 25fa9e4066Sahrens */ 26fa9e4066Sahrens 27fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28fa9e4066Sahrens 29fa9e4066Sahrens /* 30fa9e4066Sahrens * Functions to convert between a list of vdevs and an nvlist representing the 31fa9e4066Sahrens * configuration. Each entry in the list can be one of: 32fa9e4066Sahrens * 33fa9e4066Sahrens * Device vdevs 34fa9e4066Sahrens * disk=(path=..., devid=...) 35fa9e4066Sahrens * file=(path=...) 36fa9e4066Sahrens * 37fa9e4066Sahrens * Group vdevs 38*99653d4eSeschrock * raidz[1|2]=(...) 39fa9e4066Sahrens * mirror=(...) 40fa9e4066Sahrens * 41*99653d4eSeschrock * Hot spares 42*99653d4eSeschrock * 43fa9e4066Sahrens * While the underlying implementation supports it, group vdevs cannot contain 44fa9e4066Sahrens * other group vdevs. All userland verification of devices is contained within 45fa9e4066Sahrens * this file. If successful, the nvlist returned can be passed directly to the 46fa9e4066Sahrens * kernel; we've done as much verification as possible in userland. 47fa9e4066Sahrens * 48*99653d4eSeschrock * Hot spares are a special case, and passed down as an array of disk vdevs, at 49*99653d4eSeschrock * the same level as the root of the vdev tree. 50*99653d4eSeschrock * 51fa9e4066Sahrens * The only function exported by this file is 'get_vdev_spec'. The function 52fa9e4066Sahrens * performs several passes: 53fa9e4066Sahrens * 54fa9e4066Sahrens * 1. Construct the vdev specification. Performs syntax validation and 55fa9e4066Sahrens * makes sure each device is valid. 56fa9e4066Sahrens * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57fa9e4066Sahrens * devices are also in use. Some can be overridden using the 'force' 58fa9e4066Sahrens * flag, others cannot. 59fa9e4066Sahrens * 3. Check for replication errors if the 'force' flag is not specified. 60fa9e4066Sahrens * validates that the replication level is consistent across the 61fa9e4066Sahrens * entire pool. 62fa9e4066Sahrens * 4. Label any whole disks with an EFI label. 63fa9e4066Sahrens */ 64fa9e4066Sahrens 65fa9e4066Sahrens #include <assert.h> 66fa9e4066Sahrens #include <devid.h> 67fa9e4066Sahrens #include <errno.h> 68fa9e4066Sahrens #include <fcntl.h> 69fa9e4066Sahrens #include <libdiskmgt.h> 70fa9e4066Sahrens #include <libintl.h> 71fa9e4066Sahrens #include <libnvpair.h> 72fa9e4066Sahrens #include <stdio.h> 73fa9e4066Sahrens #include <string.h> 74fa9e4066Sahrens #include <unistd.h> 75fa9e4066Sahrens #include <sys/efi_partition.h> 76fa9e4066Sahrens #include <sys/stat.h> 77fa9e4066Sahrens #include <sys/vtoc.h> 78fa9e4066Sahrens #include <sys/mntent.h> 79fa9e4066Sahrens 80fa9e4066Sahrens #include <libzfs.h> 81fa9e4066Sahrens 82fa9e4066Sahrens #include "zpool_util.h" 83fa9e4066Sahrens 84fa9e4066Sahrens #define DISK_ROOT "/dev/dsk" 85fa9e4066Sahrens #define RDISK_ROOT "/dev/rdsk" 86fa9e4066Sahrens #define BACKUP_SLICE "s2" 87fa9e4066Sahrens 88fa9e4066Sahrens /* 89fa9e4066Sahrens * For any given vdev specification, we can have multiple errors. The 90fa9e4066Sahrens * vdev_error() function keeps track of whether we have seen an error yet, and 91fa9e4066Sahrens * prints out a header if its the first error we've seen. 92fa9e4066Sahrens */ 93*99653d4eSeschrock boolean_t error_seen; 94*99653d4eSeschrock boolean_t is_force; 95fa9e4066Sahrens 96*99653d4eSeschrock /*PRINTFLIKE1*/ 97*99653d4eSeschrock static void 98fa9e4066Sahrens vdev_error(const char *fmt, ...) 99fa9e4066Sahrens { 100fa9e4066Sahrens va_list ap; 101fa9e4066Sahrens 102fa9e4066Sahrens if (!error_seen) { 103fa9e4066Sahrens (void) fprintf(stderr, gettext("invalid vdev specification\n")); 104fa9e4066Sahrens if (!is_force) 105fa9e4066Sahrens (void) fprintf(stderr, gettext("use '-f' to override " 106fa9e4066Sahrens "the following errors:\n")); 107fa9e4066Sahrens else 108fa9e4066Sahrens (void) fprintf(stderr, gettext("the following errors " 109fa9e4066Sahrens "must be manually repaired:\n")); 110*99653d4eSeschrock error_seen = B_TRUE; 111fa9e4066Sahrens } 112fa9e4066Sahrens 113fa9e4066Sahrens va_start(ap, fmt); 114fa9e4066Sahrens (void) vfprintf(stderr, fmt, ap); 115fa9e4066Sahrens va_end(ap); 116fa9e4066Sahrens } 117fa9e4066Sahrens 11846a2abf2Seschrock static void 11946a2abf2Seschrock libdiskmgt_error(int error) 120fa9e4066Sahrens { 121ea8dc4b6Seschrock /* 122*99653d4eSeschrock * ENXIO/ENODEV is a valid error message if the device doesn't live in 123ea8dc4b6Seschrock * /dev/dsk. Don't bother printing an error message in this case. 124ea8dc4b6Seschrock */ 125*99653d4eSeschrock if (error == ENXIO || error == ENODEV) 126ea8dc4b6Seschrock return; 127ea8dc4b6Seschrock 12846a2abf2Seschrock (void) fprintf(stderr, gettext("warning: device in use checking " 12946a2abf2Seschrock "failed: %s\n"), strerror(error)); 130fa9e4066Sahrens } 131fa9e4066Sahrens 132fa9e4066Sahrens /* 13346a2abf2Seschrock * Validate a device, passing the bulk of the work off to libdiskmgt. 134fa9e4066Sahrens */ 135fa9e4066Sahrens int 136*99653d4eSeschrock check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 137fa9e4066Sahrens { 13846a2abf2Seschrock char *msg; 13946a2abf2Seschrock int error = 0; 140fa9e4066Sahrens int ret = 0; 141fa9e4066Sahrens 14246a2abf2Seschrock if (dm_inuse((char *)path, &msg, 14346a2abf2Seschrock force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL, &error) || error) { 14446a2abf2Seschrock if (error != 0) { 14546a2abf2Seschrock libdiskmgt_error(error); 14646a2abf2Seschrock return (0); 147*99653d4eSeschrock } else if (!isspare || 148*99653d4eSeschrock strstr(msg, gettext("hot spare")) == NULL) { 149*99653d4eSeschrock /* 150*99653d4eSeschrock * The above check is a rather severe hack. It would 151*99653d4eSeschrock * probably make more sense to have DM_WHO_ZPOOL_SPARE 152*99653d4eSeschrock * instead. 153*99653d4eSeschrock */ 15446a2abf2Seschrock vdev_error("%s", msg); 15546a2abf2Seschrock free(msg); 156*99653d4eSeschrock ret = -1; 157fa9e4066Sahrens } 158fa9e4066Sahrens 159fa9e4066Sahrens } 160fa9e4066Sahrens 161fa9e4066Sahrens /* 16246a2abf2Seschrock * If we're given a whole disk, ignore overlapping slices since we're 16346a2abf2Seschrock * about to label it anyway. 164fa9e4066Sahrens */ 16546a2abf2Seschrock error = 0; 16646a2abf2Seschrock if (!wholedisk && !force && 16746a2abf2Seschrock (dm_isoverlapping((char *)path, &msg, &error) || error)) { 16846a2abf2Seschrock if (error != 0) { 16946a2abf2Seschrock libdiskmgt_error(error); 17046a2abf2Seschrock return (0); 171fa9e4066Sahrens } else { 17246a2abf2Seschrock vdev_error("%s overlaps with %s\n", path, msg); 17346a2abf2Seschrock free(msg); 174fa9e4066Sahrens } 175fa9e4066Sahrens 17646a2abf2Seschrock ret = -1; 17746a2abf2Seschrock } 178fa9e4066Sahrens 17946a2abf2Seschrock return (ret); 180fa9e4066Sahrens } 181fa9e4066Sahrens 182fa9e4066Sahrens /* 183fa9e4066Sahrens * Validate a whole disk. Iterate over all slices on the disk and make sure 184fa9e4066Sahrens * that none is in use by calling check_slice(). 185fa9e4066Sahrens */ 186fa9e4066Sahrens /* ARGSUSED */ 187fa9e4066Sahrens int 188*99653d4eSeschrock check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 189fa9e4066Sahrens { 190fa9e4066Sahrens dm_descriptor_t *drive, *media, *slice; 191fa9e4066Sahrens int err = 0; 192fa9e4066Sahrens int i; 193fa9e4066Sahrens int ret; 194fa9e4066Sahrens 195fa9e4066Sahrens /* 196fa9e4066Sahrens * Get the drive associated with this disk. This should never fail, 197fa9e4066Sahrens * because we already have an alias handle open for the device. 198fa9e4066Sahrens */ 199fa9e4066Sahrens if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 20046a2abf2Seschrock &err)) == NULL || *drive == NULL) { 20146a2abf2Seschrock if (err) 20246a2abf2Seschrock libdiskmgt_error(err); 20346a2abf2Seschrock return (0); 20446a2abf2Seschrock } 205fa9e4066Sahrens 206fa9e4066Sahrens if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 20746a2abf2Seschrock &err)) == NULL) { 20846a2abf2Seschrock dm_free_descriptors(drive); 20946a2abf2Seschrock if (err) 21046a2abf2Seschrock libdiskmgt_error(err); 21146a2abf2Seschrock return (0); 21246a2abf2Seschrock } 213fa9e4066Sahrens 214fa9e4066Sahrens dm_free_descriptors(drive); 215fa9e4066Sahrens 216fa9e4066Sahrens /* 217fa9e4066Sahrens * It is possible that the user has specified a removable media drive, 218fa9e4066Sahrens * and the media is not present. 219fa9e4066Sahrens */ 220fa9e4066Sahrens if (*media == NULL) { 221fa9e4066Sahrens dm_free_descriptors(media); 22246a2abf2Seschrock vdev_error(gettext("'%s' has no media in drive\n"), name); 223fa9e4066Sahrens return (-1); 224fa9e4066Sahrens } 225fa9e4066Sahrens 226fa9e4066Sahrens if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 22746a2abf2Seschrock &err)) == NULL) { 22846a2abf2Seschrock dm_free_descriptors(media); 22946a2abf2Seschrock if (err) 23046a2abf2Seschrock libdiskmgt_error(err); 23146a2abf2Seschrock return (0); 23246a2abf2Seschrock } 233fa9e4066Sahrens 234fa9e4066Sahrens dm_free_descriptors(media); 235fa9e4066Sahrens 236fa9e4066Sahrens ret = 0; 237fa9e4066Sahrens 238fa9e4066Sahrens /* 239fa9e4066Sahrens * Iterate over all slices and report any errors. We don't care about 240fa9e4066Sahrens * overlapping slices because we are using the whole disk. 241fa9e4066Sahrens */ 242fa9e4066Sahrens for (i = 0; slice[i] != NULL; i++) { 243*99653d4eSeschrock char *name = dm_get_name(slice[i], &err); 244*99653d4eSeschrock 245*99653d4eSeschrock if (check_slice(name, force, B_TRUE, isspare) != 0) 246fa9e4066Sahrens ret = -1; 247*99653d4eSeschrock 248*99653d4eSeschrock dm_free_name(name); 249fa9e4066Sahrens } 250fa9e4066Sahrens 251fa9e4066Sahrens dm_free_descriptors(slice); 252fa9e4066Sahrens return (ret); 253fa9e4066Sahrens } 254fa9e4066Sahrens 255fa9e4066Sahrens /* 25646a2abf2Seschrock * Validate a device. 257fa9e4066Sahrens */ 258fa9e4066Sahrens int 259*99653d4eSeschrock check_device(const char *path, boolean_t force, boolean_t isspare) 260fa9e4066Sahrens { 261fa9e4066Sahrens dm_descriptor_t desc; 262fa9e4066Sahrens int err; 26346a2abf2Seschrock char *dev; 264fa9e4066Sahrens 265fa9e4066Sahrens /* 266fa9e4066Sahrens * For whole disks, libdiskmgt does not include the leading dev path. 267fa9e4066Sahrens */ 268fa9e4066Sahrens dev = strrchr(path, '/'); 269fa9e4066Sahrens assert(dev != NULL); 270fa9e4066Sahrens dev++; 27146a2abf2Seschrock if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 272*99653d4eSeschrock err = check_disk(path, desc, force, isspare); 27346a2abf2Seschrock dm_free_descriptor(desc); 27446a2abf2Seschrock return (err); 275fa9e4066Sahrens } 276fa9e4066Sahrens 277*99653d4eSeschrock return (check_slice(path, force, B_FALSE, isspare)); 278fa9e4066Sahrens } 279fa9e4066Sahrens 280fa9e4066Sahrens /* 281fa9e4066Sahrens * Check that a file is valid. All we can do in this case is check that it's 282fa9e4066Sahrens * not in use by another pool. 283fa9e4066Sahrens */ 284fa9e4066Sahrens int 285*99653d4eSeschrock check_file(const char *file, boolean_t force, boolean_t isspare) 286fa9e4066Sahrens { 28746a2abf2Seschrock char *name; 288fa9e4066Sahrens int fd; 289fa9e4066Sahrens int ret = 0; 29046a2abf2Seschrock pool_state_t state; 291*99653d4eSeschrock boolean_t inuse; 292fa9e4066Sahrens 293fa9e4066Sahrens if ((fd = open(file, O_RDONLY)) < 0) 294fa9e4066Sahrens return (0); 295fa9e4066Sahrens 296*99653d4eSeschrock if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 29746a2abf2Seschrock const char *desc; 29846a2abf2Seschrock 29946a2abf2Seschrock switch (state) { 30046a2abf2Seschrock case POOL_STATE_ACTIVE: 30146a2abf2Seschrock desc = gettext("active"); 30246a2abf2Seschrock break; 30346a2abf2Seschrock 30446a2abf2Seschrock case POOL_STATE_EXPORTED: 30546a2abf2Seschrock desc = gettext("exported"); 30646a2abf2Seschrock break; 30746a2abf2Seschrock 30846a2abf2Seschrock case POOL_STATE_POTENTIALLY_ACTIVE: 30946a2abf2Seschrock desc = gettext("potentially active"); 31046a2abf2Seschrock break; 31146a2abf2Seschrock 31246a2abf2Seschrock default: 31346a2abf2Seschrock desc = gettext("unknown"); 31446a2abf2Seschrock break; 31546a2abf2Seschrock } 31646a2abf2Seschrock 317*99653d4eSeschrock /* 318*99653d4eSeschrock * Allow hot spares to be shared between pools. 319*99653d4eSeschrock */ 320*99653d4eSeschrock if (state == POOL_STATE_SPARE && isspare) 321*99653d4eSeschrock return (0); 322*99653d4eSeschrock 323*99653d4eSeschrock if (state == POOL_STATE_ACTIVE || 324*99653d4eSeschrock state == POOL_STATE_SPARE || !force) { 325*99653d4eSeschrock switch (state) { 326*99653d4eSeschrock case POOL_STATE_SPARE: 327*99653d4eSeschrock vdev_error(gettext("%s is reserved as a hot " 328*99653d4eSeschrock "spare for pool %s\n"), file, name); 329*99653d4eSeschrock break; 330*99653d4eSeschrock default: 331*99653d4eSeschrock vdev_error(gettext("%s is part of %s pool " 332*99653d4eSeschrock "'%s'\n"), file, desc, name); 333*99653d4eSeschrock break; 334*99653d4eSeschrock } 335fa9e4066Sahrens ret = -1; 336fa9e4066Sahrens } 337fa9e4066Sahrens 338fa9e4066Sahrens free(name); 339fa9e4066Sahrens } 340fa9e4066Sahrens 341fa9e4066Sahrens (void) close(fd); 342fa9e4066Sahrens return (ret); 343fa9e4066Sahrens } 344fa9e4066Sahrens 345*99653d4eSeschrock static boolean_t 346fa9e4066Sahrens is_whole_disk(const char *arg, struct stat64 *statbuf) 347fa9e4066Sahrens { 348fa9e4066Sahrens char path[MAXPATHLEN]; 349fa9e4066Sahrens 350fa9e4066Sahrens (void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE); 351fa9e4066Sahrens if (stat64(path, statbuf) == 0) 352*99653d4eSeschrock return (B_TRUE); 353fa9e4066Sahrens 354*99653d4eSeschrock return (B_FALSE); 355fa9e4066Sahrens } 356fa9e4066Sahrens 357fa9e4066Sahrens /* 358fa9e4066Sahrens * Create a leaf vdev. Determine if this is a file or a device. If it's a 359fa9e4066Sahrens * device, fill in the device id to make a complete nvlist. Valid forms for a 360fa9e4066Sahrens * leaf vdev are: 361fa9e4066Sahrens * 362fa9e4066Sahrens * /dev/dsk/xxx Complete disk path 363fa9e4066Sahrens * /xxx Full path to file 364fa9e4066Sahrens * xxx Shorthand for /dev/dsk/xxx 365fa9e4066Sahrens */ 366fa9e4066Sahrens nvlist_t * 367fa9e4066Sahrens make_leaf_vdev(const char *arg) 368fa9e4066Sahrens { 369fa9e4066Sahrens char path[MAXPATHLEN]; 370fa9e4066Sahrens struct stat64 statbuf; 371fa9e4066Sahrens nvlist_t *vdev = NULL; 372fa9e4066Sahrens char *type = NULL; 373*99653d4eSeschrock boolean_t wholedisk = B_FALSE; 374fa9e4066Sahrens 375fa9e4066Sahrens /* 376fa9e4066Sahrens * Determine what type of vdev this is, and put the full path into 377fa9e4066Sahrens * 'path'. We detect whether this is a device of file afterwards by 378fa9e4066Sahrens * checking the st_mode of the file. 379fa9e4066Sahrens */ 380fa9e4066Sahrens if (arg[0] == '/') { 381fa9e4066Sahrens /* 382fa9e4066Sahrens * Complete device or file path. Exact type is determined by 383fa9e4066Sahrens * examining the file descriptor afterwards. 384fa9e4066Sahrens */ 385fa9e4066Sahrens if (is_whole_disk(arg, &statbuf)) { 386*99653d4eSeschrock wholedisk = B_TRUE; 387fa9e4066Sahrens } else if (stat64(arg, &statbuf) != 0) { 388fa9e4066Sahrens (void) fprintf(stderr, 389fa9e4066Sahrens gettext("cannot open '%s': %s\n"), 390fa9e4066Sahrens arg, strerror(errno)); 391fa9e4066Sahrens return (NULL); 392fa9e4066Sahrens } 393fa9e4066Sahrens 394fa9e4066Sahrens (void) strlcpy(path, arg, sizeof (path)); 395fa9e4066Sahrens } else { 396fa9e4066Sahrens /* 397fa9e4066Sahrens * This may be a short path for a device, or it could be total 398fa9e4066Sahrens * gibberish. Check to see if it's a known device in 399fa9e4066Sahrens * /dev/dsk/. As part of this check, see if we've been given a 400fa9e4066Sahrens * an entire disk (minus the slice number). 401fa9e4066Sahrens */ 402fa9e4066Sahrens (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 403fa9e4066Sahrens arg); 404fa9e4066Sahrens if (is_whole_disk(path, &statbuf)) { 405*99653d4eSeschrock wholedisk = B_TRUE; 406fa9e4066Sahrens } else if (stat64(path, &statbuf) != 0) { 407fa9e4066Sahrens /* 408fa9e4066Sahrens * If we got ENOENT, then the user gave us 409fa9e4066Sahrens * gibberish, so try to direct them with a 410fa9e4066Sahrens * reasonable error message. Otherwise, 411fa9e4066Sahrens * regurgitate strerror() since it's the best we 412fa9e4066Sahrens * can do. 413fa9e4066Sahrens */ 414fa9e4066Sahrens if (errno == ENOENT) { 415fa9e4066Sahrens (void) fprintf(stderr, 416fa9e4066Sahrens gettext("cannot open '%s': no such " 417fa9e4066Sahrens "device in %s\n"), arg, DISK_ROOT); 418fa9e4066Sahrens (void) fprintf(stderr, 419fa9e4066Sahrens gettext("must be a full path or " 420fa9e4066Sahrens "shorthand device name\n")); 421fa9e4066Sahrens return (NULL); 422fa9e4066Sahrens } else { 423fa9e4066Sahrens (void) fprintf(stderr, 424fa9e4066Sahrens gettext("cannot open '%s': %s\n"), 425fa9e4066Sahrens path, strerror(errno)); 426fa9e4066Sahrens return (NULL); 427fa9e4066Sahrens } 428fa9e4066Sahrens } 429fa9e4066Sahrens } 430fa9e4066Sahrens 431fa9e4066Sahrens /* 432fa9e4066Sahrens * Determine whether this is a device or a file. 433fa9e4066Sahrens */ 434fa9e4066Sahrens if (S_ISBLK(statbuf.st_mode)) { 435fa9e4066Sahrens type = VDEV_TYPE_DISK; 436fa9e4066Sahrens } else if (S_ISREG(statbuf.st_mode)) { 437fa9e4066Sahrens type = VDEV_TYPE_FILE; 438fa9e4066Sahrens } else { 439fa9e4066Sahrens (void) fprintf(stderr, gettext("cannot use '%s': must be a " 440fa9e4066Sahrens "block device or regular file\n"), path); 441fa9e4066Sahrens return (NULL); 442fa9e4066Sahrens } 443fa9e4066Sahrens 444fa9e4066Sahrens /* 445fa9e4066Sahrens * Finally, we have the complete device or file, and we know that it is 446fa9e4066Sahrens * acceptable to use. Construct the nvlist to describe this vdev. All 447fa9e4066Sahrens * vdevs have a 'path' element, and devices also have a 'devid' element. 448fa9e4066Sahrens */ 449fa9e4066Sahrens verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 450fa9e4066Sahrens verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 451fa9e4066Sahrens verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 452afefbcddSeschrock if (strcmp(type, VDEV_TYPE_DISK) == 0) 453afefbcddSeschrock verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 454afefbcddSeschrock (uint64_t)wholedisk) == 0); 455fa9e4066Sahrens 456fa9e4066Sahrens /* 457fa9e4066Sahrens * For a whole disk, defer getting its devid until after labeling it. 458fa9e4066Sahrens */ 459fa9e4066Sahrens if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 460fa9e4066Sahrens /* 461fa9e4066Sahrens * Get the devid for the device. 462fa9e4066Sahrens */ 463fa9e4066Sahrens int fd; 464fa9e4066Sahrens ddi_devid_t devid; 465fa9e4066Sahrens char *minor = NULL, *devid_str = NULL; 466fa9e4066Sahrens 467fa9e4066Sahrens if ((fd = open(path, O_RDONLY)) < 0) { 468fa9e4066Sahrens (void) fprintf(stderr, gettext("cannot open '%s': " 469fa9e4066Sahrens "%s\n"), path, strerror(errno)); 470fa9e4066Sahrens nvlist_free(vdev); 471fa9e4066Sahrens return (NULL); 472fa9e4066Sahrens } 473fa9e4066Sahrens 474fa9e4066Sahrens if (devid_get(fd, &devid) == 0) { 475fa9e4066Sahrens if (devid_get_minor_name(fd, &minor) == 0 && 476fa9e4066Sahrens (devid_str = devid_str_encode(devid, minor)) != 477fa9e4066Sahrens NULL) { 478fa9e4066Sahrens verify(nvlist_add_string(vdev, 479fa9e4066Sahrens ZPOOL_CONFIG_DEVID, devid_str) == 0); 480fa9e4066Sahrens } 481fa9e4066Sahrens if (devid_str != NULL) 482fa9e4066Sahrens devid_str_free(devid_str); 483fa9e4066Sahrens if (minor != NULL) 484fa9e4066Sahrens devid_str_free(minor); 485fa9e4066Sahrens devid_free(devid); 486fa9e4066Sahrens } 487fa9e4066Sahrens 488fa9e4066Sahrens (void) close(fd); 489fa9e4066Sahrens } 490fa9e4066Sahrens 491fa9e4066Sahrens return (vdev); 492fa9e4066Sahrens } 493fa9e4066Sahrens 494fa9e4066Sahrens /* 495fa9e4066Sahrens * Go through and verify the replication level of the pool is consistent. 496fa9e4066Sahrens * Performs the following checks: 497fa9e4066Sahrens * 498fa9e4066Sahrens * For the new spec, verifies that devices in mirrors and raidz are the 499fa9e4066Sahrens * same size. 500fa9e4066Sahrens * 501fa9e4066Sahrens * If the current configuration already has inconsistent replication 502fa9e4066Sahrens * levels, ignore any other potential problems in the new spec. 503fa9e4066Sahrens * 504fa9e4066Sahrens * Otherwise, make sure that the current spec (if there is one) and the new 505fa9e4066Sahrens * spec have consistent replication levels. 506fa9e4066Sahrens */ 507fa9e4066Sahrens typedef struct replication_level { 508*99653d4eSeschrock char *zprl_type; 509*99653d4eSeschrock uint64_t zprl_children; 510*99653d4eSeschrock uint64_t zprl_parity; 511fa9e4066Sahrens } replication_level_t; 512fa9e4066Sahrens 513fa9e4066Sahrens /* 514fa9e4066Sahrens * Given a list of toplevel vdevs, return the current replication level. If 515fa9e4066Sahrens * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 516fa9e4066Sahrens * an error message will be displayed for each self-inconsistent vdev. 517fa9e4066Sahrens */ 518fa9e4066Sahrens replication_level_t * 519*99653d4eSeschrock get_replication(nvlist_t *nvroot, boolean_t fatal) 520fa9e4066Sahrens { 521fa9e4066Sahrens nvlist_t **top; 522fa9e4066Sahrens uint_t t, toplevels; 523fa9e4066Sahrens nvlist_t **child; 524fa9e4066Sahrens uint_t c, children; 525fa9e4066Sahrens nvlist_t *nv; 526fa9e4066Sahrens char *type; 527fa9e4066Sahrens replication_level_t lastrep, rep, *ret; 528*99653d4eSeschrock boolean_t dontreport; 529fa9e4066Sahrens 530fa9e4066Sahrens ret = safe_malloc(sizeof (replication_level_t)); 531fa9e4066Sahrens 532fa9e4066Sahrens verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 533fa9e4066Sahrens &top, &toplevels) == 0); 534fa9e4066Sahrens 535*99653d4eSeschrock lastrep.zprl_type = NULL; 536fa9e4066Sahrens for (t = 0; t < toplevels; t++) { 537fa9e4066Sahrens nv = top[t]; 538fa9e4066Sahrens 539fa9e4066Sahrens verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 540fa9e4066Sahrens 541fa9e4066Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 542fa9e4066Sahrens &child, &children) != 0) { 543fa9e4066Sahrens /* 544fa9e4066Sahrens * This is a 'file' or 'disk' vdev. 545fa9e4066Sahrens */ 546*99653d4eSeschrock rep.zprl_type = type; 547*99653d4eSeschrock rep.zprl_children = 1; 548*99653d4eSeschrock rep.zprl_parity = 0; 549fa9e4066Sahrens } else { 550fa9e4066Sahrens uint64_t vdev_size; 551fa9e4066Sahrens 552fa9e4066Sahrens /* 553fa9e4066Sahrens * This is a mirror or RAID-Z vdev. Go through and make 554fa9e4066Sahrens * sure the contents are all the same (files vs. disks), 555fa9e4066Sahrens * keeping track of the number of elements in the 556fa9e4066Sahrens * process. 557fa9e4066Sahrens * 558fa9e4066Sahrens * We also check that the size of each vdev (if it can 559fa9e4066Sahrens * be determined) is the same. 560fa9e4066Sahrens */ 561*99653d4eSeschrock rep.zprl_type = type; 562*99653d4eSeschrock rep.zprl_children = 0; 563*99653d4eSeschrock 564*99653d4eSeschrock if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 565*99653d4eSeschrock verify(nvlist_lookup_uint64(nv, 566*99653d4eSeschrock ZPOOL_CONFIG_NPARITY, 567*99653d4eSeschrock &rep.zprl_parity) == 0); 568*99653d4eSeschrock assert(rep.zprl_parity != 0); 569*99653d4eSeschrock } else { 570*99653d4eSeschrock rep.zprl_parity = 0; 571*99653d4eSeschrock } 572fa9e4066Sahrens 573fa9e4066Sahrens /* 574fa9e4066Sahrens * The 'dontreport' variable indicatest that we've 575fa9e4066Sahrens * already reported an error for this spec, so don't 576fa9e4066Sahrens * bother doing it again. 577fa9e4066Sahrens */ 578fa9e4066Sahrens type = NULL; 579fa9e4066Sahrens dontreport = 0; 580fa9e4066Sahrens vdev_size = -1ULL; 581fa9e4066Sahrens for (c = 0; c < children; c++) { 582fa9e4066Sahrens nvlist_t *cnv = child[c]; 583fa9e4066Sahrens char *path; 584fa9e4066Sahrens struct stat64 statbuf; 585fa9e4066Sahrens uint64_t size = -1ULL; 586fa9e4066Sahrens char *childtype; 587fa9e4066Sahrens int fd, err; 588fa9e4066Sahrens 589*99653d4eSeschrock rep.zprl_children++; 590fa9e4066Sahrens 591fa9e4066Sahrens verify(nvlist_lookup_string(cnv, 592fa9e4066Sahrens ZPOOL_CONFIG_TYPE, &childtype) == 0); 593fa9e4066Sahrens verify(nvlist_lookup_string(cnv, 594fa9e4066Sahrens ZPOOL_CONFIG_PATH, &path) == 0); 595fa9e4066Sahrens 596fa9e4066Sahrens /* 597fa9e4066Sahrens * If we have a raidz/mirror that combines disks 598fa9e4066Sahrens * with files, report it as an error. 599fa9e4066Sahrens */ 600fa9e4066Sahrens if (!dontreport && type != NULL && 601fa9e4066Sahrens strcmp(type, childtype) != 0) { 602fa9e4066Sahrens if (ret != NULL) 603fa9e4066Sahrens free(ret); 604fa9e4066Sahrens ret = NULL; 605fa9e4066Sahrens if (fatal) 606fa9e4066Sahrens vdev_error(gettext( 607fa9e4066Sahrens "mismatched replication " 608fa9e4066Sahrens "level: %s contains both " 609fa9e4066Sahrens "files and devices\n"), 610*99653d4eSeschrock rep.zprl_type); 611fa9e4066Sahrens else 612fa9e4066Sahrens return (NULL); 613*99653d4eSeschrock dontreport = B_TRUE; 614fa9e4066Sahrens } 615fa9e4066Sahrens 616fa9e4066Sahrens /* 617fa9e4066Sahrens * According to stat(2), the value of 'st_size' 618fa9e4066Sahrens * is undefined for block devices and character 619fa9e4066Sahrens * devices. But there is no effective way to 620fa9e4066Sahrens * determine the real size in userland. 621fa9e4066Sahrens * 622fa9e4066Sahrens * Instead, we'll take advantage of an 623fa9e4066Sahrens * implementation detail of spec_size(). If the 624fa9e4066Sahrens * device is currently open, then we (should) 625fa9e4066Sahrens * return a valid size. 626fa9e4066Sahrens * 627fa9e4066Sahrens * If we still don't get a valid size (indicated 628fa9e4066Sahrens * by a size of 0 or MAXOFFSET_T), then ignore 629fa9e4066Sahrens * this device altogether. 630fa9e4066Sahrens */ 631fa9e4066Sahrens if ((fd = open(path, O_RDONLY)) >= 0) { 632fa9e4066Sahrens err = fstat64(fd, &statbuf); 633fa9e4066Sahrens (void) close(fd); 634fa9e4066Sahrens } else { 635fa9e4066Sahrens err = stat64(path, &statbuf); 636fa9e4066Sahrens } 637fa9e4066Sahrens 638fa9e4066Sahrens if (err != 0 || 639fa9e4066Sahrens statbuf.st_size == 0 || 640fa9e4066Sahrens statbuf.st_size == MAXOFFSET_T) 641fa9e4066Sahrens continue; 642fa9e4066Sahrens 643fa9e4066Sahrens size = statbuf.st_size; 644fa9e4066Sahrens 645fa9e4066Sahrens /* 646fa9e4066Sahrens * Also check the size of each device. If they 647fa9e4066Sahrens * differ, then report an error. 648fa9e4066Sahrens */ 649fa9e4066Sahrens if (!dontreport && vdev_size != -1ULL && 650fa9e4066Sahrens size != vdev_size) { 651fa9e4066Sahrens if (ret != NULL) 652fa9e4066Sahrens free(ret); 653fa9e4066Sahrens ret = NULL; 654fa9e4066Sahrens if (fatal) 655fa9e4066Sahrens vdev_error(gettext( 656fa9e4066Sahrens "%s contains devices of " 657fa9e4066Sahrens "different sizes\n"), 658*99653d4eSeschrock rep.zprl_type); 659fa9e4066Sahrens else 660fa9e4066Sahrens return (NULL); 661*99653d4eSeschrock dontreport = B_TRUE; 662fa9e4066Sahrens } 663fa9e4066Sahrens 664fa9e4066Sahrens type = childtype; 665fa9e4066Sahrens vdev_size = size; 666fa9e4066Sahrens } 667fa9e4066Sahrens } 668fa9e4066Sahrens 669fa9e4066Sahrens /* 670fa9e4066Sahrens * At this point, we have the replication of the last toplevel 671fa9e4066Sahrens * vdev in 'rep'. Compare it to 'lastrep' to see if its 672fa9e4066Sahrens * different. 673fa9e4066Sahrens */ 674*99653d4eSeschrock if (lastrep.zprl_type != NULL) { 675*99653d4eSeschrock if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 676fa9e4066Sahrens if (ret != NULL) 677fa9e4066Sahrens free(ret); 678fa9e4066Sahrens ret = NULL; 679fa9e4066Sahrens if (fatal) 680fa9e4066Sahrens vdev_error(gettext( 681*99653d4eSeschrock "mismatched replication level: " 682*99653d4eSeschrock "both %s and %s vdevs are " 683fa9e4066Sahrens "present\n"), 684*99653d4eSeschrock lastrep.zprl_type, rep.zprl_type); 685fa9e4066Sahrens else 686fa9e4066Sahrens return (NULL); 687*99653d4eSeschrock } else if (lastrep.zprl_parity != rep.zprl_parity) { 688fa9e4066Sahrens if (ret) 689fa9e4066Sahrens free(ret); 690fa9e4066Sahrens ret = NULL; 691fa9e4066Sahrens if (fatal) 692fa9e4066Sahrens vdev_error(gettext( 693*99653d4eSeschrock "mismatched replication level: " 694*99653d4eSeschrock "both %llu and %llu device parity " 695*99653d4eSeschrock "%s vdevs are present\n"), 696*99653d4eSeschrock lastrep.zprl_parity, 697*99653d4eSeschrock rep.zprl_parity, 698*99653d4eSeschrock rep.zprl_type); 699*99653d4eSeschrock else 700*99653d4eSeschrock return (NULL); 701*99653d4eSeschrock } else if (lastrep.zprl_children != rep.zprl_children) { 702*99653d4eSeschrock if (ret) 703*99653d4eSeschrock free(ret); 704*99653d4eSeschrock ret = NULL; 705*99653d4eSeschrock if (fatal) 706*99653d4eSeschrock vdev_error(gettext( 707*99653d4eSeschrock "mismatched replication level: " 708*99653d4eSeschrock "both %llu-way and %llu-way %s " 709fa9e4066Sahrens "vdevs are present\n"), 710*99653d4eSeschrock lastrep.zprl_children, 711*99653d4eSeschrock rep.zprl_children, 712*99653d4eSeschrock rep.zprl_type); 713fa9e4066Sahrens else 714fa9e4066Sahrens return (NULL); 715fa9e4066Sahrens } 716fa9e4066Sahrens } 717fa9e4066Sahrens lastrep = rep; 718fa9e4066Sahrens } 719fa9e4066Sahrens 720*99653d4eSeschrock if (ret != NULL) 721*99653d4eSeschrock *ret = rep; 722fa9e4066Sahrens 723fa9e4066Sahrens return (ret); 724fa9e4066Sahrens } 725fa9e4066Sahrens 726fa9e4066Sahrens /* 727fa9e4066Sahrens * Check the replication level of the vdev spec against the current pool. Calls 728fa9e4066Sahrens * get_replication() to make sure the new spec is self-consistent. If the pool 729fa9e4066Sahrens * has a consistent replication level, then we ignore any errors. Otherwise, 730fa9e4066Sahrens * report any difference between the two. 731fa9e4066Sahrens */ 732fa9e4066Sahrens int 733fa9e4066Sahrens check_replication(nvlist_t *config, nvlist_t *newroot) 734fa9e4066Sahrens { 735fa9e4066Sahrens replication_level_t *current = NULL, *new; 736fa9e4066Sahrens int ret; 737fa9e4066Sahrens 738fa9e4066Sahrens /* 739fa9e4066Sahrens * If we have a current pool configuration, check to see if it's 740fa9e4066Sahrens * self-consistent. If not, simply return success. 741fa9e4066Sahrens */ 742fa9e4066Sahrens if (config != NULL) { 743fa9e4066Sahrens nvlist_t *nvroot; 744fa9e4066Sahrens 745fa9e4066Sahrens verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 746fa9e4066Sahrens &nvroot) == 0); 747*99653d4eSeschrock if ((current = get_replication(nvroot, B_FALSE)) == NULL) 748fa9e4066Sahrens return (0); 749fa9e4066Sahrens } 750fa9e4066Sahrens 751fa9e4066Sahrens /* 752fa9e4066Sahrens * Get the replication level of the new vdev spec, reporting any 753fa9e4066Sahrens * inconsistencies found. 754fa9e4066Sahrens */ 755*99653d4eSeschrock if ((new = get_replication(newroot, B_TRUE)) == NULL) { 756fa9e4066Sahrens free(current); 757fa9e4066Sahrens return (-1); 758fa9e4066Sahrens } 759fa9e4066Sahrens 760fa9e4066Sahrens /* 761fa9e4066Sahrens * Check to see if the new vdev spec matches the replication level of 762fa9e4066Sahrens * the current pool. 763fa9e4066Sahrens */ 764fa9e4066Sahrens ret = 0; 765fa9e4066Sahrens if (current != NULL) { 766*99653d4eSeschrock if (strcmp(current->zprl_type, new->zprl_type) != 0) { 767fa9e4066Sahrens vdev_error(gettext( 768*99653d4eSeschrock "mismatched replication level: pool uses %s " 769*99653d4eSeschrock "and new vdev is %s\n"), 770*99653d4eSeschrock current->zprl_type, new->zprl_type); 771*99653d4eSeschrock ret = -1; 772*99653d4eSeschrock } else if (current->zprl_parity != new->zprl_parity) { 773*99653d4eSeschrock vdev_error(gettext( 774*99653d4eSeschrock "mismatched replication level: pool uses %llu " 775*99653d4eSeschrock "device parity and new vdev uses %llu\n"), 776*99653d4eSeschrock current->zprl_parity, new->zprl_parity); 777*99653d4eSeschrock ret = -1; 778*99653d4eSeschrock } else if (current->zprl_children != new->zprl_children) { 779*99653d4eSeschrock vdev_error(gettext( 780*99653d4eSeschrock "mismatched replication level: pool uses %llu-way " 781*99653d4eSeschrock "%s and new vdev uses %llu-way %s\n"), 782*99653d4eSeschrock current->zprl_children, current->zprl_type, 783*99653d4eSeschrock new->zprl_children, new->zprl_type); 784fa9e4066Sahrens ret = -1; 785fa9e4066Sahrens } 786fa9e4066Sahrens } 787fa9e4066Sahrens 788fa9e4066Sahrens free(new); 789fa9e4066Sahrens if (current != NULL) 790fa9e4066Sahrens free(current); 791fa9e4066Sahrens 792fa9e4066Sahrens return (ret); 793fa9e4066Sahrens } 794fa9e4066Sahrens 795fa9e4066Sahrens /* 796fa9e4066Sahrens * Label an individual disk. The name provided is the short name, stripped of 797fa9e4066Sahrens * any leading /dev path. 798fa9e4066Sahrens */ 799fa9e4066Sahrens int 800fa9e4066Sahrens label_disk(char *name) 801fa9e4066Sahrens { 802fa9e4066Sahrens char path[MAXPATHLEN]; 803fa9e4066Sahrens struct dk_gpt *vtoc; 804fa9e4066Sahrens int fd; 805fa9e4066Sahrens size_t resv = 16384; 806fa9e4066Sahrens 807fa9e4066Sahrens (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, 808fa9e4066Sahrens BACKUP_SLICE); 809fa9e4066Sahrens 810fa9e4066Sahrens if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { 811fa9e4066Sahrens /* 812fa9e4066Sahrens * This shouldn't happen. We've long since verified that this 813fa9e4066Sahrens * is a valid device. 814fa9e4066Sahrens */ 815fa9e4066Sahrens (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 816fa9e4066Sahrens path, strerror(errno)); 817fa9e4066Sahrens return (-1); 818fa9e4066Sahrens } 819fa9e4066Sahrens 820fa9e4066Sahrens 821fa9e4066Sahrens if (efi_alloc_and_init(fd, 9, &vtoc) != 0) { 822fa9e4066Sahrens /* 823fa9e4066Sahrens * The only way this can fail is if we run out of memory, or we 824fa9e4066Sahrens * were unable to read the disk geometry. 825fa9e4066Sahrens */ 826fa9e4066Sahrens if (errno == ENOMEM) 827fa9e4066Sahrens no_memory(); 828fa9e4066Sahrens 829fa9e4066Sahrens (void) fprintf(stderr, gettext("cannot label '%s': unable to " 830fa9e4066Sahrens "read disk geometry\n"), name); 831fa9e4066Sahrens (void) close(fd); 832fa9e4066Sahrens return (-1); 833fa9e4066Sahrens } 834fa9e4066Sahrens 835fa9e4066Sahrens vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba; 836fa9e4066Sahrens vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 - 837fa9e4066Sahrens vtoc->efi_first_u_lba - resv; 838fa9e4066Sahrens 839fa9e4066Sahrens /* 840fa9e4066Sahrens * Why we use V_USR: V_BACKUP confuses users, and is considered 841fa9e4066Sahrens * disposable by some EFI utilities (since EFI doesn't have a backup 842fa9e4066Sahrens * slice). V_UNASSIGNED is supposed to be used only for zero size 843fa9e4066Sahrens * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, 844fa9e4066Sahrens * etc. were all pretty specific. V_USR is as close to reality as we 845fa9e4066Sahrens * can get, in the absence of V_OTHER. 846fa9e4066Sahrens */ 847fa9e4066Sahrens vtoc->efi_parts[0].p_tag = V_USR; 848fa9e4066Sahrens (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); 849fa9e4066Sahrens 850fa9e4066Sahrens vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv; 851fa9e4066Sahrens vtoc->efi_parts[8].p_size = resv; 852fa9e4066Sahrens vtoc->efi_parts[8].p_tag = V_RESERVED; 853fa9e4066Sahrens 854fa9e4066Sahrens if (efi_write(fd, vtoc) != 0) { 855fa9e4066Sahrens /* 856fa9e4066Sahrens * Currently, EFI labels are not supported for IDE disks, and it 857fa9e4066Sahrens * is likely that they will not be supported on other drives for 858fa9e4066Sahrens * some time. Print out a helpful error message directing the 859fa9e4066Sahrens * user to manually label the disk and give a specific slice. 860fa9e4066Sahrens */ 861fa9e4066Sahrens (void) fprintf(stderr, gettext("cannot label '%s': failed to " 862fa9e4066Sahrens "write EFI label\n"), name); 863fa9e4066Sahrens (void) fprintf(stderr, gettext("use fdisk(1M) to partition " 864fa9e4066Sahrens "the disk, and provide a specific slice\n")); 865fa9e4066Sahrens (void) close(fd); 866*99653d4eSeschrock efi_free(vtoc); 867fa9e4066Sahrens return (-1); 868fa9e4066Sahrens } 869fa9e4066Sahrens 870fa9e4066Sahrens (void) close(fd); 871*99653d4eSeschrock efi_free(vtoc); 872fa9e4066Sahrens return (0); 873fa9e4066Sahrens } 874fa9e4066Sahrens 875fa9e4066Sahrens /* 876fa9e4066Sahrens * Go through and find any whole disks in the vdev specification, labelling them 877fa9e4066Sahrens * as appropriate. When constructing the vdev spec, we were unable to open this 878fa9e4066Sahrens * device in order to provide a devid. Now that we have labelled the disk and 879fa9e4066Sahrens * know that slice 0 is valid, we can construct the devid now. 880fa9e4066Sahrens * 881fa9e4066Sahrens * If the disk was already labelled with an EFI label, we will have gotten the 882fa9e4066Sahrens * devid already (because we were able to open the whole disk). Otherwise, we 883fa9e4066Sahrens * need to get the devid after we label the disk. 884fa9e4066Sahrens */ 885fa9e4066Sahrens int 886fa9e4066Sahrens make_disks(nvlist_t *nv) 887fa9e4066Sahrens { 888fa9e4066Sahrens nvlist_t **child; 889fa9e4066Sahrens uint_t c, children; 890fa9e4066Sahrens char *type, *path, *diskname; 891fa9e4066Sahrens char buf[MAXPATHLEN]; 892afefbcddSeschrock uint64_t wholedisk; 893fa9e4066Sahrens int fd; 894fa9e4066Sahrens int ret; 895fa9e4066Sahrens ddi_devid_t devid; 896fa9e4066Sahrens char *minor = NULL, *devid_str = NULL; 897fa9e4066Sahrens 898fa9e4066Sahrens verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 899fa9e4066Sahrens 900fa9e4066Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 901fa9e4066Sahrens &child, &children) != 0) { 902fa9e4066Sahrens 903fa9e4066Sahrens if (strcmp(type, VDEV_TYPE_DISK) != 0) 904fa9e4066Sahrens return (0); 905fa9e4066Sahrens 906fa9e4066Sahrens /* 907fa9e4066Sahrens * We have a disk device. Get the path to the device 908fa9e4066Sahrens * and see if its a whole disk by appending the backup 909fa9e4066Sahrens * slice and stat()ing the device. 910fa9e4066Sahrens */ 911fa9e4066Sahrens verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 912fa9e4066Sahrens 913afefbcddSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 914afefbcddSeschrock &wholedisk) != 0 || !wholedisk) 915fa9e4066Sahrens return (0); 916fa9e4066Sahrens 917fa9e4066Sahrens diskname = strrchr(path, '/'); 918fa9e4066Sahrens assert(diskname != NULL); 919fa9e4066Sahrens diskname++; 920fa9e4066Sahrens if (label_disk(diskname) != 0) 921fa9e4066Sahrens return (-1); 922fa9e4066Sahrens 923fa9e4066Sahrens /* 924fa9e4066Sahrens * Fill in the devid, now that we've labeled the disk. 925fa9e4066Sahrens */ 926fa9e4066Sahrens (void) snprintf(buf, sizeof (buf), "%ss0", path); 927fa9e4066Sahrens if ((fd = open(buf, O_RDONLY)) < 0) { 928fa9e4066Sahrens (void) fprintf(stderr, 929fa9e4066Sahrens gettext("cannot open '%s': %s\n"), 930fa9e4066Sahrens buf, strerror(errno)); 931fa9e4066Sahrens return (-1); 932fa9e4066Sahrens } 933fa9e4066Sahrens 934fa9e4066Sahrens if (devid_get(fd, &devid) == 0) { 935fa9e4066Sahrens if (devid_get_minor_name(fd, &minor) == 0 && 936fa9e4066Sahrens (devid_str = devid_str_encode(devid, minor)) != 937fa9e4066Sahrens NULL) { 938fa9e4066Sahrens verify(nvlist_add_string(nv, 939fa9e4066Sahrens ZPOOL_CONFIG_DEVID, devid_str) == 0); 940fa9e4066Sahrens } 941fa9e4066Sahrens if (devid_str != NULL) 942fa9e4066Sahrens devid_str_free(devid_str); 943fa9e4066Sahrens if (minor != NULL) 944fa9e4066Sahrens devid_str_free(minor); 945fa9e4066Sahrens devid_free(devid); 946fa9e4066Sahrens } 947fa9e4066Sahrens 948afefbcddSeschrock /* 949afefbcddSeschrock * Update the path to refer to the 's0' slice. The presence of 950afefbcddSeschrock * the 'whole_disk' field indicates to the CLI that we should 951afefbcddSeschrock * chop off the slice number when displaying the device in 952afefbcddSeschrock * future output. 953afefbcddSeschrock */ 954afefbcddSeschrock verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 955afefbcddSeschrock 956fa9e4066Sahrens (void) close(fd); 957fa9e4066Sahrens 958fa9e4066Sahrens return (0); 959fa9e4066Sahrens } 960fa9e4066Sahrens 961fa9e4066Sahrens for (c = 0; c < children; c++) 962fa9e4066Sahrens if ((ret = make_disks(child[c])) != 0) 963fa9e4066Sahrens return (ret); 964fa9e4066Sahrens 965*99653d4eSeschrock if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 966*99653d4eSeschrock &child, &children) == 0) 967*99653d4eSeschrock for (c = 0; c < children; c++) 968*99653d4eSeschrock if ((ret = make_disks(child[c])) != 0) 969*99653d4eSeschrock return (ret); 970*99653d4eSeschrock 971fa9e4066Sahrens return (0); 972fa9e4066Sahrens } 973fa9e4066Sahrens 974*99653d4eSeschrock /* 975*99653d4eSeschrock * Determine if the given path is a hot spare within the given configuration. 976*99653d4eSeschrock */ 977*99653d4eSeschrock static boolean_t 978*99653d4eSeschrock is_spare(nvlist_t *config, const char *path) 979*99653d4eSeschrock { 980*99653d4eSeschrock int fd; 981*99653d4eSeschrock pool_state_t state; 982*99653d4eSeschrock char *name; 983*99653d4eSeschrock nvlist_t *label; 984*99653d4eSeschrock uint64_t guid, spareguid; 985*99653d4eSeschrock nvlist_t *nvroot; 986*99653d4eSeschrock nvlist_t **spares; 987*99653d4eSeschrock uint_t i, nspares; 988*99653d4eSeschrock boolean_t inuse; 989*99653d4eSeschrock 990*99653d4eSeschrock if ((fd = open(path, O_RDONLY)) < 0) 991*99653d4eSeschrock return (B_FALSE); 992*99653d4eSeschrock 993*99653d4eSeschrock if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 994*99653d4eSeschrock !inuse || 995*99653d4eSeschrock state != POOL_STATE_SPARE || 996*99653d4eSeschrock zpool_read_label(fd, &label) != 0) { 997*99653d4eSeschrock (void) close(fd); 998*99653d4eSeschrock return (B_FALSE); 999*99653d4eSeschrock } 1000*99653d4eSeschrock 1001*99653d4eSeschrock (void) close(fd); 1002*99653d4eSeschrock verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1003*99653d4eSeschrock nvlist_free(label); 1004*99653d4eSeschrock 1005*99653d4eSeschrock verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1006*99653d4eSeschrock &nvroot) == 0); 1007*99653d4eSeschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1008*99653d4eSeschrock &spares, &nspares) == 0) { 1009*99653d4eSeschrock for (i = 0; i < nspares; i++) { 1010*99653d4eSeschrock verify(nvlist_lookup_uint64(spares[i], 1011*99653d4eSeschrock ZPOOL_CONFIG_GUID, &spareguid) == 0); 1012*99653d4eSeschrock if (spareguid == guid) 1013*99653d4eSeschrock return (B_TRUE); 1014*99653d4eSeschrock } 1015*99653d4eSeschrock } 1016*99653d4eSeschrock 1017*99653d4eSeschrock return (B_FALSE); 1018*99653d4eSeschrock } 1019*99653d4eSeschrock 1020fa9e4066Sahrens /* 1021fa9e4066Sahrens * Go through and find any devices that are in use. We rely on libdiskmgt for 1022fa9e4066Sahrens * the majority of this task. 1023fa9e4066Sahrens */ 1024fa9e4066Sahrens int 1025*99653d4eSeschrock check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1026*99653d4eSeschrock int isspare) 1027fa9e4066Sahrens { 1028fa9e4066Sahrens nvlist_t **child; 1029fa9e4066Sahrens uint_t c, children; 1030fa9e4066Sahrens char *type, *path; 1031fa9e4066Sahrens int ret; 1032*99653d4eSeschrock char buf[MAXPATHLEN]; 1033*99653d4eSeschrock uint64_t wholedisk; 1034fa9e4066Sahrens 1035fa9e4066Sahrens verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1036fa9e4066Sahrens 1037fa9e4066Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1038fa9e4066Sahrens &child, &children) != 0) { 1039fa9e4066Sahrens 1040fa9e4066Sahrens verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1041fa9e4066Sahrens 1042*99653d4eSeschrock /* 1043*99653d4eSeschrock * As a generic check, we look to see if this is a replace of a 1044*99653d4eSeschrock * hot spare within the same pool. If so, we allow it 1045*99653d4eSeschrock * regardless of what libdiskmgt or zpool_in_use() says. 1046*99653d4eSeschrock */ 1047*99653d4eSeschrock if (isreplacing) { 1048*99653d4eSeschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1049*99653d4eSeschrock &wholedisk) == 0 && wholedisk) 1050*99653d4eSeschrock (void) snprintf(buf, sizeof (buf), "%ss0", 1051*99653d4eSeschrock path); 1052*99653d4eSeschrock else 1053*99653d4eSeschrock (void) strlcpy(buf, path, sizeof (buf)); 1054*99653d4eSeschrock if (is_spare(config, buf)) 1055*99653d4eSeschrock return (0); 1056*99653d4eSeschrock } 1057*99653d4eSeschrock 1058fa9e4066Sahrens if (strcmp(type, VDEV_TYPE_DISK) == 0) 1059*99653d4eSeschrock ret = check_device(path, force, isspare); 1060fa9e4066Sahrens 1061fa9e4066Sahrens if (strcmp(type, VDEV_TYPE_FILE) == 0) 1062*99653d4eSeschrock ret = check_file(path, force, isspare); 1063fa9e4066Sahrens 1064fa9e4066Sahrens return (ret); 1065fa9e4066Sahrens } 1066fa9e4066Sahrens 1067fa9e4066Sahrens for (c = 0; c < children; c++) 1068*99653d4eSeschrock if ((ret = check_in_use(config, child[c], force, 1069*99653d4eSeschrock isreplacing, B_FALSE)) != 0) 1070fa9e4066Sahrens return (ret); 1071fa9e4066Sahrens 1072*99653d4eSeschrock if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1073*99653d4eSeschrock &child, &children) == 0) 1074*99653d4eSeschrock for (c = 0; c < children; c++) 1075*99653d4eSeschrock if ((ret = check_in_use(config, child[c], force, 1076*99653d4eSeschrock isreplacing, B_TRUE)) != 0) 1077*99653d4eSeschrock return (ret); 1078*99653d4eSeschrock 1079fa9e4066Sahrens return (0); 1080fa9e4066Sahrens } 1081fa9e4066Sahrens 1082*99653d4eSeschrock const char * 1083*99653d4eSeschrock is_grouping(const char *type, int *mindev) 1084*99653d4eSeschrock { 1085*99653d4eSeschrock if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { 1086*99653d4eSeschrock if (mindev != NULL) 1087*99653d4eSeschrock *mindev = 2; 1088*99653d4eSeschrock return (VDEV_TYPE_RAIDZ); 1089*99653d4eSeschrock } 1090*99653d4eSeschrock 1091*99653d4eSeschrock if (strcmp(type, "raidz2") == 0) { 1092*99653d4eSeschrock if (mindev != NULL) 1093*99653d4eSeschrock *mindev = 3; 1094*99653d4eSeschrock return (VDEV_TYPE_RAIDZ); 1095*99653d4eSeschrock } 1096*99653d4eSeschrock 1097*99653d4eSeschrock if (strcmp(type, "mirror") == 0) { 1098*99653d4eSeschrock if (mindev != NULL) 1099*99653d4eSeschrock *mindev = 2; 1100*99653d4eSeschrock return (VDEV_TYPE_MIRROR); 1101*99653d4eSeschrock } 1102*99653d4eSeschrock 1103*99653d4eSeschrock if (strcmp(type, "spare") == 0) { 1104*99653d4eSeschrock if (mindev != NULL) 1105*99653d4eSeschrock *mindev = 1; 1106*99653d4eSeschrock return (VDEV_TYPE_SPARE); 1107*99653d4eSeschrock } 1108*99653d4eSeschrock 1109*99653d4eSeschrock return (NULL); 1110*99653d4eSeschrock } 1111*99653d4eSeschrock 1112fa9e4066Sahrens /* 1113fa9e4066Sahrens * Construct a syntactically valid vdev specification, 1114fa9e4066Sahrens * and ensure that all devices and files exist and can be opened. 1115fa9e4066Sahrens * Note: we don't bother freeing anything in the error paths 1116fa9e4066Sahrens * because the program is just going to exit anyway. 1117fa9e4066Sahrens */ 1118fa9e4066Sahrens nvlist_t * 1119fa9e4066Sahrens construct_spec(int argc, char **argv) 1120fa9e4066Sahrens { 1121*99653d4eSeschrock nvlist_t *nvroot, *nv, **top, **spares; 1122*99653d4eSeschrock int t, toplevels, mindev, nspares; 1123*99653d4eSeschrock const char *type; 1124fa9e4066Sahrens 1125fa9e4066Sahrens top = NULL; 1126fa9e4066Sahrens toplevels = 0; 1127*99653d4eSeschrock spares = NULL; 1128*99653d4eSeschrock nspares = 0; 1129fa9e4066Sahrens 1130fa9e4066Sahrens while (argc > 0) { 1131fa9e4066Sahrens nv = NULL; 1132fa9e4066Sahrens 1133fa9e4066Sahrens /* 1134fa9e4066Sahrens * If it's a mirror or raidz, the subsequent arguments are 1135fa9e4066Sahrens * its leaves -- until we encounter the next mirror or raidz. 1136fa9e4066Sahrens */ 1137*99653d4eSeschrock if ((type = is_grouping(argv[0], &mindev)) != NULL) { 1138fa9e4066Sahrens nvlist_t **child = NULL; 1139*99653d4eSeschrock int c, children = 0; 1140*99653d4eSeschrock 1141*99653d4eSeschrock if (strcmp(type, VDEV_TYPE_SPARE) == 0 && 1142*99653d4eSeschrock spares != NULL) { 1143*99653d4eSeschrock (void) fprintf(stderr, gettext("invalid vdev " 1144*99653d4eSeschrock "specification: 'spare' can be " 1145*99653d4eSeschrock "specified only once\n")); 1146*99653d4eSeschrock return (NULL); 1147*99653d4eSeschrock } 1148fa9e4066Sahrens 1149fa9e4066Sahrens for (c = 1; c < argc; c++) { 1150*99653d4eSeschrock if (is_grouping(argv[c], NULL) != NULL) 1151fa9e4066Sahrens break; 1152fa9e4066Sahrens children++; 1153fa9e4066Sahrens child = realloc(child, 1154fa9e4066Sahrens children * sizeof (nvlist_t *)); 1155fa9e4066Sahrens if (child == NULL) 1156fa9e4066Sahrens no_memory(); 1157fa9e4066Sahrens if ((nv = make_leaf_vdev(argv[c])) == NULL) 1158fa9e4066Sahrens return (NULL); 1159fa9e4066Sahrens child[children - 1] = nv; 1160fa9e4066Sahrens } 1161fa9e4066Sahrens 1162*99653d4eSeschrock if (children < mindev) { 1163*99653d4eSeschrock (void) fprintf(stderr, gettext("invalid vdev " 1164*99653d4eSeschrock "specification: %s requires at least %d " 1165*99653d4eSeschrock "devices\n"), argv[0], mindev); 1166fa9e4066Sahrens return (NULL); 1167fa9e4066Sahrens } 1168fa9e4066Sahrens 1169*99653d4eSeschrock argc -= c; 1170*99653d4eSeschrock argv += c; 1171*99653d4eSeschrock 1172*99653d4eSeschrock if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1173*99653d4eSeschrock spares = child; 1174*99653d4eSeschrock nspares = children; 1175*99653d4eSeschrock continue; 1176*99653d4eSeschrock } else { 1177*99653d4eSeschrock verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1178*99653d4eSeschrock 0) == 0); 1179*99653d4eSeschrock verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1180*99653d4eSeschrock type) == 0); 1181*99653d4eSeschrock if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1182*99653d4eSeschrock verify(nvlist_add_uint64(nv, 1183*99653d4eSeschrock ZPOOL_CONFIG_NPARITY, 1184*99653d4eSeschrock mindev - 1) == 0); 1185*99653d4eSeschrock } 1186*99653d4eSeschrock verify(nvlist_add_nvlist_array(nv, 1187*99653d4eSeschrock ZPOOL_CONFIG_CHILDREN, child, 1188*99653d4eSeschrock children) == 0); 1189fa9e4066Sahrens 1190*99653d4eSeschrock for (c = 0; c < children; c++) 1191*99653d4eSeschrock nvlist_free(child[c]); 1192*99653d4eSeschrock free(child); 1193*99653d4eSeschrock } 1194fa9e4066Sahrens } else { 1195fa9e4066Sahrens /* 1196fa9e4066Sahrens * We have a device. Pass off to make_leaf_vdev() to 1197fa9e4066Sahrens * construct the appropriate nvlist describing the vdev. 1198fa9e4066Sahrens */ 1199fa9e4066Sahrens if ((nv = make_leaf_vdev(argv[0])) == NULL) 1200fa9e4066Sahrens return (NULL); 1201fa9e4066Sahrens argc--; 1202fa9e4066Sahrens argv++; 1203fa9e4066Sahrens } 1204fa9e4066Sahrens 1205fa9e4066Sahrens toplevels++; 1206fa9e4066Sahrens top = realloc(top, toplevels * sizeof (nvlist_t *)); 1207fa9e4066Sahrens if (top == NULL) 1208fa9e4066Sahrens no_memory(); 1209fa9e4066Sahrens top[toplevels - 1] = nv; 1210fa9e4066Sahrens } 1211fa9e4066Sahrens 1212*99653d4eSeschrock if (toplevels == 0 && nspares == 0) { 1213*99653d4eSeschrock (void) fprintf(stderr, gettext("invalid vdev " 1214*99653d4eSeschrock "specification: at least one toplevel vdev must be " 1215*99653d4eSeschrock "specified\n")); 1216*99653d4eSeschrock return (NULL); 1217*99653d4eSeschrock } 1218*99653d4eSeschrock 1219fa9e4066Sahrens /* 1220fa9e4066Sahrens * Finally, create nvroot and add all top-level vdevs to it. 1221fa9e4066Sahrens */ 1222fa9e4066Sahrens verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1223fa9e4066Sahrens verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1224fa9e4066Sahrens VDEV_TYPE_ROOT) == 0); 1225fa9e4066Sahrens verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1226fa9e4066Sahrens top, toplevels) == 0); 1227*99653d4eSeschrock if (nspares != 0) 1228*99653d4eSeschrock verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1229*99653d4eSeschrock spares, nspares) == 0); 1230fa9e4066Sahrens 1231fa9e4066Sahrens for (t = 0; t < toplevels; t++) 1232fa9e4066Sahrens nvlist_free(top[t]); 1233*99653d4eSeschrock for (t = 0; t < nspares; t++) 1234*99653d4eSeschrock nvlist_free(spares[t]); 1235*99653d4eSeschrock if (spares) 1236*99653d4eSeschrock free(spares); 1237fa9e4066Sahrens free(top); 1238fa9e4066Sahrens 1239fa9e4066Sahrens return (nvroot); 1240fa9e4066Sahrens } 1241fa9e4066Sahrens 1242fa9e4066Sahrens /* 1243fa9e4066Sahrens * Get and validate the contents of the given vdev specification. This ensures 1244fa9e4066Sahrens * that the nvlist returned is well-formed, that all the devices exist, and that 1245fa9e4066Sahrens * they are not currently in use by any other known consumer. The 'poolconfig' 1246fa9e4066Sahrens * parameter is the current configuration of the pool when adding devices 1247fa9e4066Sahrens * existing pool, and is used to perform additional checks, such as changing the 1248fa9e4066Sahrens * replication level of the pool. It can be 'NULL' to indicate that this is a 1249fa9e4066Sahrens * new pool. The 'force' flag controls whether devices should be forcefully 1250fa9e4066Sahrens * added, even if they appear in use. 1251fa9e4066Sahrens */ 1252fa9e4066Sahrens nvlist_t * 1253fa9e4066Sahrens make_root_vdev(nvlist_t *poolconfig, int force, int check_rep, 1254*99653d4eSeschrock boolean_t isreplacing, int argc, char **argv) 1255fa9e4066Sahrens { 1256fa9e4066Sahrens nvlist_t *newroot; 1257fa9e4066Sahrens 1258fa9e4066Sahrens is_force = force; 1259fa9e4066Sahrens 1260fa9e4066Sahrens /* 1261fa9e4066Sahrens * Construct the vdev specification. If this is successful, we know 1262fa9e4066Sahrens * that we have a valid specification, and that all devices can be 1263fa9e4066Sahrens * opened. 1264fa9e4066Sahrens */ 1265fa9e4066Sahrens if ((newroot = construct_spec(argc, argv)) == NULL) 1266fa9e4066Sahrens return (NULL); 1267fa9e4066Sahrens 1268fa9e4066Sahrens /* 1269fa9e4066Sahrens * Validate each device to make sure that its not shared with another 1270fa9e4066Sahrens * subsystem. We do this even if 'force' is set, because there are some 1271fa9e4066Sahrens * uses (such as a dedicated dump device) that even '-f' cannot 1272fa9e4066Sahrens * override. 1273fa9e4066Sahrens */ 1274*99653d4eSeschrock if (check_in_use(poolconfig, newroot, force, isreplacing, 1275*99653d4eSeschrock B_FALSE) != 0) { 1276fa9e4066Sahrens nvlist_free(newroot); 1277fa9e4066Sahrens return (NULL); 1278fa9e4066Sahrens } 1279fa9e4066Sahrens 1280fa9e4066Sahrens /* 1281fa9e4066Sahrens * Check the replication level of the given vdevs and report any errors 1282fa9e4066Sahrens * found. We include the existing pool spec, if any, as we need to 1283fa9e4066Sahrens * catch changes against the existing replication level. 1284fa9e4066Sahrens */ 1285fa9e4066Sahrens if (check_rep && check_replication(poolconfig, newroot) != 0) { 1286fa9e4066Sahrens nvlist_free(newroot); 1287fa9e4066Sahrens return (NULL); 1288fa9e4066Sahrens } 1289fa9e4066Sahrens 1290fa9e4066Sahrens /* 1291fa9e4066Sahrens * Run through the vdev specification and label any whole disks found. 1292fa9e4066Sahrens */ 1293fa9e4066Sahrens if (make_disks(newroot) != 0) { 1294fa9e4066Sahrens nvlist_free(newroot); 1295fa9e4066Sahrens return (NULL); 1296fa9e4066Sahrens } 1297fa9e4066Sahrens 1298fa9e4066Sahrens return (newroot); 1299fa9e4066Sahrens } 1300