1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'make_root_vdev'. The 52 * function performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Call libzfs to label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <stdio.h> 73 #include <string.h> 74 #include <unistd.h> 75 #include <sys/efi_partition.h> 76 #include <sys/stat.h> 77 #include <sys/vtoc.h> 78 #include <sys/mntent.h> 79 80 #include "zpool_util.h" 81 82 #define DISK_ROOT "/dev/dsk" 83 #define RDISK_ROOT "/dev/rdsk" 84 #define BACKUP_SLICE "s2" 85 86 /* 87 * For any given vdev specification, we can have multiple errors. The 88 * vdev_error() function keeps track of whether we have seen an error yet, and 89 * prints out a header if its the first error we've seen. 90 */ 91 boolean_t error_seen; 92 boolean_t is_force; 93 94 /*PRINTFLIKE1*/ 95 static void 96 vdev_error(const char *fmt, ...) 97 { 98 va_list ap; 99 100 if (!error_seen) { 101 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 102 if (!is_force) 103 (void) fprintf(stderr, gettext("use '-f' to override " 104 "the following errors:\n")); 105 else 106 (void) fprintf(stderr, gettext("the following errors " 107 "must be manually repaired:\n")); 108 error_seen = B_TRUE; 109 } 110 111 va_start(ap, fmt); 112 (void) vfprintf(stderr, fmt, ap); 113 va_end(ap); 114 } 115 116 static void 117 libdiskmgt_error(int error) 118 { 119 /* 120 * ENXIO/ENODEV is a valid error message if the device doesn't live in 121 * /dev/dsk. Don't bother printing an error message in this case. 122 */ 123 if (error == ENXIO || error == ENODEV) 124 return; 125 126 (void) fprintf(stderr, gettext("warning: device in use checking " 127 "failed: %s\n"), strerror(error)); 128 } 129 130 /* 131 * Validate a device, passing the bulk of the work off to libdiskmgt. 132 */ 133 static int 134 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 135 { 136 char *msg; 137 int error = 0; 138 139 if (dm_inuse((char *)path, &msg, isspare ? DM_WHO_ZPOOL_SPARE : 140 (force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL), &error) || error) { 141 if (error != 0) { 142 libdiskmgt_error(error); 143 return (0); 144 } else { 145 vdev_error("%s", msg); 146 free(msg); 147 return (-1); 148 } 149 } 150 151 /* 152 * If we're given a whole disk, ignore overlapping slices since we're 153 * about to label it anyway. 154 */ 155 error = 0; 156 if (!wholedisk && !force && 157 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 158 if (error == 0) { 159 /* dm_isoverlapping returned -1 */ 160 vdev_error(gettext("%s overlaps with %s\n"), path, msg); 161 free(msg); 162 return (-1); 163 } else if (error != ENODEV) { 164 /* libdiskmgt's devcache only handles physical drives */ 165 libdiskmgt_error(error); 166 return (0); 167 } 168 } 169 170 return (0); 171 } 172 173 174 /* 175 * Validate a whole disk. Iterate over all slices on the disk and make sure 176 * that none is in use by calling check_slice(). 177 */ 178 static int 179 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 180 { 181 dm_descriptor_t *drive, *media, *slice; 182 int err = 0; 183 int i; 184 int ret; 185 186 /* 187 * Get the drive associated with this disk. This should never fail, 188 * because we already have an alias handle open for the device. 189 */ 190 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 191 &err)) == NULL || *drive == NULL) { 192 if (err) 193 libdiskmgt_error(err); 194 return (0); 195 } 196 197 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 198 &err)) == NULL) { 199 dm_free_descriptors(drive); 200 if (err) 201 libdiskmgt_error(err); 202 return (0); 203 } 204 205 dm_free_descriptors(drive); 206 207 /* 208 * It is possible that the user has specified a removable media drive, 209 * and the media is not present. 210 */ 211 if (*media == NULL) { 212 dm_free_descriptors(media); 213 vdev_error(gettext("'%s' has no media in drive\n"), name); 214 return (-1); 215 } 216 217 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 218 &err)) == NULL) { 219 dm_free_descriptors(media); 220 if (err) 221 libdiskmgt_error(err); 222 return (0); 223 } 224 225 dm_free_descriptors(media); 226 227 ret = 0; 228 229 /* 230 * Iterate over all slices and report any errors. We don't care about 231 * overlapping slices because we are using the whole disk. 232 */ 233 for (i = 0; slice[i] != NULL; i++) { 234 char *name = dm_get_name(slice[i], &err); 235 236 if (check_slice(name, force, B_TRUE, isspare) != 0) 237 ret = -1; 238 239 dm_free_name(name); 240 } 241 242 dm_free_descriptors(slice); 243 return (ret); 244 } 245 246 /* 247 * Validate a device. 248 */ 249 static int 250 check_device(const char *path, boolean_t force, boolean_t isspare) 251 { 252 dm_descriptor_t desc; 253 int err; 254 char *dev; 255 256 /* 257 * For whole disks, libdiskmgt does not include the leading dev path. 258 */ 259 dev = strrchr(path, '/'); 260 assert(dev != NULL); 261 dev++; 262 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 263 err = check_disk(path, desc, force, isspare); 264 dm_free_descriptor(desc); 265 return (err); 266 } 267 268 return (check_slice(path, force, B_FALSE, isspare)); 269 } 270 271 /* 272 * Check that a file is valid. All we can do in this case is check that it's 273 * not in use by another pool, and not in use by swap. 274 */ 275 static int 276 check_file(const char *file, boolean_t force, boolean_t isspare) 277 { 278 char *name; 279 int fd; 280 int ret = 0; 281 int err; 282 pool_state_t state; 283 boolean_t inuse; 284 285 if (dm_inuse_swap(file, &err)) { 286 if (err) 287 libdiskmgt_error(err); 288 else 289 vdev_error(gettext("%s is currently used by swap. " 290 "Please see swap(1M).\n"), file); 291 return (-1); 292 } 293 294 if ((fd = open(file, O_RDONLY)) < 0) 295 return (0); 296 297 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 298 const char *desc; 299 300 switch (state) { 301 case POOL_STATE_ACTIVE: 302 desc = gettext("active"); 303 break; 304 305 case POOL_STATE_EXPORTED: 306 desc = gettext("exported"); 307 break; 308 309 case POOL_STATE_POTENTIALLY_ACTIVE: 310 desc = gettext("potentially active"); 311 break; 312 313 default: 314 desc = gettext("unknown"); 315 break; 316 } 317 318 /* 319 * Allow hot spares to be shared between pools. 320 */ 321 if (state == POOL_STATE_SPARE && isspare) 322 return (0); 323 324 if (state == POOL_STATE_ACTIVE || 325 state == POOL_STATE_SPARE || !force) { 326 switch (state) { 327 case POOL_STATE_SPARE: 328 vdev_error(gettext("%s is reserved as a hot " 329 "spare for pool %s\n"), file, name); 330 break; 331 default: 332 vdev_error(gettext("%s is part of %s pool " 333 "'%s'\n"), file, desc, name); 334 break; 335 } 336 ret = -1; 337 } 338 339 free(name); 340 } 341 342 (void) close(fd); 343 return (ret); 344 } 345 346 347 /* 348 * By "whole disk" we mean an entire physical disk (something we can 349 * label, toggle the write cache on, etc.) as opposed to the full 350 * capacity of a pseudo-device such as lofi or did. We act as if we 351 * are labeling the disk, which should be a pretty good test of whether 352 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 353 * it isn't. 354 */ 355 static boolean_t 356 is_whole_disk(const char *arg) 357 { 358 struct dk_gpt *label; 359 int fd; 360 char path[MAXPATHLEN]; 361 362 (void) snprintf(path, sizeof (path), "%s%s%s", 363 RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 364 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 365 return (B_FALSE); 366 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 367 (void) close(fd); 368 return (B_FALSE); 369 } 370 efi_free(label); 371 (void) close(fd); 372 return (B_TRUE); 373 } 374 375 /* 376 * Create a leaf vdev. Determine if this is a file or a device. If it's a 377 * device, fill in the device id to make a complete nvlist. Valid forms for a 378 * leaf vdev are: 379 * 380 * /dev/dsk/xxx Complete disk path 381 * /xxx Full path to file 382 * xxx Shorthand for /dev/dsk/xxx 383 */ 384 static nvlist_t * 385 make_leaf_vdev(const char *arg, uint64_t is_log) 386 { 387 char path[MAXPATHLEN]; 388 struct stat64 statbuf; 389 nvlist_t *vdev = NULL; 390 char *type = NULL; 391 boolean_t wholedisk = B_FALSE; 392 393 /* 394 * Determine what type of vdev this is, and put the full path into 395 * 'path'. We detect whether this is a device of file afterwards by 396 * checking the st_mode of the file. 397 */ 398 if (arg[0] == '/') { 399 /* 400 * Complete device or file path. Exact type is determined by 401 * examining the file descriptor afterwards. 402 */ 403 wholedisk = is_whole_disk(arg); 404 if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 405 (void) fprintf(stderr, 406 gettext("cannot open '%s': %s\n"), 407 arg, strerror(errno)); 408 return (NULL); 409 } 410 411 (void) strlcpy(path, arg, sizeof (path)); 412 } else { 413 /* 414 * This may be a short path for a device, or it could be total 415 * gibberish. Check to see if it's a known device in 416 * /dev/dsk/. As part of this check, see if we've been given a 417 * an entire disk (minus the slice number). 418 */ 419 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 420 arg); 421 wholedisk = is_whole_disk(path); 422 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 423 /* 424 * If we got ENOENT, then the user gave us 425 * gibberish, so try to direct them with a 426 * reasonable error message. Otherwise, 427 * regurgitate strerror() since it's the best we 428 * can do. 429 */ 430 if (errno == ENOENT) { 431 (void) fprintf(stderr, 432 gettext("cannot open '%s': no such " 433 "device in %s\n"), arg, DISK_ROOT); 434 (void) fprintf(stderr, 435 gettext("must be a full path or " 436 "shorthand device name\n")); 437 return (NULL); 438 } else { 439 (void) fprintf(stderr, 440 gettext("cannot open '%s': %s\n"), 441 path, strerror(errno)); 442 return (NULL); 443 } 444 } 445 } 446 447 /* 448 * Determine whether this is a device or a file. 449 */ 450 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 451 type = VDEV_TYPE_DISK; 452 } else if (S_ISREG(statbuf.st_mode)) { 453 type = VDEV_TYPE_FILE; 454 } else { 455 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 456 "block device or regular file\n"), path); 457 return (NULL); 458 } 459 460 /* 461 * Finally, we have the complete device or file, and we know that it is 462 * acceptable to use. Construct the nvlist to describe this vdev. All 463 * vdevs have a 'path' element, and devices also have a 'devid' element. 464 */ 465 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 466 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 467 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 468 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 469 if (strcmp(type, VDEV_TYPE_DISK) == 0) 470 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 471 (uint64_t)wholedisk) == 0); 472 473 /* 474 * For a whole disk, defer getting its devid until after labeling it. 475 */ 476 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 477 /* 478 * Get the devid for the device. 479 */ 480 int fd; 481 ddi_devid_t devid; 482 char *minor = NULL, *devid_str = NULL; 483 484 if ((fd = open(path, O_RDONLY)) < 0) { 485 (void) fprintf(stderr, gettext("cannot open '%s': " 486 "%s\n"), path, strerror(errno)); 487 nvlist_free(vdev); 488 return (NULL); 489 } 490 491 if (devid_get(fd, &devid) == 0) { 492 if (devid_get_minor_name(fd, &minor) == 0 && 493 (devid_str = devid_str_encode(devid, minor)) != 494 NULL) { 495 verify(nvlist_add_string(vdev, 496 ZPOOL_CONFIG_DEVID, devid_str) == 0); 497 } 498 if (devid_str != NULL) 499 devid_str_free(devid_str); 500 if (minor != NULL) 501 devid_str_free(minor); 502 devid_free(devid); 503 } 504 505 (void) close(fd); 506 } 507 508 return (vdev); 509 } 510 511 /* 512 * Go through and verify the replication level of the pool is consistent. 513 * Performs the following checks: 514 * 515 * For the new spec, verifies that devices in mirrors and raidz are the 516 * same size. 517 * 518 * If the current configuration already has inconsistent replication 519 * levels, ignore any other potential problems in the new spec. 520 * 521 * Otherwise, make sure that the current spec (if there is one) and the new 522 * spec have consistent replication levels. 523 */ 524 typedef struct replication_level { 525 char *zprl_type; 526 uint64_t zprl_children; 527 uint64_t zprl_parity; 528 } replication_level_t; 529 530 #define ZPOOL_FUZZ (16 * 1024 * 1024) 531 532 /* 533 * Given a list of toplevel vdevs, return the current replication level. If 534 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 535 * an error message will be displayed for each self-inconsistent vdev. 536 */ 537 static replication_level_t * 538 get_replication(nvlist_t *nvroot, boolean_t fatal) 539 { 540 nvlist_t **top; 541 uint_t t, toplevels; 542 nvlist_t **child; 543 uint_t c, children; 544 nvlist_t *nv; 545 char *type; 546 replication_level_t lastrep, rep, *ret; 547 boolean_t dontreport; 548 uint64_t is_log; 549 550 ret = safe_malloc(sizeof (replication_level_t)); 551 552 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 553 &top, &toplevels) == 0); 554 555 lastrep.zprl_type = NULL; 556 for (t = 0; t < toplevels; t++) { 557 uint64_t is_log = B_FALSE; 558 559 nv = top[t]; 560 561 /* 562 * For separate logs we ignore the top level vdev replication 563 * constraints. 564 */ 565 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 566 if (is_log) 567 continue; 568 569 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 570 &type) == 0); 571 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 572 &child, &children) != 0) { 573 /* 574 * This is a 'file' or 'disk' vdev. 575 */ 576 rep.zprl_type = type; 577 rep.zprl_children = 1; 578 rep.zprl_parity = 0; 579 } else { 580 uint64_t vdev_size; 581 582 /* 583 * This is a mirror or RAID-Z vdev. Go through and make 584 * sure the contents are all the same (files vs. disks), 585 * keeping track of the number of elements in the 586 * process. 587 * 588 * We also check that the size of each vdev (if it can 589 * be determined) is the same. 590 */ 591 rep.zprl_type = type; 592 rep.zprl_children = 0; 593 594 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 595 verify(nvlist_lookup_uint64(nv, 596 ZPOOL_CONFIG_NPARITY, 597 &rep.zprl_parity) == 0); 598 assert(rep.zprl_parity != 0); 599 } else { 600 rep.zprl_parity = 0; 601 } 602 603 /* 604 * The 'dontreport' variable indicates that we've 605 * already reported an error for this spec, so don't 606 * bother doing it again. 607 */ 608 type = NULL; 609 dontreport = 0; 610 vdev_size = -1ULL; 611 for (c = 0; c < children; c++) { 612 nvlist_t *cnv = child[c]; 613 char *path; 614 struct stat64 statbuf; 615 uint64_t size = -1ULL; 616 char *childtype; 617 int fd, err; 618 619 rep.zprl_children++; 620 621 verify(nvlist_lookup_string(cnv, 622 ZPOOL_CONFIG_TYPE, &childtype) == 0); 623 624 /* 625 * If this is a replacing or spare vdev, then 626 * get the real first child of the vdev. 627 */ 628 if (strcmp(childtype, 629 VDEV_TYPE_REPLACING) == 0 || 630 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 631 nvlist_t **rchild; 632 uint_t rchildren; 633 634 verify(nvlist_lookup_nvlist_array(cnv, 635 ZPOOL_CONFIG_CHILDREN, &rchild, 636 &rchildren) == 0); 637 assert(rchildren == 2); 638 cnv = rchild[0]; 639 640 verify(nvlist_lookup_string(cnv, 641 ZPOOL_CONFIG_TYPE, 642 &childtype) == 0); 643 } 644 645 verify(nvlist_lookup_string(cnv, 646 ZPOOL_CONFIG_PATH, &path) == 0); 647 648 /* 649 * If we have a raidz/mirror that combines disks 650 * with files, report it as an error. 651 */ 652 if (!dontreport && type != NULL && 653 strcmp(type, childtype) != 0) { 654 if (ret != NULL) 655 free(ret); 656 ret = NULL; 657 if (fatal) 658 vdev_error(gettext( 659 "mismatched replication " 660 "level: %s contains both " 661 "files and devices\n"), 662 rep.zprl_type); 663 else 664 return (NULL); 665 dontreport = B_TRUE; 666 } 667 668 /* 669 * According to stat(2), the value of 'st_size' 670 * is undefined for block devices and character 671 * devices. But there is no effective way to 672 * determine the real size in userland. 673 * 674 * Instead, we'll take advantage of an 675 * implementation detail of spec_size(). If the 676 * device is currently open, then we (should) 677 * return a valid size. 678 * 679 * If we still don't get a valid size (indicated 680 * by a size of 0 or MAXOFFSET_T), then ignore 681 * this device altogether. 682 */ 683 if ((fd = open(path, O_RDONLY)) >= 0) { 684 err = fstat64(fd, &statbuf); 685 (void) close(fd); 686 } else { 687 err = stat64(path, &statbuf); 688 } 689 690 if (err != 0 || 691 statbuf.st_size == 0 || 692 statbuf.st_size == MAXOFFSET_T) 693 continue; 694 695 size = statbuf.st_size; 696 697 /* 698 * Also make sure that devices and 699 * slices have a consistent size. If 700 * they differ by a significant amount 701 * (~16MB) then report an error. 702 */ 703 if (!dontreport && 704 (vdev_size != -1ULL && 705 (labs(size - vdev_size) > 706 ZPOOL_FUZZ))) { 707 if (ret != NULL) 708 free(ret); 709 ret = NULL; 710 if (fatal) 711 vdev_error(gettext( 712 "%s contains devices of " 713 "different sizes\n"), 714 rep.zprl_type); 715 else 716 return (NULL); 717 dontreport = B_TRUE; 718 } 719 720 type = childtype; 721 vdev_size = size; 722 } 723 } 724 725 /* 726 * At this point, we have the replication of the last toplevel 727 * vdev in 'rep'. Compare it to 'lastrep' to see if its 728 * different. 729 */ 730 if (lastrep.zprl_type != NULL) { 731 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 732 if (ret != NULL) 733 free(ret); 734 ret = NULL; 735 if (fatal) 736 vdev_error(gettext( 737 "mismatched replication level: " 738 "both %s and %s vdevs are " 739 "present\n"), 740 lastrep.zprl_type, rep.zprl_type); 741 else 742 return (NULL); 743 } else if (lastrep.zprl_parity != rep.zprl_parity) { 744 if (ret) 745 free(ret); 746 ret = NULL; 747 if (fatal) 748 vdev_error(gettext( 749 "mismatched replication level: " 750 "both %llu and %llu device parity " 751 "%s vdevs are present\n"), 752 lastrep.zprl_parity, 753 rep.zprl_parity, 754 rep.zprl_type); 755 else 756 return (NULL); 757 } else if (lastrep.zprl_children != rep.zprl_children) { 758 if (ret) 759 free(ret); 760 ret = NULL; 761 if (fatal) 762 vdev_error(gettext( 763 "mismatched replication level: " 764 "both %llu-way and %llu-way %s " 765 "vdevs are present\n"), 766 lastrep.zprl_children, 767 rep.zprl_children, 768 rep.zprl_type); 769 else 770 return (NULL); 771 } 772 } 773 lastrep = rep; 774 } 775 776 if (ret != NULL) 777 *ret = rep; 778 779 return (ret); 780 } 781 782 /* 783 * Check the replication level of the vdev spec against the current pool. Calls 784 * get_replication() to make sure the new spec is self-consistent. If the pool 785 * has a consistent replication level, then we ignore any errors. Otherwise, 786 * report any difference between the two. 787 */ 788 static int 789 check_replication(nvlist_t *config, nvlist_t *newroot) 790 { 791 nvlist_t **child; 792 uint_t children; 793 replication_level_t *current = NULL, *new; 794 int ret; 795 796 /* 797 * If we have a current pool configuration, check to see if it's 798 * self-consistent. If not, simply return success. 799 */ 800 if (config != NULL) { 801 nvlist_t *nvroot; 802 803 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 804 &nvroot) == 0); 805 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 806 return (0); 807 } 808 /* 809 * for spares there may be no children, and therefore no 810 * replication level to check 811 */ 812 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 813 &child, &children) != 0) || (children == 0)) { 814 free(current); 815 return (0); 816 } 817 818 /* 819 * If all we have is logs then there's no replication level to check. 820 */ 821 if (num_logs(newroot) == children) { 822 free(current); 823 return (0); 824 } 825 826 /* 827 * Get the replication level of the new vdev spec, reporting any 828 * inconsistencies found. 829 */ 830 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 831 free(current); 832 return (-1); 833 } 834 835 /* 836 * Check to see if the new vdev spec matches the replication level of 837 * the current pool. 838 */ 839 ret = 0; 840 if (current != NULL) { 841 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 842 vdev_error(gettext( 843 "mismatched replication level: pool uses %s " 844 "and new vdev is %s\n"), 845 current->zprl_type, new->zprl_type); 846 ret = -1; 847 } else if (current->zprl_parity != new->zprl_parity) { 848 vdev_error(gettext( 849 "mismatched replication level: pool uses %llu " 850 "device parity and new vdev uses %llu\n"), 851 current->zprl_parity, new->zprl_parity); 852 ret = -1; 853 } else if (current->zprl_children != new->zprl_children) { 854 vdev_error(gettext( 855 "mismatched replication level: pool uses %llu-way " 856 "%s and new vdev uses %llu-way %s\n"), 857 current->zprl_children, current->zprl_type, 858 new->zprl_children, new->zprl_type); 859 ret = -1; 860 } 861 } 862 863 free(new); 864 if (current != NULL) 865 free(current); 866 867 return (ret); 868 } 869 870 /* 871 * Go through and find any whole disks in the vdev specification, labelling them 872 * as appropriate. When constructing the vdev spec, we were unable to open this 873 * device in order to provide a devid. Now that we have labelled the disk and 874 * know that slice 0 is valid, we can construct the devid now. 875 * 876 * If the disk was already labeled with an EFI label, we will have gotten the 877 * devid already (because we were able to open the whole disk). Otherwise, we 878 * need to get the devid after we label the disk. 879 */ 880 static int 881 make_disks(zpool_handle_t *zhp, nvlist_t *nv) 882 { 883 nvlist_t **child; 884 uint_t c, children; 885 char *type, *path, *diskname; 886 char buf[MAXPATHLEN]; 887 uint64_t wholedisk; 888 int fd; 889 int ret; 890 ddi_devid_t devid; 891 char *minor = NULL, *devid_str = NULL; 892 893 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 894 895 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 896 &child, &children) != 0) { 897 898 if (strcmp(type, VDEV_TYPE_DISK) != 0) 899 return (0); 900 901 /* 902 * We have a disk device. Get the path to the device 903 * and see if it's a whole disk by appending the backup 904 * slice and stat()ing the device. 905 */ 906 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 907 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 908 &wholedisk) != 0 || !wholedisk) 909 return (0); 910 911 diskname = strrchr(path, '/'); 912 assert(diskname != NULL); 913 diskname++; 914 if (zpool_label_disk(g_zfs, zhp, diskname) == -1) 915 return (-1); 916 917 /* 918 * Fill in the devid, now that we've labeled the disk. 919 */ 920 (void) snprintf(buf, sizeof (buf), "%ss0", path); 921 if ((fd = open(buf, O_RDONLY)) < 0) { 922 (void) fprintf(stderr, 923 gettext("cannot open '%s': %s\n"), 924 buf, strerror(errno)); 925 return (-1); 926 } 927 928 if (devid_get(fd, &devid) == 0) { 929 if (devid_get_minor_name(fd, &minor) == 0 && 930 (devid_str = devid_str_encode(devid, minor)) != 931 NULL) { 932 verify(nvlist_add_string(nv, 933 ZPOOL_CONFIG_DEVID, devid_str) == 0); 934 } 935 if (devid_str != NULL) 936 devid_str_free(devid_str); 937 if (minor != NULL) 938 devid_str_free(minor); 939 devid_free(devid); 940 } 941 942 /* 943 * Update the path to refer to the 's0' slice. The presence of 944 * the 'whole_disk' field indicates to the CLI that we should 945 * chop off the slice number when displaying the device in 946 * future output. 947 */ 948 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 949 950 (void) close(fd); 951 952 return (0); 953 } 954 955 for (c = 0; c < children; c++) 956 if ((ret = make_disks(zhp, child[c])) != 0) 957 return (ret); 958 959 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 960 &child, &children) == 0) 961 for (c = 0; c < children; c++) 962 if ((ret = make_disks(zhp, child[c])) != 0) 963 return (ret); 964 965 return (0); 966 } 967 968 /* 969 * Determine if the given path is a hot spare within the given configuration. 970 */ 971 static boolean_t 972 is_spare(nvlist_t *config, const char *path) 973 { 974 int fd; 975 pool_state_t state; 976 char *name = NULL; 977 nvlist_t *label; 978 uint64_t guid, spareguid; 979 nvlist_t *nvroot; 980 nvlist_t **spares; 981 uint_t i, nspares; 982 boolean_t inuse; 983 984 if ((fd = open(path, O_RDONLY)) < 0) 985 return (B_FALSE); 986 987 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 988 !inuse || 989 state != POOL_STATE_SPARE || 990 zpool_read_label(fd, &label) != 0) { 991 free(name); 992 (void) close(fd); 993 return (B_FALSE); 994 } 995 free(name); 996 997 (void) close(fd); 998 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 999 nvlist_free(label); 1000 1001 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1002 &nvroot) == 0); 1003 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1004 &spares, &nspares) == 0) { 1005 for (i = 0; i < nspares; i++) { 1006 verify(nvlist_lookup_uint64(spares[i], 1007 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1008 if (spareguid == guid) 1009 return (B_TRUE); 1010 } 1011 } 1012 1013 return (B_FALSE); 1014 } 1015 1016 /* 1017 * Go through and find any devices that are in use. We rely on libdiskmgt for 1018 * the majority of this task. 1019 */ 1020 static int 1021 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1022 int isspare) 1023 { 1024 nvlist_t **child; 1025 uint_t c, children; 1026 char *type, *path; 1027 int ret; 1028 char buf[MAXPATHLEN]; 1029 uint64_t wholedisk; 1030 1031 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1032 1033 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1034 &child, &children) != 0) { 1035 1036 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1037 1038 /* 1039 * As a generic check, we look to see if this is a replace of a 1040 * hot spare within the same pool. If so, we allow it 1041 * regardless of what libdiskmgt or zpool_in_use() says. 1042 */ 1043 if (isreplacing) { 1044 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1045 &wholedisk) == 0 && wholedisk) 1046 (void) snprintf(buf, sizeof (buf), "%ss0", 1047 path); 1048 else 1049 (void) strlcpy(buf, path, sizeof (buf)); 1050 if (is_spare(config, buf)) 1051 return (0); 1052 } 1053 1054 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1055 ret = check_device(path, force, isspare); 1056 1057 if (strcmp(type, VDEV_TYPE_FILE) == 0) 1058 ret = check_file(path, force, isspare); 1059 1060 return (ret); 1061 } 1062 1063 for (c = 0; c < children; c++) 1064 if ((ret = check_in_use(config, child[c], force, 1065 isreplacing, B_FALSE)) != 0) 1066 return (ret); 1067 1068 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1069 &child, &children) == 0) 1070 for (c = 0; c < children; c++) 1071 if ((ret = check_in_use(config, child[c], force, 1072 isreplacing, B_TRUE)) != 0) 1073 return (ret); 1074 return (0); 1075 } 1076 1077 static const char * 1078 is_grouping(const char *type, int *mindev) 1079 { 1080 if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { 1081 if (mindev != NULL) 1082 *mindev = 2; 1083 return (VDEV_TYPE_RAIDZ); 1084 } 1085 1086 if (strcmp(type, "raidz2") == 0) { 1087 if (mindev != NULL) 1088 *mindev = 3; 1089 return (VDEV_TYPE_RAIDZ); 1090 } 1091 1092 if (strcmp(type, "mirror") == 0) { 1093 if (mindev != NULL) 1094 *mindev = 2; 1095 return (VDEV_TYPE_MIRROR); 1096 } 1097 1098 if (strcmp(type, "spare") == 0) { 1099 if (mindev != NULL) 1100 *mindev = 1; 1101 return (VDEV_TYPE_SPARE); 1102 } 1103 1104 if (strcmp(type, "log") == 0) { 1105 if (mindev != NULL) 1106 *mindev = 1; 1107 return (VDEV_TYPE_LOG); 1108 } 1109 1110 return (NULL); 1111 } 1112 1113 /* 1114 * Construct a syntactically valid vdev specification, 1115 * and ensure that all devices and files exist and can be opened. 1116 * Note: we don't bother freeing anything in the error paths 1117 * because the program is just going to exit anyway. 1118 */ 1119 nvlist_t * 1120 construct_spec(int argc, char **argv) 1121 { 1122 nvlist_t *nvroot, *nv, **top, **spares; 1123 int t, toplevels, mindev, nspares, nlogs; 1124 const char *type; 1125 uint64_t is_log; 1126 boolean_t seen_logs; 1127 1128 top = NULL; 1129 toplevels = 0; 1130 spares = NULL; 1131 nspares = 0; 1132 nlogs = 0; 1133 is_log = B_FALSE; 1134 seen_logs = B_FALSE; 1135 1136 while (argc > 0) { 1137 nv = NULL; 1138 1139 /* 1140 * If it's a mirror or raidz, the subsequent arguments are 1141 * its leaves -- until we encounter the next mirror or raidz. 1142 */ 1143 if ((type = is_grouping(argv[0], &mindev)) != NULL) { 1144 nvlist_t **child = NULL; 1145 int c, children = 0; 1146 1147 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1148 if (spares != NULL) { 1149 (void) fprintf(stderr, 1150 gettext("invalid vdev " 1151 "specification: 'spare' can be " 1152 "specified only once\n")); 1153 return (NULL); 1154 } 1155 is_log = B_FALSE; 1156 } 1157 1158 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1159 if (seen_logs) { 1160 (void) fprintf(stderr, 1161 gettext("invalid vdev " 1162 "specification: 'log' can be " 1163 "specified only once\n")); 1164 return (NULL); 1165 } 1166 seen_logs = B_TRUE; 1167 is_log = B_TRUE; 1168 argc--; 1169 argv++; 1170 /* 1171 * A log is not a real grouping device. 1172 * We just set is_log and continue. 1173 */ 1174 continue; 1175 } 1176 1177 if (is_log) { 1178 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1179 (void) fprintf(stderr, 1180 gettext("invalid vdev " 1181 "specification: unsupported 'log' " 1182 "device: %s\n"), type); 1183 return (NULL); 1184 } 1185 nlogs++; 1186 } 1187 1188 for (c = 1; c < argc; c++) { 1189 if (is_grouping(argv[c], NULL) != NULL) 1190 break; 1191 children++; 1192 child = realloc(child, 1193 children * sizeof (nvlist_t *)); 1194 if (child == NULL) 1195 zpool_no_memory(); 1196 if ((nv = make_leaf_vdev(argv[c], B_FALSE)) 1197 == NULL) 1198 return (NULL); 1199 child[children - 1] = nv; 1200 } 1201 1202 if (children < mindev) { 1203 (void) fprintf(stderr, gettext("invalid vdev " 1204 "specification: %s requires at least %d " 1205 "devices\n"), argv[0], mindev); 1206 return (NULL); 1207 } 1208 1209 argc -= c; 1210 argv += c; 1211 1212 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1213 spares = child; 1214 nspares = children; 1215 continue; 1216 } else { 1217 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1218 0) == 0); 1219 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1220 type) == 0); 1221 verify(nvlist_add_uint64(nv, 1222 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1223 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1224 verify(nvlist_add_uint64(nv, 1225 ZPOOL_CONFIG_NPARITY, 1226 mindev - 1) == 0); 1227 } 1228 verify(nvlist_add_nvlist_array(nv, 1229 ZPOOL_CONFIG_CHILDREN, child, 1230 children) == 0); 1231 1232 for (c = 0; c < children; c++) 1233 nvlist_free(child[c]); 1234 free(child); 1235 } 1236 } else { 1237 /* 1238 * We have a device. Pass off to make_leaf_vdev() to 1239 * construct the appropriate nvlist describing the vdev. 1240 */ 1241 if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) 1242 return (NULL); 1243 if (is_log) 1244 nlogs++; 1245 argc--; 1246 argv++; 1247 } 1248 1249 toplevels++; 1250 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1251 if (top == NULL) 1252 zpool_no_memory(); 1253 top[toplevels - 1] = nv; 1254 } 1255 1256 if (toplevels == 0 && nspares == 0) { 1257 (void) fprintf(stderr, gettext("invalid vdev " 1258 "specification: at least one toplevel vdev must be " 1259 "specified\n")); 1260 return (NULL); 1261 } 1262 1263 if (seen_logs && nlogs == 0) { 1264 (void) fprintf(stderr, gettext("invalid vdev specification: " 1265 "log requires at least 1 device\n")); 1266 return (NULL); 1267 } 1268 1269 /* 1270 * Finally, create nvroot and add all top-level vdevs to it. 1271 */ 1272 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1273 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1274 VDEV_TYPE_ROOT) == 0); 1275 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1276 top, toplevels) == 0); 1277 if (nspares != 0) 1278 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1279 spares, nspares) == 0); 1280 1281 for (t = 0; t < toplevels; t++) 1282 nvlist_free(top[t]); 1283 for (t = 0; t < nspares; t++) 1284 nvlist_free(spares[t]); 1285 if (spares) 1286 free(spares); 1287 free(top); 1288 1289 return (nvroot); 1290 } 1291 1292 1293 /* 1294 * Get and validate the contents of the given vdev specification. This ensures 1295 * that the nvlist returned is well-formed, that all the devices exist, and that 1296 * they are not currently in use by any other known consumer. The 'poolconfig' 1297 * parameter is the current configuration of the pool when adding devices 1298 * existing pool, and is used to perform additional checks, such as changing the 1299 * replication level of the pool. It can be 'NULL' to indicate that this is a 1300 * new pool. The 'force' flag controls whether devices should be forcefully 1301 * added, even if they appear in use. 1302 */ 1303 nvlist_t * 1304 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, 1305 boolean_t isreplacing, int argc, char **argv) 1306 { 1307 nvlist_t *newroot; 1308 nvlist_t *poolconfig = NULL; 1309 is_force = force; 1310 1311 /* 1312 * Construct the vdev specification. If this is successful, we know 1313 * that we have a valid specification, and that all devices can be 1314 * opened. 1315 */ 1316 if ((newroot = construct_spec(argc, argv)) == NULL) 1317 return (NULL); 1318 1319 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1320 return (NULL); 1321 1322 /* 1323 * Validate each device to make sure that its not shared with another 1324 * subsystem. We do this even if 'force' is set, because there are some 1325 * uses (such as a dedicated dump device) that even '-f' cannot 1326 * override. 1327 */ 1328 if (check_in_use(poolconfig, newroot, force, isreplacing, 1329 B_FALSE) != 0) { 1330 nvlist_free(newroot); 1331 return (NULL); 1332 } 1333 1334 /* 1335 * Check the replication level of the given vdevs and report any errors 1336 * found. We include the existing pool spec, if any, as we need to 1337 * catch changes against the existing replication level. 1338 */ 1339 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1340 nvlist_free(newroot); 1341 return (NULL); 1342 } 1343 1344 /* 1345 * Run through the vdev specification and label any whole disks found. 1346 */ 1347 if (make_disks(zhp, newroot) != 0) { 1348 nvlist_free(newroot); 1349 return (NULL); 1350 } 1351 1352 return (newroot); 1353 } 1354