1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2013 Steven Hartland. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2017 Joyent, Inc. 28 * Copyright 2017 RackTop Systems. 29 */ 30 31 /* 32 * The objective of this program is to provide a DMU/ZAP/SPA stress test 33 * that runs entirely in userland, is easy to use, and easy to extend. 34 * 35 * The overall design of the ztest program is as follows: 36 * 37 * (1) For each major functional area (e.g. adding vdevs to a pool, 38 * creating and destroying datasets, reading and writing objects, etc) 39 * we have a simple routine to test that functionality. These 40 * individual routines do not have to do anything "stressful". 41 * 42 * (2) We turn these simple functionality tests into a stress test by 43 * running them all in parallel, with as many threads as desired, 44 * and spread across as many datasets, objects, and vdevs as desired. 45 * 46 * (3) While all this is happening, we inject faults into the pool to 47 * verify that self-healing data really works. 48 * 49 * (4) Every time we open a dataset, we change its checksum and compression 50 * functions. Thus even individual objects vary from block to block 51 * in which checksum they use and whether they're compressed. 52 * 53 * (5) To verify that we never lose on-disk consistency after a crash, 54 * we run the entire test in a child of the main process. 55 * At random times, the child self-immolates with a SIGKILL. 56 * This is the software equivalent of pulling the power cord. 57 * The parent then runs the test again, using the existing 58 * storage pool, as many times as desired. If backwards compatibility 59 * testing is enabled ztest will sometimes run the "older" version 60 * of ztest after a SIGKILL. 61 * 62 * (6) To verify that we don't have future leaks or temporal incursions, 63 * many of the functional tests record the transaction group number 64 * as part of their data. When reading old data, they verify that 65 * the transaction group number is less than the current, open txg. 66 * If you add a new test, please do this if applicable. 67 * 68 * When run with no arguments, ztest runs for about five minutes and 69 * produces no output if successful. To get a little bit of information, 70 * specify -V. To get more information, specify -VV, and so on. 71 * 72 * To turn this into an overnight stress test, use -T to specify run time. 73 * 74 * You can ask more more vdevs [-v], datasets [-d], or threads [-t] 75 * to increase the pool capacity, fanout, and overall stress level. 76 * 77 * Use the -k option to set the desired frequency of kills. 78 * 79 * When ztest invokes itself it passes all relevant information through a 80 * temporary file which is mmap-ed in the child process. This allows shared 81 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always 82 * stored at offset 0 of this file and contains information on the size and 83 * number of shared structures in the file. The information stored in this file 84 * must remain backwards compatible with older versions of ztest so that 85 * ztest can invoke them during backwards compatibility testing (-B). 86 */ 87 88 #include <sys/zfs_context.h> 89 #include <sys/spa.h> 90 #include <sys/dmu.h> 91 #include <sys/txg.h> 92 #include <sys/dbuf.h> 93 #include <sys/zap.h> 94 #include <sys/dmu_objset.h> 95 #include <sys/poll.h> 96 #include <sys/stat.h> 97 #include <sys/time.h> 98 #include <sys/wait.h> 99 #include <sys/mman.h> 100 #include <sys/resource.h> 101 #include <sys/zio.h> 102 #include <sys/zil.h> 103 #include <sys/zil_impl.h> 104 #include <sys/vdev_impl.h> 105 #include <sys/vdev_file.h> 106 #include <sys/vdev_initialize.h> 107 #include <sys/spa_impl.h> 108 #include <sys/metaslab_impl.h> 109 #include <sys/dsl_prop.h> 110 #include <sys/dsl_dataset.h> 111 #include <sys/dsl_destroy.h> 112 #include <sys/dsl_scan.h> 113 #include <sys/zio_checksum.h> 114 #include <sys/refcount.h> 115 #include <sys/zfeature.h> 116 #include <sys/dsl_userhold.h> 117 #include <sys/abd.h> 118 #include <stdio.h> 119 #include <stdio_ext.h> 120 #include <stdlib.h> 121 #include <unistd.h> 122 #include <signal.h> 123 #include <umem.h> 124 #include <dlfcn.h> 125 #include <ctype.h> 126 #include <math.h> 127 #include <sys/fs/zfs.h> 128 #include <libnvpair.h> 129 #include <libcmdutils.h> 130 131 static int ztest_fd_data = -1; 132 static int ztest_fd_rand = -1; 133 134 typedef struct ztest_shared_hdr { 135 uint64_t zh_hdr_size; 136 uint64_t zh_opts_size; 137 uint64_t zh_size; 138 uint64_t zh_stats_size; 139 uint64_t zh_stats_count; 140 uint64_t zh_ds_size; 141 uint64_t zh_ds_count; 142 } ztest_shared_hdr_t; 143 144 static ztest_shared_hdr_t *ztest_shared_hdr; 145 146 typedef struct ztest_shared_opts { 147 char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; 148 char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; 149 char zo_alt_ztest[MAXNAMELEN]; 150 char zo_alt_libpath[MAXNAMELEN]; 151 uint64_t zo_vdevs; 152 uint64_t zo_vdevtime; 153 size_t zo_vdev_size; 154 int zo_ashift; 155 int zo_mirrors; 156 int zo_raidz; 157 int zo_raidz_parity; 158 int zo_datasets; 159 int zo_threads; 160 uint64_t zo_passtime; 161 uint64_t zo_killrate; 162 int zo_verbose; 163 int zo_init; 164 uint64_t zo_time; 165 uint64_t zo_maxloops; 166 uint64_t zo_metaslab_force_ganging; 167 } ztest_shared_opts_t; 168 169 static const ztest_shared_opts_t ztest_opts_defaults = { 170 .zo_pool = { 'z', 't', 'e', 's', 't', '\0' }, 171 .zo_dir = { '/', 't', 'm', 'p', '\0' }, 172 .zo_alt_ztest = { '\0' }, 173 .zo_alt_libpath = { '\0' }, 174 .zo_vdevs = 5, 175 .zo_ashift = SPA_MINBLOCKSHIFT, 176 .zo_mirrors = 2, 177 .zo_raidz = 4, 178 .zo_raidz_parity = 1, 179 .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ 180 .zo_datasets = 7, 181 .zo_threads = 23, 182 .zo_passtime = 60, /* 60 seconds */ 183 .zo_killrate = 70, /* 70% kill rate */ 184 .zo_verbose = 0, 185 .zo_init = 1, 186 .zo_time = 300, /* 5 minutes */ 187 .zo_maxloops = 50, /* max loops during spa_freeze() */ 188 .zo_metaslab_force_ganging = 32 << 10 189 }; 190 191 extern uint64_t metaslab_force_ganging; 192 extern uint64_t metaslab_df_alloc_threshold; 193 extern uint64_t zfs_deadman_synctime_ms; 194 extern int metaslab_preload_limit; 195 extern boolean_t zfs_compressed_arc_enabled; 196 extern boolean_t zfs_abd_scatter_enabled; 197 extern int dmu_object_alloc_chunk_shift; 198 extern boolean_t zfs_force_some_double_word_sm_entries; 199 200 static ztest_shared_opts_t *ztest_shared_opts; 201 static ztest_shared_opts_t ztest_opts; 202 203 typedef struct ztest_shared_ds { 204 uint64_t zd_seq; 205 } ztest_shared_ds_t; 206 207 static ztest_shared_ds_t *ztest_shared_ds; 208 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) 209 210 #define BT_MAGIC 0x123456789abcdefULL 211 #define MAXFAULTS() \ 212 (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) 213 214 enum ztest_io_type { 215 ZTEST_IO_WRITE_TAG, 216 ZTEST_IO_WRITE_PATTERN, 217 ZTEST_IO_WRITE_ZEROES, 218 ZTEST_IO_TRUNCATE, 219 ZTEST_IO_SETATTR, 220 ZTEST_IO_REWRITE, 221 ZTEST_IO_TYPES 222 }; 223 224 typedef struct ztest_block_tag { 225 uint64_t bt_magic; 226 uint64_t bt_objset; 227 uint64_t bt_object; 228 uint64_t bt_dnodesize; 229 uint64_t bt_offset; 230 uint64_t bt_gen; 231 uint64_t bt_txg; 232 uint64_t bt_crtxg; 233 } ztest_block_tag_t; 234 235 typedef struct bufwad { 236 uint64_t bw_index; 237 uint64_t bw_txg; 238 uint64_t bw_data; 239 } bufwad_t; 240 241 /* 242 * It would be better to use a rangelock_t per object. Unfortunately 243 * the rangelock_t is not a drop-in replacement for rl_t, because we 244 * still need to map from object ID to rangelock_t. 245 */ 246 typedef enum { 247 RL_READER, 248 RL_WRITER, 249 RL_APPEND 250 } rl_type_t; 251 252 typedef struct rll { 253 void *rll_writer; 254 int rll_readers; 255 kmutex_t rll_lock; 256 kcondvar_t rll_cv; 257 } rll_t; 258 259 typedef struct rl { 260 uint64_t rl_object; 261 uint64_t rl_offset; 262 uint64_t rl_size; 263 rll_t *rl_lock; 264 } rl_t; 265 266 #define ZTEST_RANGE_LOCKS 64 267 #define ZTEST_OBJECT_LOCKS 64 268 269 /* 270 * Object descriptor. Used as a template for object lookup/create/remove. 271 */ 272 typedef struct ztest_od { 273 uint64_t od_dir; 274 uint64_t od_object; 275 dmu_object_type_t od_type; 276 dmu_object_type_t od_crtype; 277 uint64_t od_blocksize; 278 uint64_t od_crblocksize; 279 uint64_t od_crdnodesize; 280 uint64_t od_gen; 281 uint64_t od_crgen; 282 char od_name[ZFS_MAX_DATASET_NAME_LEN]; 283 } ztest_od_t; 284 285 /* 286 * Per-dataset state. 287 */ 288 typedef struct ztest_ds { 289 ztest_shared_ds_t *zd_shared; 290 objset_t *zd_os; 291 krwlock_t zd_zilog_lock; 292 zilog_t *zd_zilog; 293 ztest_od_t *zd_od; /* debugging aid */ 294 char zd_name[ZFS_MAX_DATASET_NAME_LEN]; 295 kmutex_t zd_dirobj_lock; 296 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; 297 rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; 298 } ztest_ds_t; 299 300 /* 301 * Per-iteration state. 302 */ 303 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); 304 305 typedef struct ztest_info { 306 ztest_func_t *zi_func; /* test function */ 307 uint64_t zi_iters; /* iterations per execution */ 308 uint64_t *zi_interval; /* execute every <interval> seconds */ 309 } ztest_info_t; 310 311 typedef struct ztest_shared_callstate { 312 uint64_t zc_count; /* per-pass count */ 313 uint64_t zc_time; /* per-pass time */ 314 uint64_t zc_next; /* next time to call this function */ 315 } ztest_shared_callstate_t; 316 317 static ztest_shared_callstate_t *ztest_shared_callstate; 318 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) 319 320 /* 321 * Note: these aren't static because we want dladdr() to work. 322 */ 323 ztest_func_t ztest_dmu_read_write; 324 ztest_func_t ztest_dmu_write_parallel; 325 ztest_func_t ztest_dmu_object_alloc_free; 326 ztest_func_t ztest_dmu_object_next_chunk; 327 ztest_func_t ztest_dmu_commit_callbacks; 328 ztest_func_t ztest_zap; 329 ztest_func_t ztest_zap_parallel; 330 ztest_func_t ztest_zil_commit; 331 ztest_func_t ztest_zil_remount; 332 ztest_func_t ztest_dmu_read_write_zcopy; 333 ztest_func_t ztest_dmu_objset_create_destroy; 334 ztest_func_t ztest_dmu_prealloc; 335 ztest_func_t ztest_fzap; 336 ztest_func_t ztest_dmu_snapshot_create_destroy; 337 ztest_func_t ztest_dsl_prop_get_set; 338 ztest_func_t ztest_spa_prop_get_set; 339 ztest_func_t ztest_spa_create_destroy; 340 ztest_func_t ztest_fault_inject; 341 ztest_func_t ztest_ddt_repair; 342 ztest_func_t ztest_dmu_snapshot_hold; 343 ztest_func_t ztest_scrub; 344 ztest_func_t ztest_dsl_dataset_promote_busy; 345 ztest_func_t ztest_vdev_attach_detach; 346 ztest_func_t ztest_vdev_LUN_growth; 347 ztest_func_t ztest_vdev_add_remove; 348 ztest_func_t ztest_vdev_aux_add_remove; 349 ztest_func_t ztest_split_pool; 350 ztest_func_t ztest_reguid; 351 ztest_func_t ztest_spa_upgrade; 352 ztest_func_t ztest_device_removal; 353 ztest_func_t ztest_remap_blocks; 354 ztest_func_t ztest_spa_checkpoint_create_discard; 355 ztest_func_t ztest_initialize; 356 ztest_func_t ztest_verify_dnode_bt; 357 358 uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ 359 uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ 360 uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ 361 uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ 362 uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ 363 364 ztest_info_t ztest_info[] = { 365 { ztest_dmu_read_write, 1, &zopt_always }, 366 { ztest_dmu_write_parallel, 10, &zopt_always }, 367 { ztest_dmu_object_alloc_free, 1, &zopt_always }, 368 { ztest_dmu_object_next_chunk, 1, &zopt_sometimes }, 369 { ztest_dmu_commit_callbacks, 1, &zopt_always }, 370 { ztest_zap, 30, &zopt_always }, 371 { ztest_zap_parallel, 100, &zopt_always }, 372 { ztest_split_pool, 1, &zopt_always }, 373 { ztest_zil_commit, 1, &zopt_incessant }, 374 { ztest_zil_remount, 1, &zopt_sometimes }, 375 { ztest_dmu_read_write_zcopy, 1, &zopt_often }, 376 { ztest_dmu_objset_create_destroy, 1, &zopt_often }, 377 { ztest_dsl_prop_get_set, 1, &zopt_often }, 378 { ztest_spa_prop_get_set, 1, &zopt_sometimes }, 379 #if 0 380 { ztest_dmu_prealloc, 1, &zopt_sometimes }, 381 #endif 382 { ztest_fzap, 1, &zopt_sometimes }, 383 { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, 384 { ztest_spa_create_destroy, 1, &zopt_sometimes }, 385 { ztest_fault_inject, 1, &zopt_sometimes }, 386 { ztest_ddt_repair, 1, &zopt_sometimes }, 387 { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, 388 { ztest_reguid, 1, &zopt_rarely }, 389 { ztest_scrub, 1, &zopt_rarely }, 390 { ztest_spa_upgrade, 1, &zopt_rarely }, 391 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, 392 { ztest_vdev_attach_detach, 1, &zopt_sometimes }, 393 { ztest_vdev_LUN_growth, 1, &zopt_rarely }, 394 { ztest_vdev_add_remove, 1, 395 &ztest_opts.zo_vdevtime }, 396 { ztest_vdev_aux_add_remove, 1, 397 &ztest_opts.zo_vdevtime }, 398 { ztest_device_removal, 1, &zopt_sometimes }, 399 { ztest_remap_blocks, 1, &zopt_sometimes }, 400 { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }, 401 { ztest_initialize, 1, &zopt_sometimes }, 402 { ztest_verify_dnode_bt, 1, &zopt_sometimes } 403 }; 404 405 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) 406 407 /* 408 * The following struct is used to hold a list of uncalled commit callbacks. 409 * The callbacks are ordered by txg number. 410 */ 411 typedef struct ztest_cb_list { 412 kmutex_t zcl_callbacks_lock; 413 list_t zcl_callbacks; 414 } ztest_cb_list_t; 415 416 /* 417 * Stuff we need to share writably between parent and child. 418 */ 419 typedef struct ztest_shared { 420 boolean_t zs_do_init; 421 hrtime_t zs_proc_start; 422 hrtime_t zs_proc_stop; 423 hrtime_t zs_thread_start; 424 hrtime_t zs_thread_stop; 425 hrtime_t zs_thread_kill; 426 uint64_t zs_enospc_count; 427 uint64_t zs_vdev_next_leaf; 428 uint64_t zs_vdev_aux; 429 uint64_t zs_alloc; 430 uint64_t zs_space; 431 uint64_t zs_splits; 432 uint64_t zs_mirrors; 433 uint64_t zs_metaslab_sz; 434 uint64_t zs_metaslab_df_alloc_threshold; 435 uint64_t zs_guid; 436 } ztest_shared_t; 437 438 #define ID_PARALLEL -1ULL 439 440 static char ztest_dev_template[] = "%s/%s.%llua"; 441 static char ztest_aux_template[] = "%s/%s.%s.%llu"; 442 ztest_shared_t *ztest_shared; 443 444 static spa_t *ztest_spa = NULL; 445 static ztest_ds_t *ztest_ds; 446 447 static kmutex_t ztest_vdev_lock; 448 static boolean_t ztest_device_removal_active = B_FALSE; 449 static kmutex_t ztest_checkpoint_lock; 450 451 /* 452 * The ztest_name_lock protects the pool and dataset namespace used by 453 * the individual tests. To modify the namespace, consumers must grab 454 * this lock as writer. Grabbing the lock as reader will ensure that the 455 * namespace does not change while the lock is held. 456 */ 457 static krwlock_t ztest_name_lock; 458 459 static boolean_t ztest_dump_core = B_TRUE; 460 static boolean_t ztest_exiting; 461 462 /* Global commit callback list */ 463 static ztest_cb_list_t zcl; 464 465 enum ztest_object { 466 ZTEST_META_DNODE = 0, 467 ZTEST_DIROBJ, 468 ZTEST_OBJECTS 469 }; 470 471 static void usage(boolean_t) __NORETURN; 472 473 /* 474 * These libumem hooks provide a reasonable set of defaults for the allocator's 475 * debugging facilities. 476 */ 477 const char * 478 _umem_debug_init() 479 { 480 return ("default,verbose"); /* $UMEM_DEBUG setting */ 481 } 482 483 const char * 484 _umem_logging_init(void) 485 { 486 return ("fail,contents"); /* $UMEM_LOGGING setting */ 487 } 488 489 #define FATAL_MSG_SZ 1024 490 491 char *fatal_msg; 492 493 static void 494 fatal(int do_perror, char *message, ...) 495 { 496 va_list args; 497 int save_errno = errno; 498 char buf[FATAL_MSG_SZ]; 499 500 (void) fflush(stdout); 501 502 va_start(args, message); 503 (void) sprintf(buf, "ztest: "); 504 /* LINTED */ 505 (void) vsprintf(buf + strlen(buf), message, args); 506 va_end(args); 507 if (do_perror) { 508 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), 509 ": %s", strerror(save_errno)); 510 } 511 (void) fprintf(stderr, "%s\n", buf); 512 fatal_msg = buf; /* to ease debugging */ 513 if (ztest_dump_core) 514 abort(); 515 exit(3); 516 } 517 518 static int 519 str2shift(const char *buf) 520 { 521 const char *ends = "BKMGTPEZ"; 522 int i; 523 524 if (buf[0] == '\0') 525 return (0); 526 for (i = 0; i < strlen(ends); i++) { 527 if (toupper(buf[0]) == ends[i]) 528 break; 529 } 530 if (i == strlen(ends)) { 531 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", 532 buf); 533 usage(B_FALSE); 534 } 535 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { 536 return (10*i); 537 } 538 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); 539 usage(B_FALSE); 540 /* NOTREACHED */ 541 } 542 543 static uint64_t 544 nicenumtoull(const char *buf) 545 { 546 char *end; 547 uint64_t val; 548 549 val = strtoull(buf, &end, 0); 550 if (end == buf) { 551 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); 552 usage(B_FALSE); 553 } else if (end[0] == '.') { 554 double fval = strtod(buf, &end); 555 fval *= pow(2, str2shift(end)); 556 if (fval > UINT64_MAX) { 557 (void) fprintf(stderr, "ztest: value too large: %s\n", 558 buf); 559 usage(B_FALSE); 560 } 561 val = (uint64_t)fval; 562 } else { 563 int shift = str2shift(end); 564 if (shift >= 64 || (val << shift) >> shift != val) { 565 (void) fprintf(stderr, "ztest: value too large: %s\n", 566 buf); 567 usage(B_FALSE); 568 } 569 val <<= shift; 570 } 571 return (val); 572 } 573 574 static void 575 usage(boolean_t requested) 576 { 577 const ztest_shared_opts_t *zo = &ztest_opts_defaults; 578 579 char nice_vdev_size[NN_NUMBUF_SZ]; 580 char nice_force_ganging[NN_NUMBUF_SZ]; 581 FILE *fp = requested ? stdout : stderr; 582 583 nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); 584 nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, 585 sizeof (nice_force_ganging)); 586 587 (void) fprintf(fp, "Usage: %s\n" 588 "\t[-v vdevs (default: %llu)]\n" 589 "\t[-s size_of_each_vdev (default: %s)]\n" 590 "\t[-a alignment_shift (default: %d)] use 0 for random\n" 591 "\t[-m mirror_copies (default: %d)]\n" 592 "\t[-r raidz_disks (default: %d)]\n" 593 "\t[-R raidz_parity (default: %d)]\n" 594 "\t[-d datasets (default: %d)]\n" 595 "\t[-t threads (default: %d)]\n" 596 "\t[-g gang_block_threshold (default: %s)]\n" 597 "\t[-i init_count (default: %d)] initialize pool i times\n" 598 "\t[-k kill_percentage (default: %llu%%)]\n" 599 "\t[-p pool_name (default: %s)]\n" 600 "\t[-f dir (default: %s)] file directory for vdev files\n" 601 "\t[-V] verbose (use multiple times for ever more blather)\n" 602 "\t[-E] use existing pool instead of creating new one\n" 603 "\t[-T time (default: %llu sec)] total run time\n" 604 "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" 605 "\t[-P passtime (default: %llu sec)] time per pass\n" 606 "\t[-B alt_ztest (default: <none>)] alternate ztest path\n" 607 "\t[-o variable=value] ... set global variable to an unsigned\n" 608 "\t 32-bit integer value\n" 609 "\t[-h] (print help)\n" 610 "", 611 zo->zo_pool, 612 (u_longlong_t)zo->zo_vdevs, /* -v */ 613 nice_vdev_size, /* -s */ 614 zo->zo_ashift, /* -a */ 615 zo->zo_mirrors, /* -m */ 616 zo->zo_raidz, /* -r */ 617 zo->zo_raidz_parity, /* -R */ 618 zo->zo_datasets, /* -d */ 619 zo->zo_threads, /* -t */ 620 nice_force_ganging, /* -g */ 621 zo->zo_init, /* -i */ 622 (u_longlong_t)zo->zo_killrate, /* -k */ 623 zo->zo_pool, /* -p */ 624 zo->zo_dir, /* -f */ 625 (u_longlong_t)zo->zo_time, /* -T */ 626 (u_longlong_t)zo->zo_maxloops, /* -F */ 627 (u_longlong_t)zo->zo_passtime); 628 exit(requested ? 0 : 1); 629 } 630 631 static void 632 process_options(int argc, char **argv) 633 { 634 char *path; 635 ztest_shared_opts_t *zo = &ztest_opts; 636 637 int opt; 638 uint64_t value; 639 char altdir[MAXNAMELEN] = { 0 }; 640 641 bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); 642 643 while ((opt = getopt(argc, argv, 644 "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:o:")) != EOF) { 645 value = 0; 646 switch (opt) { 647 case 'v': 648 case 's': 649 case 'a': 650 case 'm': 651 case 'r': 652 case 'R': 653 case 'd': 654 case 't': 655 case 'g': 656 case 'i': 657 case 'k': 658 case 'T': 659 case 'P': 660 case 'F': 661 value = nicenumtoull(optarg); 662 } 663 switch (opt) { 664 case 'v': 665 zo->zo_vdevs = value; 666 break; 667 case 's': 668 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); 669 break; 670 case 'a': 671 zo->zo_ashift = value; 672 break; 673 case 'm': 674 zo->zo_mirrors = value; 675 break; 676 case 'r': 677 zo->zo_raidz = MAX(1, value); 678 break; 679 case 'R': 680 zo->zo_raidz_parity = MIN(MAX(value, 1), 3); 681 break; 682 case 'd': 683 zo->zo_datasets = MAX(1, value); 684 break; 685 case 't': 686 zo->zo_threads = MAX(1, value); 687 break; 688 case 'g': 689 zo->zo_metaslab_force_ganging = 690 MAX(SPA_MINBLOCKSIZE << 1, value); 691 break; 692 case 'i': 693 zo->zo_init = value; 694 break; 695 case 'k': 696 zo->zo_killrate = value; 697 break; 698 case 'p': 699 (void) strlcpy(zo->zo_pool, optarg, 700 sizeof (zo->zo_pool)); 701 break; 702 case 'f': 703 path = realpath(optarg, NULL); 704 if (path == NULL) { 705 (void) fprintf(stderr, "error: %s: %s\n", 706 optarg, strerror(errno)); 707 usage(B_FALSE); 708 } else { 709 (void) strlcpy(zo->zo_dir, path, 710 sizeof (zo->zo_dir)); 711 } 712 break; 713 case 'V': 714 zo->zo_verbose++; 715 break; 716 case 'E': 717 zo->zo_init = 0; 718 break; 719 case 'T': 720 zo->zo_time = value; 721 break; 722 case 'P': 723 zo->zo_passtime = MAX(1, value); 724 break; 725 case 'F': 726 zo->zo_maxloops = MAX(1, value); 727 break; 728 case 'B': 729 (void) strlcpy(altdir, optarg, sizeof (altdir)); 730 break; 731 case 'o': 732 if (set_global_var(optarg) != 0) 733 usage(B_FALSE); 734 break; 735 case 'h': 736 usage(B_TRUE); 737 break; 738 case '?': 739 default: 740 usage(B_FALSE); 741 break; 742 } 743 } 744 745 zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); 746 747 zo->zo_vdevtime = 748 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : 749 UINT64_MAX >> 2); 750 751 if (strlen(altdir) > 0) { 752 char *cmd; 753 char *realaltdir; 754 char *bin; 755 char *ztest; 756 char *isa; 757 int isalen; 758 759 cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 760 realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 761 762 VERIFY(NULL != realpath(getexecname(), cmd)); 763 if (0 != access(altdir, F_OK)) { 764 ztest_dump_core = B_FALSE; 765 fatal(B_TRUE, "invalid alternate ztest path: %s", 766 altdir); 767 } 768 VERIFY(NULL != realpath(altdir, realaltdir)); 769 770 /* 771 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest". 772 * We want to extract <isa> to determine if we should use 773 * 32 or 64 bit binaries. 774 */ 775 bin = strstr(cmd, "/usr/bin/"); 776 ztest = strstr(bin, "/ztest"); 777 isa = bin + 9; 778 isalen = ztest - isa; 779 (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), 780 "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); 781 (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), 782 "%s/usr/lib/%.*s", realaltdir, isalen, isa); 783 784 if (0 != access(zo->zo_alt_ztest, X_OK)) { 785 ztest_dump_core = B_FALSE; 786 fatal(B_TRUE, "invalid alternate ztest: %s", 787 zo->zo_alt_ztest); 788 } else if (0 != access(zo->zo_alt_libpath, X_OK)) { 789 ztest_dump_core = B_FALSE; 790 fatal(B_TRUE, "invalid alternate lib directory %s", 791 zo->zo_alt_libpath); 792 } 793 794 umem_free(cmd, MAXPATHLEN); 795 umem_free(realaltdir, MAXPATHLEN); 796 } 797 } 798 799 static void 800 ztest_kill(ztest_shared_t *zs) 801 { 802 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); 803 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); 804 805 /* 806 * Before we kill off ztest, make sure that the config is updated. 807 * See comment above spa_write_cachefile(). 808 */ 809 mutex_enter(&spa_namespace_lock); 810 spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); 811 mutex_exit(&spa_namespace_lock); 812 813 zfs_dbgmsg_print(FTAG); 814 (void) kill(getpid(), SIGKILL); 815 } 816 817 static uint64_t 818 ztest_random(uint64_t range) 819 { 820 uint64_t r; 821 822 ASSERT3S(ztest_fd_rand, >=, 0); 823 824 if (range == 0) 825 return (0); 826 827 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) 828 fatal(1, "short read from /dev/urandom"); 829 830 return (r % range); 831 } 832 833 /* ARGSUSED */ 834 static void 835 ztest_record_enospc(const char *s) 836 { 837 ztest_shared->zs_enospc_count++; 838 } 839 840 static uint64_t 841 ztest_get_ashift(void) 842 { 843 if (ztest_opts.zo_ashift == 0) 844 return (SPA_MINBLOCKSHIFT + ztest_random(5)); 845 return (ztest_opts.zo_ashift); 846 } 847 848 static nvlist_t * 849 make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) 850 { 851 char pathbuf[MAXPATHLEN]; 852 uint64_t vdev; 853 nvlist_t *file; 854 855 if (ashift == 0) 856 ashift = ztest_get_ashift(); 857 858 if (path == NULL) { 859 path = pathbuf; 860 861 if (aux != NULL) { 862 vdev = ztest_shared->zs_vdev_aux; 863 (void) snprintf(path, sizeof (pathbuf), 864 ztest_aux_template, ztest_opts.zo_dir, 865 pool == NULL ? ztest_opts.zo_pool : pool, 866 aux, vdev); 867 } else { 868 vdev = ztest_shared->zs_vdev_next_leaf++; 869 (void) snprintf(path, sizeof (pathbuf), 870 ztest_dev_template, ztest_opts.zo_dir, 871 pool == NULL ? ztest_opts.zo_pool : pool, vdev); 872 } 873 } 874 875 if (size != 0) { 876 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 877 if (fd == -1) 878 fatal(1, "can't open %s", path); 879 if (ftruncate(fd, size) != 0) 880 fatal(1, "can't ftruncate %s", path); 881 (void) close(fd); 882 } 883 884 VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); 885 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); 886 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); 887 VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); 888 889 return (file); 890 } 891 892 static nvlist_t * 893 make_vdev_raidz(char *path, char *aux, char *pool, size_t size, 894 uint64_t ashift, int r) 895 { 896 nvlist_t *raidz, **child; 897 int c; 898 899 if (r < 2) 900 return (make_vdev_file(path, aux, pool, size, ashift)); 901 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); 902 903 for (c = 0; c < r; c++) 904 child[c] = make_vdev_file(path, aux, pool, size, ashift); 905 906 VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); 907 VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, 908 VDEV_TYPE_RAIDZ) == 0); 909 VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, 910 ztest_opts.zo_raidz_parity) == 0); 911 VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, 912 child, r) == 0); 913 914 for (c = 0; c < r; c++) 915 nvlist_free(child[c]); 916 917 umem_free(child, r * sizeof (nvlist_t *)); 918 919 return (raidz); 920 } 921 922 static nvlist_t * 923 make_vdev_mirror(char *path, char *aux, char *pool, size_t size, 924 uint64_t ashift, int r, int m) 925 { 926 nvlist_t *mirror, **child; 927 int c; 928 929 if (m < 1) 930 return (make_vdev_raidz(path, aux, pool, size, ashift, r)); 931 932 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); 933 934 for (c = 0; c < m; c++) 935 child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); 936 937 VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); 938 VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, 939 VDEV_TYPE_MIRROR) == 0); 940 VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, 941 child, m) == 0); 942 943 for (c = 0; c < m; c++) 944 nvlist_free(child[c]); 945 946 umem_free(child, m * sizeof (nvlist_t *)); 947 948 return (mirror); 949 } 950 951 static nvlist_t * 952 make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, 953 int log, int r, int m, int t) 954 { 955 nvlist_t *root, **child; 956 int c; 957 958 ASSERT(t > 0); 959 960 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); 961 962 for (c = 0; c < t; c++) { 963 child[c] = make_vdev_mirror(path, aux, pool, size, ashift, 964 r, m); 965 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, 966 log) == 0); 967 } 968 969 VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); 970 VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); 971 VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, 972 child, t) == 0); 973 974 for (c = 0; c < t; c++) 975 nvlist_free(child[c]); 976 977 umem_free(child, t * sizeof (nvlist_t *)); 978 979 return (root); 980 } 981 982 /* 983 * Find a random spa version. Returns back a random spa version in the 984 * range [initial_version, SPA_VERSION_FEATURES]. 985 */ 986 static uint64_t 987 ztest_random_spa_version(uint64_t initial_version) 988 { 989 uint64_t version = initial_version; 990 991 if (version <= SPA_VERSION_BEFORE_FEATURES) { 992 version = version + 993 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); 994 } 995 996 if (version > SPA_VERSION_BEFORE_FEATURES) 997 version = SPA_VERSION_FEATURES; 998 999 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 1000 return (version); 1001 } 1002 1003 static int 1004 ztest_random_blocksize(void) 1005 { 1006 uint64_t block_shift; 1007 /* 1008 * Choose a block size >= the ashift. 1009 * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. 1010 */ 1011 int maxbs = SPA_OLD_MAXBLOCKSHIFT; 1012 if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) 1013 maxbs = 20; 1014 block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); 1015 return (1 << (SPA_MINBLOCKSHIFT + block_shift)); 1016 } 1017 1018 static int 1019 ztest_random_dnodesize(void) 1020 { 1021 int slots; 1022 int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; 1023 1024 if (max_slots == DNODE_MIN_SLOTS) 1025 return (DNODE_MIN_SIZE); 1026 1027 /* 1028 * Weight the random distribution more heavily toward smaller 1029 * dnode sizes since that is more likely to reflect real-world 1030 * usage. 1031 */ 1032 ASSERT3U(max_slots, >, 4); 1033 switch (ztest_random(10)) { 1034 case 0: 1035 slots = 5 + ztest_random(max_slots - 4); 1036 break; 1037 case 1 ... 4: 1038 slots = 2 + ztest_random(3); 1039 break; 1040 default: 1041 slots = 1; 1042 break; 1043 } 1044 1045 return (slots << DNODE_SHIFT); 1046 } 1047 1048 static int 1049 ztest_random_ibshift(void) 1050 { 1051 return (DN_MIN_INDBLKSHIFT + 1052 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); 1053 } 1054 1055 static uint64_t 1056 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) 1057 { 1058 uint64_t top; 1059 vdev_t *rvd = spa->spa_root_vdev; 1060 vdev_t *tvd; 1061 1062 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1063 1064 do { 1065 top = ztest_random(rvd->vdev_children); 1066 tvd = rvd->vdev_child[top]; 1067 } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || 1068 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); 1069 1070 return (top); 1071 } 1072 1073 static uint64_t 1074 ztest_random_dsl_prop(zfs_prop_t prop) 1075 { 1076 uint64_t value; 1077 1078 do { 1079 value = zfs_prop_random_value(prop, ztest_random(-1ULL)); 1080 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); 1081 1082 return (value); 1083 } 1084 1085 static int 1086 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, 1087 boolean_t inherit) 1088 { 1089 const char *propname = zfs_prop_to_name(prop); 1090 const char *valname; 1091 char setpoint[MAXPATHLEN]; 1092 uint64_t curval; 1093 int error; 1094 1095 error = dsl_prop_set_int(osname, propname, 1096 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); 1097 1098 if (error == ENOSPC) { 1099 ztest_record_enospc(FTAG); 1100 return (error); 1101 } 1102 ASSERT0(error); 1103 1104 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); 1105 1106 if (ztest_opts.zo_verbose >= 6) { 1107 VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); 1108 (void) printf("%s %s = %s at '%s'\n", 1109 osname, propname, valname, setpoint); 1110 } 1111 1112 return (error); 1113 } 1114 1115 static int 1116 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) 1117 { 1118 spa_t *spa = ztest_spa; 1119 nvlist_t *props = NULL; 1120 int error; 1121 1122 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 1123 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); 1124 1125 error = spa_prop_set(spa, props); 1126 1127 nvlist_free(props); 1128 1129 if (error == ENOSPC) { 1130 ztest_record_enospc(FTAG); 1131 return (error); 1132 } 1133 ASSERT0(error); 1134 1135 return (error); 1136 } 1137 1138 static void 1139 ztest_rll_init(rll_t *rll) 1140 { 1141 rll->rll_writer = NULL; 1142 rll->rll_readers = 0; 1143 mutex_init(&rll->rll_lock, NULL, USYNC_THREAD, NULL); 1144 cv_init(&rll->rll_cv, NULL, USYNC_THREAD, NULL); 1145 } 1146 1147 static void 1148 ztest_rll_destroy(rll_t *rll) 1149 { 1150 ASSERT(rll->rll_writer == NULL); 1151 ASSERT(rll->rll_readers == 0); 1152 mutex_destroy(&rll->rll_lock); 1153 cv_destroy(&rll->rll_cv); 1154 } 1155 1156 static void 1157 ztest_rll_lock(rll_t *rll, rl_type_t type) 1158 { 1159 mutex_enter(&rll->rll_lock); 1160 1161 if (type == RL_READER) { 1162 while (rll->rll_writer != NULL) 1163 cv_wait(&rll->rll_cv, &rll->rll_lock); 1164 rll->rll_readers++; 1165 } else { 1166 while (rll->rll_writer != NULL || rll->rll_readers) 1167 cv_wait(&rll->rll_cv, &rll->rll_lock); 1168 rll->rll_writer = curthread; 1169 } 1170 1171 mutex_exit(&rll->rll_lock); 1172 } 1173 1174 static void 1175 ztest_rll_unlock(rll_t *rll) 1176 { 1177 mutex_enter(&rll->rll_lock); 1178 1179 if (rll->rll_writer) { 1180 ASSERT(rll->rll_readers == 0); 1181 rll->rll_writer = NULL; 1182 } else { 1183 ASSERT(rll->rll_readers != 0); 1184 ASSERT(rll->rll_writer == NULL); 1185 rll->rll_readers--; 1186 } 1187 1188 if (rll->rll_writer == NULL && rll->rll_readers == 0) 1189 cv_broadcast(&rll->rll_cv); 1190 1191 mutex_exit(&rll->rll_lock); 1192 } 1193 1194 static void 1195 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) 1196 { 1197 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1198 1199 ztest_rll_lock(rll, type); 1200 } 1201 1202 static void 1203 ztest_object_unlock(ztest_ds_t *zd, uint64_t object) 1204 { 1205 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; 1206 1207 ztest_rll_unlock(rll); 1208 } 1209 1210 static rl_t * 1211 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, 1212 uint64_t size, rl_type_t type) 1213 { 1214 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); 1215 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; 1216 rl_t *rl; 1217 1218 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); 1219 rl->rl_object = object; 1220 rl->rl_offset = offset; 1221 rl->rl_size = size; 1222 rl->rl_lock = rll; 1223 1224 ztest_rll_lock(rll, type); 1225 1226 return (rl); 1227 } 1228 1229 static void 1230 ztest_range_unlock(rl_t *rl) 1231 { 1232 rll_t *rll = rl->rl_lock; 1233 1234 ztest_rll_unlock(rll); 1235 1236 umem_free(rl, sizeof (*rl)); 1237 } 1238 1239 static void 1240 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) 1241 { 1242 zd->zd_os = os; 1243 zd->zd_zilog = dmu_objset_zil(os); 1244 zd->zd_shared = szd; 1245 dmu_objset_name(os, zd->zd_name); 1246 1247 if (zd->zd_shared != NULL) 1248 zd->zd_shared->zd_seq = 0; 1249 1250 rw_init(&zd->zd_zilog_lock, NULL, USYNC_THREAD, NULL); 1251 mutex_init(&zd->zd_dirobj_lock, NULL, USYNC_THREAD, NULL); 1252 1253 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1254 ztest_rll_init(&zd->zd_object_lock[l]); 1255 1256 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 1257 ztest_rll_init(&zd->zd_range_lock[l]); 1258 } 1259 1260 static void 1261 ztest_zd_fini(ztest_ds_t *zd) 1262 { 1263 mutex_destroy(&zd->zd_dirobj_lock); 1264 1265 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) 1266 ztest_rll_destroy(&zd->zd_object_lock[l]); 1267 1268 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) 1269 ztest_rll_destroy(&zd->zd_range_lock[l]); 1270 } 1271 1272 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) 1273 1274 static uint64_t 1275 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) 1276 { 1277 uint64_t txg; 1278 int error; 1279 1280 /* 1281 * Attempt to assign tx to some transaction group. 1282 */ 1283 error = dmu_tx_assign(tx, txg_how); 1284 if (error) { 1285 if (error == ERESTART) { 1286 ASSERT(txg_how == TXG_NOWAIT); 1287 dmu_tx_wait(tx); 1288 } else { 1289 ASSERT3U(error, ==, ENOSPC); 1290 ztest_record_enospc(tag); 1291 } 1292 dmu_tx_abort(tx); 1293 return (0); 1294 } 1295 txg = dmu_tx_get_txg(tx); 1296 ASSERT(txg != 0); 1297 return (txg); 1298 } 1299 1300 static void 1301 ztest_pattern_set(void *buf, uint64_t size, uint64_t value) 1302 { 1303 uint64_t *ip = buf; 1304 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1305 1306 while (ip < ip_end) 1307 *ip++ = value; 1308 } 1309 1310 static boolean_t 1311 ztest_pattern_match(void *buf, uint64_t size, uint64_t value) 1312 { 1313 uint64_t *ip = buf; 1314 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); 1315 uint64_t diff = 0; 1316 1317 while (ip < ip_end) 1318 diff |= (value - *ip++); 1319 1320 return (diff == 0); 1321 } 1322 1323 static void 1324 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1325 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1326 uint64_t crtxg) 1327 { 1328 bt->bt_magic = BT_MAGIC; 1329 bt->bt_objset = dmu_objset_id(os); 1330 bt->bt_object = object; 1331 bt->bt_dnodesize = dnodesize; 1332 bt->bt_offset = offset; 1333 bt->bt_gen = gen; 1334 bt->bt_txg = txg; 1335 bt->bt_crtxg = crtxg; 1336 } 1337 1338 static void 1339 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, 1340 uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, 1341 uint64_t crtxg) 1342 { 1343 ASSERT3U(bt->bt_magic, ==, BT_MAGIC); 1344 ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); 1345 ASSERT3U(bt->bt_object, ==, object); 1346 ASSERT3U(bt->bt_dnodesize, ==, dnodesize); 1347 ASSERT3U(bt->bt_offset, ==, offset); 1348 ASSERT3U(bt->bt_gen, <=, gen); 1349 ASSERT3U(bt->bt_txg, <=, txg); 1350 ASSERT3U(bt->bt_crtxg, ==, crtxg); 1351 } 1352 1353 static ztest_block_tag_t * 1354 ztest_bt_bonus(dmu_buf_t *db) 1355 { 1356 dmu_object_info_t doi; 1357 ztest_block_tag_t *bt; 1358 1359 dmu_object_info_from_db(db, &doi); 1360 ASSERT3U(doi.doi_bonus_size, <=, db->db_size); 1361 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); 1362 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); 1363 1364 return (bt); 1365 } 1366 1367 /* 1368 * Generate a token to fill up unused bonus buffer space. Try to make 1369 * it unique to the object, generation, and offset to verify that data 1370 * is not getting overwritten by data from other dnodes. 1371 */ 1372 #define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ 1373 (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) 1374 1375 /* 1376 * Fill up the unused bonus buffer region before the block tag with a 1377 * verifiable pattern. Filling the whole bonus area with non-zero data 1378 * helps ensure that all dnode traversal code properly skips the 1379 * interior regions of large dnodes. 1380 */ 1381 void 1382 ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1383 objset_t *os, uint64_t gen) 1384 { 1385 uint64_t *bonusp; 1386 1387 ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); 1388 1389 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1390 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1391 gen, bonusp - (uint64_t *)db->db_data); 1392 *bonusp = token; 1393 } 1394 } 1395 1396 /* 1397 * Verify that the unused area of a bonus buffer is filled with the 1398 * expected tokens. 1399 */ 1400 void 1401 ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, 1402 objset_t *os, uint64_t gen) 1403 { 1404 uint64_t *bonusp; 1405 1406 for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { 1407 uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), 1408 gen, bonusp - (uint64_t *)db->db_data); 1409 VERIFY3U(*bonusp, ==, token); 1410 } 1411 } 1412 1413 /* 1414 * ZIL logging ops 1415 */ 1416 1417 #define lrz_type lr_mode 1418 #define lrz_blocksize lr_uid 1419 #define lrz_ibshift lr_gid 1420 #define lrz_bonustype lr_rdev 1421 #define lrz_dnodesize lr_crtime[1] 1422 1423 static void 1424 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) 1425 { 1426 char *name = (void *)(lr + 1); /* name follows lr */ 1427 size_t namesize = strlen(name) + 1; 1428 itx_t *itx; 1429 1430 if (zil_replaying(zd->zd_zilog, tx)) 1431 return; 1432 1433 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); 1434 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1435 sizeof (*lr) + namesize - sizeof (lr_t)); 1436 1437 zil_itx_assign(zd->zd_zilog, itx, tx); 1438 } 1439 1440 static void 1441 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) 1442 { 1443 char *name = (void *)(lr + 1); /* name follows lr */ 1444 size_t namesize = strlen(name) + 1; 1445 itx_t *itx; 1446 1447 if (zil_replaying(zd->zd_zilog, tx)) 1448 return; 1449 1450 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); 1451 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1452 sizeof (*lr) + namesize - sizeof (lr_t)); 1453 1454 itx->itx_oid = object; 1455 zil_itx_assign(zd->zd_zilog, itx, tx); 1456 } 1457 1458 static void 1459 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) 1460 { 1461 itx_t *itx; 1462 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); 1463 1464 if (zil_replaying(zd->zd_zilog, tx)) 1465 return; 1466 1467 if (lr->lr_length > ZIL_MAX_LOG_DATA) 1468 write_state = WR_INDIRECT; 1469 1470 itx = zil_itx_create(TX_WRITE, 1471 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); 1472 1473 if (write_state == WR_COPIED && 1474 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, 1475 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { 1476 zil_itx_destroy(itx); 1477 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1478 write_state = WR_NEED_COPY; 1479 } 1480 itx->itx_private = zd; 1481 itx->itx_wr_state = write_state; 1482 itx->itx_sync = (ztest_random(8) == 0); 1483 1484 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1485 sizeof (*lr) - sizeof (lr_t)); 1486 1487 zil_itx_assign(zd->zd_zilog, itx, tx); 1488 } 1489 1490 static void 1491 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) 1492 { 1493 itx_t *itx; 1494 1495 if (zil_replaying(zd->zd_zilog, tx)) 1496 return; 1497 1498 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); 1499 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1500 sizeof (*lr) - sizeof (lr_t)); 1501 1502 itx->itx_sync = B_FALSE; 1503 zil_itx_assign(zd->zd_zilog, itx, tx); 1504 } 1505 1506 static void 1507 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) 1508 { 1509 itx_t *itx; 1510 1511 if (zil_replaying(zd->zd_zilog, tx)) 1512 return; 1513 1514 itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); 1515 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, 1516 sizeof (*lr) - sizeof (lr_t)); 1517 1518 itx->itx_sync = B_FALSE; 1519 zil_itx_assign(zd->zd_zilog, itx, tx); 1520 } 1521 1522 /* 1523 * ZIL replay ops 1524 */ 1525 static int 1526 ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) 1527 { 1528 ztest_ds_t *zd = arg1; 1529 lr_create_t *lr = arg2; 1530 char *name = (void *)(lr + 1); /* name follows lr */ 1531 objset_t *os = zd->zd_os; 1532 ztest_block_tag_t *bbt; 1533 dmu_buf_t *db; 1534 dmu_tx_t *tx; 1535 uint64_t txg; 1536 int error = 0; 1537 int bonuslen; 1538 1539 if (byteswap) 1540 byteswap_uint64_array(lr, sizeof (*lr)); 1541 1542 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1543 ASSERT(name[0] != '\0'); 1544 1545 tx = dmu_tx_create(os); 1546 1547 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); 1548 1549 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1550 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); 1551 } else { 1552 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1553 } 1554 1555 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1556 if (txg == 0) 1557 return (ENOSPC); 1558 1559 ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); 1560 bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); 1561 1562 if (lr->lrz_type == DMU_OT_ZAP_OTHER) { 1563 if (lr->lr_foid == 0) { 1564 lr->lr_foid = zap_create_dnsize(os, 1565 lr->lrz_type, lr->lrz_bonustype, 1566 bonuslen, lr->lrz_dnodesize, tx); 1567 } else { 1568 error = zap_create_claim_dnsize(os, lr->lr_foid, 1569 lr->lrz_type, lr->lrz_bonustype, 1570 bonuslen, lr->lrz_dnodesize, tx); 1571 } 1572 } else { 1573 if (lr->lr_foid == 0) { 1574 lr->lr_foid = dmu_object_alloc_dnsize(os, 1575 lr->lrz_type, 0, lr->lrz_bonustype, 1576 bonuslen, lr->lrz_dnodesize, tx); 1577 } else { 1578 error = dmu_object_claim_dnsize(os, lr->lr_foid, 1579 lr->lrz_type, 0, lr->lrz_bonustype, 1580 bonuslen, lr->lrz_dnodesize, tx); 1581 } 1582 } 1583 1584 if (error) { 1585 ASSERT3U(error, ==, EEXIST); 1586 ASSERT(zd->zd_zilog->zl_replay); 1587 dmu_tx_commit(tx); 1588 return (error); 1589 } 1590 1591 ASSERT(lr->lr_foid != 0); 1592 1593 if (lr->lrz_type != DMU_OT_ZAP_OTHER) 1594 VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, 1595 lr->lrz_blocksize, lr->lrz_ibshift, tx)); 1596 1597 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1598 bbt = ztest_bt_bonus(db); 1599 dmu_buf_will_dirty(db, tx); 1600 ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, 1601 lr->lr_gen, txg, txg); 1602 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); 1603 dmu_buf_rele(db, FTAG); 1604 1605 VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, 1606 &lr->lr_foid, tx)); 1607 1608 (void) ztest_log_create(zd, tx, lr); 1609 1610 dmu_tx_commit(tx); 1611 1612 return (0); 1613 } 1614 1615 static int 1616 ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) 1617 { 1618 ztest_ds_t *zd = arg1; 1619 lr_remove_t *lr = arg2; 1620 char *name = (void *)(lr + 1); /* name follows lr */ 1621 objset_t *os = zd->zd_os; 1622 dmu_object_info_t doi; 1623 dmu_tx_t *tx; 1624 uint64_t object, txg; 1625 1626 if (byteswap) 1627 byteswap_uint64_array(lr, sizeof (*lr)); 1628 1629 ASSERT(lr->lr_doid == ZTEST_DIROBJ); 1630 ASSERT(name[0] != '\0'); 1631 1632 VERIFY3U(0, ==, 1633 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); 1634 ASSERT(object != 0); 1635 1636 ztest_object_lock(zd, object, RL_WRITER); 1637 1638 VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); 1639 1640 tx = dmu_tx_create(os); 1641 1642 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); 1643 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); 1644 1645 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1646 if (txg == 0) { 1647 ztest_object_unlock(zd, object); 1648 return (ENOSPC); 1649 } 1650 1651 if (doi.doi_type == DMU_OT_ZAP_OTHER) { 1652 VERIFY3U(0, ==, zap_destroy(os, object, tx)); 1653 } else { 1654 VERIFY3U(0, ==, dmu_object_free(os, object, tx)); 1655 } 1656 1657 VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); 1658 1659 (void) ztest_log_remove(zd, tx, lr, object); 1660 1661 dmu_tx_commit(tx); 1662 1663 ztest_object_unlock(zd, object); 1664 1665 return (0); 1666 } 1667 1668 static int 1669 ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) 1670 { 1671 ztest_ds_t *zd = arg1; 1672 lr_write_t *lr = arg2; 1673 objset_t *os = zd->zd_os; 1674 void *data = lr + 1; /* data follows lr */ 1675 uint64_t offset, length; 1676 ztest_block_tag_t *bt = data; 1677 ztest_block_tag_t *bbt; 1678 uint64_t gen, txg, lrtxg, crtxg; 1679 dmu_object_info_t doi; 1680 dmu_tx_t *tx; 1681 dmu_buf_t *db; 1682 arc_buf_t *abuf = NULL; 1683 rl_t *rl; 1684 1685 if (byteswap) 1686 byteswap_uint64_array(lr, sizeof (*lr)); 1687 1688 offset = lr->lr_offset; 1689 length = lr->lr_length; 1690 1691 /* If it's a dmu_sync() block, write the whole block */ 1692 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 1693 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 1694 if (length < blocksize) { 1695 offset -= offset % blocksize; 1696 length = blocksize; 1697 } 1698 } 1699 1700 if (bt->bt_magic == BSWAP_64(BT_MAGIC)) 1701 byteswap_uint64_array(bt, sizeof (*bt)); 1702 1703 if (bt->bt_magic != BT_MAGIC) 1704 bt = NULL; 1705 1706 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1707 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); 1708 1709 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1710 1711 dmu_object_info_from_db(db, &doi); 1712 1713 bbt = ztest_bt_bonus(db); 1714 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1715 gen = bbt->bt_gen; 1716 crtxg = bbt->bt_crtxg; 1717 lrtxg = lr->lr_common.lrc_txg; 1718 1719 tx = dmu_tx_create(os); 1720 1721 dmu_tx_hold_write(tx, lr->lr_foid, offset, length); 1722 1723 if (ztest_random(8) == 0 && length == doi.doi_data_block_size && 1724 P2PHASE(offset, length) == 0) 1725 abuf = dmu_request_arcbuf(db, length); 1726 1727 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1728 if (txg == 0) { 1729 if (abuf != NULL) 1730 dmu_return_arcbuf(abuf); 1731 dmu_buf_rele(db, FTAG); 1732 ztest_range_unlock(rl); 1733 ztest_object_unlock(zd, lr->lr_foid); 1734 return (ENOSPC); 1735 } 1736 1737 if (bt != NULL) { 1738 /* 1739 * Usually, verify the old data before writing new data -- 1740 * but not always, because we also want to verify correct 1741 * behavior when the data was not recently read into cache. 1742 */ 1743 ASSERT(offset % doi.doi_data_block_size == 0); 1744 if (ztest_random(4) != 0) { 1745 int prefetch = ztest_random(2) ? 1746 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; 1747 ztest_block_tag_t rbt; 1748 1749 VERIFY(dmu_read(os, lr->lr_foid, offset, 1750 sizeof (rbt), &rbt, prefetch) == 0); 1751 if (rbt.bt_magic == BT_MAGIC) { 1752 ztest_bt_verify(&rbt, os, lr->lr_foid, 0, 1753 offset, gen, txg, crtxg); 1754 } 1755 } 1756 1757 /* 1758 * Writes can appear to be newer than the bonus buffer because 1759 * the ztest_get_data() callback does a dmu_read() of the 1760 * open-context data, which may be different than the data 1761 * as it was when the write was generated. 1762 */ 1763 if (zd->zd_zilog->zl_replay) { 1764 ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, 1765 MAX(gen, bt->bt_gen), MAX(txg, lrtxg), 1766 bt->bt_crtxg); 1767 } 1768 1769 /* 1770 * Set the bt's gen/txg to the bonus buffer's gen/txg 1771 * so that all of the usual ASSERTs will work. 1772 */ 1773 ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, 1774 crtxg); 1775 } 1776 1777 if (abuf == NULL) { 1778 dmu_write(os, lr->lr_foid, offset, length, data, tx); 1779 } else { 1780 bcopy(data, abuf->b_data, length); 1781 dmu_assign_arcbuf(db, offset, abuf, tx); 1782 } 1783 1784 (void) ztest_log_write(zd, tx, lr); 1785 1786 dmu_buf_rele(db, FTAG); 1787 1788 dmu_tx_commit(tx); 1789 1790 ztest_range_unlock(rl); 1791 ztest_object_unlock(zd, lr->lr_foid); 1792 1793 return (0); 1794 } 1795 1796 static int 1797 ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) 1798 { 1799 ztest_ds_t *zd = arg1; 1800 lr_truncate_t *lr = arg2; 1801 objset_t *os = zd->zd_os; 1802 dmu_tx_t *tx; 1803 uint64_t txg; 1804 rl_t *rl; 1805 1806 if (byteswap) 1807 byteswap_uint64_array(lr, sizeof (*lr)); 1808 1809 ztest_object_lock(zd, lr->lr_foid, RL_READER); 1810 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, 1811 RL_WRITER); 1812 1813 tx = dmu_tx_create(os); 1814 1815 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); 1816 1817 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1818 if (txg == 0) { 1819 ztest_range_unlock(rl); 1820 ztest_object_unlock(zd, lr->lr_foid); 1821 return (ENOSPC); 1822 } 1823 1824 VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, 1825 lr->lr_length, tx) == 0); 1826 1827 (void) ztest_log_truncate(zd, tx, lr); 1828 1829 dmu_tx_commit(tx); 1830 1831 ztest_range_unlock(rl); 1832 ztest_object_unlock(zd, lr->lr_foid); 1833 1834 return (0); 1835 } 1836 1837 static int 1838 ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) 1839 { 1840 ztest_ds_t *zd = arg1; 1841 lr_setattr_t *lr = arg2; 1842 objset_t *os = zd->zd_os; 1843 dmu_tx_t *tx; 1844 dmu_buf_t *db; 1845 ztest_block_tag_t *bbt; 1846 uint64_t txg, lrtxg, crtxg, dnodesize; 1847 1848 if (byteswap) 1849 byteswap_uint64_array(lr, sizeof (*lr)); 1850 1851 ztest_object_lock(zd, lr->lr_foid, RL_WRITER); 1852 1853 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); 1854 1855 tx = dmu_tx_create(os); 1856 dmu_tx_hold_bonus(tx, lr->lr_foid); 1857 1858 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 1859 if (txg == 0) { 1860 dmu_buf_rele(db, FTAG); 1861 ztest_object_unlock(zd, lr->lr_foid); 1862 return (ENOSPC); 1863 } 1864 1865 bbt = ztest_bt_bonus(db); 1866 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 1867 crtxg = bbt->bt_crtxg; 1868 lrtxg = lr->lr_common.lrc_txg; 1869 dnodesize = bbt->bt_dnodesize; 1870 1871 if (zd->zd_zilog->zl_replay) { 1872 ASSERT(lr->lr_size != 0); 1873 ASSERT(lr->lr_mode != 0); 1874 ASSERT(lrtxg != 0); 1875 } else { 1876 /* 1877 * Randomly change the size and increment the generation. 1878 */ 1879 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * 1880 sizeof (*bbt); 1881 lr->lr_mode = bbt->bt_gen + 1; 1882 ASSERT(lrtxg == 0); 1883 } 1884 1885 /* 1886 * Verify that the current bonus buffer is not newer than our txg. 1887 */ 1888 ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 1889 MAX(txg, lrtxg), crtxg); 1890 1891 dmu_buf_will_dirty(db, tx); 1892 1893 ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); 1894 ASSERT3U(lr->lr_size, <=, db->db_size); 1895 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); 1896 bbt = ztest_bt_bonus(db); 1897 1898 ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, 1899 txg, crtxg); 1900 ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); 1901 1902 dmu_buf_rele(db, FTAG); 1903 1904 (void) ztest_log_setattr(zd, tx, lr); 1905 1906 dmu_tx_commit(tx); 1907 1908 ztest_object_unlock(zd, lr->lr_foid); 1909 1910 return (0); 1911 } 1912 1913 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { 1914 NULL, /* 0 no such transaction type */ 1915 ztest_replay_create, /* TX_CREATE */ 1916 NULL, /* TX_MKDIR */ 1917 NULL, /* TX_MKXATTR */ 1918 NULL, /* TX_SYMLINK */ 1919 ztest_replay_remove, /* TX_REMOVE */ 1920 NULL, /* TX_RMDIR */ 1921 NULL, /* TX_LINK */ 1922 NULL, /* TX_RENAME */ 1923 ztest_replay_write, /* TX_WRITE */ 1924 ztest_replay_truncate, /* TX_TRUNCATE */ 1925 ztest_replay_setattr, /* TX_SETATTR */ 1926 NULL, /* TX_ACL */ 1927 NULL, /* TX_CREATE_ACL */ 1928 NULL, /* TX_CREATE_ATTR */ 1929 NULL, /* TX_CREATE_ACL_ATTR */ 1930 NULL, /* TX_MKDIR_ACL */ 1931 NULL, /* TX_MKDIR_ATTR */ 1932 NULL, /* TX_MKDIR_ACL_ATTR */ 1933 NULL, /* TX_WRITE2 */ 1934 }; 1935 1936 /* 1937 * ZIL get_data callbacks 1938 */ 1939 1940 /* ARGSUSED */ 1941 static void 1942 ztest_get_done(zgd_t *zgd, int error) 1943 { 1944 ztest_ds_t *zd = zgd->zgd_private; 1945 uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; 1946 1947 if (zgd->zgd_db) 1948 dmu_buf_rele(zgd->zgd_db, zgd); 1949 1950 ztest_range_unlock((rl_t *)zgd->zgd_lr); 1951 ztest_object_unlock(zd, object); 1952 1953 umem_free(zgd, sizeof (*zgd)); 1954 } 1955 1956 static int 1957 ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, 1958 zio_t *zio) 1959 { 1960 ztest_ds_t *zd = arg; 1961 objset_t *os = zd->zd_os; 1962 uint64_t object = lr->lr_foid; 1963 uint64_t offset = lr->lr_offset; 1964 uint64_t size = lr->lr_length; 1965 uint64_t txg = lr->lr_common.lrc_txg; 1966 uint64_t crtxg; 1967 dmu_object_info_t doi; 1968 dmu_buf_t *db; 1969 zgd_t *zgd; 1970 int error; 1971 1972 ASSERT3P(lwb, !=, NULL); 1973 ASSERT3P(zio, !=, NULL); 1974 ASSERT3U(size, !=, 0); 1975 1976 ztest_object_lock(zd, object, RL_READER); 1977 error = dmu_bonus_hold(os, object, FTAG, &db); 1978 if (error) { 1979 ztest_object_unlock(zd, object); 1980 return (error); 1981 } 1982 1983 crtxg = ztest_bt_bonus(db)->bt_crtxg; 1984 1985 if (crtxg == 0 || crtxg > txg) { 1986 dmu_buf_rele(db, FTAG); 1987 ztest_object_unlock(zd, object); 1988 return (ENOENT); 1989 } 1990 1991 dmu_object_info_from_db(db, &doi); 1992 dmu_buf_rele(db, FTAG); 1993 db = NULL; 1994 1995 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); 1996 zgd->zgd_lwb = lwb; 1997 zgd->zgd_private = zd; 1998 1999 if (buf != NULL) { /* immediate write */ 2000 zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, 2001 object, offset, size, RL_READER); 2002 2003 error = dmu_read(os, object, offset, size, buf, 2004 DMU_READ_NO_PREFETCH); 2005 ASSERT(error == 0); 2006 } else { 2007 size = doi.doi_data_block_size; 2008 if (ISP2(size)) { 2009 offset = P2ALIGN(offset, size); 2010 } else { 2011 ASSERT(offset < size); 2012 offset = 0; 2013 } 2014 2015 zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, 2016 object, offset, size, RL_READER); 2017 2018 error = dmu_buf_hold(os, object, offset, zgd, &db, 2019 DMU_READ_NO_PREFETCH); 2020 2021 if (error == 0) { 2022 blkptr_t *bp = &lr->lr_blkptr; 2023 2024 zgd->zgd_db = db; 2025 zgd->zgd_bp = bp; 2026 2027 ASSERT(db->db_offset == offset); 2028 ASSERT(db->db_size == size); 2029 2030 error = dmu_sync(zio, lr->lr_common.lrc_txg, 2031 ztest_get_done, zgd); 2032 2033 if (error == 0) 2034 return (0); 2035 } 2036 } 2037 2038 ztest_get_done(zgd, error); 2039 2040 return (error); 2041 } 2042 2043 static void * 2044 ztest_lr_alloc(size_t lrsize, char *name) 2045 { 2046 char *lr; 2047 size_t namesize = name ? strlen(name) + 1 : 0; 2048 2049 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); 2050 2051 if (name) 2052 bcopy(name, lr + lrsize, namesize); 2053 2054 return (lr); 2055 } 2056 2057 void 2058 ztest_lr_free(void *lr, size_t lrsize, char *name) 2059 { 2060 size_t namesize = name ? strlen(name) + 1 : 0; 2061 2062 umem_free(lr, lrsize + namesize); 2063 } 2064 2065 /* 2066 * Lookup a bunch of objects. Returns the number of objects not found. 2067 */ 2068 static int 2069 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) 2070 { 2071 int missing = 0; 2072 int error; 2073 2074 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2075 2076 for (int i = 0; i < count; i++, od++) { 2077 od->od_object = 0; 2078 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, 2079 sizeof (uint64_t), 1, &od->od_object); 2080 if (error) { 2081 ASSERT(error == ENOENT); 2082 ASSERT(od->od_object == 0); 2083 missing++; 2084 } else { 2085 dmu_buf_t *db; 2086 ztest_block_tag_t *bbt; 2087 dmu_object_info_t doi; 2088 2089 ASSERT(od->od_object != 0); 2090 ASSERT(missing == 0); /* there should be no gaps */ 2091 2092 ztest_object_lock(zd, od->od_object, RL_READER); 2093 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, 2094 od->od_object, FTAG, &db)); 2095 dmu_object_info_from_db(db, &doi); 2096 bbt = ztest_bt_bonus(db); 2097 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); 2098 od->od_type = doi.doi_type; 2099 od->od_blocksize = doi.doi_data_block_size; 2100 od->od_gen = bbt->bt_gen; 2101 dmu_buf_rele(db, FTAG); 2102 ztest_object_unlock(zd, od->od_object); 2103 } 2104 } 2105 2106 return (missing); 2107 } 2108 2109 static int 2110 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) 2111 { 2112 int missing = 0; 2113 2114 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2115 2116 for (int i = 0; i < count; i++, od++) { 2117 if (missing) { 2118 od->od_object = 0; 2119 missing++; 2120 continue; 2121 } 2122 2123 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2124 2125 lr->lr_doid = od->od_dir; 2126 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ 2127 lr->lrz_type = od->od_crtype; 2128 lr->lrz_blocksize = od->od_crblocksize; 2129 lr->lrz_ibshift = ztest_random_ibshift(); 2130 lr->lrz_bonustype = DMU_OT_UINT64_OTHER; 2131 lr->lrz_dnodesize = od->od_crdnodesize; 2132 lr->lr_gen = od->od_crgen; 2133 lr->lr_crtime[0] = time(NULL); 2134 2135 if (ztest_replay_create(zd, lr, B_FALSE) != 0) { 2136 ASSERT(missing == 0); 2137 od->od_object = 0; 2138 missing++; 2139 } else { 2140 od->od_object = lr->lr_foid; 2141 od->od_type = od->od_crtype; 2142 od->od_blocksize = od->od_crblocksize; 2143 od->od_gen = od->od_crgen; 2144 ASSERT(od->od_object != 0); 2145 } 2146 2147 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2148 } 2149 2150 return (missing); 2151 } 2152 2153 static int 2154 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) 2155 { 2156 int missing = 0; 2157 int error; 2158 2159 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); 2160 2161 od += count - 1; 2162 2163 for (int i = count - 1; i >= 0; i--, od--) { 2164 if (missing) { 2165 missing++; 2166 continue; 2167 } 2168 2169 /* 2170 * No object was found. 2171 */ 2172 if (od->od_object == 0) 2173 continue; 2174 2175 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); 2176 2177 lr->lr_doid = od->od_dir; 2178 2179 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { 2180 ASSERT3U(error, ==, ENOSPC); 2181 missing++; 2182 } else { 2183 od->od_object = 0; 2184 } 2185 ztest_lr_free(lr, sizeof (*lr), od->od_name); 2186 } 2187 2188 return (missing); 2189 } 2190 2191 static int 2192 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, 2193 void *data) 2194 { 2195 lr_write_t *lr; 2196 int error; 2197 2198 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); 2199 2200 lr->lr_foid = object; 2201 lr->lr_offset = offset; 2202 lr->lr_length = size; 2203 lr->lr_blkoff = 0; 2204 BP_ZERO(&lr->lr_blkptr); 2205 2206 bcopy(data, lr + 1, size); 2207 2208 error = ztest_replay_write(zd, lr, B_FALSE); 2209 2210 ztest_lr_free(lr, sizeof (*lr) + size, NULL); 2211 2212 return (error); 2213 } 2214 2215 static int 2216 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2217 { 2218 lr_truncate_t *lr; 2219 int error; 2220 2221 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2222 2223 lr->lr_foid = object; 2224 lr->lr_offset = offset; 2225 lr->lr_length = size; 2226 2227 error = ztest_replay_truncate(zd, lr, B_FALSE); 2228 2229 ztest_lr_free(lr, sizeof (*lr), NULL); 2230 2231 return (error); 2232 } 2233 2234 static int 2235 ztest_setattr(ztest_ds_t *zd, uint64_t object) 2236 { 2237 lr_setattr_t *lr; 2238 int error; 2239 2240 lr = ztest_lr_alloc(sizeof (*lr), NULL); 2241 2242 lr->lr_foid = object; 2243 lr->lr_size = 0; 2244 lr->lr_mode = 0; 2245 2246 error = ztest_replay_setattr(zd, lr, B_FALSE); 2247 2248 ztest_lr_free(lr, sizeof (*lr), NULL); 2249 2250 return (error); 2251 } 2252 2253 static void 2254 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) 2255 { 2256 objset_t *os = zd->zd_os; 2257 dmu_tx_t *tx; 2258 uint64_t txg; 2259 rl_t *rl; 2260 2261 txg_wait_synced(dmu_objset_pool(os), 0); 2262 2263 ztest_object_lock(zd, object, RL_READER); 2264 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); 2265 2266 tx = dmu_tx_create(os); 2267 2268 dmu_tx_hold_write(tx, object, offset, size); 2269 2270 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 2271 2272 if (txg != 0) { 2273 dmu_prealloc(os, object, offset, size, tx); 2274 dmu_tx_commit(tx); 2275 txg_wait_synced(dmu_objset_pool(os), txg); 2276 } else { 2277 (void) dmu_free_long_range(os, object, offset, size); 2278 } 2279 2280 ztest_range_unlock(rl); 2281 ztest_object_unlock(zd, object); 2282 } 2283 2284 static void 2285 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) 2286 { 2287 int err; 2288 ztest_block_tag_t wbt; 2289 dmu_object_info_t doi; 2290 enum ztest_io_type io_type; 2291 uint64_t blocksize; 2292 void *data; 2293 2294 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); 2295 blocksize = doi.doi_data_block_size; 2296 data = umem_alloc(blocksize, UMEM_NOFAIL); 2297 2298 /* 2299 * Pick an i/o type at random, biased toward writing block tags. 2300 */ 2301 io_type = ztest_random(ZTEST_IO_TYPES); 2302 if (ztest_random(2) == 0) 2303 io_type = ZTEST_IO_WRITE_TAG; 2304 2305 rw_enter(&zd->zd_zilog_lock, RW_READER); 2306 2307 switch (io_type) { 2308 2309 case ZTEST_IO_WRITE_TAG: 2310 ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, 2311 offset, 0, 0, 0); 2312 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); 2313 break; 2314 2315 case ZTEST_IO_WRITE_PATTERN: 2316 (void) memset(data, 'a' + (object + offset) % 5, blocksize); 2317 if (ztest_random(2) == 0) { 2318 /* 2319 * Induce fletcher2 collisions to ensure that 2320 * zio_ddt_collision() detects and resolves them 2321 * when using fletcher2-verify for deduplication. 2322 */ 2323 ((uint64_t *)data)[0] ^= 1ULL << 63; 2324 ((uint64_t *)data)[4] ^= 1ULL << 63; 2325 } 2326 (void) ztest_write(zd, object, offset, blocksize, data); 2327 break; 2328 2329 case ZTEST_IO_WRITE_ZEROES: 2330 bzero(data, blocksize); 2331 (void) ztest_write(zd, object, offset, blocksize, data); 2332 break; 2333 2334 case ZTEST_IO_TRUNCATE: 2335 (void) ztest_truncate(zd, object, offset, blocksize); 2336 break; 2337 2338 case ZTEST_IO_SETATTR: 2339 (void) ztest_setattr(zd, object); 2340 break; 2341 2342 case ZTEST_IO_REWRITE: 2343 rw_enter(&ztest_name_lock, RW_READER); 2344 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2345 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), 2346 B_FALSE); 2347 VERIFY(err == 0 || err == ENOSPC); 2348 err = ztest_dsl_prop_set_uint64(zd->zd_name, 2349 ZFS_PROP_COMPRESSION, 2350 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), 2351 B_FALSE); 2352 VERIFY(err == 0 || err == ENOSPC); 2353 rw_exit(&ztest_name_lock); 2354 2355 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, 2356 DMU_READ_NO_PREFETCH)); 2357 2358 (void) ztest_write(zd, object, offset, blocksize, data); 2359 break; 2360 } 2361 2362 rw_exit(&zd->zd_zilog_lock); 2363 2364 umem_free(data, blocksize); 2365 } 2366 2367 /* 2368 * Initialize an object description template. 2369 */ 2370 static void 2371 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, 2372 dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, 2373 uint64_t gen) 2374 { 2375 od->od_dir = ZTEST_DIROBJ; 2376 od->od_object = 0; 2377 2378 od->od_crtype = type; 2379 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); 2380 od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); 2381 od->od_crgen = gen; 2382 2383 od->od_type = DMU_OT_NONE; 2384 od->od_blocksize = 0; 2385 od->od_gen = 0; 2386 2387 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", 2388 tag, (int64_t)id, index); 2389 } 2390 2391 /* 2392 * Lookup or create the objects for a test using the od template. 2393 * If the objects do not all exist, or if 'remove' is specified, 2394 * remove any existing objects and create new ones. Otherwise, 2395 * use the existing objects. 2396 */ 2397 static int 2398 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) 2399 { 2400 int count = size / sizeof (*od); 2401 int rv = 0; 2402 2403 mutex_enter(&zd->zd_dirobj_lock); 2404 if ((ztest_lookup(zd, od, count) != 0 || remove) && 2405 (ztest_remove(zd, od, count) != 0 || 2406 ztest_create(zd, od, count) != 0)) 2407 rv = -1; 2408 zd->zd_od = od; 2409 mutex_exit(&zd->zd_dirobj_lock); 2410 2411 return (rv); 2412 } 2413 2414 /* ARGSUSED */ 2415 void 2416 ztest_zil_commit(ztest_ds_t *zd, uint64_t id) 2417 { 2418 zilog_t *zilog = zd->zd_zilog; 2419 2420 rw_enter(&zd->zd_zilog_lock, RW_READER); 2421 2422 zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); 2423 2424 /* 2425 * Remember the committed values in zd, which is in parent/child 2426 * shared memory. If we die, the next iteration of ztest_run() 2427 * will verify that the log really does contain this record. 2428 */ 2429 mutex_enter(&zilog->zl_lock); 2430 ASSERT(zd->zd_shared != NULL); 2431 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); 2432 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; 2433 mutex_exit(&zilog->zl_lock); 2434 2435 rw_exit(&zd->zd_zilog_lock); 2436 } 2437 2438 /* 2439 * This function is designed to simulate the operations that occur during a 2440 * mount/unmount operation. We hold the dataset across these operations in an 2441 * attempt to expose any implicit assumptions about ZIL management. 2442 */ 2443 /* ARGSUSED */ 2444 void 2445 ztest_zil_remount(ztest_ds_t *zd, uint64_t id) 2446 { 2447 objset_t *os = zd->zd_os; 2448 2449 /* 2450 * We grab the zd_dirobj_lock to ensure that no other thread is 2451 * updating the zil (i.e. adding in-memory log records) and the 2452 * zd_zilog_lock to block any I/O. 2453 */ 2454 mutex_enter(&zd->zd_dirobj_lock); 2455 rw_enter(&zd->zd_zilog_lock, RW_WRITER); 2456 2457 /* zfsvfs_teardown() */ 2458 zil_close(zd->zd_zilog); 2459 2460 /* zfsvfs_setup() */ 2461 VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); 2462 zil_replay(os, zd, ztest_replay_vector); 2463 2464 rw_exit(&zd->zd_zilog_lock); 2465 mutex_exit(&zd->zd_dirobj_lock); 2466 } 2467 2468 /* 2469 * Verify that we can't destroy an active pool, create an existing pool, 2470 * or create a pool with a bad vdev spec. 2471 */ 2472 /* ARGSUSED */ 2473 void 2474 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) 2475 { 2476 ztest_shared_opts_t *zo = &ztest_opts; 2477 spa_t *spa; 2478 nvlist_t *nvroot; 2479 2480 /* 2481 * Attempt to create using a bad file. 2482 */ 2483 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); 2484 VERIFY3U(ENOENT, ==, 2485 spa_create("ztest_bad_file", nvroot, NULL, NULL)); 2486 nvlist_free(nvroot); 2487 2488 /* 2489 * Attempt to create using a bad mirror. 2490 */ 2491 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1); 2492 VERIFY3U(ENOENT, ==, 2493 spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); 2494 nvlist_free(nvroot); 2495 2496 /* 2497 * Attempt to create an existing pool. It shouldn't matter 2498 * what's in the nvroot; we should fail with EEXIST. 2499 */ 2500 rw_enter(&ztest_name_lock, RW_READER); 2501 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); 2502 VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); 2503 nvlist_free(nvroot); 2504 VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); 2505 VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); 2506 spa_close(spa, FTAG); 2507 2508 rw_exit(&ztest_name_lock); 2509 } 2510 2511 /* ARGSUSED */ 2512 void 2513 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) 2514 { 2515 spa_t *spa; 2516 uint64_t initial_version = SPA_VERSION_INITIAL; 2517 uint64_t version, newversion; 2518 nvlist_t *nvroot, *props; 2519 char *name; 2520 2521 mutex_enter(&ztest_vdev_lock); 2522 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); 2523 2524 /* 2525 * Clean up from previous runs. 2526 */ 2527 (void) spa_destroy(name); 2528 2529 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, 2530 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); 2531 2532 /* 2533 * If we're configuring a RAIDZ device then make sure that the 2534 * the initial version is capable of supporting that feature. 2535 */ 2536 switch (ztest_opts.zo_raidz_parity) { 2537 case 0: 2538 case 1: 2539 initial_version = SPA_VERSION_INITIAL; 2540 break; 2541 case 2: 2542 initial_version = SPA_VERSION_RAIDZ2; 2543 break; 2544 case 3: 2545 initial_version = SPA_VERSION_RAIDZ3; 2546 break; 2547 } 2548 2549 /* 2550 * Create a pool with a spa version that can be upgraded. Pick 2551 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. 2552 */ 2553 do { 2554 version = ztest_random_spa_version(initial_version); 2555 } while (version > SPA_VERSION_BEFORE_FEATURES); 2556 2557 props = fnvlist_alloc(); 2558 fnvlist_add_uint64(props, 2559 zpool_prop_to_name(ZPOOL_PROP_VERSION), version); 2560 VERIFY0(spa_create(name, nvroot, props, NULL)); 2561 fnvlist_free(nvroot); 2562 fnvlist_free(props); 2563 2564 VERIFY0(spa_open(name, &spa, FTAG)); 2565 VERIFY3U(spa_version(spa), ==, version); 2566 newversion = ztest_random_spa_version(version + 1); 2567 2568 if (ztest_opts.zo_verbose >= 4) { 2569 (void) printf("upgrading spa version from %llu to %llu\n", 2570 (u_longlong_t)version, (u_longlong_t)newversion); 2571 } 2572 2573 spa_upgrade(spa, newversion); 2574 VERIFY3U(spa_version(spa), >, version); 2575 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, 2576 zpool_prop_to_name(ZPOOL_PROP_VERSION))); 2577 spa_close(spa, FTAG); 2578 2579 strfree(name); 2580 mutex_exit(&ztest_vdev_lock); 2581 } 2582 2583 static void 2584 ztest_spa_checkpoint(spa_t *spa) 2585 { 2586 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 2587 2588 int error = spa_checkpoint(spa->spa_name); 2589 2590 switch (error) { 2591 case 0: 2592 case ZFS_ERR_DEVRM_IN_PROGRESS: 2593 case ZFS_ERR_DISCARDING_CHECKPOINT: 2594 case ZFS_ERR_CHECKPOINT_EXISTS: 2595 break; 2596 case ENOSPC: 2597 ztest_record_enospc(FTAG); 2598 break; 2599 default: 2600 fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); 2601 } 2602 } 2603 2604 static void 2605 ztest_spa_discard_checkpoint(spa_t *spa) 2606 { 2607 ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); 2608 2609 int error = spa_checkpoint_discard(spa->spa_name); 2610 2611 switch (error) { 2612 case 0: 2613 case ZFS_ERR_DISCARDING_CHECKPOINT: 2614 case ZFS_ERR_NO_CHECKPOINT: 2615 break; 2616 default: 2617 fatal(0, "spa_discard_checkpoint(%s) = %d", 2618 spa->spa_name, error); 2619 } 2620 2621 } 2622 2623 /* ARGSUSED */ 2624 void 2625 ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) 2626 { 2627 spa_t *spa = ztest_spa; 2628 2629 mutex_enter(&ztest_checkpoint_lock); 2630 if (ztest_random(2) == 0) { 2631 ztest_spa_checkpoint(spa); 2632 } else { 2633 ztest_spa_discard_checkpoint(spa); 2634 } 2635 mutex_exit(&ztest_checkpoint_lock); 2636 } 2637 2638 2639 static vdev_t * 2640 vdev_lookup_by_path(vdev_t *vd, const char *path) 2641 { 2642 vdev_t *mvd; 2643 2644 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 2645 return (vd); 2646 2647 for (int c = 0; c < vd->vdev_children; c++) 2648 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 2649 NULL) 2650 return (mvd); 2651 2652 return (NULL); 2653 } 2654 2655 /* 2656 * Find the first available hole which can be used as a top-level. 2657 */ 2658 int 2659 find_vdev_hole(spa_t *spa) 2660 { 2661 vdev_t *rvd = spa->spa_root_vdev; 2662 int c; 2663 2664 ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); 2665 2666 for (c = 0; c < rvd->vdev_children; c++) { 2667 vdev_t *cvd = rvd->vdev_child[c]; 2668 2669 if (cvd->vdev_ishole) 2670 break; 2671 } 2672 return (c); 2673 } 2674 2675 /* 2676 * Verify that vdev_add() works as expected. 2677 */ 2678 /* ARGSUSED */ 2679 void 2680 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) 2681 { 2682 ztest_shared_t *zs = ztest_shared; 2683 spa_t *spa = ztest_spa; 2684 uint64_t leaves; 2685 uint64_t guid; 2686 nvlist_t *nvroot; 2687 int error; 2688 2689 mutex_enter(&ztest_vdev_lock); 2690 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; 2691 2692 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2693 2694 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; 2695 2696 /* 2697 * If we have slogs then remove them 1/4 of the time. 2698 */ 2699 if (spa_has_slogs(spa) && ztest_random(4) == 0) { 2700 /* 2701 * Grab the guid from the head of the log class rotor. 2702 */ 2703 guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; 2704 2705 spa_config_exit(spa, SCL_VDEV, FTAG); 2706 2707 /* 2708 * We have to grab the zs_name_lock as writer to 2709 * prevent a race between removing a slog (dmu_objset_find) 2710 * and destroying a dataset. Removing the slog will 2711 * grab a reference on the dataset which may cause 2712 * dmu_objset_destroy() to fail with EBUSY thus 2713 * leaving the dataset in an inconsistent state. 2714 */ 2715 rw_enter(&ztest_name_lock, RW_WRITER); 2716 error = spa_vdev_remove(spa, guid, B_FALSE); 2717 rw_exit(&ztest_name_lock); 2718 2719 switch (error) { 2720 case 0: 2721 case EEXIST: 2722 case ZFS_ERR_CHECKPOINT_EXISTS: 2723 case ZFS_ERR_DISCARDING_CHECKPOINT: 2724 break; 2725 default: 2726 fatal(0, "spa_vdev_remove() = %d", error); 2727 } 2728 } else { 2729 spa_config_exit(spa, SCL_VDEV, FTAG); 2730 2731 /* 2732 * Make 1/4 of the devices be log devices. 2733 */ 2734 nvroot = make_vdev_root(NULL, NULL, NULL, 2735 ztest_opts.zo_vdev_size, 0, 2736 ztest_random(4) == 0, ztest_opts.zo_raidz, 2737 zs->zs_mirrors, 1); 2738 2739 error = spa_vdev_add(spa, nvroot); 2740 nvlist_free(nvroot); 2741 2742 switch (error) { 2743 case 0: 2744 break; 2745 case ENOSPC: 2746 ztest_record_enospc("spa_vdev_add"); 2747 break; 2748 default: 2749 fatal(0, "spa_vdev_add() = %d", error); 2750 } 2751 } 2752 2753 mutex_exit(&ztest_vdev_lock); 2754 } 2755 2756 /* 2757 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. 2758 */ 2759 /* ARGSUSED */ 2760 void 2761 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) 2762 { 2763 ztest_shared_t *zs = ztest_shared; 2764 spa_t *spa = ztest_spa; 2765 vdev_t *rvd = spa->spa_root_vdev; 2766 spa_aux_vdev_t *sav; 2767 char *aux; 2768 uint64_t guid = 0; 2769 int error; 2770 2771 if (ztest_random(2) == 0) { 2772 sav = &spa->spa_spares; 2773 aux = ZPOOL_CONFIG_SPARES; 2774 } else { 2775 sav = &spa->spa_l2cache; 2776 aux = ZPOOL_CONFIG_L2CACHE; 2777 } 2778 2779 mutex_enter(&ztest_vdev_lock); 2780 2781 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2782 2783 if (sav->sav_count != 0 && ztest_random(4) == 0) { 2784 /* 2785 * Pick a random device to remove. 2786 */ 2787 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; 2788 } else { 2789 /* 2790 * Find an unused device we can add. 2791 */ 2792 zs->zs_vdev_aux = 0; 2793 for (;;) { 2794 char path[MAXPATHLEN]; 2795 int c; 2796 (void) snprintf(path, sizeof (path), ztest_aux_template, 2797 ztest_opts.zo_dir, ztest_opts.zo_pool, aux, 2798 zs->zs_vdev_aux); 2799 for (c = 0; c < sav->sav_count; c++) 2800 if (strcmp(sav->sav_vdevs[c]->vdev_path, 2801 path) == 0) 2802 break; 2803 if (c == sav->sav_count && 2804 vdev_lookup_by_path(rvd, path) == NULL) 2805 break; 2806 zs->zs_vdev_aux++; 2807 } 2808 } 2809 2810 spa_config_exit(spa, SCL_VDEV, FTAG); 2811 2812 if (guid == 0) { 2813 /* 2814 * Add a new device. 2815 */ 2816 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, 2817 (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1); 2818 error = spa_vdev_add(spa, nvroot); 2819 2820 switch (error) { 2821 case 0: 2822 break; 2823 default: 2824 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); 2825 } 2826 nvlist_free(nvroot); 2827 } else { 2828 /* 2829 * Remove an existing device. Sometimes, dirty its 2830 * vdev state first to make sure we handle removal 2831 * of devices that have pending state changes. 2832 */ 2833 if (ztest_random(2) == 0) 2834 (void) vdev_online(spa, guid, 0, NULL); 2835 2836 error = spa_vdev_remove(spa, guid, B_FALSE); 2837 2838 switch (error) { 2839 case 0: 2840 case EBUSY: 2841 case ZFS_ERR_CHECKPOINT_EXISTS: 2842 case ZFS_ERR_DISCARDING_CHECKPOINT: 2843 break; 2844 default: 2845 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); 2846 } 2847 } 2848 2849 mutex_exit(&ztest_vdev_lock); 2850 } 2851 2852 /* 2853 * split a pool if it has mirror tlvdevs 2854 */ 2855 /* ARGSUSED */ 2856 void 2857 ztest_split_pool(ztest_ds_t *zd, uint64_t id) 2858 { 2859 ztest_shared_t *zs = ztest_shared; 2860 spa_t *spa = ztest_spa; 2861 vdev_t *rvd = spa->spa_root_vdev; 2862 nvlist_t *tree, **child, *config, *split, **schild; 2863 uint_t c, children, schildren = 0, lastlogid = 0; 2864 int error = 0; 2865 2866 mutex_enter(&ztest_vdev_lock); 2867 2868 /* ensure we have a useable config; mirrors of raidz aren't supported */ 2869 if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { 2870 mutex_exit(&ztest_vdev_lock); 2871 return; 2872 } 2873 2874 /* clean up the old pool, if any */ 2875 (void) spa_destroy("splitp"); 2876 2877 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2878 2879 /* generate a config from the existing config */ 2880 mutex_enter(&spa->spa_props_lock); 2881 VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, 2882 &tree) == 0); 2883 mutex_exit(&spa->spa_props_lock); 2884 2885 VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, 2886 &children) == 0); 2887 2888 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); 2889 for (c = 0; c < children; c++) { 2890 vdev_t *tvd = rvd->vdev_child[c]; 2891 nvlist_t **mchild; 2892 uint_t mchildren; 2893 2894 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { 2895 VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, 2896 0) == 0); 2897 VERIFY(nvlist_add_string(schild[schildren], 2898 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); 2899 VERIFY(nvlist_add_uint64(schild[schildren], 2900 ZPOOL_CONFIG_IS_HOLE, 1) == 0); 2901 if (lastlogid == 0) 2902 lastlogid = schildren; 2903 ++schildren; 2904 continue; 2905 } 2906 lastlogid = 0; 2907 VERIFY(nvlist_lookup_nvlist_array(child[c], 2908 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); 2909 VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); 2910 } 2911 2912 /* OK, create a config that can be used to split */ 2913 VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); 2914 VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, 2915 VDEV_TYPE_ROOT) == 0); 2916 VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, 2917 lastlogid != 0 ? lastlogid : schildren) == 0); 2918 2919 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); 2920 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); 2921 2922 for (c = 0; c < schildren; c++) 2923 nvlist_free(schild[c]); 2924 free(schild); 2925 nvlist_free(split); 2926 2927 spa_config_exit(spa, SCL_VDEV, FTAG); 2928 2929 rw_enter(&ztest_name_lock, RW_WRITER); 2930 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); 2931 rw_exit(&ztest_name_lock); 2932 2933 nvlist_free(config); 2934 2935 if (error == 0) { 2936 (void) printf("successful split - results:\n"); 2937 mutex_enter(&spa_namespace_lock); 2938 show_pool_stats(spa); 2939 show_pool_stats(spa_lookup("splitp")); 2940 mutex_exit(&spa_namespace_lock); 2941 ++zs->zs_splits; 2942 --zs->zs_mirrors; 2943 } 2944 mutex_exit(&ztest_vdev_lock); 2945 } 2946 2947 /* 2948 * Verify that we can attach and detach devices. 2949 */ 2950 /* ARGSUSED */ 2951 void 2952 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) 2953 { 2954 ztest_shared_t *zs = ztest_shared; 2955 spa_t *spa = ztest_spa; 2956 spa_aux_vdev_t *sav = &spa->spa_spares; 2957 vdev_t *rvd = spa->spa_root_vdev; 2958 vdev_t *oldvd, *newvd, *pvd; 2959 nvlist_t *root; 2960 uint64_t leaves; 2961 uint64_t leaf, top; 2962 uint64_t ashift = ztest_get_ashift(); 2963 uint64_t oldguid, pguid; 2964 uint64_t oldsize, newsize; 2965 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; 2966 int replacing; 2967 int oldvd_has_siblings = B_FALSE; 2968 int newvd_is_spare = B_FALSE; 2969 int oldvd_is_log; 2970 int error, expected_error; 2971 2972 mutex_enter(&ztest_vdev_lock); 2973 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; 2974 2975 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2976 2977 /* 2978 * If a vdev is in the process of being removed, its removal may 2979 * finish while we are in progress, leading to an unexpected error 2980 * value. Don't bother trying to attach while we are in the middle 2981 * of removal. 2982 */ 2983 if (ztest_device_removal_active) { 2984 spa_config_exit(spa, SCL_ALL, FTAG); 2985 mutex_exit(&ztest_vdev_lock); 2986 return; 2987 } 2988 2989 /* 2990 * Decide whether to do an attach or a replace. 2991 */ 2992 replacing = ztest_random(2); 2993 2994 /* 2995 * Pick a random top-level vdev. 2996 */ 2997 top = ztest_random_vdev_top(spa, B_TRUE); 2998 2999 /* 3000 * Pick a random leaf within it. 3001 */ 3002 leaf = ztest_random(leaves); 3003 3004 /* 3005 * Locate this vdev. 3006 */ 3007 oldvd = rvd->vdev_child[top]; 3008 if (zs->zs_mirrors >= 1) { 3009 ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); 3010 ASSERT(oldvd->vdev_children >= zs->zs_mirrors); 3011 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; 3012 } 3013 if (ztest_opts.zo_raidz > 1) { 3014 ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); 3015 ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); 3016 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; 3017 } 3018 3019 /* 3020 * If we're already doing an attach or replace, oldvd may be a 3021 * mirror vdev -- in which case, pick a random child. 3022 */ 3023 while (oldvd->vdev_children != 0) { 3024 oldvd_has_siblings = B_TRUE; 3025 ASSERT(oldvd->vdev_children >= 2); 3026 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; 3027 } 3028 3029 oldguid = oldvd->vdev_guid; 3030 oldsize = vdev_get_min_asize(oldvd); 3031 oldvd_is_log = oldvd->vdev_top->vdev_islog; 3032 (void) strcpy(oldpath, oldvd->vdev_path); 3033 pvd = oldvd->vdev_parent; 3034 pguid = pvd->vdev_guid; 3035 3036 /* 3037 * If oldvd has siblings, then half of the time, detach it. 3038 */ 3039 if (oldvd_has_siblings && ztest_random(2) == 0) { 3040 spa_config_exit(spa, SCL_ALL, FTAG); 3041 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); 3042 if (error != 0 && error != ENODEV && error != EBUSY && 3043 error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && 3044 error != ZFS_ERR_DISCARDING_CHECKPOINT) 3045 fatal(0, "detach (%s) returned %d", oldpath, error); 3046 mutex_exit(&ztest_vdev_lock); 3047 return; 3048 } 3049 3050 /* 3051 * For the new vdev, choose with equal probability between the two 3052 * standard paths (ending in either 'a' or 'b') or a random hot spare. 3053 */ 3054 if (sav->sav_count != 0 && ztest_random(3) == 0) { 3055 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; 3056 newvd_is_spare = B_TRUE; 3057 (void) strcpy(newpath, newvd->vdev_path); 3058 } else { 3059 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, 3060 ztest_opts.zo_dir, ztest_opts.zo_pool, 3061 top * leaves + leaf); 3062 if (ztest_random(2) == 0) 3063 newpath[strlen(newpath) - 1] = 'b'; 3064 newvd = vdev_lookup_by_path(rvd, newpath); 3065 } 3066 3067 if (newvd) { 3068 /* 3069 * Reopen to ensure the vdev's asize field isn't stale. 3070 */ 3071 vdev_reopen(newvd); 3072 newsize = vdev_get_min_asize(newvd); 3073 } else { 3074 /* 3075 * Make newsize a little bigger or smaller than oldsize. 3076 * If it's smaller, the attach should fail. 3077 * If it's larger, and we're doing a replace, 3078 * we should get dynamic LUN growth when we're done. 3079 */ 3080 newsize = 10 * oldsize / (9 + ztest_random(3)); 3081 } 3082 3083 /* 3084 * If pvd is not a mirror or root, the attach should fail with ENOTSUP, 3085 * unless it's a replace; in that case any non-replacing parent is OK. 3086 * 3087 * If newvd is already part of the pool, it should fail with EBUSY. 3088 * 3089 * If newvd is too small, it should fail with EOVERFLOW. 3090 */ 3091 if (pvd->vdev_ops != &vdev_mirror_ops && 3092 pvd->vdev_ops != &vdev_root_ops && (!replacing || 3093 pvd->vdev_ops == &vdev_replacing_ops || 3094 pvd->vdev_ops == &vdev_spare_ops)) 3095 expected_error = ENOTSUP; 3096 else if (newvd_is_spare && (!replacing || oldvd_is_log)) 3097 expected_error = ENOTSUP; 3098 else if (newvd == oldvd) 3099 expected_error = replacing ? 0 : EBUSY; 3100 else if (vdev_lookup_by_path(rvd, newpath) != NULL) 3101 expected_error = EBUSY; 3102 else if (newsize < oldsize) 3103 expected_error = EOVERFLOW; 3104 else if (ashift > oldvd->vdev_top->vdev_ashift) 3105 expected_error = EDOM; 3106 else 3107 expected_error = 0; 3108 3109 spa_config_exit(spa, SCL_ALL, FTAG); 3110 3111 /* 3112 * Build the nvlist describing newpath. 3113 */ 3114 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, 3115 ashift, 0, 0, 0, 1); 3116 3117 error = spa_vdev_attach(spa, oldguid, root, replacing); 3118 3119 nvlist_free(root); 3120 3121 /* 3122 * If our parent was the replacing vdev, but the replace completed, 3123 * then instead of failing with ENOTSUP we may either succeed, 3124 * fail with ENODEV, or fail with EOVERFLOW. 3125 */ 3126 if (expected_error == ENOTSUP && 3127 (error == 0 || error == ENODEV || error == EOVERFLOW)) 3128 expected_error = error; 3129 3130 /* 3131 * If someone grew the LUN, the replacement may be too small. 3132 */ 3133 if (error == EOVERFLOW || error == EBUSY) 3134 expected_error = error; 3135 3136 if (error == ZFS_ERR_CHECKPOINT_EXISTS || 3137 error == ZFS_ERR_DISCARDING_CHECKPOINT) 3138 expected_error = error; 3139 3140 /* XXX workaround 6690467 */ 3141 if (error != expected_error && expected_error != EBUSY) { 3142 fatal(0, "attach (%s %llu, %s %llu, %d) " 3143 "returned %d, expected %d", 3144 oldpath, oldsize, newpath, 3145 newsize, replacing, error, expected_error); 3146 } 3147 3148 mutex_exit(&ztest_vdev_lock); 3149 } 3150 3151 /* ARGSUSED */ 3152 void 3153 ztest_device_removal(ztest_ds_t *zd, uint64_t id) 3154 { 3155 spa_t *spa = ztest_spa; 3156 vdev_t *vd; 3157 uint64_t guid; 3158 int error; 3159 3160 mutex_enter(&ztest_vdev_lock); 3161 3162 if (ztest_device_removal_active) { 3163 mutex_exit(&ztest_vdev_lock); 3164 return; 3165 } 3166 3167 /* 3168 * Remove a random top-level vdev and wait for removal to finish. 3169 */ 3170 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3171 vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); 3172 guid = vd->vdev_guid; 3173 spa_config_exit(spa, SCL_VDEV, FTAG); 3174 3175 error = spa_vdev_remove(spa, guid, B_FALSE); 3176 if (error == 0) { 3177 ztest_device_removal_active = B_TRUE; 3178 mutex_exit(&ztest_vdev_lock); 3179 3180 while (spa->spa_vdev_removal != NULL) 3181 txg_wait_synced(spa_get_dsl(spa), 0); 3182 } else { 3183 mutex_exit(&ztest_vdev_lock); 3184 return; 3185 } 3186 3187 /* 3188 * The pool needs to be scrubbed after completing device removal. 3189 * Failure to do so may result in checksum errors due to the 3190 * strategy employed by ztest_fault_inject() when selecting which 3191 * offset are redundant and can be damaged. 3192 */ 3193 error = spa_scan(spa, POOL_SCAN_SCRUB); 3194 if (error == 0) { 3195 while (dsl_scan_scrubbing(spa_get_dsl(spa))) 3196 txg_wait_synced(spa_get_dsl(spa), 0); 3197 } 3198 3199 mutex_enter(&ztest_vdev_lock); 3200 ztest_device_removal_active = B_FALSE; 3201 mutex_exit(&ztest_vdev_lock); 3202 } 3203 3204 /* 3205 * Callback function which expands the physical size of the vdev. 3206 */ 3207 vdev_t * 3208 grow_vdev(vdev_t *vd, void *arg) 3209 { 3210 spa_t *spa = vd->vdev_spa; 3211 size_t *newsize = arg; 3212 size_t fsize; 3213 int fd; 3214 3215 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 3216 ASSERT(vd->vdev_ops->vdev_op_leaf); 3217 3218 if ((fd = open(vd->vdev_path, O_RDWR)) == -1) 3219 return (vd); 3220 3221 fsize = lseek(fd, 0, SEEK_END); 3222 (void) ftruncate(fd, *newsize); 3223 3224 if (ztest_opts.zo_verbose >= 6) { 3225 (void) printf("%s grew from %lu to %lu bytes\n", 3226 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); 3227 } 3228 (void) close(fd); 3229 return (NULL); 3230 } 3231 3232 /* 3233 * Callback function which expands a given vdev by calling vdev_online(). 3234 */ 3235 /* ARGSUSED */ 3236 vdev_t * 3237 online_vdev(vdev_t *vd, void *arg) 3238 { 3239 spa_t *spa = vd->vdev_spa; 3240 vdev_t *tvd = vd->vdev_top; 3241 uint64_t guid = vd->vdev_guid; 3242 uint64_t generation = spa->spa_config_generation + 1; 3243 vdev_state_t newstate = VDEV_STATE_UNKNOWN; 3244 int error; 3245 3246 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); 3247 ASSERT(vd->vdev_ops->vdev_op_leaf); 3248 3249 /* Calling vdev_online will initialize the new metaslabs */ 3250 spa_config_exit(spa, SCL_STATE, spa); 3251 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); 3252 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3253 3254 /* 3255 * If vdev_online returned an error or the underlying vdev_open 3256 * failed then we abort the expand. The only way to know that 3257 * vdev_open fails is by checking the returned newstate. 3258 */ 3259 if (error || newstate != VDEV_STATE_HEALTHY) { 3260 if (ztest_opts.zo_verbose >= 5) { 3261 (void) printf("Unable to expand vdev, state %llu, " 3262 "error %d\n", (u_longlong_t)newstate, error); 3263 } 3264 return (vd); 3265 } 3266 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); 3267 3268 /* 3269 * Since we dropped the lock we need to ensure that we're 3270 * still talking to the original vdev. It's possible this 3271 * vdev may have been detached/replaced while we were 3272 * trying to online it. 3273 */ 3274 if (generation != spa->spa_config_generation) { 3275 if (ztest_opts.zo_verbose >= 5) { 3276 (void) printf("vdev configuration has changed, " 3277 "guid %llu, state %llu, expected gen %llu, " 3278 "got gen %llu\n", 3279 (u_longlong_t)guid, 3280 (u_longlong_t)tvd->vdev_state, 3281 (u_longlong_t)generation, 3282 (u_longlong_t)spa->spa_config_generation); 3283 } 3284 return (vd); 3285 } 3286 return (NULL); 3287 } 3288 3289 /* 3290 * Traverse the vdev tree calling the supplied function. 3291 * We continue to walk the tree until we either have walked all 3292 * children or we receive a non-NULL return from the callback. 3293 * If a NULL callback is passed, then we just return back the first 3294 * leaf vdev we encounter. 3295 */ 3296 vdev_t * 3297 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) 3298 { 3299 if (vd->vdev_ops->vdev_op_leaf) { 3300 if (func == NULL) 3301 return (vd); 3302 else 3303 return (func(vd, arg)); 3304 } 3305 3306 for (uint_t c = 0; c < vd->vdev_children; c++) { 3307 vdev_t *cvd = vd->vdev_child[c]; 3308 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) 3309 return (cvd); 3310 } 3311 return (NULL); 3312 } 3313 3314 /* 3315 * Verify that dynamic LUN growth works as expected. 3316 */ 3317 /* ARGSUSED */ 3318 void 3319 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) 3320 { 3321 spa_t *spa = ztest_spa; 3322 vdev_t *vd, *tvd; 3323 metaslab_class_t *mc; 3324 metaslab_group_t *mg; 3325 size_t psize, newsize; 3326 uint64_t top; 3327 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; 3328 3329 mutex_enter(&ztest_checkpoint_lock); 3330 mutex_enter(&ztest_vdev_lock); 3331 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3332 3333 /* 3334 * If there is a vdev removal in progress, it could complete while 3335 * we are running, in which case we would not be able to verify 3336 * that the metaslab_class space increased (because it decreases 3337 * when the device removal completes). 3338 */ 3339 if (ztest_device_removal_active) { 3340 spa_config_exit(spa, SCL_STATE, spa); 3341 mutex_exit(&ztest_vdev_lock); 3342 mutex_exit(&ztest_checkpoint_lock); 3343 return; 3344 } 3345 3346 top = ztest_random_vdev_top(spa, B_TRUE); 3347 3348 tvd = spa->spa_root_vdev->vdev_child[top]; 3349 mg = tvd->vdev_mg; 3350 mc = mg->mg_class; 3351 old_ms_count = tvd->vdev_ms_count; 3352 old_class_space = metaslab_class_get_space(mc); 3353 3354 /* 3355 * Determine the size of the first leaf vdev associated with 3356 * our top-level device. 3357 */ 3358 vd = vdev_walk_tree(tvd, NULL, NULL); 3359 ASSERT3P(vd, !=, NULL); 3360 ASSERT(vd->vdev_ops->vdev_op_leaf); 3361 3362 psize = vd->vdev_psize; 3363 3364 /* 3365 * We only try to expand the vdev if it's healthy, less than 4x its 3366 * original size, and it has a valid psize. 3367 */ 3368 if (tvd->vdev_state != VDEV_STATE_HEALTHY || 3369 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { 3370 spa_config_exit(spa, SCL_STATE, spa); 3371 mutex_exit(&ztest_vdev_lock); 3372 mutex_exit(&ztest_checkpoint_lock); 3373 return; 3374 } 3375 ASSERT(psize > 0); 3376 newsize = psize + psize / 8; 3377 ASSERT3U(newsize, >, psize); 3378 3379 if (ztest_opts.zo_verbose >= 6) { 3380 (void) printf("Expanding LUN %s from %lu to %lu\n", 3381 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); 3382 } 3383 3384 /* 3385 * Growing the vdev is a two step process: 3386 * 1). expand the physical size (i.e. relabel) 3387 * 2). online the vdev to create the new metaslabs 3388 */ 3389 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || 3390 vdev_walk_tree(tvd, online_vdev, NULL) != NULL || 3391 tvd->vdev_state != VDEV_STATE_HEALTHY) { 3392 if (ztest_opts.zo_verbose >= 5) { 3393 (void) printf("Could not expand LUN because " 3394 "the vdev configuration changed.\n"); 3395 } 3396 spa_config_exit(spa, SCL_STATE, spa); 3397 mutex_exit(&ztest_vdev_lock); 3398 mutex_exit(&ztest_checkpoint_lock); 3399 return; 3400 } 3401 3402 spa_config_exit(spa, SCL_STATE, spa); 3403 3404 /* 3405 * Expanding the LUN will update the config asynchronously, 3406 * thus we must wait for the async thread to complete any 3407 * pending tasks before proceeding. 3408 */ 3409 for (;;) { 3410 boolean_t done; 3411 mutex_enter(&spa->spa_async_lock); 3412 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); 3413 mutex_exit(&spa->spa_async_lock); 3414 if (done) 3415 break; 3416 txg_wait_synced(spa_get_dsl(spa), 0); 3417 (void) poll(NULL, 0, 100); 3418 } 3419 3420 spa_config_enter(spa, SCL_STATE, spa, RW_READER); 3421 3422 tvd = spa->spa_root_vdev->vdev_child[top]; 3423 new_ms_count = tvd->vdev_ms_count; 3424 new_class_space = metaslab_class_get_space(mc); 3425 3426 if (tvd->vdev_mg != mg || mg->mg_class != mc) { 3427 if (ztest_opts.zo_verbose >= 5) { 3428 (void) printf("Could not verify LUN expansion due to " 3429 "intervening vdev offline or remove.\n"); 3430 } 3431 spa_config_exit(spa, SCL_STATE, spa); 3432 mutex_exit(&ztest_vdev_lock); 3433 mutex_exit(&ztest_checkpoint_lock); 3434 return; 3435 } 3436 3437 /* 3438 * Make sure we were able to grow the vdev. 3439 */ 3440 if (new_ms_count <= old_ms_count) { 3441 fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", 3442 old_ms_count, new_ms_count); 3443 } 3444 3445 /* 3446 * Make sure we were able to grow the pool. 3447 */ 3448 if (new_class_space <= old_class_space) { 3449 fatal(0, "LUN expansion failed: class_space %llu < %llu\n", 3450 old_class_space, new_class_space); 3451 } 3452 3453 if (ztest_opts.zo_verbose >= 5) { 3454 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; 3455 3456 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); 3457 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); 3458 (void) printf("%s grew from %s to %s\n", 3459 spa->spa_name, oldnumbuf, newnumbuf); 3460 } 3461 3462 spa_config_exit(spa, SCL_STATE, spa); 3463 mutex_exit(&ztest_vdev_lock); 3464 mutex_exit(&ztest_checkpoint_lock); 3465 } 3466 3467 /* 3468 * Verify that dmu_objset_{create,destroy,open,close} work as expected. 3469 */ 3470 /* ARGSUSED */ 3471 static void 3472 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 3473 { 3474 /* 3475 * Create the objects common to all ztest datasets. 3476 */ 3477 VERIFY(zap_create_claim(os, ZTEST_DIROBJ, 3478 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); 3479 } 3480 3481 static int 3482 ztest_dataset_create(char *dsname) 3483 { 3484 uint64_t zilset = ztest_random(100); 3485 int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, 3486 ztest_objset_create_cb, NULL); 3487 3488 if (err || zilset < 80) 3489 return (err); 3490 3491 if (ztest_opts.zo_verbose >= 6) 3492 (void) printf("Setting dataset %s to sync always\n", dsname); 3493 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, 3494 ZFS_SYNC_ALWAYS, B_FALSE)); 3495 } 3496 3497 /* ARGSUSED */ 3498 static int 3499 ztest_objset_destroy_cb(const char *name, void *arg) 3500 { 3501 objset_t *os; 3502 dmu_object_info_t doi; 3503 int error; 3504 3505 /* 3506 * Verify that the dataset contains a directory object. 3507 */ 3508 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os)); 3509 error = dmu_object_info(os, ZTEST_DIROBJ, &doi); 3510 if (error != ENOENT) { 3511 /* We could have crashed in the middle of destroying it */ 3512 ASSERT0(error); 3513 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); 3514 ASSERT3S(doi.doi_physical_blocks_512, >=, 0); 3515 } 3516 dmu_objset_disown(os, FTAG); 3517 3518 /* 3519 * Destroy the dataset. 3520 */ 3521 if (strchr(name, '@') != NULL) { 3522 VERIFY0(dsl_destroy_snapshot(name, B_TRUE)); 3523 } else { 3524 error = dsl_destroy_head(name); 3525 /* There could be a hold on this dataset */ 3526 if (error != EBUSY) 3527 ASSERT0(error); 3528 } 3529 return (0); 3530 } 3531 3532 static boolean_t 3533 ztest_snapshot_create(char *osname, uint64_t id) 3534 { 3535 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 3536 int error; 3537 3538 (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); 3539 3540 error = dmu_objset_snapshot_one(osname, snapname); 3541 if (error == ENOSPC) { 3542 ztest_record_enospc(FTAG); 3543 return (B_FALSE); 3544 } 3545 if (error != 0 && error != EEXIST) { 3546 fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, 3547 snapname, error); 3548 } 3549 return (B_TRUE); 3550 } 3551 3552 static boolean_t 3553 ztest_snapshot_destroy(char *osname, uint64_t id) 3554 { 3555 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 3556 int error; 3557 3558 (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, 3559 (u_longlong_t)id); 3560 3561 error = dsl_destroy_snapshot(snapname, B_FALSE); 3562 if (error != 0 && error != ENOENT) 3563 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); 3564 return (B_TRUE); 3565 } 3566 3567 /* ARGSUSED */ 3568 void 3569 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) 3570 { 3571 ztest_ds_t zdtmp; 3572 int iters; 3573 int error; 3574 objset_t *os, *os2; 3575 char name[ZFS_MAX_DATASET_NAME_LEN]; 3576 zilog_t *zilog; 3577 3578 rw_enter(&ztest_name_lock, RW_READER); 3579 3580 (void) snprintf(name, sizeof (name), "%s/temp_%llu", 3581 ztest_opts.zo_pool, (u_longlong_t)id); 3582 3583 /* 3584 * If this dataset exists from a previous run, process its replay log 3585 * half of the time. If we don't replay it, then dmu_objset_destroy() 3586 * (invoked from ztest_objset_destroy_cb()) should just throw it away. 3587 */ 3588 if (ztest_random(2) == 0 && 3589 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { 3590 ztest_zd_init(&zdtmp, NULL, os); 3591 zil_replay(os, &zdtmp, ztest_replay_vector); 3592 ztest_zd_fini(&zdtmp); 3593 dmu_objset_disown(os, FTAG); 3594 } 3595 3596 /* 3597 * There may be an old instance of the dataset we're about to 3598 * create lying around from a previous run. If so, destroy it 3599 * and all of its snapshots. 3600 */ 3601 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 3602 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 3603 3604 /* 3605 * Verify that the destroyed dataset is no longer in the namespace. 3606 */ 3607 VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, 3608 FTAG, &os)); 3609 3610 /* 3611 * Verify that we can create a new dataset. 3612 */ 3613 error = ztest_dataset_create(name); 3614 if (error) { 3615 if (error == ENOSPC) { 3616 ztest_record_enospc(FTAG); 3617 rw_exit(&ztest_name_lock); 3618 return; 3619 } 3620 fatal(0, "dmu_objset_create(%s) = %d", name, error); 3621 } 3622 3623 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); 3624 3625 ztest_zd_init(&zdtmp, NULL, os); 3626 3627 /* 3628 * Open the intent log for it. 3629 */ 3630 zilog = zil_open(os, ztest_get_data); 3631 3632 /* 3633 * Put some objects in there, do a little I/O to them, 3634 * and randomly take a couple of snapshots along the way. 3635 */ 3636 iters = ztest_random(5); 3637 for (int i = 0; i < iters; i++) { 3638 ztest_dmu_object_alloc_free(&zdtmp, id); 3639 if (ztest_random(iters) == 0) 3640 (void) ztest_snapshot_create(name, i); 3641 } 3642 3643 /* 3644 * Verify that we cannot create an existing dataset. 3645 */ 3646 VERIFY3U(EEXIST, ==, 3647 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL)); 3648 3649 /* 3650 * Verify that we can hold an objset that is also owned. 3651 */ 3652 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); 3653 dmu_objset_rele(os2, FTAG); 3654 3655 /* 3656 * Verify that we cannot own an objset that is already owned. 3657 */ 3658 VERIFY3U(EBUSY, ==, 3659 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2)); 3660 3661 zil_close(zilog); 3662 dmu_objset_disown(os, FTAG); 3663 ztest_zd_fini(&zdtmp); 3664 3665 rw_exit(&ztest_name_lock); 3666 } 3667 3668 /* 3669 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. 3670 */ 3671 void 3672 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) 3673 { 3674 rw_enter(&ztest_name_lock, RW_READER); 3675 (void) ztest_snapshot_destroy(zd->zd_name, id); 3676 (void) ztest_snapshot_create(zd->zd_name, id); 3677 rw_exit(&ztest_name_lock); 3678 } 3679 3680 /* 3681 * Cleanup non-standard snapshots and clones. 3682 */ 3683 void 3684 ztest_dsl_dataset_cleanup(char *osname, uint64_t id) 3685 { 3686 char snap1name[ZFS_MAX_DATASET_NAME_LEN]; 3687 char clone1name[ZFS_MAX_DATASET_NAME_LEN]; 3688 char snap2name[ZFS_MAX_DATASET_NAME_LEN]; 3689 char clone2name[ZFS_MAX_DATASET_NAME_LEN]; 3690 char snap3name[ZFS_MAX_DATASET_NAME_LEN]; 3691 int error; 3692 3693 (void) snprintf(snap1name, sizeof (snap1name), 3694 "%s@s1_%llu", osname, id); 3695 (void) snprintf(clone1name, sizeof (clone1name), 3696 "%s/c1_%llu", osname, id); 3697 (void) snprintf(snap2name, sizeof (snap2name), 3698 "%s@s2_%llu", clone1name, id); 3699 (void) snprintf(clone2name, sizeof (clone2name), 3700 "%s/c2_%llu", osname, id); 3701 (void) snprintf(snap3name, sizeof (snap3name), 3702 "%s@s3_%llu", clone1name, id); 3703 3704 error = dsl_destroy_head(clone2name); 3705 if (error && error != ENOENT) 3706 fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); 3707 error = dsl_destroy_snapshot(snap3name, B_FALSE); 3708 if (error && error != ENOENT) 3709 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); 3710 error = dsl_destroy_snapshot(snap2name, B_FALSE); 3711 if (error && error != ENOENT) 3712 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); 3713 error = dsl_destroy_head(clone1name); 3714 if (error && error != ENOENT) 3715 fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); 3716 error = dsl_destroy_snapshot(snap1name, B_FALSE); 3717 if (error && error != ENOENT) 3718 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); 3719 } 3720 3721 /* 3722 * Verify dsl_dataset_promote handles EBUSY 3723 */ 3724 void 3725 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) 3726 { 3727 objset_t *os; 3728 char snap1name[ZFS_MAX_DATASET_NAME_LEN]; 3729 char clone1name[ZFS_MAX_DATASET_NAME_LEN]; 3730 char snap2name[ZFS_MAX_DATASET_NAME_LEN]; 3731 char clone2name[ZFS_MAX_DATASET_NAME_LEN]; 3732 char snap3name[ZFS_MAX_DATASET_NAME_LEN]; 3733 char *osname = zd->zd_name; 3734 int error; 3735 3736 rw_enter(&ztest_name_lock, RW_READER); 3737 3738 ztest_dsl_dataset_cleanup(osname, id); 3739 3740 (void) snprintf(snap1name, sizeof (snap1name), 3741 "%s@s1_%llu", osname, id); 3742 (void) snprintf(clone1name, sizeof (clone1name), 3743 "%s/c1_%llu", osname, id); 3744 (void) snprintf(snap2name, sizeof (snap2name), 3745 "%s@s2_%llu", clone1name, id); 3746 (void) snprintf(clone2name, sizeof (clone2name), 3747 "%s/c2_%llu", osname, id); 3748 (void) snprintf(snap3name, sizeof (snap3name), 3749 "%s@s3_%llu", clone1name, id); 3750 3751 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); 3752 if (error && error != EEXIST) { 3753 if (error == ENOSPC) { 3754 ztest_record_enospc(FTAG); 3755 goto out; 3756 } 3757 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); 3758 } 3759 3760 error = dmu_objset_clone(clone1name, snap1name); 3761 if (error) { 3762 if (error == ENOSPC) { 3763 ztest_record_enospc(FTAG); 3764 goto out; 3765 } 3766 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); 3767 } 3768 3769 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); 3770 if (error && error != EEXIST) { 3771 if (error == ENOSPC) { 3772 ztest_record_enospc(FTAG); 3773 goto out; 3774 } 3775 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); 3776 } 3777 3778 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); 3779 if (error && error != EEXIST) { 3780 if (error == ENOSPC) { 3781 ztest_record_enospc(FTAG); 3782 goto out; 3783 } 3784 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); 3785 } 3786 3787 error = dmu_objset_clone(clone2name, snap3name); 3788 if (error) { 3789 if (error == ENOSPC) { 3790 ztest_record_enospc(FTAG); 3791 goto out; 3792 } 3793 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); 3794 } 3795 3796 error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os); 3797 if (error) 3798 fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); 3799 error = dsl_dataset_promote(clone2name, NULL); 3800 if (error == ENOSPC) { 3801 dmu_objset_disown(os, FTAG); 3802 ztest_record_enospc(FTAG); 3803 goto out; 3804 } 3805 if (error != EBUSY) 3806 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, 3807 error); 3808 dmu_objset_disown(os, FTAG); 3809 3810 out: 3811 ztest_dsl_dataset_cleanup(osname, id); 3812 3813 rw_exit(&ztest_name_lock); 3814 } 3815 3816 /* 3817 * Verify that dmu_object_{alloc,free} work as expected. 3818 */ 3819 void 3820 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) 3821 { 3822 ztest_od_t od[4]; 3823 int batchsize = sizeof (od) / sizeof (od[0]); 3824 3825 for (int b = 0; b < batchsize; b++) { 3826 ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 3827 0, 0, 0); 3828 } 3829 3830 /* 3831 * Destroy the previous batch of objects, create a new batch, 3832 * and do some I/O on the new objects. 3833 */ 3834 if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) 3835 return; 3836 3837 while (ztest_random(4 * batchsize) != 0) 3838 ztest_io(zd, od[ztest_random(batchsize)].od_object, 3839 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 3840 } 3841 3842 /* 3843 * Rewind the global allocator to verify object allocation backfilling. 3844 */ 3845 void 3846 ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) 3847 { 3848 objset_t *os = zd->zd_os; 3849 int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; 3850 uint64_t object; 3851 3852 /* 3853 * Rewind the global allocator randomly back to a lower object number 3854 * to force backfilling and reclamation of recently freed dnodes. 3855 */ 3856 mutex_enter(&os->os_obj_lock); 3857 object = ztest_random(os->os_obj_next_chunk); 3858 os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); 3859 mutex_exit(&os->os_obj_lock); 3860 } 3861 3862 /* 3863 * Verify that dmu_{read,write} work as expected. 3864 */ 3865 void 3866 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) 3867 { 3868 objset_t *os = zd->zd_os; 3869 ztest_od_t od[2]; 3870 dmu_tx_t *tx; 3871 int i, freeit, error; 3872 uint64_t n, s, txg; 3873 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; 3874 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 3875 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); 3876 uint64_t regions = 997; 3877 uint64_t stride = 123456789ULL; 3878 uint64_t width = 40; 3879 int free_percent = 5; 3880 3881 /* 3882 * This test uses two objects, packobj and bigobj, that are always 3883 * updated together (i.e. in the same tx) so that their contents are 3884 * in sync and can be compared. Their contents relate to each other 3885 * in a simple way: packobj is a dense array of 'bufwad' structures, 3886 * while bigobj is a sparse array of the same bufwads. Specifically, 3887 * for any index n, there are three bufwads that should be identical: 3888 * 3889 * packobj, at offset n * sizeof (bufwad_t) 3890 * bigobj, at the head of the nth chunk 3891 * bigobj, at the tail of the nth chunk 3892 * 3893 * The chunk size is arbitrary. It doesn't have to be a power of two, 3894 * and it doesn't have any relation to the object blocksize. 3895 * The only requirement is that it can hold at least two bufwads. 3896 * 3897 * Normally, we write the bufwad to each of these locations. 3898 * However, free_percent of the time we instead write zeroes to 3899 * packobj and perform a dmu_free_range() on bigobj. By comparing 3900 * bigobj to packobj, we can verify that the DMU is correctly 3901 * tracking which parts of an object are allocated and free, 3902 * and that the contents of the allocated blocks are correct. 3903 */ 3904 3905 /* 3906 * Read the directory info. If it's the first time, set things up. 3907 */ 3908 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 3909 chunksize); 3910 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 3911 chunksize); 3912 3913 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 3914 return; 3915 3916 bigobj = od[0].od_object; 3917 packobj = od[1].od_object; 3918 chunksize = od[0].od_gen; 3919 ASSERT(chunksize == od[1].od_gen); 3920 3921 /* 3922 * Prefetch a random chunk of the big object. 3923 * Our aim here is to get some async reads in flight 3924 * for blocks that we may free below; the DMU should 3925 * handle this race correctly. 3926 */ 3927 n = ztest_random(regions) * stride + ztest_random(width); 3928 s = 1 + ztest_random(2 * width - 1); 3929 dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, 3930 ZIO_PRIORITY_SYNC_READ); 3931 3932 /* 3933 * Pick a random index and compute the offsets into packobj and bigobj. 3934 */ 3935 n = ztest_random(regions) * stride + ztest_random(width); 3936 s = 1 + ztest_random(width - 1); 3937 3938 packoff = n * sizeof (bufwad_t); 3939 packsize = s * sizeof (bufwad_t); 3940 3941 bigoff = n * chunksize; 3942 bigsize = s * chunksize; 3943 3944 packbuf = umem_alloc(packsize, UMEM_NOFAIL); 3945 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); 3946 3947 /* 3948 * free_percent of the time, free a range of bigobj rather than 3949 * overwriting it. 3950 */ 3951 freeit = (ztest_random(100) < free_percent); 3952 3953 /* 3954 * Read the current contents of our objects. 3955 */ 3956 error = dmu_read(os, packobj, packoff, packsize, packbuf, 3957 DMU_READ_PREFETCH); 3958 ASSERT0(error); 3959 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, 3960 DMU_READ_PREFETCH); 3961 ASSERT0(error); 3962 3963 /* 3964 * Get a tx for the mods to both packobj and bigobj. 3965 */ 3966 tx = dmu_tx_create(os); 3967 3968 dmu_tx_hold_write(tx, packobj, packoff, packsize); 3969 3970 if (freeit) 3971 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); 3972 else 3973 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 3974 3975 /* This accounts for setting the checksum/compression. */ 3976 dmu_tx_hold_bonus(tx, bigobj); 3977 3978 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 3979 if (txg == 0) { 3980 umem_free(packbuf, packsize); 3981 umem_free(bigbuf, bigsize); 3982 return; 3983 } 3984 3985 enum zio_checksum cksum; 3986 do { 3987 cksum = (enum zio_checksum) 3988 ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); 3989 } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); 3990 dmu_object_set_checksum(os, bigobj, cksum, tx); 3991 3992 enum zio_compress comp; 3993 do { 3994 comp = (enum zio_compress) 3995 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); 3996 } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); 3997 dmu_object_set_compress(os, bigobj, comp, tx); 3998 3999 /* 4000 * For each index from n to n + s, verify that the existing bufwad 4001 * in packobj matches the bufwads at the head and tail of the 4002 * corresponding chunk in bigobj. Then update all three bufwads 4003 * with the new values we want to write out. 4004 */ 4005 for (i = 0; i < s; i++) { 4006 /* LINTED */ 4007 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4008 /* LINTED */ 4009 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4010 /* LINTED */ 4011 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4012 4013 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 4014 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 4015 4016 if (pack->bw_txg > txg) 4017 fatal(0, "future leak: got %llx, open txg is %llx", 4018 pack->bw_txg, txg); 4019 4020 if (pack->bw_data != 0 && pack->bw_index != n + i) 4021 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 4022 pack->bw_index, n, i); 4023 4024 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4025 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 4026 4027 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4028 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 4029 4030 if (freeit) { 4031 bzero(pack, sizeof (bufwad_t)); 4032 } else { 4033 pack->bw_index = n + i; 4034 pack->bw_txg = txg; 4035 pack->bw_data = 1 + ztest_random(-2ULL); 4036 } 4037 *bigH = *pack; 4038 *bigT = *pack; 4039 } 4040 4041 /* 4042 * We've verified all the old bufwads, and made new ones. 4043 * Now write them out. 4044 */ 4045 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4046 4047 if (freeit) { 4048 if (ztest_opts.zo_verbose >= 7) { 4049 (void) printf("freeing offset %llx size %llx" 4050 " txg %llx\n", 4051 (u_longlong_t)bigoff, 4052 (u_longlong_t)bigsize, 4053 (u_longlong_t)txg); 4054 } 4055 VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); 4056 } else { 4057 if (ztest_opts.zo_verbose >= 7) { 4058 (void) printf("writing offset %llx size %llx" 4059 " txg %llx\n", 4060 (u_longlong_t)bigoff, 4061 (u_longlong_t)bigsize, 4062 (u_longlong_t)txg); 4063 } 4064 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); 4065 } 4066 4067 dmu_tx_commit(tx); 4068 4069 /* 4070 * Sanity check the stuff we just wrote. 4071 */ 4072 { 4073 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4074 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4075 4076 VERIFY(0 == dmu_read(os, packobj, packoff, 4077 packsize, packcheck, DMU_READ_PREFETCH)); 4078 VERIFY(0 == dmu_read(os, bigobj, bigoff, 4079 bigsize, bigcheck, DMU_READ_PREFETCH)); 4080 4081 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 4082 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 4083 4084 umem_free(packcheck, packsize); 4085 umem_free(bigcheck, bigsize); 4086 } 4087 4088 umem_free(packbuf, packsize); 4089 umem_free(bigbuf, bigsize); 4090 } 4091 4092 void 4093 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, 4094 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) 4095 { 4096 uint64_t i; 4097 bufwad_t *pack; 4098 bufwad_t *bigH; 4099 bufwad_t *bigT; 4100 4101 /* 4102 * For each index from n to n + s, verify that the existing bufwad 4103 * in packobj matches the bufwads at the head and tail of the 4104 * corresponding chunk in bigobj. Then update all three bufwads 4105 * with the new values we want to write out. 4106 */ 4107 for (i = 0; i < s; i++) { 4108 /* LINTED */ 4109 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); 4110 /* LINTED */ 4111 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); 4112 /* LINTED */ 4113 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; 4114 4115 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); 4116 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); 4117 4118 if (pack->bw_txg > txg) 4119 fatal(0, "future leak: got %llx, open txg is %llx", 4120 pack->bw_txg, txg); 4121 4122 if (pack->bw_data != 0 && pack->bw_index != n + i) 4123 fatal(0, "wrong index: got %llx, wanted %llx+%llx", 4124 pack->bw_index, n, i); 4125 4126 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) 4127 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); 4128 4129 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) 4130 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); 4131 4132 pack->bw_index = n + i; 4133 pack->bw_txg = txg; 4134 pack->bw_data = 1 + ztest_random(-2ULL); 4135 4136 *bigH = *pack; 4137 *bigT = *pack; 4138 } 4139 } 4140 4141 void 4142 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) 4143 { 4144 objset_t *os = zd->zd_os; 4145 ztest_od_t od[2]; 4146 dmu_tx_t *tx; 4147 uint64_t i; 4148 int error; 4149 uint64_t n, s, txg; 4150 bufwad_t *packbuf, *bigbuf; 4151 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; 4152 uint64_t blocksize = ztest_random_blocksize(); 4153 uint64_t chunksize = blocksize; 4154 uint64_t regions = 997; 4155 uint64_t stride = 123456789ULL; 4156 uint64_t width = 9; 4157 dmu_buf_t *bonus_db; 4158 arc_buf_t **bigbuf_arcbufs; 4159 dmu_object_info_t doi; 4160 4161 /* 4162 * This test uses two objects, packobj and bigobj, that are always 4163 * updated together (i.e. in the same tx) so that their contents are 4164 * in sync and can be compared. Their contents relate to each other 4165 * in a simple way: packobj is a dense array of 'bufwad' structures, 4166 * while bigobj is a sparse array of the same bufwads. Specifically, 4167 * for any index n, there are three bufwads that should be identical: 4168 * 4169 * packobj, at offset n * sizeof (bufwad_t) 4170 * bigobj, at the head of the nth chunk 4171 * bigobj, at the tail of the nth chunk 4172 * 4173 * The chunk size is set equal to bigobj block size so that 4174 * dmu_assign_arcbuf() can be tested for object updates. 4175 */ 4176 4177 /* 4178 * Read the directory info. If it's the first time, set things up. 4179 */ 4180 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 4181 0, 0); 4182 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, 4183 chunksize); 4184 4185 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4186 return; 4187 4188 bigobj = od[0].od_object; 4189 packobj = od[1].od_object; 4190 blocksize = od[0].od_blocksize; 4191 chunksize = blocksize; 4192 ASSERT(chunksize == od[1].od_gen); 4193 4194 VERIFY(dmu_object_info(os, bigobj, &doi) == 0); 4195 VERIFY(ISP2(doi.doi_data_block_size)); 4196 VERIFY(chunksize == doi.doi_data_block_size); 4197 VERIFY(chunksize >= 2 * sizeof (bufwad_t)); 4198 4199 /* 4200 * Pick a random index and compute the offsets into packobj and bigobj. 4201 */ 4202 n = ztest_random(regions) * stride + ztest_random(width); 4203 s = 1 + ztest_random(width - 1); 4204 4205 packoff = n * sizeof (bufwad_t); 4206 packsize = s * sizeof (bufwad_t); 4207 4208 bigoff = n * chunksize; 4209 bigsize = s * chunksize; 4210 4211 packbuf = umem_zalloc(packsize, UMEM_NOFAIL); 4212 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); 4213 4214 VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); 4215 4216 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); 4217 4218 /* 4219 * Iteration 0 test zcopy for DB_UNCACHED dbufs. 4220 * Iteration 1 test zcopy to already referenced dbufs. 4221 * Iteration 2 test zcopy to dirty dbuf in the same txg. 4222 * Iteration 3 test zcopy to dbuf dirty in previous txg. 4223 * Iteration 4 test zcopy when dbuf is no longer dirty. 4224 * Iteration 5 test zcopy when it can't be done. 4225 * Iteration 6 one more zcopy write. 4226 */ 4227 for (i = 0; i < 7; i++) { 4228 uint64_t j; 4229 uint64_t off; 4230 4231 /* 4232 * In iteration 5 (i == 5) use arcbufs 4233 * that don't match bigobj blksz to test 4234 * dmu_assign_arcbuf() when it can't directly 4235 * assign an arcbuf to a dbuf. 4236 */ 4237 for (j = 0; j < s; j++) { 4238 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 4239 bigbuf_arcbufs[j] = 4240 dmu_request_arcbuf(bonus_db, chunksize); 4241 } else { 4242 bigbuf_arcbufs[2 * j] = 4243 dmu_request_arcbuf(bonus_db, chunksize / 2); 4244 bigbuf_arcbufs[2 * j + 1] = 4245 dmu_request_arcbuf(bonus_db, chunksize / 2); 4246 } 4247 } 4248 4249 /* 4250 * Get a tx for the mods to both packobj and bigobj. 4251 */ 4252 tx = dmu_tx_create(os); 4253 4254 dmu_tx_hold_write(tx, packobj, packoff, packsize); 4255 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); 4256 4257 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4258 if (txg == 0) { 4259 umem_free(packbuf, packsize); 4260 umem_free(bigbuf, bigsize); 4261 for (j = 0; j < s; j++) { 4262 if (i != 5 || 4263 chunksize < (SPA_MINBLOCKSIZE * 2)) { 4264 dmu_return_arcbuf(bigbuf_arcbufs[j]); 4265 } else { 4266 dmu_return_arcbuf( 4267 bigbuf_arcbufs[2 * j]); 4268 dmu_return_arcbuf( 4269 bigbuf_arcbufs[2 * j + 1]); 4270 } 4271 } 4272 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 4273 dmu_buf_rele(bonus_db, FTAG); 4274 return; 4275 } 4276 4277 /* 4278 * 50% of the time don't read objects in the 1st iteration to 4279 * test dmu_assign_arcbuf() for the case when there're no 4280 * existing dbufs for the specified offsets. 4281 */ 4282 if (i != 0 || ztest_random(2) != 0) { 4283 error = dmu_read(os, packobj, packoff, 4284 packsize, packbuf, DMU_READ_PREFETCH); 4285 ASSERT0(error); 4286 error = dmu_read(os, bigobj, bigoff, bigsize, 4287 bigbuf, DMU_READ_PREFETCH); 4288 ASSERT0(error); 4289 } 4290 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, 4291 n, chunksize, txg); 4292 4293 /* 4294 * We've verified all the old bufwads, and made new ones. 4295 * Now write them out. 4296 */ 4297 dmu_write(os, packobj, packoff, packsize, packbuf, tx); 4298 if (ztest_opts.zo_verbose >= 7) { 4299 (void) printf("writing offset %llx size %llx" 4300 " txg %llx\n", 4301 (u_longlong_t)bigoff, 4302 (u_longlong_t)bigsize, 4303 (u_longlong_t)txg); 4304 } 4305 for (off = bigoff, j = 0; j < s; j++, off += chunksize) { 4306 dmu_buf_t *dbt; 4307 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 4308 bcopy((caddr_t)bigbuf + (off - bigoff), 4309 bigbuf_arcbufs[j]->b_data, chunksize); 4310 } else { 4311 bcopy((caddr_t)bigbuf + (off - bigoff), 4312 bigbuf_arcbufs[2 * j]->b_data, 4313 chunksize / 2); 4314 bcopy((caddr_t)bigbuf + (off - bigoff) + 4315 chunksize / 2, 4316 bigbuf_arcbufs[2 * j + 1]->b_data, 4317 chunksize / 2); 4318 } 4319 4320 if (i == 1) { 4321 VERIFY(dmu_buf_hold(os, bigobj, off, 4322 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); 4323 } 4324 if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { 4325 dmu_assign_arcbuf(bonus_db, off, 4326 bigbuf_arcbufs[j], tx); 4327 } else { 4328 dmu_assign_arcbuf(bonus_db, off, 4329 bigbuf_arcbufs[2 * j], tx); 4330 dmu_assign_arcbuf(bonus_db, 4331 off + chunksize / 2, 4332 bigbuf_arcbufs[2 * j + 1], tx); 4333 } 4334 if (i == 1) { 4335 dmu_buf_rele(dbt, FTAG); 4336 } 4337 } 4338 dmu_tx_commit(tx); 4339 4340 /* 4341 * Sanity check the stuff we just wrote. 4342 */ 4343 { 4344 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); 4345 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); 4346 4347 VERIFY(0 == dmu_read(os, packobj, packoff, 4348 packsize, packcheck, DMU_READ_PREFETCH)); 4349 VERIFY(0 == dmu_read(os, bigobj, bigoff, 4350 bigsize, bigcheck, DMU_READ_PREFETCH)); 4351 4352 ASSERT(bcmp(packbuf, packcheck, packsize) == 0); 4353 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); 4354 4355 umem_free(packcheck, packsize); 4356 umem_free(bigcheck, bigsize); 4357 } 4358 if (i == 2) { 4359 txg_wait_open(dmu_objset_pool(os), 0); 4360 } else if (i == 3) { 4361 txg_wait_synced(dmu_objset_pool(os), 0); 4362 } 4363 } 4364 4365 dmu_buf_rele(bonus_db, FTAG); 4366 umem_free(packbuf, packsize); 4367 umem_free(bigbuf, bigsize); 4368 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); 4369 } 4370 4371 /* ARGSUSED */ 4372 void 4373 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) 4374 { 4375 ztest_od_t od[1]; 4376 uint64_t offset = (1ULL << (ztest_random(20) + 43)) + 4377 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4378 4379 /* 4380 * Have multiple threads write to large offsets in an object 4381 * to verify that parallel writes to an object -- even to the 4382 * same blocks within the object -- doesn't cause any trouble. 4383 */ 4384 ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 4385 0, 0, 0); 4386 4387 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4388 return; 4389 4390 while (ztest_random(10) != 0) 4391 ztest_io(zd, od[0].od_object, offset); 4392 } 4393 4394 void 4395 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) 4396 { 4397 ztest_od_t od[1]; 4398 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + 4399 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 4400 uint64_t count = ztest_random(20) + 1; 4401 uint64_t blocksize = ztest_random_blocksize(); 4402 void *data; 4403 4404 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 4405 0, 0); 4406 4407 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4408 return; 4409 4410 if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) 4411 return; 4412 4413 ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); 4414 4415 data = umem_zalloc(blocksize, UMEM_NOFAIL); 4416 4417 while (ztest_random(count) != 0) { 4418 uint64_t randoff = offset + (ztest_random(count) * blocksize); 4419 if (ztest_write(zd, od[0].od_object, randoff, blocksize, 4420 data) != 0) 4421 break; 4422 while (ztest_random(4) != 0) 4423 ztest_io(zd, od[0].od_object, randoff); 4424 } 4425 4426 umem_free(data, blocksize); 4427 } 4428 4429 /* 4430 * Verify that zap_{create,destroy,add,remove,update} work as expected. 4431 */ 4432 #define ZTEST_ZAP_MIN_INTS 1 4433 #define ZTEST_ZAP_MAX_INTS 4 4434 #define ZTEST_ZAP_MAX_PROPS 1000 4435 4436 void 4437 ztest_zap(ztest_ds_t *zd, uint64_t id) 4438 { 4439 objset_t *os = zd->zd_os; 4440 ztest_od_t od[1]; 4441 uint64_t object; 4442 uint64_t txg, last_txg; 4443 uint64_t value[ZTEST_ZAP_MAX_INTS]; 4444 uint64_t zl_ints, zl_intsize, prop; 4445 int i, ints; 4446 dmu_tx_t *tx; 4447 char propname[100], txgname[100]; 4448 int error; 4449 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; 4450 4451 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 4452 4453 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4454 return; 4455 4456 object = od[0].od_object; 4457 4458 /* 4459 * Generate a known hash collision, and verify that 4460 * we can lookup and remove both entries. 4461 */ 4462 tx = dmu_tx_create(os); 4463 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4464 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4465 if (txg == 0) 4466 return; 4467 for (i = 0; i < 2; i++) { 4468 value[i] = i; 4469 VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), 4470 1, &value[i], tx)); 4471 } 4472 for (i = 0; i < 2; i++) { 4473 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], 4474 sizeof (uint64_t), 1, &value[i], tx)); 4475 VERIFY3U(0, ==, 4476 zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); 4477 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4478 ASSERT3U(zl_ints, ==, 1); 4479 } 4480 for (i = 0; i < 2; i++) { 4481 VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); 4482 } 4483 dmu_tx_commit(tx); 4484 4485 /* 4486 * Generate a buch of random entries. 4487 */ 4488 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); 4489 4490 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 4491 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 4492 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 4493 bzero(value, sizeof (value)); 4494 last_txg = 0; 4495 4496 /* 4497 * If these zap entries already exist, validate their contents. 4498 */ 4499 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 4500 if (error == 0) { 4501 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4502 ASSERT3U(zl_ints, ==, 1); 4503 4504 VERIFY(zap_lookup(os, object, txgname, zl_intsize, 4505 zl_ints, &last_txg) == 0); 4506 4507 VERIFY(zap_length(os, object, propname, &zl_intsize, 4508 &zl_ints) == 0); 4509 4510 ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); 4511 ASSERT3U(zl_ints, ==, ints); 4512 4513 VERIFY(zap_lookup(os, object, propname, zl_intsize, 4514 zl_ints, value) == 0); 4515 4516 for (i = 0; i < ints; i++) { 4517 ASSERT3U(value[i], ==, last_txg + object + i); 4518 } 4519 } else { 4520 ASSERT3U(error, ==, ENOENT); 4521 } 4522 4523 /* 4524 * Atomically update two entries in our zap object. 4525 * The first is named txg_%llu, and contains the txg 4526 * in which the property was last updated. The second 4527 * is named prop_%llu, and the nth element of its value 4528 * should be txg + object + n. 4529 */ 4530 tx = dmu_tx_create(os); 4531 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4532 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4533 if (txg == 0) 4534 return; 4535 4536 if (last_txg > txg) 4537 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); 4538 4539 for (i = 0; i < ints; i++) 4540 value[i] = txg + object + i; 4541 4542 VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), 4543 1, &txg, tx)); 4544 VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), 4545 ints, value, tx)); 4546 4547 dmu_tx_commit(tx); 4548 4549 /* 4550 * Remove a random pair of entries. 4551 */ 4552 prop = ztest_random(ZTEST_ZAP_MAX_PROPS); 4553 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); 4554 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); 4555 4556 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); 4557 4558 if (error == ENOENT) 4559 return; 4560 4561 ASSERT0(error); 4562 4563 tx = dmu_tx_create(os); 4564 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4565 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4566 if (txg == 0) 4567 return; 4568 VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); 4569 VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); 4570 dmu_tx_commit(tx); 4571 } 4572 4573 /* 4574 * Testcase to test the upgrading of a microzap to fatzap. 4575 */ 4576 void 4577 ztest_fzap(ztest_ds_t *zd, uint64_t id) 4578 { 4579 objset_t *os = zd->zd_os; 4580 ztest_od_t od[1]; 4581 uint64_t object, txg; 4582 4583 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); 4584 4585 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) 4586 return; 4587 4588 object = od[0].od_object; 4589 4590 /* 4591 * Add entries to this ZAP and make sure it spills over 4592 * and gets upgraded to a fatzap. Also, since we are adding 4593 * 2050 entries we should see ptrtbl growth and leaf-block split. 4594 */ 4595 for (int i = 0; i < 2050; i++) { 4596 char name[ZFS_MAX_DATASET_NAME_LEN]; 4597 uint64_t value = i; 4598 dmu_tx_t *tx; 4599 int error; 4600 4601 (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", 4602 id, value); 4603 4604 tx = dmu_tx_create(os); 4605 dmu_tx_hold_zap(tx, object, B_TRUE, name); 4606 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4607 if (txg == 0) 4608 return; 4609 error = zap_add(os, object, name, sizeof (uint64_t), 1, 4610 &value, tx); 4611 ASSERT(error == 0 || error == EEXIST); 4612 dmu_tx_commit(tx); 4613 } 4614 } 4615 4616 /* ARGSUSED */ 4617 void 4618 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) 4619 { 4620 objset_t *os = zd->zd_os; 4621 ztest_od_t od[1]; 4622 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; 4623 dmu_tx_t *tx; 4624 int i, namelen, error; 4625 int micro = ztest_random(2); 4626 char name[20], string_value[20]; 4627 void *data; 4628 4629 ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 4630 0, 0, 0); 4631 4632 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4633 return; 4634 4635 object = od[0].od_object; 4636 4637 /* 4638 * Generate a random name of the form 'xxx.....' where each 4639 * x is a random printable character and the dots are dots. 4640 * There are 94 such characters, and the name length goes from 4641 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. 4642 */ 4643 namelen = ztest_random(sizeof (name) - 5) + 5 + 1; 4644 4645 for (i = 0; i < 3; i++) 4646 name[i] = '!' + ztest_random('~' - '!' + 1); 4647 for (; i < namelen - 1; i++) 4648 name[i] = '.'; 4649 name[i] = '\0'; 4650 4651 if ((namelen & 1) || micro) { 4652 wsize = sizeof (txg); 4653 wc = 1; 4654 data = &txg; 4655 } else { 4656 wsize = 1; 4657 wc = namelen; 4658 data = string_value; 4659 } 4660 4661 count = -1ULL; 4662 VERIFY0(zap_count(os, object, &count)); 4663 ASSERT(count != -1ULL); 4664 4665 /* 4666 * Select an operation: length, lookup, add, update, remove. 4667 */ 4668 i = ztest_random(5); 4669 4670 if (i >= 2) { 4671 tx = dmu_tx_create(os); 4672 dmu_tx_hold_zap(tx, object, B_TRUE, NULL); 4673 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); 4674 if (txg == 0) 4675 return; 4676 bcopy(name, string_value, namelen); 4677 } else { 4678 tx = NULL; 4679 txg = 0; 4680 bzero(string_value, namelen); 4681 } 4682 4683 switch (i) { 4684 4685 case 0: 4686 error = zap_length(os, object, name, &zl_wsize, &zl_wc); 4687 if (error == 0) { 4688 ASSERT3U(wsize, ==, zl_wsize); 4689 ASSERT3U(wc, ==, zl_wc); 4690 } else { 4691 ASSERT3U(error, ==, ENOENT); 4692 } 4693 break; 4694 4695 case 1: 4696 error = zap_lookup(os, object, name, wsize, wc, data); 4697 if (error == 0) { 4698 if (data == string_value && 4699 bcmp(name, data, namelen) != 0) 4700 fatal(0, "name '%s' != val '%s' len %d", 4701 name, data, namelen); 4702 } else { 4703 ASSERT3U(error, ==, ENOENT); 4704 } 4705 break; 4706 4707 case 2: 4708 error = zap_add(os, object, name, wsize, wc, data, tx); 4709 ASSERT(error == 0 || error == EEXIST); 4710 break; 4711 4712 case 3: 4713 VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); 4714 break; 4715 4716 case 4: 4717 error = zap_remove(os, object, name, tx); 4718 ASSERT(error == 0 || error == ENOENT); 4719 break; 4720 } 4721 4722 if (tx != NULL) 4723 dmu_tx_commit(tx); 4724 } 4725 4726 /* 4727 * Commit callback data. 4728 */ 4729 typedef struct ztest_cb_data { 4730 list_node_t zcd_node; 4731 uint64_t zcd_txg; 4732 int zcd_expected_err; 4733 boolean_t zcd_added; 4734 boolean_t zcd_called; 4735 spa_t *zcd_spa; 4736 } ztest_cb_data_t; 4737 4738 /* This is the actual commit callback function */ 4739 static void 4740 ztest_commit_callback(void *arg, int error) 4741 { 4742 ztest_cb_data_t *data = arg; 4743 uint64_t synced_txg; 4744 4745 VERIFY(data != NULL); 4746 VERIFY3S(data->zcd_expected_err, ==, error); 4747 VERIFY(!data->zcd_called); 4748 4749 synced_txg = spa_last_synced_txg(data->zcd_spa); 4750 if (data->zcd_txg > synced_txg) 4751 fatal(0, "commit callback of txg %" PRIu64 " called prematurely" 4752 ", last synced txg = %" PRIu64 "\n", data->zcd_txg, 4753 synced_txg); 4754 4755 data->zcd_called = B_TRUE; 4756 4757 if (error == ECANCELED) { 4758 ASSERT0(data->zcd_txg); 4759 ASSERT(!data->zcd_added); 4760 4761 /* 4762 * The private callback data should be destroyed here, but 4763 * since we are going to check the zcd_called field after 4764 * dmu_tx_abort(), we will destroy it there. 4765 */ 4766 return; 4767 } 4768 4769 /* Was this callback added to the global callback list? */ 4770 if (!data->zcd_added) 4771 goto out; 4772 4773 ASSERT3U(data->zcd_txg, !=, 0); 4774 4775 /* Remove our callback from the list */ 4776 mutex_enter(&zcl.zcl_callbacks_lock); 4777 list_remove(&zcl.zcl_callbacks, data); 4778 mutex_exit(&zcl.zcl_callbacks_lock); 4779 4780 out: 4781 umem_free(data, sizeof (ztest_cb_data_t)); 4782 } 4783 4784 /* Allocate and initialize callback data structure */ 4785 static ztest_cb_data_t * 4786 ztest_create_cb_data(objset_t *os, uint64_t txg) 4787 { 4788 ztest_cb_data_t *cb_data; 4789 4790 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); 4791 4792 cb_data->zcd_txg = txg; 4793 cb_data->zcd_spa = dmu_objset_spa(os); 4794 4795 return (cb_data); 4796 } 4797 4798 /* 4799 * If a number of txgs equal to this threshold have been created after a commit 4800 * callback has been registered but not called, then we assume there is an 4801 * implementation bug. 4802 */ 4803 #define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) 4804 4805 /* 4806 * Commit callback test. 4807 */ 4808 void 4809 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) 4810 { 4811 objset_t *os = zd->zd_os; 4812 ztest_od_t od[1]; 4813 dmu_tx_t *tx; 4814 ztest_cb_data_t *cb_data[3], *tmp_cb; 4815 uint64_t old_txg, txg; 4816 int i, error; 4817 4818 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 4819 4820 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 4821 return; 4822 4823 tx = dmu_tx_create(os); 4824 4825 cb_data[0] = ztest_create_cb_data(os, 0); 4826 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); 4827 4828 dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); 4829 4830 /* Every once in a while, abort the transaction on purpose */ 4831 if (ztest_random(100) == 0) 4832 error = -1; 4833 4834 if (!error) 4835 error = dmu_tx_assign(tx, TXG_NOWAIT); 4836 4837 txg = error ? 0 : dmu_tx_get_txg(tx); 4838 4839 cb_data[0]->zcd_txg = txg; 4840 cb_data[1] = ztest_create_cb_data(os, txg); 4841 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); 4842 4843 if (error) { 4844 /* 4845 * It's not a strict requirement to call the registered 4846 * callbacks from inside dmu_tx_abort(), but that's what 4847 * it's supposed to happen in the current implementation 4848 * so we will check for that. 4849 */ 4850 for (i = 0; i < 2; i++) { 4851 cb_data[i]->zcd_expected_err = ECANCELED; 4852 VERIFY(!cb_data[i]->zcd_called); 4853 } 4854 4855 dmu_tx_abort(tx); 4856 4857 for (i = 0; i < 2; i++) { 4858 VERIFY(cb_data[i]->zcd_called); 4859 umem_free(cb_data[i], sizeof (ztest_cb_data_t)); 4860 } 4861 4862 return; 4863 } 4864 4865 cb_data[2] = ztest_create_cb_data(os, txg); 4866 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); 4867 4868 /* 4869 * Read existing data to make sure there isn't a future leak. 4870 */ 4871 VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), 4872 &old_txg, DMU_READ_PREFETCH)); 4873 4874 if (old_txg > txg) 4875 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, 4876 old_txg, txg); 4877 4878 dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); 4879 4880 mutex_enter(&zcl.zcl_callbacks_lock); 4881 4882 /* 4883 * Since commit callbacks don't have any ordering requirement and since 4884 * it is theoretically possible for a commit callback to be called 4885 * after an arbitrary amount of time has elapsed since its txg has been 4886 * synced, it is difficult to reliably determine whether a commit 4887 * callback hasn't been called due to high load or due to a flawed 4888 * implementation. 4889 * 4890 * In practice, we will assume that if after a certain number of txgs a 4891 * commit callback hasn't been called, then most likely there's an 4892 * implementation bug.. 4893 */ 4894 tmp_cb = list_head(&zcl.zcl_callbacks); 4895 if (tmp_cb != NULL && 4896 (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) { 4897 fatal(0, "Commit callback threshold exceeded, oldest txg: %" 4898 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); 4899 } 4900 4901 /* 4902 * Let's find the place to insert our callbacks. 4903 * 4904 * Even though the list is ordered by txg, it is possible for the 4905 * insertion point to not be the end because our txg may already be 4906 * quiescing at this point and other callbacks in the open txg 4907 * (from other objsets) may have sneaked in. 4908 */ 4909 tmp_cb = list_tail(&zcl.zcl_callbacks); 4910 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) 4911 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); 4912 4913 /* Add the 3 callbacks to the list */ 4914 for (i = 0; i < 3; i++) { 4915 if (tmp_cb == NULL) 4916 list_insert_head(&zcl.zcl_callbacks, cb_data[i]); 4917 else 4918 list_insert_after(&zcl.zcl_callbacks, tmp_cb, 4919 cb_data[i]); 4920 4921 cb_data[i]->zcd_added = B_TRUE; 4922 VERIFY(!cb_data[i]->zcd_called); 4923 4924 tmp_cb = cb_data[i]; 4925 } 4926 4927 mutex_exit(&zcl.zcl_callbacks_lock); 4928 4929 dmu_tx_commit(tx); 4930 } 4931 4932 /* 4933 * Visit each object in the dataset. Verify that its properties 4934 * are consistent what was stored in the block tag when it was created, 4935 * and that its unused bonus buffer space has not been overwritten. 4936 */ 4937 void 4938 ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) 4939 { 4940 objset_t *os = zd->zd_os; 4941 uint64_t obj; 4942 int err = 0; 4943 4944 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { 4945 ztest_block_tag_t *bt = NULL; 4946 dmu_object_info_t doi; 4947 dmu_buf_t *db; 4948 4949 if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) 4950 continue; 4951 4952 dmu_object_info_from_db(db, &doi); 4953 if (doi.doi_bonus_size >= sizeof (*bt)) 4954 bt = ztest_bt_bonus(db); 4955 4956 if (bt && bt->bt_magic == BT_MAGIC) { 4957 ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, 4958 bt->bt_offset, bt->bt_gen, bt->bt_txg, 4959 bt->bt_crtxg); 4960 ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); 4961 } 4962 4963 dmu_buf_rele(db, FTAG); 4964 } 4965 } 4966 4967 /* ARGSUSED */ 4968 void 4969 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) 4970 { 4971 zfs_prop_t proplist[] = { 4972 ZFS_PROP_CHECKSUM, 4973 ZFS_PROP_COMPRESSION, 4974 ZFS_PROP_COPIES, 4975 ZFS_PROP_DEDUP 4976 }; 4977 4978 rw_enter(&ztest_name_lock, RW_READER); 4979 4980 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) 4981 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], 4982 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); 4983 4984 rw_exit(&ztest_name_lock); 4985 } 4986 4987 /* ARGSUSED */ 4988 void 4989 ztest_remap_blocks(ztest_ds_t *zd, uint64_t id) 4990 { 4991 rw_enter(&ztest_name_lock, RW_READER); 4992 4993 int error = dmu_objset_remap_indirects(zd->zd_name); 4994 if (error == ENOSPC) 4995 error = 0; 4996 ASSERT0(error); 4997 4998 rw_exit(&ztest_name_lock); 4999 } 5000 5001 /* ARGSUSED */ 5002 void 5003 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) 5004 { 5005 nvlist_t *props = NULL; 5006 5007 rw_enter(&ztest_name_lock, RW_READER); 5008 5009 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, 5010 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); 5011 5012 VERIFY0(spa_prop_get(ztest_spa, &props)); 5013 5014 if (ztest_opts.zo_verbose >= 6) 5015 dump_nvlist(props, 4); 5016 5017 nvlist_free(props); 5018 5019 rw_exit(&ztest_name_lock); 5020 } 5021 5022 static int 5023 user_release_one(const char *snapname, const char *holdname) 5024 { 5025 nvlist_t *snaps, *holds; 5026 int error; 5027 5028 snaps = fnvlist_alloc(); 5029 holds = fnvlist_alloc(); 5030 fnvlist_add_boolean(holds, holdname); 5031 fnvlist_add_nvlist(snaps, snapname, holds); 5032 fnvlist_free(holds); 5033 error = dsl_dataset_user_release(snaps, NULL); 5034 fnvlist_free(snaps); 5035 return (error); 5036 } 5037 5038 /* 5039 * Test snapshot hold/release and deferred destroy. 5040 */ 5041 void 5042 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) 5043 { 5044 int error; 5045 objset_t *os = zd->zd_os; 5046 objset_t *origin; 5047 char snapname[100]; 5048 char fullname[100]; 5049 char clonename[100]; 5050 char tag[100]; 5051 char osname[ZFS_MAX_DATASET_NAME_LEN]; 5052 nvlist_t *holds; 5053 5054 rw_enter(&ztest_name_lock, RW_READER); 5055 5056 dmu_objset_name(os, osname); 5057 5058 (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id); 5059 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); 5060 (void) snprintf(clonename, sizeof (clonename), 5061 "%s/ch1_%llu", osname, id); 5062 (void) snprintf(tag, sizeof (tag), "tag_%llu", id); 5063 5064 /* 5065 * Clean up from any previous run. 5066 */ 5067 error = dsl_destroy_head(clonename); 5068 if (error != ENOENT) 5069 ASSERT0(error); 5070 error = user_release_one(fullname, tag); 5071 if (error != ESRCH && error != ENOENT) 5072 ASSERT0(error); 5073 error = dsl_destroy_snapshot(fullname, B_FALSE); 5074 if (error != ENOENT) 5075 ASSERT0(error); 5076 5077 /* 5078 * Create snapshot, clone it, mark snap for deferred destroy, 5079 * destroy clone, verify snap was also destroyed. 5080 */ 5081 error = dmu_objset_snapshot_one(osname, snapname); 5082 if (error) { 5083 if (error == ENOSPC) { 5084 ztest_record_enospc("dmu_objset_snapshot"); 5085 goto out; 5086 } 5087 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 5088 } 5089 5090 error = dmu_objset_clone(clonename, fullname); 5091 if (error) { 5092 if (error == ENOSPC) { 5093 ztest_record_enospc("dmu_objset_clone"); 5094 goto out; 5095 } 5096 fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); 5097 } 5098 5099 error = dsl_destroy_snapshot(fullname, B_TRUE); 5100 if (error) { 5101 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5102 fullname, error); 5103 } 5104 5105 error = dsl_destroy_head(clonename); 5106 if (error) 5107 fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); 5108 5109 error = dmu_objset_hold(fullname, FTAG, &origin); 5110 if (error != ENOENT) 5111 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); 5112 5113 /* 5114 * Create snapshot, add temporary hold, verify that we can't 5115 * destroy a held snapshot, mark for deferred destroy, 5116 * release hold, verify snapshot was destroyed. 5117 */ 5118 error = dmu_objset_snapshot_one(osname, snapname); 5119 if (error) { 5120 if (error == ENOSPC) { 5121 ztest_record_enospc("dmu_objset_snapshot"); 5122 goto out; 5123 } 5124 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); 5125 } 5126 5127 holds = fnvlist_alloc(); 5128 fnvlist_add_string(holds, fullname, tag); 5129 error = dsl_dataset_user_hold(holds, 0, NULL); 5130 fnvlist_free(holds); 5131 5132 if (error == ENOSPC) { 5133 ztest_record_enospc("dsl_dataset_user_hold"); 5134 goto out; 5135 } else if (error) { 5136 fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", 5137 fullname, tag, error); 5138 } 5139 5140 error = dsl_destroy_snapshot(fullname, B_FALSE); 5141 if (error != EBUSY) { 5142 fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", 5143 fullname, error); 5144 } 5145 5146 error = dsl_destroy_snapshot(fullname, B_TRUE); 5147 if (error) { 5148 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", 5149 fullname, error); 5150 } 5151 5152 error = user_release_one(fullname, tag); 5153 if (error) 5154 fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); 5155 5156 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); 5157 5158 out: 5159 rw_exit(&ztest_name_lock); 5160 } 5161 5162 /* 5163 * Inject random faults into the on-disk data. 5164 */ 5165 /* ARGSUSED */ 5166 void 5167 ztest_fault_inject(ztest_ds_t *zd, uint64_t id) 5168 { 5169 ztest_shared_t *zs = ztest_shared; 5170 spa_t *spa = ztest_spa; 5171 int fd; 5172 uint64_t offset; 5173 uint64_t leaves; 5174 uint64_t bad = 0x1990c0ffeedecade; 5175 uint64_t top, leaf; 5176 char path0[MAXPATHLEN]; 5177 char pathrand[MAXPATHLEN]; 5178 size_t fsize; 5179 int bshift = SPA_MAXBLOCKSHIFT + 2; 5180 int iters = 1000; 5181 int maxfaults; 5182 int mirror_save; 5183 vdev_t *vd0 = NULL; 5184 uint64_t guid0 = 0; 5185 boolean_t islog = B_FALSE; 5186 5187 mutex_enter(&ztest_vdev_lock); 5188 5189 /* 5190 * Device removal is in progress, fault injection must be disabled 5191 * until it completes and the pool is scrubbed. The fault injection 5192 * strategy for damaging blocks does not take in to account evacuated 5193 * blocks which may have already been damaged. 5194 */ 5195 if (ztest_device_removal_active) { 5196 mutex_exit(&ztest_vdev_lock); 5197 return; 5198 } 5199 5200 maxfaults = MAXFAULTS(); 5201 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; 5202 mirror_save = zs->zs_mirrors; 5203 mutex_exit(&ztest_vdev_lock); 5204 5205 ASSERT(leaves >= 1); 5206 5207 /* 5208 * Grab the name lock as reader. There are some operations 5209 * which don't like to have their vdevs changed while 5210 * they are in progress (i.e. spa_change_guid). Those 5211 * operations will have grabbed the name lock as writer. 5212 */ 5213 rw_enter(&ztest_name_lock, RW_READER); 5214 5215 /* 5216 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. 5217 */ 5218 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5219 5220 if (ztest_random(2) == 0) { 5221 /* 5222 * Inject errors on a normal data device or slog device. 5223 */ 5224 top = ztest_random_vdev_top(spa, B_TRUE); 5225 leaf = ztest_random(leaves) + zs->zs_splits; 5226 5227 /* 5228 * Generate paths to the first leaf in this top-level vdev, 5229 * and to the random leaf we selected. We'll induce transient 5230 * write failures and random online/offline activity on leaf 0, 5231 * and we'll write random garbage to the randomly chosen leaf. 5232 */ 5233 (void) snprintf(path0, sizeof (path0), ztest_dev_template, 5234 ztest_opts.zo_dir, ztest_opts.zo_pool, 5235 top * leaves + zs->zs_splits); 5236 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, 5237 ztest_opts.zo_dir, ztest_opts.zo_pool, 5238 top * leaves + leaf); 5239 5240 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); 5241 if (vd0 != NULL && vd0->vdev_top->vdev_islog) 5242 islog = B_TRUE; 5243 5244 /* 5245 * If the top-level vdev needs to be resilvered 5246 * then we only allow faults on the device that is 5247 * resilvering. 5248 */ 5249 if (vd0 != NULL && maxfaults != 1 && 5250 (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || 5251 vd0->vdev_resilver_txg != 0)) { 5252 /* 5253 * Make vd0 explicitly claim to be unreadable, 5254 * or unwriteable, or reach behind its back 5255 * and close the underlying fd. We can do this if 5256 * maxfaults == 0 because we'll fail and reexecute, 5257 * and we can do it if maxfaults >= 2 because we'll 5258 * have enough redundancy. If maxfaults == 1, the 5259 * combination of this with injection of random data 5260 * corruption below exceeds the pool's fault tolerance. 5261 */ 5262 vdev_file_t *vf = vd0->vdev_tsd; 5263 5264 zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", 5265 (long long)vd0->vdev_id, (int)maxfaults); 5266 5267 if (vf != NULL && ztest_random(3) == 0) { 5268 (void) close(vf->vf_vnode->v_fd); 5269 vf->vf_vnode->v_fd = -1; 5270 } else if (ztest_random(2) == 0) { 5271 vd0->vdev_cant_read = B_TRUE; 5272 } else { 5273 vd0->vdev_cant_write = B_TRUE; 5274 } 5275 guid0 = vd0->vdev_guid; 5276 } 5277 } else { 5278 /* 5279 * Inject errors on an l2cache device. 5280 */ 5281 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5282 5283 if (sav->sav_count == 0) { 5284 spa_config_exit(spa, SCL_STATE, FTAG); 5285 rw_exit(&ztest_name_lock); 5286 return; 5287 } 5288 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; 5289 guid0 = vd0->vdev_guid; 5290 (void) strcpy(path0, vd0->vdev_path); 5291 (void) strcpy(pathrand, vd0->vdev_path); 5292 5293 leaf = 0; 5294 leaves = 1; 5295 maxfaults = INT_MAX; /* no limit on cache devices */ 5296 } 5297 5298 spa_config_exit(spa, SCL_STATE, FTAG); 5299 rw_exit(&ztest_name_lock); 5300 5301 /* 5302 * If we can tolerate two or more faults, or we're dealing 5303 * with a slog, randomly online/offline vd0. 5304 */ 5305 if ((maxfaults >= 2 || islog) && guid0 != 0) { 5306 if (ztest_random(10) < 6) { 5307 int flags = (ztest_random(2) == 0 ? 5308 ZFS_OFFLINE_TEMPORARY : 0); 5309 5310 /* 5311 * We have to grab the zs_name_lock as writer to 5312 * prevent a race between offlining a slog and 5313 * destroying a dataset. Offlining the slog will 5314 * grab a reference on the dataset which may cause 5315 * dmu_objset_destroy() to fail with EBUSY thus 5316 * leaving the dataset in an inconsistent state. 5317 */ 5318 if (islog) 5319 rw_enter(&ztest_name_lock, RW_WRITER); 5320 5321 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); 5322 5323 if (islog) 5324 rw_exit(&ztest_name_lock); 5325 } else { 5326 /* 5327 * Ideally we would like to be able to randomly 5328 * call vdev_[on|off]line without holding locks 5329 * to force unpredictable failures but the side 5330 * effects of vdev_[on|off]line prevent us from 5331 * doing so. We grab the ztest_vdev_lock here to 5332 * prevent a race between injection testing and 5333 * aux_vdev removal. 5334 */ 5335 mutex_enter(&ztest_vdev_lock); 5336 (void) vdev_online(spa, guid0, 0, NULL); 5337 mutex_exit(&ztest_vdev_lock); 5338 } 5339 } 5340 5341 if (maxfaults == 0) 5342 return; 5343 5344 /* 5345 * We have at least single-fault tolerance, so inject data corruption. 5346 */ 5347 fd = open(pathrand, O_RDWR); 5348 5349 if (fd == -1) /* we hit a gap in the device namespace */ 5350 return; 5351 5352 fsize = lseek(fd, 0, SEEK_END); 5353 5354 while (--iters != 0) { 5355 /* 5356 * The offset must be chosen carefully to ensure that 5357 * we do not inject a given logical block with errors 5358 * on two different leaf devices, because ZFS can not 5359 * tolerate that (if maxfaults==1). 5360 * 5361 * We divide each leaf into chunks of size 5362 * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk 5363 * there is a series of ranges to which we can inject errors. 5364 * Each range can accept errors on only a single leaf vdev. 5365 * The error injection ranges are separated by ranges 5366 * which we will not inject errors on any device (DMZs). 5367 * Each DMZ must be large enough such that a single block 5368 * can not straddle it, so that a single block can not be 5369 * a target in two different injection ranges (on different 5370 * leaf vdevs). 5371 * 5372 * For example, with 3 leaves, each chunk looks like: 5373 * 0 to 32M: injection range for leaf 0 5374 * 32M to 64M: DMZ - no injection allowed 5375 * 64M to 96M: injection range for leaf 1 5376 * 96M to 128M: DMZ - no injection allowed 5377 * 128M to 160M: injection range for leaf 2 5378 * 160M to 192M: DMZ - no injection allowed 5379 */ 5380 offset = ztest_random(fsize / (leaves << bshift)) * 5381 (leaves << bshift) + (leaf << bshift) + 5382 (ztest_random(1ULL << (bshift - 1)) & -8ULL); 5383 5384 /* 5385 * Only allow damage to the labels at one end of the vdev. 5386 * 5387 * If all labels are damaged, the device will be totally 5388 * inaccessible, which will result in loss of data, 5389 * because we also damage (parts of) the other side of 5390 * the mirror/raidz. 5391 * 5392 * Additionally, we will always have both an even and an 5393 * odd label, so that we can handle crashes in the 5394 * middle of vdev_config_sync(). 5395 */ 5396 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) 5397 continue; 5398 5399 /* 5400 * The two end labels are stored at the "end" of the disk, but 5401 * the end of the disk (vdev_psize) is aligned to 5402 * sizeof (vdev_label_t). 5403 */ 5404 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); 5405 if ((leaf & 1) == 1 && 5406 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) 5407 continue; 5408 5409 mutex_enter(&ztest_vdev_lock); 5410 if (mirror_save != zs->zs_mirrors) { 5411 mutex_exit(&ztest_vdev_lock); 5412 (void) close(fd); 5413 return; 5414 } 5415 5416 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) 5417 fatal(1, "can't inject bad word at 0x%llx in %s", 5418 offset, pathrand); 5419 5420 mutex_exit(&ztest_vdev_lock); 5421 5422 if (ztest_opts.zo_verbose >= 7) 5423 (void) printf("injected bad word into %s," 5424 " offset 0x%llx\n", pathrand, (u_longlong_t)offset); 5425 } 5426 5427 (void) close(fd); 5428 } 5429 5430 /* 5431 * Verify that DDT repair works as expected. 5432 */ 5433 void 5434 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) 5435 { 5436 ztest_shared_t *zs = ztest_shared; 5437 spa_t *spa = ztest_spa; 5438 objset_t *os = zd->zd_os; 5439 ztest_od_t od[1]; 5440 uint64_t object, blocksize, txg, pattern, psize; 5441 enum zio_checksum checksum = spa_dedup_checksum(spa); 5442 dmu_buf_t *db; 5443 dmu_tx_t *tx; 5444 abd_t *abd; 5445 blkptr_t blk; 5446 int copies = 2 * ZIO_DEDUPDITTO_MIN; 5447 5448 blocksize = ztest_random_blocksize(); 5449 blocksize = MIN(blocksize, 2048); /* because we write so many */ 5450 5451 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 5452 0, 0); 5453 5454 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) 5455 return; 5456 5457 /* 5458 * Take the name lock as writer to prevent anyone else from changing 5459 * the pool and dataset properies we need to maintain during this test. 5460 */ 5461 rw_enter(&ztest_name_lock, RW_WRITER); 5462 5463 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, 5464 B_FALSE) != 0 || 5465 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, 5466 B_FALSE) != 0) { 5467 rw_exit(&ztest_name_lock); 5468 return; 5469 } 5470 5471 dmu_objset_stats_t dds; 5472 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 5473 dmu_objset_fast_stat(os, &dds); 5474 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 5475 5476 object = od[0].od_object; 5477 blocksize = od[0].od_blocksize; 5478 pattern = zs->zs_guid ^ dds.dds_guid; 5479 5480 ASSERT(object != 0); 5481 5482 tx = dmu_tx_create(os); 5483 dmu_tx_hold_write(tx, object, 0, copies * blocksize); 5484 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); 5485 if (txg == 0) { 5486 rw_exit(&ztest_name_lock); 5487 return; 5488 } 5489 5490 /* 5491 * Write all the copies of our block. 5492 */ 5493 for (int i = 0; i < copies; i++) { 5494 uint64_t offset = i * blocksize; 5495 int error = dmu_buf_hold(os, object, offset, FTAG, &db, 5496 DMU_READ_NO_PREFETCH); 5497 if (error != 0) { 5498 fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", 5499 os, (long long)object, (long long) offset, error); 5500 } 5501 ASSERT(db->db_offset == offset); 5502 ASSERT(db->db_size == blocksize); 5503 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || 5504 ztest_pattern_match(db->db_data, db->db_size, 0ULL)); 5505 dmu_buf_will_fill(db, tx); 5506 ztest_pattern_set(db->db_data, db->db_size, pattern); 5507 dmu_buf_rele(db, FTAG); 5508 } 5509 5510 dmu_tx_commit(tx); 5511 txg_wait_synced(spa_get_dsl(spa), txg); 5512 5513 /* 5514 * Find out what block we got. 5515 */ 5516 VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, 5517 DMU_READ_NO_PREFETCH)); 5518 blk = *((dmu_buf_impl_t *)db)->db_blkptr; 5519 dmu_buf_rele(db, FTAG); 5520 5521 /* 5522 * Damage the block. Dedup-ditto will save us when we read it later. 5523 */ 5524 psize = BP_GET_PSIZE(&blk); 5525 abd = abd_alloc_linear(psize, B_TRUE); 5526 ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); 5527 5528 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, 5529 abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, 5530 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); 5531 5532 abd_free(abd); 5533 5534 rw_exit(&ztest_name_lock); 5535 } 5536 5537 /* 5538 * Scrub the pool. 5539 */ 5540 /* ARGSUSED */ 5541 void 5542 ztest_scrub(ztest_ds_t *zd, uint64_t id) 5543 { 5544 spa_t *spa = ztest_spa; 5545 5546 /* 5547 * Scrub in progress by device removal. 5548 */ 5549 if (ztest_device_removal_active) 5550 return; 5551 5552 (void) spa_scan(spa, POOL_SCAN_SCRUB); 5553 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ 5554 (void) spa_scan(spa, POOL_SCAN_SCRUB); 5555 } 5556 5557 /* 5558 * Change the guid for the pool. 5559 */ 5560 /* ARGSUSED */ 5561 void 5562 ztest_reguid(ztest_ds_t *zd, uint64_t id) 5563 { 5564 spa_t *spa = ztest_spa; 5565 uint64_t orig, load; 5566 int error; 5567 5568 orig = spa_guid(spa); 5569 load = spa_load_guid(spa); 5570 5571 rw_enter(&ztest_name_lock, RW_WRITER); 5572 error = spa_change_guid(spa); 5573 rw_exit(&ztest_name_lock); 5574 5575 if (error != 0) 5576 return; 5577 5578 if (ztest_opts.zo_verbose >= 4) { 5579 (void) printf("Changed guid old %llu -> %llu\n", 5580 (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); 5581 } 5582 5583 VERIFY3U(orig, !=, spa_guid(spa)); 5584 VERIFY3U(load, ==, spa_load_guid(spa)); 5585 } 5586 5587 static vdev_t * 5588 ztest_random_concrete_vdev_leaf(vdev_t *vd) 5589 { 5590 if (vd == NULL) 5591 return (NULL); 5592 5593 if (vd->vdev_children == 0) 5594 return (vd); 5595 5596 vdev_t *eligible[vd->vdev_children]; 5597 int eligible_idx = 0, i; 5598 for (i = 0; i < vd->vdev_children; i++) { 5599 vdev_t *cvd = vd->vdev_child[i]; 5600 if (cvd->vdev_top->vdev_removing) 5601 continue; 5602 if (cvd->vdev_children > 0 || 5603 (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { 5604 eligible[eligible_idx++] = cvd; 5605 } 5606 } 5607 VERIFY(eligible_idx > 0); 5608 5609 uint64_t child_no = ztest_random(eligible_idx); 5610 return (ztest_random_concrete_vdev_leaf(eligible[child_no])); 5611 } 5612 5613 /* ARGSUSED */ 5614 void 5615 ztest_initialize(ztest_ds_t *zd, uint64_t id) 5616 { 5617 spa_t *spa = ztest_spa; 5618 int error = 0; 5619 5620 mutex_enter(&ztest_vdev_lock); 5621 5622 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5623 5624 /* Random leaf vdev */ 5625 vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); 5626 if (rand_vd == NULL) { 5627 spa_config_exit(spa, SCL_VDEV, FTAG); 5628 mutex_exit(&ztest_vdev_lock); 5629 return; 5630 } 5631 5632 /* 5633 * The random vdev we've selected may change as soon as we 5634 * drop the spa_config_lock. We create local copies of things 5635 * we're interested in. 5636 */ 5637 uint64_t guid = rand_vd->vdev_guid; 5638 char *path = strdup(rand_vd->vdev_path); 5639 boolean_t active = rand_vd->vdev_initialize_thread != NULL; 5640 5641 zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); 5642 spa_config_exit(spa, SCL_VDEV, FTAG); 5643 5644 uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); 5645 error = spa_vdev_initialize(spa, guid, cmd); 5646 switch (cmd) { 5647 case POOL_INITIALIZE_CANCEL: 5648 if (ztest_opts.zo_verbose >= 4) { 5649 (void) printf("Cancel initialize %s", path); 5650 if (!active) 5651 (void) printf(" failed (no initialize active)"); 5652 (void) printf("\n"); 5653 } 5654 break; 5655 case POOL_INITIALIZE_DO: 5656 if (ztest_opts.zo_verbose >= 4) { 5657 (void) printf("Start initialize %s", path); 5658 if (active && error == 0) 5659 (void) printf(" failed (already active)"); 5660 else if (error != 0) 5661 (void) printf(" failed (error %d)", error); 5662 (void) printf("\n"); 5663 } 5664 break; 5665 case POOL_INITIALIZE_SUSPEND: 5666 if (ztest_opts.zo_verbose >= 4) { 5667 (void) printf("Suspend initialize %s", path); 5668 if (!active) 5669 (void) printf(" failed (no initialize active)"); 5670 (void) printf("\n"); 5671 } 5672 break; 5673 } 5674 free(path); 5675 mutex_exit(&ztest_vdev_lock); 5676 } 5677 5678 /* 5679 * Verify pool integrity by running zdb. 5680 */ 5681 static void 5682 ztest_run_zdb(char *pool) 5683 { 5684 int status; 5685 char zdb[MAXPATHLEN + MAXNAMELEN + 20]; 5686 char zbuf[1024]; 5687 char *bin; 5688 char *ztest; 5689 char *isa; 5690 int isalen; 5691 FILE *fp; 5692 5693 (void) realpath(getexecname(), zdb); 5694 5695 /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ 5696 bin = strstr(zdb, "/usr/bin/"); 5697 ztest = strstr(bin, "/ztest"); 5698 isa = bin + 8; 5699 isalen = ztest - isa; 5700 isa = strdup(isa); 5701 /* LINTED */ 5702 (void) sprintf(bin, 5703 "/usr/sbin%.*s/zdb -bcc%s%s -G -d -U %s %s", 5704 isalen, 5705 isa, 5706 ztest_opts.zo_verbose >= 3 ? "s" : "", 5707 ztest_opts.zo_verbose >= 4 ? "v" : "", 5708 spa_config_path, 5709 pool); 5710 free(isa); 5711 5712 if (ztest_opts.zo_verbose >= 5) 5713 (void) printf("Executing %s\n", strstr(zdb, "zdb ")); 5714 5715 fp = popen(zdb, "r"); 5716 5717 while (fgets(zbuf, sizeof (zbuf), fp) != NULL) 5718 if (ztest_opts.zo_verbose >= 3) 5719 (void) printf("%s", zbuf); 5720 5721 status = pclose(fp); 5722 5723 if (status == 0) 5724 return; 5725 5726 ztest_dump_core = 0; 5727 if (WIFEXITED(status)) 5728 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); 5729 else 5730 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); 5731 } 5732 5733 static void 5734 ztest_walk_pool_directory(char *header) 5735 { 5736 spa_t *spa = NULL; 5737 5738 if (ztest_opts.zo_verbose >= 6) 5739 (void) printf("%s\n", header); 5740 5741 mutex_enter(&spa_namespace_lock); 5742 while ((spa = spa_next(spa)) != NULL) 5743 if (ztest_opts.zo_verbose >= 6) 5744 (void) printf("\t%s\n", spa_name(spa)); 5745 mutex_exit(&spa_namespace_lock); 5746 } 5747 5748 static void 5749 ztest_spa_import_export(char *oldname, char *newname) 5750 { 5751 nvlist_t *config, *newconfig; 5752 uint64_t pool_guid; 5753 spa_t *spa; 5754 int error; 5755 5756 if (ztest_opts.zo_verbose >= 4) { 5757 (void) printf("import/export: old = %s, new = %s\n", 5758 oldname, newname); 5759 } 5760 5761 /* 5762 * Clean up from previous runs. 5763 */ 5764 (void) spa_destroy(newname); 5765 5766 /* 5767 * Get the pool's configuration and guid. 5768 */ 5769 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); 5770 5771 /* 5772 * Kick off a scrub to tickle scrub/export races. 5773 */ 5774 if (ztest_random(2) == 0) 5775 (void) spa_scan(spa, POOL_SCAN_SCRUB); 5776 5777 pool_guid = spa_guid(spa); 5778 spa_close(spa, FTAG); 5779 5780 ztest_walk_pool_directory("pools before export"); 5781 5782 /* 5783 * Export it. 5784 */ 5785 VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); 5786 5787 ztest_walk_pool_directory("pools after export"); 5788 5789 /* 5790 * Try to import it. 5791 */ 5792 newconfig = spa_tryimport(config); 5793 ASSERT(newconfig != NULL); 5794 nvlist_free(newconfig); 5795 5796 /* 5797 * Import it under the new name. 5798 */ 5799 error = spa_import(newname, config, NULL, 0); 5800 if (error != 0) { 5801 dump_nvlist(config, 0); 5802 fatal(B_FALSE, "couldn't import pool %s as %s: error %u", 5803 oldname, newname, error); 5804 } 5805 5806 ztest_walk_pool_directory("pools after import"); 5807 5808 /* 5809 * Try to import it again -- should fail with EEXIST. 5810 */ 5811 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); 5812 5813 /* 5814 * Try to import it under a different name -- should fail with EEXIST. 5815 */ 5816 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); 5817 5818 /* 5819 * Verify that the pool is no longer visible under the old name. 5820 */ 5821 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); 5822 5823 /* 5824 * Verify that we can open and close the pool using the new name. 5825 */ 5826 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); 5827 ASSERT(pool_guid == spa_guid(spa)); 5828 spa_close(spa, FTAG); 5829 5830 nvlist_free(config); 5831 } 5832 5833 static void 5834 ztest_resume(spa_t *spa) 5835 { 5836 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) 5837 (void) printf("resuming from suspended state\n"); 5838 spa_vdev_state_enter(spa, SCL_NONE); 5839 vdev_clear(spa, NULL); 5840 (void) spa_vdev_state_exit(spa, NULL, 0); 5841 (void) zio_resume(spa); 5842 } 5843 5844 static void * 5845 ztest_resume_thread(void *arg) 5846 { 5847 spa_t *spa = arg; 5848 5849 while (!ztest_exiting) { 5850 if (spa_suspended(spa)) 5851 ztest_resume(spa); 5852 (void) poll(NULL, 0, 100); 5853 5854 /* 5855 * Periodically change the zfs_compressed_arc_enabled setting. 5856 */ 5857 if (ztest_random(10) == 0) 5858 zfs_compressed_arc_enabled = ztest_random(2); 5859 5860 /* 5861 * Periodically change the zfs_abd_scatter_enabled setting. 5862 */ 5863 if (ztest_random(10) == 0) 5864 zfs_abd_scatter_enabled = ztest_random(2); 5865 } 5866 return (NULL); 5867 } 5868 5869 static void * 5870 ztest_deadman_thread(void *arg) 5871 { 5872 ztest_shared_t *zs = arg; 5873 spa_t *spa = ztest_spa; 5874 hrtime_t delta, total = 0; 5875 5876 for (;;) { 5877 delta = zs->zs_thread_stop - zs->zs_thread_start + 5878 MSEC2NSEC(zfs_deadman_synctime_ms); 5879 5880 (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); 5881 5882 /* 5883 * If the pool is suspended then fail immediately. Otherwise, 5884 * check to see if the pool is making any progress. If 5885 * vdev_deadman() discovers that there hasn't been any recent 5886 * I/Os then it will end up aborting the tests. 5887 */ 5888 if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { 5889 fatal(0, "aborting test after %llu seconds because " 5890 "pool has transitioned to a suspended state.", 5891 zfs_deadman_synctime_ms / 1000); 5892 return (NULL); 5893 } 5894 vdev_deadman(spa->spa_root_vdev); 5895 5896 total += zfs_deadman_synctime_ms/1000; 5897 (void) printf("ztest has been running for %lld seconds\n", 5898 total); 5899 } 5900 } 5901 5902 static void 5903 ztest_execute(int test, ztest_info_t *zi, uint64_t id) 5904 { 5905 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; 5906 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); 5907 hrtime_t functime = gethrtime(); 5908 5909 for (int i = 0; i < zi->zi_iters; i++) 5910 zi->zi_func(zd, id); 5911 5912 functime = gethrtime() - functime; 5913 5914 atomic_add_64(&zc->zc_count, 1); 5915 atomic_add_64(&zc->zc_time, functime); 5916 5917 if (ztest_opts.zo_verbose >= 4) { 5918 Dl_info dli; 5919 (void) dladdr((void *)zi->zi_func, &dli); 5920 (void) printf("%6.2f sec in %s\n", 5921 (double)functime / NANOSEC, dli.dli_sname); 5922 } 5923 } 5924 5925 static void * 5926 ztest_thread(void *arg) 5927 { 5928 int rand; 5929 uint64_t id = (uintptr_t)arg; 5930 ztest_shared_t *zs = ztest_shared; 5931 uint64_t call_next; 5932 hrtime_t now; 5933 ztest_info_t *zi; 5934 ztest_shared_callstate_t *zc; 5935 5936 while ((now = gethrtime()) < zs->zs_thread_stop) { 5937 /* 5938 * See if it's time to force a crash. 5939 */ 5940 if (now > zs->zs_thread_kill) 5941 ztest_kill(zs); 5942 5943 /* 5944 * If we're getting ENOSPC with some regularity, stop. 5945 */ 5946 if (zs->zs_enospc_count > 10) 5947 break; 5948 5949 /* 5950 * Pick a random function to execute. 5951 */ 5952 rand = ztest_random(ZTEST_FUNCS); 5953 zi = &ztest_info[rand]; 5954 zc = ZTEST_GET_SHARED_CALLSTATE(rand); 5955 call_next = zc->zc_next; 5956 5957 if (now >= call_next && 5958 atomic_cas_64(&zc->zc_next, call_next, call_next + 5959 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { 5960 ztest_execute(rand, zi, id); 5961 } 5962 } 5963 5964 return (NULL); 5965 } 5966 5967 static void 5968 ztest_dataset_name(char *dsname, char *pool, int d) 5969 { 5970 (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); 5971 } 5972 5973 static void 5974 ztest_dataset_destroy(int d) 5975 { 5976 char name[ZFS_MAX_DATASET_NAME_LEN]; 5977 5978 ztest_dataset_name(name, ztest_opts.zo_pool, d); 5979 5980 if (ztest_opts.zo_verbose >= 3) 5981 (void) printf("Destroying %s to free up space\n", name); 5982 5983 /* 5984 * Cleanup any non-standard clones and snapshots. In general, 5985 * ztest thread t operates on dataset (t % zopt_datasets), 5986 * so there may be more than one thing to clean up. 5987 */ 5988 for (int t = d; t < ztest_opts.zo_threads; 5989 t += ztest_opts.zo_datasets) { 5990 ztest_dsl_dataset_cleanup(name, t); 5991 } 5992 5993 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, 5994 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 5995 } 5996 5997 static void 5998 ztest_dataset_dirobj_verify(ztest_ds_t *zd) 5999 { 6000 uint64_t usedobjs, dirobjs, scratch; 6001 6002 /* 6003 * ZTEST_DIROBJ is the object directory for the entire dataset. 6004 * Therefore, the number of objects in use should equal the 6005 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. 6006 * If not, we have an object leak. 6007 * 6008 * Note that we can only check this in ztest_dataset_open(), 6009 * when the open-context and syncing-context values agree. 6010 * That's because zap_count() returns the open-context value, 6011 * while dmu_objset_space() returns the rootbp fill count. 6012 */ 6013 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); 6014 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); 6015 ASSERT3U(dirobjs + 1, ==, usedobjs); 6016 } 6017 6018 static int 6019 ztest_dataset_open(int d) 6020 { 6021 ztest_ds_t *zd = &ztest_ds[d]; 6022 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; 6023 objset_t *os; 6024 zilog_t *zilog; 6025 char name[ZFS_MAX_DATASET_NAME_LEN]; 6026 int error; 6027 6028 ztest_dataset_name(name, ztest_opts.zo_pool, d); 6029 6030 rw_enter(&ztest_name_lock, RW_READER); 6031 6032 error = ztest_dataset_create(name); 6033 if (error == ENOSPC) { 6034 rw_exit(&ztest_name_lock); 6035 ztest_record_enospc(FTAG); 6036 return (error); 6037 } 6038 ASSERT(error == 0 || error == EEXIST); 6039 6040 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os)); 6041 rw_exit(&ztest_name_lock); 6042 6043 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); 6044 6045 zilog = zd->zd_zilog; 6046 6047 if (zilog->zl_header->zh_claim_lr_seq != 0 && 6048 zilog->zl_header->zh_claim_lr_seq < committed_seq) 6049 fatal(0, "missing log records: claimed %llu < committed %llu", 6050 zilog->zl_header->zh_claim_lr_seq, committed_seq); 6051 6052 ztest_dataset_dirobj_verify(zd); 6053 6054 zil_replay(os, zd, ztest_replay_vector); 6055 6056 ztest_dataset_dirobj_verify(zd); 6057 6058 if (ztest_opts.zo_verbose >= 6) 6059 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", 6060 zd->zd_name, 6061 (u_longlong_t)zilog->zl_parse_blk_count, 6062 (u_longlong_t)zilog->zl_parse_lr_count, 6063 (u_longlong_t)zilog->zl_replaying_seq); 6064 6065 zilog = zil_open(os, ztest_get_data); 6066 6067 if (zilog->zl_replaying_seq != 0 && 6068 zilog->zl_replaying_seq < committed_seq) 6069 fatal(0, "missing log records: replayed %llu < committed %llu", 6070 zilog->zl_replaying_seq, committed_seq); 6071 6072 return (0); 6073 } 6074 6075 static void 6076 ztest_dataset_close(int d) 6077 { 6078 ztest_ds_t *zd = &ztest_ds[d]; 6079 6080 zil_close(zd->zd_zilog); 6081 dmu_objset_disown(zd->zd_os, zd); 6082 6083 ztest_zd_fini(zd); 6084 } 6085 6086 /* 6087 * Kick off threads to run tests on all datasets in parallel. 6088 */ 6089 static void 6090 ztest_run(ztest_shared_t *zs) 6091 { 6092 thread_t *tid; 6093 spa_t *spa; 6094 objset_t *os; 6095 thread_t resume_tid; 6096 int error; 6097 6098 ztest_exiting = B_FALSE; 6099 6100 /* 6101 * Initialize parent/child shared state. 6102 */ 6103 mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); 6104 mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); 6105 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); 6106 6107 zs->zs_thread_start = gethrtime(); 6108 zs->zs_thread_stop = 6109 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; 6110 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); 6111 zs->zs_thread_kill = zs->zs_thread_stop; 6112 if (ztest_random(100) < ztest_opts.zo_killrate) { 6113 zs->zs_thread_kill -= 6114 ztest_random(ztest_opts.zo_passtime * NANOSEC); 6115 } 6116 6117 mutex_init(&zcl.zcl_callbacks_lock, NULL, USYNC_THREAD, NULL); 6118 6119 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), 6120 offsetof(ztest_cb_data_t, zcd_node)); 6121 6122 /* 6123 * Open our pool. 6124 */ 6125 kernel_init(FREAD | FWRITE); 6126 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6127 metaslab_preload_limit = ztest_random(20) + 1; 6128 ztest_spa = spa; 6129 6130 dmu_objset_stats_t dds; 6131 VERIFY0(dmu_objset_own(ztest_opts.zo_pool, 6132 DMU_OST_ANY, B_TRUE, FTAG, &os)); 6133 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 6134 dmu_objset_fast_stat(os, &dds); 6135 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 6136 zs->zs_guid = dds.dds_guid; 6137 dmu_objset_disown(os, FTAG); 6138 6139 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; 6140 6141 /* 6142 * We don't expect the pool to suspend unless maxfaults == 0, 6143 * in which case ztest_fault_inject() temporarily takes away 6144 * the only valid replica. 6145 */ 6146 if (MAXFAULTS() == 0) 6147 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; 6148 else 6149 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; 6150 6151 /* 6152 * Create a thread to periodically resume suspended I/O. 6153 */ 6154 VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, 6155 &resume_tid) == 0); 6156 6157 /* 6158 * Create a deadman thread to abort() if we hang. 6159 */ 6160 VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, 6161 NULL) == 0); 6162 6163 /* 6164 * Verify that we can safely inquire about any object, 6165 * whether it's allocated or not. To make it interesting, 6166 * we probe a 5-wide window around each power of two. 6167 * This hits all edge cases, including zero and the max. 6168 */ 6169 for (int t = 0; t < 64; t++) { 6170 for (int d = -5; d <= 5; d++) { 6171 error = dmu_object_info(spa->spa_meta_objset, 6172 (1ULL << t) + d, NULL); 6173 ASSERT(error == 0 || error == ENOENT || 6174 error == EINVAL); 6175 } 6176 } 6177 6178 /* 6179 * If we got any ENOSPC errors on the previous run, destroy something. 6180 */ 6181 if (zs->zs_enospc_count != 0) { 6182 int d = ztest_random(ztest_opts.zo_datasets); 6183 ztest_dataset_destroy(d); 6184 } 6185 zs->zs_enospc_count = 0; 6186 6187 tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t), 6188 UMEM_NOFAIL); 6189 6190 if (ztest_opts.zo_verbose >= 4) 6191 (void) printf("starting main threads...\n"); 6192 6193 /* 6194 * Kick off all the tests that run in parallel. 6195 */ 6196 for (int t = 0; t < ztest_opts.zo_threads; t++) { 6197 if (t < ztest_opts.zo_datasets && 6198 ztest_dataset_open(t) != 0) 6199 return; 6200 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, 6201 THR_BOUND, &tid[t]) == 0); 6202 } 6203 6204 /* 6205 * Wait for all of the tests to complete. We go in reverse order 6206 * so we don't close datasets while threads are still using them. 6207 */ 6208 for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) { 6209 VERIFY(thr_join(tid[t], NULL, NULL) == 0); 6210 if (t < ztest_opts.zo_datasets) 6211 ztest_dataset_close(t); 6212 } 6213 6214 txg_wait_synced(spa_get_dsl(spa), 0); 6215 6216 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 6217 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); 6218 zfs_dbgmsg_print(FTAG); 6219 6220 umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t)); 6221 6222 /* Kill the resume thread */ 6223 ztest_exiting = B_TRUE; 6224 VERIFY(thr_join(resume_tid, NULL, NULL) == 0); 6225 ztest_resume(spa); 6226 6227 /* 6228 * Right before closing the pool, kick off a bunch of async I/O; 6229 * spa_close() should wait for it to complete. 6230 */ 6231 for (uint64_t object = 1; object < 50; object++) { 6232 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, 6233 ZIO_PRIORITY_SYNC_READ); 6234 } 6235 6236 spa_close(spa, FTAG); 6237 6238 /* 6239 * Verify that we can loop over all pools. 6240 */ 6241 mutex_enter(&spa_namespace_lock); 6242 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) 6243 if (ztest_opts.zo_verbose > 3) 6244 (void) printf("spa_next: found %s\n", spa_name(spa)); 6245 mutex_exit(&spa_namespace_lock); 6246 6247 /* 6248 * Verify that we can export the pool and reimport it under a 6249 * different name. 6250 */ 6251 if (ztest_random(2) == 0) { 6252 char name[ZFS_MAX_DATASET_NAME_LEN]; 6253 (void) snprintf(name, sizeof (name), "%s_import", 6254 ztest_opts.zo_pool); 6255 ztest_spa_import_export(ztest_opts.zo_pool, name); 6256 ztest_spa_import_export(name, ztest_opts.zo_pool); 6257 } 6258 6259 kernel_fini(); 6260 6261 list_destroy(&zcl.zcl_callbacks); 6262 6263 mutex_destroy(&zcl.zcl_callbacks_lock); 6264 6265 rw_destroy(&ztest_name_lock); 6266 mutex_destroy(&ztest_vdev_lock); 6267 mutex_destroy(&ztest_checkpoint_lock); 6268 } 6269 6270 static void 6271 ztest_freeze(void) 6272 { 6273 ztest_ds_t *zd = &ztest_ds[0]; 6274 spa_t *spa; 6275 int numloops = 0; 6276 6277 if (ztest_opts.zo_verbose >= 3) 6278 (void) printf("testing spa_freeze()...\n"); 6279 6280 kernel_init(FREAD | FWRITE); 6281 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6282 VERIFY3U(0, ==, ztest_dataset_open(0)); 6283 ztest_spa = spa; 6284 6285 /* 6286 * Force the first log block to be transactionally allocated. 6287 * We have to do this before we freeze the pool -- otherwise 6288 * the log chain won't be anchored. 6289 */ 6290 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { 6291 ztest_dmu_object_alloc_free(zd, 0); 6292 zil_commit(zd->zd_zilog, 0); 6293 } 6294 6295 txg_wait_synced(spa_get_dsl(spa), 0); 6296 6297 /* 6298 * Freeze the pool. This stops spa_sync() from doing anything, 6299 * so that the only way to record changes from now on is the ZIL. 6300 */ 6301 spa_freeze(spa); 6302 6303 /* 6304 * Because it is hard to predict how much space a write will actually 6305 * require beforehand, we leave ourselves some fudge space to write over 6306 * capacity. 6307 */ 6308 uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; 6309 6310 /* 6311 * Run tests that generate log records but don't alter the pool config 6312 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). 6313 * We do a txg_wait_synced() after each iteration to force the txg 6314 * to increase well beyond the last synced value in the uberblock. 6315 * The ZIL should be OK with that. 6316 * 6317 * Run a random number of times less than zo_maxloops and ensure we do 6318 * not run out of space on the pool. 6319 */ 6320 while (ztest_random(10) != 0 && 6321 numloops++ < ztest_opts.zo_maxloops && 6322 metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { 6323 ztest_od_t od; 6324 ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); 6325 VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); 6326 ztest_io(zd, od.od_object, 6327 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); 6328 txg_wait_synced(spa_get_dsl(spa), 0); 6329 } 6330 6331 /* 6332 * Commit all of the changes we just generated. 6333 */ 6334 zil_commit(zd->zd_zilog, 0); 6335 txg_wait_synced(spa_get_dsl(spa), 0); 6336 6337 /* 6338 * Close our dataset and close the pool. 6339 */ 6340 ztest_dataset_close(0); 6341 spa_close(spa, FTAG); 6342 kernel_fini(); 6343 6344 /* 6345 * Open and close the pool and dataset to induce log replay. 6346 */ 6347 kernel_init(FREAD | FWRITE); 6348 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6349 ASSERT(spa_freeze_txg(spa) == UINT64_MAX); 6350 VERIFY3U(0, ==, ztest_dataset_open(0)); 6351 ztest_dataset_close(0); 6352 6353 ztest_spa = spa; 6354 txg_wait_synced(spa_get_dsl(spa), 0); 6355 ztest_reguid(NULL, 0); 6356 6357 spa_close(spa, FTAG); 6358 kernel_fini(); 6359 } 6360 6361 void 6362 print_time(hrtime_t t, char *timebuf) 6363 { 6364 hrtime_t s = t / NANOSEC; 6365 hrtime_t m = s / 60; 6366 hrtime_t h = m / 60; 6367 hrtime_t d = h / 24; 6368 6369 s -= m * 60; 6370 m -= h * 60; 6371 h -= d * 24; 6372 6373 timebuf[0] = '\0'; 6374 6375 if (d) 6376 (void) sprintf(timebuf, 6377 "%llud%02lluh%02llum%02llus", d, h, m, s); 6378 else if (h) 6379 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); 6380 else if (m) 6381 (void) sprintf(timebuf, "%llum%02llus", m, s); 6382 else 6383 (void) sprintf(timebuf, "%llus", s); 6384 } 6385 6386 static nvlist_t * 6387 make_random_props() 6388 { 6389 nvlist_t *props; 6390 6391 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); 6392 if (ztest_random(2) == 0) 6393 return (props); 6394 VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); 6395 6396 return (props); 6397 } 6398 6399 /* 6400 * Create a storage pool with the given name and initial vdev size. 6401 * Then test spa_freeze() functionality. 6402 */ 6403 static void 6404 ztest_init(ztest_shared_t *zs) 6405 { 6406 spa_t *spa; 6407 nvlist_t *nvroot, *props; 6408 6409 mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL); 6410 mutex_init(&ztest_checkpoint_lock, NULL, USYNC_THREAD, NULL); 6411 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL); 6412 6413 kernel_init(FREAD | FWRITE); 6414 6415 /* 6416 * Create the storage pool. 6417 */ 6418 (void) spa_destroy(ztest_opts.zo_pool); 6419 ztest_shared->zs_vdev_next_leaf = 0; 6420 zs->zs_splits = 0; 6421 zs->zs_mirrors = ztest_opts.zo_mirrors; 6422 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, 6423 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); 6424 props = make_random_props(); 6425 for (int i = 0; i < SPA_FEATURES; i++) { 6426 char buf[1024]; 6427 (void) snprintf(buf, sizeof (buf), "feature@%s", 6428 spa_feature_table[i].fi_uname); 6429 VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); 6430 } 6431 VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL)); 6432 nvlist_free(nvroot); 6433 nvlist_free(props); 6434 6435 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); 6436 zs->zs_metaslab_sz = 6437 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; 6438 6439 spa_close(spa, FTAG); 6440 6441 kernel_fini(); 6442 6443 ztest_run_zdb(ztest_opts.zo_pool); 6444 6445 ztest_freeze(); 6446 6447 ztest_run_zdb(ztest_opts.zo_pool); 6448 6449 rw_destroy(&ztest_name_lock); 6450 mutex_destroy(&ztest_vdev_lock); 6451 mutex_destroy(&ztest_checkpoint_lock); 6452 } 6453 6454 static void 6455 setup_data_fd(void) 6456 { 6457 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; 6458 6459 ztest_fd_data = mkstemp(ztest_name_data); 6460 ASSERT3S(ztest_fd_data, >=, 0); 6461 (void) unlink(ztest_name_data); 6462 } 6463 6464 6465 static int 6466 shared_data_size(ztest_shared_hdr_t *hdr) 6467 { 6468 int size; 6469 6470 size = hdr->zh_hdr_size; 6471 size += hdr->zh_opts_size; 6472 size += hdr->zh_size; 6473 size += hdr->zh_stats_size * hdr->zh_stats_count; 6474 size += hdr->zh_ds_size * hdr->zh_ds_count; 6475 6476 return (size); 6477 } 6478 6479 static void 6480 setup_hdr(void) 6481 { 6482 int size; 6483 ztest_shared_hdr_t *hdr; 6484 6485 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 6486 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 6487 ASSERT(hdr != MAP_FAILED); 6488 6489 VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); 6490 6491 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); 6492 hdr->zh_opts_size = sizeof (ztest_shared_opts_t); 6493 hdr->zh_size = sizeof (ztest_shared_t); 6494 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); 6495 hdr->zh_stats_count = ZTEST_FUNCS; 6496 hdr->zh_ds_size = sizeof (ztest_shared_ds_t); 6497 hdr->zh_ds_count = ztest_opts.zo_datasets; 6498 6499 size = shared_data_size(hdr); 6500 VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); 6501 6502 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 6503 } 6504 6505 static void 6506 setup_data(void) 6507 { 6508 int size, offset; 6509 ztest_shared_hdr_t *hdr; 6510 uint8_t *buf; 6511 6512 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), 6513 PROT_READ, MAP_SHARED, ztest_fd_data, 0); 6514 ASSERT(hdr != MAP_FAILED); 6515 6516 size = shared_data_size(hdr); 6517 6518 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); 6519 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), 6520 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); 6521 ASSERT(hdr != MAP_FAILED); 6522 buf = (uint8_t *)hdr; 6523 6524 offset = hdr->zh_hdr_size; 6525 ztest_shared_opts = (void *)&buf[offset]; 6526 offset += hdr->zh_opts_size; 6527 ztest_shared = (void *)&buf[offset]; 6528 offset += hdr->zh_size; 6529 ztest_shared_callstate = (void *)&buf[offset]; 6530 offset += hdr->zh_stats_size * hdr->zh_stats_count; 6531 ztest_shared_ds = (void *)&buf[offset]; 6532 } 6533 6534 static boolean_t 6535 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) 6536 { 6537 pid_t pid; 6538 int status; 6539 char *cmdbuf = NULL; 6540 6541 pid = fork(); 6542 6543 if (cmd == NULL) { 6544 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); 6545 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); 6546 cmd = cmdbuf; 6547 } 6548 6549 if (pid == -1) 6550 fatal(1, "fork failed"); 6551 6552 if (pid == 0) { /* child */ 6553 char *emptyargv[2] = { cmd, NULL }; 6554 char fd_data_str[12]; 6555 6556 struct rlimit rl = { 1024, 1024 }; 6557 (void) setrlimit(RLIMIT_NOFILE, &rl); 6558 6559 (void) close(ztest_fd_rand); 6560 VERIFY3U(11, >=, 6561 snprintf(fd_data_str, 12, "%d", ztest_fd_data)); 6562 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); 6563 6564 (void) enable_extended_FILE_stdio(-1, -1); 6565 if (libpath != NULL) 6566 VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); 6567 (void) execv(cmd, emptyargv); 6568 ztest_dump_core = B_FALSE; 6569 fatal(B_TRUE, "exec failed: %s", cmd); 6570 } 6571 6572 if (cmdbuf != NULL) { 6573 umem_free(cmdbuf, MAXPATHLEN); 6574 cmd = NULL; 6575 } 6576 6577 while (waitpid(pid, &status, 0) != pid) 6578 continue; 6579 if (statusp != NULL) 6580 *statusp = status; 6581 6582 if (WIFEXITED(status)) { 6583 if (WEXITSTATUS(status) != 0) { 6584 (void) fprintf(stderr, "child exited with code %d\n", 6585 WEXITSTATUS(status)); 6586 exit(2); 6587 } 6588 return (B_FALSE); 6589 } else if (WIFSIGNALED(status)) { 6590 if (!ignorekill || WTERMSIG(status) != SIGKILL) { 6591 (void) fprintf(stderr, "child died with signal %d\n", 6592 WTERMSIG(status)); 6593 exit(3); 6594 } 6595 return (B_TRUE); 6596 } else { 6597 (void) fprintf(stderr, "something strange happened to child\n"); 6598 exit(4); 6599 /* NOTREACHED */ 6600 } 6601 } 6602 6603 static void 6604 ztest_run_init(void) 6605 { 6606 ztest_shared_t *zs = ztest_shared; 6607 6608 ASSERT(ztest_opts.zo_init != 0); 6609 6610 /* 6611 * Blow away any existing copy of zpool.cache 6612 */ 6613 (void) remove(spa_config_path); 6614 6615 /* 6616 * Create and initialize our storage pool. 6617 */ 6618 for (int i = 1; i <= ztest_opts.zo_init; i++) { 6619 bzero(zs, sizeof (ztest_shared_t)); 6620 if (ztest_opts.zo_verbose >= 3 && 6621 ztest_opts.zo_init != 1) { 6622 (void) printf("ztest_init(), pass %d\n", i); 6623 } 6624 ztest_init(zs); 6625 } 6626 } 6627 6628 int 6629 main(int argc, char **argv) 6630 { 6631 int kills = 0; 6632 int iters = 0; 6633 int older = 0; 6634 int newer = 0; 6635 ztest_shared_t *zs; 6636 ztest_info_t *zi; 6637 ztest_shared_callstate_t *zc; 6638 char timebuf[100]; 6639 char numbuf[NN_NUMBUF_SZ]; 6640 char *cmd; 6641 boolean_t hasalt; 6642 char *fd_data_str = getenv("ZTEST_FD_DATA"); 6643 6644 (void) setvbuf(stdout, NULL, _IOLBF, 0); 6645 6646 dprintf_setup(&argc, argv); 6647 zfs_deadman_synctime_ms = 300000; 6648 /* 6649 * As two-word space map entries may not come up often (especially 6650 * if pool and vdev sizes are small) we want to force at least some 6651 * of them so the feature get tested. 6652 */ 6653 zfs_force_some_double_word_sm_entries = B_TRUE; 6654 6655 ztest_fd_rand = open("/dev/urandom", O_RDONLY); 6656 ASSERT3S(ztest_fd_rand, >=, 0); 6657 6658 if (!fd_data_str) { 6659 process_options(argc, argv); 6660 6661 setup_data_fd(); 6662 setup_hdr(); 6663 setup_data(); 6664 bcopy(&ztest_opts, ztest_shared_opts, 6665 sizeof (*ztest_shared_opts)); 6666 } else { 6667 ztest_fd_data = atoi(fd_data_str); 6668 setup_data(); 6669 bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); 6670 } 6671 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); 6672 6673 /* Override location of zpool.cache */ 6674 VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache", 6675 ztest_opts.zo_dir), !=, -1); 6676 6677 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), 6678 UMEM_NOFAIL); 6679 zs = ztest_shared; 6680 6681 if (fd_data_str) { 6682 metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; 6683 metaslab_df_alloc_threshold = 6684 zs->zs_metaslab_df_alloc_threshold; 6685 6686 if (zs->zs_do_init) 6687 ztest_run_init(); 6688 else 6689 ztest_run(zs); 6690 exit(0); 6691 } 6692 6693 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); 6694 6695 if (ztest_opts.zo_verbose >= 1) { 6696 (void) printf("%llu vdevs, %d datasets, %d threads," 6697 " %llu seconds...\n", 6698 (u_longlong_t)ztest_opts.zo_vdevs, 6699 ztest_opts.zo_datasets, 6700 ztest_opts.zo_threads, 6701 (u_longlong_t)ztest_opts.zo_time); 6702 } 6703 6704 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); 6705 (void) strlcpy(cmd, getexecname(), MAXNAMELEN); 6706 6707 zs->zs_do_init = B_TRUE; 6708 if (strlen(ztest_opts.zo_alt_ztest) != 0) { 6709 if (ztest_opts.zo_verbose >= 1) { 6710 (void) printf("Executing older ztest for " 6711 "initialization: %s\n", ztest_opts.zo_alt_ztest); 6712 } 6713 VERIFY(!exec_child(ztest_opts.zo_alt_ztest, 6714 ztest_opts.zo_alt_libpath, B_FALSE, NULL)); 6715 } else { 6716 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); 6717 } 6718 zs->zs_do_init = B_FALSE; 6719 6720 zs->zs_proc_start = gethrtime(); 6721 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; 6722 6723 for (int f = 0; f < ZTEST_FUNCS; f++) { 6724 zi = &ztest_info[f]; 6725 zc = ZTEST_GET_SHARED_CALLSTATE(f); 6726 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) 6727 zc->zc_next = UINT64_MAX; 6728 else 6729 zc->zc_next = zs->zs_proc_start + 6730 ztest_random(2 * zi->zi_interval[0] + 1); 6731 } 6732 6733 /* 6734 * Run the tests in a loop. These tests include fault injection 6735 * to verify that self-healing data works, and forced crashes 6736 * to verify that we never lose on-disk consistency. 6737 */ 6738 while (gethrtime() < zs->zs_proc_stop) { 6739 int status; 6740 boolean_t killed; 6741 6742 /* 6743 * Initialize the workload counters for each function. 6744 */ 6745 for (int f = 0; f < ZTEST_FUNCS; f++) { 6746 zc = ZTEST_GET_SHARED_CALLSTATE(f); 6747 zc->zc_count = 0; 6748 zc->zc_time = 0; 6749 } 6750 6751 /* Set the allocation switch size */ 6752 zs->zs_metaslab_df_alloc_threshold = 6753 ztest_random(zs->zs_metaslab_sz / 4) + 1; 6754 6755 if (!hasalt || ztest_random(2) == 0) { 6756 if (hasalt && ztest_opts.zo_verbose >= 1) { 6757 (void) printf("Executing newer ztest: %s\n", 6758 cmd); 6759 } 6760 newer++; 6761 killed = exec_child(cmd, NULL, B_TRUE, &status); 6762 } else { 6763 if (hasalt && ztest_opts.zo_verbose >= 1) { 6764 (void) printf("Executing older ztest: %s\n", 6765 ztest_opts.zo_alt_ztest); 6766 } 6767 older++; 6768 killed = exec_child(ztest_opts.zo_alt_ztest, 6769 ztest_opts.zo_alt_libpath, B_TRUE, &status); 6770 } 6771 6772 if (killed) 6773 kills++; 6774 iters++; 6775 6776 if (ztest_opts.zo_verbose >= 1) { 6777 hrtime_t now = gethrtime(); 6778 6779 now = MIN(now, zs->zs_proc_stop); 6780 print_time(zs->zs_proc_stop - now, timebuf); 6781 nicenum(zs->zs_space, numbuf, sizeof (numbuf)); 6782 6783 (void) printf("Pass %3d, %8s, %3llu ENOSPC, " 6784 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", 6785 iters, 6786 WIFEXITED(status) ? "Complete" : "SIGKILL", 6787 (u_longlong_t)zs->zs_enospc_count, 6788 100.0 * zs->zs_alloc / zs->zs_space, 6789 numbuf, 6790 100.0 * (now - zs->zs_proc_start) / 6791 (ztest_opts.zo_time * NANOSEC), timebuf); 6792 } 6793 6794 if (ztest_opts.zo_verbose >= 2) { 6795 (void) printf("\nWorkload summary:\n\n"); 6796 (void) printf("%7s %9s %s\n", 6797 "Calls", "Time", "Function"); 6798 (void) printf("%7s %9s %s\n", 6799 "-----", "----", "--------"); 6800 for (int f = 0; f < ZTEST_FUNCS; f++) { 6801 Dl_info dli; 6802 6803 zi = &ztest_info[f]; 6804 zc = ZTEST_GET_SHARED_CALLSTATE(f); 6805 print_time(zc->zc_time, timebuf); 6806 (void) dladdr((void *)zi->zi_func, &dli); 6807 (void) printf("%7llu %9s %s\n", 6808 (u_longlong_t)zc->zc_count, timebuf, 6809 dli.dli_sname); 6810 } 6811 (void) printf("\n"); 6812 } 6813 6814 ztest_run_zdb(ztest_opts.zo_pool); 6815 } 6816 6817 if (ztest_opts.zo_verbose >= 1) { 6818 if (hasalt) { 6819 (void) printf("%d runs of older ztest: %s\n", older, 6820 ztest_opts.zo_alt_ztest); 6821 (void) printf("%d runs of newer ztest: %s\n", newer, 6822 cmd); 6823 } 6824 (void) printf("%d killed, %d completed, %.0f%% kill rate\n", 6825 kills, iters - kills, (100.0 * kills) / MAX(1, iters)); 6826 } 6827 6828 umem_free(cmd, MAXNAMELEN); 6829 6830 return (0); 6831 } 6832