xref: /illumos-gate/usr/src/cmd/zinject/zinject.c (revision b853d39a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright (c) 2017, Intel Corporation.
25  */
26 
27 /*
28  * ZFS Fault Injector
29  *
30  * This userland component takes a set of options and uses libzpool to translate
31  * from a user-visible object type and name to an internal representation.
32  * There are two basic types of faults: device faults and data faults.
33  *
34  *
35  * DEVICE FAULTS
36  *
37  * Errors can be injected into a particular vdev using the '-d' option.  This
38  * option takes a path or vdev GUID to uniquely identify the device within a
39  * pool.  There are two types of errors that can be injected, EIO and ENXIO,
40  * that can be controlled through the '-e' option.  The default is ENXIO.  For
41  * EIO failures, any attempt to read data from the device will return EIO, but
42  * subsequent attempt to reopen the device will succeed.  For ENXIO failures,
43  * any attempt to read from the device will return EIO, but any attempt to
44  * reopen the device will also return ENXIO.
45  * For label faults, the -L option must be specified. This allows faults
46  * to be injected into either the nvlist, uberblock, pad1, or pad2 region
47  * of all the labels for the specified device.
48  *
49  * This form of the command looks like:
50  *
51  *	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
52  *
53  *
54  * DATA FAULTS
55  *
56  * We begin with a tuple of the form:
57  *
58  *	<type,level,range,object>
59  *
60  *	type	A string describing the type of data to target.  Each type
61  *		implicitly describes how to interpret 'object'. Currently,
62  *		the following values are supported:
63  *
64  *		data		User data for a file
65  *		dnode		Dnode for a file or directory
66  *
67  *		The following MOS objects are special.  Instead of injecting
68  *		errors on a particular object or blkid, we inject errors across
69  *		all objects of the given type.
70  *
71  *		mos		Any data in the MOS
72  *		mosdir		object directory
73  *		config		pool configuration
74  *		bpobj		blkptr list
75  *		spacemap	spacemap
76  *		metaslab	metaslab
77  *		errlog		persistent error log
78  *
79  *	level	Object level.  Defaults to '0', not applicable to all types.  If
80  *		a range is given, this corresponds to the indirect block
81  *		corresponding to the specific range.
82  *
83  *	range	A numerical range [start,end) within the object.  Defaults to
84  *		the full size of the file.
85  *
86  *	object	A string describing the logical location of the object.  For
87  *		files and directories (currently the only supported types),
88  *		this is the path of the object on disk.
89  *
90  * This is translated, via libzpool, into the following internal representation:
91  *
92  *	<type,objset,object,level,range>
93  *
94  * These types should be self-explanatory.  This tuple is then passed to the
95  * kernel via a special ioctl() to initiate fault injection for the given
96  * object.  Note that 'type' is not strictly necessary for fault injection, but
97  * is used when translating existing faults into a human-readable string.
98  *
99  *
100  * The command itself takes one of the forms:
101  *
102  *	zinject
103  *	zinject <-a | -u pool>
104  *	zinject -c <id|all>
105  *	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
106  *	    [-r range] <object>
107  *	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
108  *
109  * With no arguments, the command prints all currently registered injection
110  * handlers, with their numeric identifiers.
111  *
112  * The '-c' option will clear the given handler, or all handlers if 'all' is
113  * specified.
114  *
115  * The '-e' option takes a string describing the errno to simulate.  This must
116  * be one of 'io', 'checksum', or 'decrypt'.  In most cases this will result
117  * in the same behavior, but RAID-Z will produce a different set of ereports
118  * for this situation.
119  *
120  * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
121  * specified, then the ARC cache is flushed appropriately.  If '-u' is
122  * specified, then the underlying SPA is unloaded.  Either of these flags can be
123  * specified independently of any other handlers.  The '-m' flag automatically
124  * does an unmount and remount of the underlying dataset to aid in flushing the
125  * cache.
126  *
127  * The '-f' flag controls the frequency of errors injected, expressed as a
128  * real number percentage between 0.0001 and 100.  The default is 100.
129  *
130  * The this form is responsible for actually injecting the handler into the
131  * framework.  It takes the arguments described above, translates them to the
132  * internal tuple using libzpool, and then issues an ioctl() to register the
133  * handler.
134  *
135  * The final form can target a specific bookmark, regardless of whether a
136  * human-readable interface has been designed.  It allows developers to specify
137  * a particular block by number.
138  */
139 
140 #include <errno.h>
141 #include <fcntl.h>
142 #include <stdio.h>
143 #include <stdlib.h>
144 #include <strings.h>
145 #include <unistd.h>
146 
147 #include <sys/fs/zfs.h>
148 #include <sys/mount.h>
149 
150 #include <libzfs.h>
151 
152 #undef verify	/* both libzfs.h and zfs_context.h want to define this */
153 
154 #include "zinject.h"
155 
156 libzfs_handle_t *g_zfs;
157 int zfs_fd;
158 
159 #define	ECKSUM	EBADE
160 
161 static const char *errtable[TYPE_INVAL] = {
162 	"data",
163 	"dnode",
164 	"mos",
165 	"mosdir",
166 	"metaslab",
167 	"config",
168 	"bpobj",
169 	"spacemap",
170 	"errlog",
171 	"uber",
172 	"nvlist",
173 	"pad1",
174 	"pad2"
175 };
176 
177 static err_type_t
178 name_to_type(const char *arg)
179 {
180 	int i;
181 	for (i = 0; i < TYPE_INVAL; i++)
182 		if (strcmp(errtable[i], arg) == 0)
183 			return (i);
184 
185 	return (TYPE_INVAL);
186 }
187 
188 static const char *
189 type_to_name(uint64_t type)
190 {
191 	switch (type) {
192 	case DMU_OT_OBJECT_DIRECTORY:
193 		return ("mosdir");
194 	case DMU_OT_OBJECT_ARRAY:
195 		return ("metaslab");
196 	case DMU_OT_PACKED_NVLIST:
197 		return ("config");
198 	case DMU_OT_BPOBJ:
199 		return ("bpobj");
200 	case DMU_OT_SPACE_MAP:
201 		return ("spacemap");
202 	case DMU_OT_ERROR_LOG:
203 		return ("errlog");
204 	default:
205 		return ("-");
206 	}
207 }
208 
209 
210 /*
211  * Print usage message.
212  */
213 void
214 usage(void)
215 {
216 	(void) printf(
217 	    "usage:\n"
218 	    "\n"
219 	    "\tzinject\n"
220 	    "\n"
221 	    "\t\tList all active injection records.\n"
222 	    "\n"
223 	    "\tzinject -c <id|all>\n"
224 	    "\n"
225 	    "\t\tClear the particular record (if given a numeric ID), or\n"
226 	    "\t\tall records if 'all' is specificed.\n"
227 	    "\n"
228 	    "\tzinject -p <function name> pool\n"
229 	    "\n"
230 	    "\t\tInject a panic fault at the specified function. Only \n"
231 	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
232 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
233 	    "\n"
234 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
235 	    "\t    [-T <read|write|free|claim|all>] [-f frequency] pool\n"
236 	    "\n"
237 	    "\t\tInject a fault into a particular device or the device's\n"
238 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
239 	    "\t\t'pad1', or 'pad2'.\n"
240 	    "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
241 	    "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
242 	    "\t\tdevice error injection to a percentage of the IOs.\n"
243 	    "\n"
244 	    "\tzinject -d device -A <degrade|fault> pool\n"
245 	    "\n"
246 	    "\t\tPerform a specific action on a particular device\n"
247 	    "\n"
248 	    "\tzinject -d device -D latency:lanes pool\n"
249 	    "\n"
250 	    "\t\tAdd an artificial delay to IO requests on a particular\n"
251 	    "\t\tdevice, such that the requests take a minimum of 'latency'\n"
252 	    "\t\tmilliseconds to complete. Each delay has an associated\n"
253 	    "\t\tnumber of 'lanes' which defines the number of concurrent\n"
254 	    "\t\tIO requests that can be processed.\n"
255 	    "\n"
256 	    "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
257 	    "\t\tthe device will only be able to service a single IO request\n"
258 	    "\t\tat a time with each request taking 10 ms to complete. So,\n"
259 	    "\t\tif only a single request is submitted every 10 ms, the\n"
260 	    "\t\taverage latency will be 10 ms; but if more than one request\n"
261 	    "\t\tis submitted every 10 ms, the average latency will be more\n"
262 	    "\t\tthan 10 ms.\n"
263 	    "\n"
264 	    "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
265 	    "\t\tlanes (-D 10:2), then the device will be able to service\n"
266 	    "\t\ttwo requests at a time, each with a minimum latency of\n"
267 	    "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
268 	    "\t\tthe average latency will be 10 ms; but if more than two\n"
269 	    "\t\trequests are submitted every 10 ms, the average latency\n"
270 	    "\t\twill be more than 10 ms.\n"
271 	    "\n"
272 	    "\t\tAlso note, these delays are additive. So two invocations\n"
273 	    "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
274 	    "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
275 	    "\t\tlanes with differing target latencies. For example, an\n"
276 	    "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
277 	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
278 	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
279 	    "\n"
280 	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
281 	    "\n"
282 	    "\t\tCause the pool to stop writing blocks yet not\n"
283 	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
284 	    "\t\tthat fails to honor cache flush requests.\n"
285 	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
286 	    "\t\tat the end of the duration.\n"
287 	    "\n"
288 	    "\tzinject -b objset:object:level:blkid pool\n"
289 	    "\n"
290 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
291 	    "\t\tspecified by the remaining tuple.  Each number is in\n"
292 	    "\t\thexidecimal, and only one block can be specified.\n"
293 	    "\n"
294 	    "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
295 	    "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
296 	    "\n"
297 	    "\t\tInject an error into the object specified by the '-t' option\n"
298 	    "\t\tand the object descriptor.  The 'object' parameter is\n"
299 	    "\t\tinterperted depending on the '-t' option.\n"
300 	    "\n"
301 	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
302 	    "\t\t-e\tInject a specific error.  Must be one of 'io', "
303 	    "'checksum',\n"
304 	    "\t\t\t'decompress', or decrypt.  Default is 'io'.\n"
305 	    "\t\t-C\tInject the given error only into specific DVAs. The\n"
306 	    "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
307 	    "\t\t\tseparated by commas (ex. '0,2').\n"
308 	    "\t\t-l\tInject error at a particular block level. Default is "
309 	    "0.\n"
310 	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
311 	    "\t\t-r\tInject error over a particular logical range of an\n"
312 	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
313 	    "\t\t\trange according to the object's properties.\n"
314 	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
315 	    "\t\t\tassociated object.\n"
316 	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
317 	    "\t\t\ta pool object.\n"
318 	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
319 	    "\t\t\ta percentage between 0.0001 and 100.\n"
320 	    "\n"
321 	    "\t-t data\t\tInject an error into the plain file contents of a\n"
322 	    "\t\t\tfile.  The object must be specified as a complete path\n"
323 	    "\t\t\tto a file on a ZFS filesystem.\n"
324 	    "\n"
325 	    "\t-t dnode\tInject an error into the metadnode in the block\n"
326 	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
327 	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
328 	    "\t\t\tis specified as a complete path to a file or directory\n"
329 	    "\t\t\ton a ZFS filesystem.\n"
330 	    "\n"
331 	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
332 	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
333 	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
334 	    "\t\t\tthe poolname.\n");
335 }
336 
337 static int
338 iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
339     void *data)
340 {
341 	zfs_cmd_t zc = { 0 };
342 	int ret;
343 
344 	while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
345 		if ((ret = func((int)zc.zc_guid, zc.zc_name,
346 		    &zc.zc_inject_record, data)) != 0)
347 			return (ret);
348 
349 	if (errno != ENOENT) {
350 		(void) fprintf(stderr, "Unable to list handlers: %s\n",
351 		    strerror(errno));
352 		return (-1);
353 	}
354 
355 	return (0);
356 }
357 
358 static int
359 print_data_handler(int id, const char *pool, zinject_record_t *record,
360     void *data)
361 {
362 	int *count = data;
363 
364 	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
365 		return (0);
366 
367 	if (*count == 0) {
368 		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  ",
369 		    "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
370 		    "LVL", "DVAs", "RANGE");
371 		(void) printf("---  ---------------  ------  "
372 		    "------  --------  ---  ---- ----------------\n");
373 	}
374 
375 	*count += 1;
376 
377 	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %-3d  0x%02x  ",
378 	    id, pool, (u_longlong_t)record->zi_objset,
379 	    (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
380 	    record->zi_level, record->zi_dvas);
381 
382 	if (record->zi_start == 0 &&
383 	    record->zi_end == -1ULL)
384 		(void) printf("all\n");
385 	else
386 		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
387 		    (u_longlong_t)record->zi_end);
388 
389 	return (0);
390 }
391 
392 static int
393 print_device_handler(int id, const char *pool, zinject_record_t *record,
394     void *data)
395 {
396 	int *count = data;
397 
398 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
399 		return (0);
400 
401 	if (record->zi_cmd == ZINJECT_DELAY_IO)
402 		return (0);
403 
404 	if (*count == 0) {
405 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
406 		(void) printf("---  ---------------  ----------------\n");
407 	}
408 
409 	*count += 1;
410 
411 	(void) printf("%3d  %-15s  %llx\n", id, pool,
412 	    (u_longlong_t)record->zi_guid);
413 
414 	return (0);
415 }
416 
417 static int
418 print_delay_handler(int id, const char *pool, zinject_record_t *record,
419     void *data)
420 {
421 	int *count = data;
422 
423 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
424 		return (0);
425 
426 	if (record->zi_cmd != ZINJECT_DELAY_IO)
427 		return (0);
428 
429 	if (*count == 0) {
430 		(void) printf("%3s  %-15s  %-15s  %-15s  %s\n",
431 		    "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
432 		(void) printf("---  ---------------  ---------------  "
433 		    "---------------  ----------------\n");
434 	}
435 
436 	*count += 1;
437 
438 	(void) printf("%3d  %-15s  %-15llu  %-15llu  %llx\n", id, pool,
439 	    (u_longlong_t)NSEC2MSEC(record->zi_timer),
440 	    (u_longlong_t)record->zi_nlanes,
441 	    (u_longlong_t)record->zi_guid);
442 
443 	return (0);
444 }
445 
446 static int
447 print_panic_handler(int id, const char *pool, zinject_record_t *record,
448     void *data)
449 {
450 	int *count = data;
451 
452 	if (record->zi_func[0] == '\0')
453 		return (0);
454 
455 	if (*count == 0) {
456 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
457 		(void) printf("---  ---------------  ----------------\n");
458 	}
459 
460 	*count += 1;
461 
462 	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
463 
464 	return (0);
465 }
466 
467 /*
468  * Print all registered error handlers.  Returns the number of handlers
469  * registered.
470  */
471 static int
472 print_all_handlers(void)
473 {
474 	int count = 0, total = 0;
475 
476 	(void) iter_handlers(print_device_handler, &count);
477 	if (count > 0) {
478 		total += count;
479 		(void) printf("\n");
480 		count = 0;
481 	}
482 
483 	(void) iter_handlers(print_delay_handler, &count);
484 	if (count > 0) {
485 		total += count;
486 		(void) printf("\n");
487 		count = 0;
488 	}
489 
490 	(void) iter_handlers(print_data_handler, &count);
491 	if (count > 0) {
492 		total += count;
493 		(void) printf("\n");
494 		count = 0;
495 	}
496 
497 	(void) iter_handlers(print_panic_handler, &count);
498 
499 	return (count + total);
500 }
501 
502 /* ARGSUSED */
503 static int
504 cancel_one_handler(int id, const char *pool, zinject_record_t *record,
505     void *data)
506 {
507 	zfs_cmd_t zc = { 0 };
508 
509 	zc.zc_guid = (uint64_t)id;
510 
511 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
512 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
513 		    id, strerror(errno));
514 		return (1);
515 	}
516 
517 	return (0);
518 }
519 
520 /*
521  * Remove all fault injection handlers.
522  */
523 static int
524 cancel_all_handlers(void)
525 {
526 	int ret = iter_handlers(cancel_one_handler, NULL);
527 
528 	if (ret == 0)
529 		(void) printf("removed all registered handlers\n");
530 
531 	return (ret);
532 }
533 
534 /*
535  * Remove a specific fault injection handler.
536  */
537 static int
538 cancel_handler(int id)
539 {
540 	zfs_cmd_t zc = { 0 };
541 
542 	zc.zc_guid = (uint64_t)id;
543 
544 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
545 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
546 		    id, strerror(errno));
547 		return (1);
548 	}
549 
550 	(void) printf("removed handler %d\n", id);
551 
552 	return (0);
553 }
554 
555 /*
556  * Register a new fault injection handler.
557  */
558 static int
559 register_handler(const char *pool, int flags, zinject_record_t *record,
560     int quiet)
561 {
562 	zfs_cmd_t zc = { 0 };
563 
564 	(void) strcpy(zc.zc_name, pool);
565 	zc.zc_inject_record = *record;
566 	zc.zc_guid = flags;
567 
568 	if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
569 		(void) fprintf(stderr, "failed to add handler: %s\n",
570 		    errno == EDOM ? "block level exceeds max level of object" :
571 		    strerror(errno));
572 		return (1);
573 	}
574 
575 	if (flags & ZINJECT_NULL)
576 		return (0);
577 
578 	if (quiet) {
579 		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
580 	} else {
581 		(void) printf("Added handler %llu with the following "
582 		    "properties:\n", (u_longlong_t)zc.zc_guid);
583 		(void) printf("  pool: %s\n", pool);
584 		if (record->zi_guid) {
585 			(void) printf("  vdev: %llx\n",
586 			    (u_longlong_t)record->zi_guid);
587 		} else if (record->zi_func[0] != '\0') {
588 			(void) printf("  panic function: %s\n",
589 			    record->zi_func);
590 		} else if (record->zi_duration > 0) {
591 			(void) printf(" time: %lld seconds\n",
592 			    (u_longlong_t)record->zi_duration);
593 		} else if (record->zi_duration < 0) {
594 			(void) printf(" txgs: %lld \n",
595 			    (u_longlong_t)-record->zi_duration);
596 		} else {
597 			(void) printf("objset: %llu\n",
598 			    (u_longlong_t)record->zi_objset);
599 			(void) printf("object: %llu\n",
600 			    (u_longlong_t)record->zi_object);
601 			(void) printf("  type: %llu\n",
602 			    (u_longlong_t)record->zi_type);
603 			(void) printf(" level: %d\n", record->zi_level);
604 			if (record->zi_start == 0 &&
605 			    record->zi_end == -1ULL)
606 				(void) printf(" range: all\n");
607 			else
608 				(void) printf(" range: [%llu, %llu)\n",
609 				    (u_longlong_t)record->zi_start,
610 				    (u_longlong_t)record->zi_end);
611 			(void) printf("  dvas: 0x%x\n", record->zi_dvas);
612 		}
613 	}
614 
615 	return (0);
616 }
617 
618 int
619 perform_action(const char *pool, zinject_record_t *record, int cmd)
620 {
621 	zfs_cmd_t zc = { 0 };
622 
623 	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
624 	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
625 	zc.zc_guid = record->zi_guid;
626 	zc.zc_cookie = cmd;
627 
628 	if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
629 		return (0);
630 
631 	return (1);
632 }
633 
634 static int
635 parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
636 {
637 	unsigned long scan_delay;
638 	unsigned long scan_nlanes;
639 
640 	if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
641 		return (1);
642 
643 	/*
644 	 * We explicitly disallow a delay of zero here, because we key
645 	 * off this value being non-zero in translate_device(), to
646 	 * determine if the fault is a ZINJECT_DELAY_IO fault or not.
647 	 */
648 	if (scan_delay == 0)
649 		return (1);
650 
651 	/*
652 	 * The units for the CLI delay parameter is milliseconds, but
653 	 * the data passed to the kernel is interpreted as nanoseconds.
654 	 * Thus we scale the milliseconds to nanoseconds here, and this
655 	 * nanosecond value is used to pass the delay to the kernel.
656 	 */
657 	*delay = MSEC2NSEC(scan_delay);
658 	*nlanes = scan_nlanes;
659 
660 	return (0);
661 }
662 
663 static int
664 parse_frequency(const char *str, uint32_t *percent)
665 {
666 	double val;
667 	char *post;
668 
669 	val = strtod(str, &post);
670 	if (post == NULL || *post != '\0')
671 		return (EINVAL);
672 
673 	/* valid range is [0.0001, 100.0] */
674 	val /= 100.0f;
675 	if (val < 0.000001f || val > 1.0f)
676 		return (ERANGE);
677 
678 	/* convert to an integer for use by kernel */
679 	*percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX));
680 
681 	return (0);
682 }
683 
684 /*
685  * This function converts a string specifier for DVAs into a bit mask.
686  * The dva's provided by the user should be 0 indexed and separated by
687  * a comma. For example:
688  *     "1"     -> 0b0010  (0x2)
689  *     "0,1"   -> 0b0011  (0x3)
690  *     "0,1,2" -> 0b0111  (0x7)
691  */
692 static int
693 parse_dvas(const char *str, uint32_t *dvas_out)
694 {
695 	const char *c = str;
696 	uint32_t mask = 0;
697 	boolean_t need_delim = B_FALSE;
698 
699 	/* max string length is 5 ("0,1,2") */
700 	if (strlen(str) > 5 || strlen(str) == 0)
701 		return (EINVAL);
702 
703 	while (*c != '\0') {
704 		switch (*c) {
705 		case '0':
706 		case '1':
707 		case '2':
708 			/* check for pipe between DVAs */
709 			if (need_delim)
710 				return (EINVAL);
711 
712 			/* check if this DVA has been set already */
713 			if (mask & (1 << ((*c) - '0')))
714 				return (EINVAL);
715 
716 			mask |= (1 << ((*c) - '0'));
717 			need_delim = B_TRUE;
718 			break;
719 		case ',':
720 			need_delim = B_FALSE;
721 			break;
722 		default:
723 			/* check for invalid character */
724 			return (EINVAL);
725 		}
726 		c++;
727 	}
728 
729 	/* check for dangling delimiter */
730 	if (!need_delim)
731 		return (EINVAL);
732 
733 	*dvas_out = mask;
734 	return (0);
735 }
736 
737 int
738 main(int argc, char **argv)
739 {
740 	int c;
741 	char *range = NULL;
742 	char *cancel = NULL;
743 	char *end;
744 	char *raw = NULL;
745 	char *device = NULL;
746 	int level = 0;
747 	int quiet = 0;
748 	int error = 0;
749 	int domount = 0;
750 	int io_type = ZIO_TYPES;
751 	int action = VDEV_STATE_UNKNOWN;
752 	err_type_t type = TYPE_INVAL;
753 	err_type_t label = TYPE_INVAL;
754 	zinject_record_t record = { 0 };
755 	char pool[MAXNAMELEN];
756 	char dataset[MAXNAMELEN];
757 	zfs_handle_t *zhp;
758 	int nowrites = 0;
759 	int dur_txg = 0;
760 	int dur_secs = 0;
761 	int ret;
762 	int flags = 0;
763 	uint32_t dvas = 0;
764 
765 	if ((g_zfs = libzfs_init()) == NULL) {
766 		(void) fprintf(stderr, "internal error: failed to "
767 		    "initialize ZFS library\n");
768 		return (1);
769 	}
770 
771 	libzfs_print_on_error(g_zfs, B_TRUE);
772 
773 	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
774 		(void) fprintf(stderr, "failed to open ZFS device\n");
775 		return (1);
776 	}
777 
778 	if (argc == 1) {
779 		/*
780 		 * No arguments.  Print the available handlers.  If there are no
781 		 * available handlers, direct the user to '-h' for help
782 		 * information.
783 		 */
784 		if (print_all_handlers() == 0) {
785 			(void) printf("No handlers registered.\n");
786 			(void) printf("Run 'zinject -h' for usage "
787 			    "information.\n");
788 		}
789 
790 		return (0);
791 	}
792 
793 	while ((c = getopt(argc, argv,
794 	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
795 		switch (c) {
796 		case 'a':
797 			flags |= ZINJECT_FLUSH_ARC;
798 			break;
799 		case 'A':
800 			if (strcasecmp(optarg, "degrade") == 0) {
801 				action = VDEV_STATE_DEGRADED;
802 			} else if (strcasecmp(optarg, "fault") == 0) {
803 				action = VDEV_STATE_FAULTED;
804 			} else {
805 				(void) fprintf(stderr, "invalid action '%s': "
806 				    "must be 'degrade' or 'fault'\n", optarg);
807 				usage();
808 				return (1);
809 			}
810 			break;
811 		case 'b':
812 			raw = optarg;
813 			break;
814 		case 'c':
815 			cancel = optarg;
816 			break;
817 		case 'C':
818 			ret = parse_dvas(optarg, &dvas);
819 			if (ret != 0) {
820 				(void) fprintf(stderr, "invalid DVA list '%s': "
821 				    "DVAs should be 0 indexed and separated by "
822 				    "commas.\n", optarg);
823 				usage();
824 				libzfs_fini(g_zfs);
825 				return (1);
826 			}
827 			break;
828 		case 'd':
829 			device = optarg;
830 			break;
831 		case 'D':
832 			ret = parse_delay(optarg, &record.zi_timer,
833 			    &record.zi_nlanes);
834 			if (ret != 0) {
835 				(void) fprintf(stderr, "invalid i/o delay "
836 				    "value: '%s'\n", optarg);
837 				usage();
838 				return (1);
839 			}
840 			break;
841 		case 'e':
842 			if (strcasecmp(optarg, "io") == 0) {
843 				error = EIO;
844 			} else if (strcasecmp(optarg, "checksum") == 0) {
845 				error = ECKSUM;
846 			} else if (strcasecmp(optarg, "decrypt") == 0) {
847 				error = EACCES;
848 			} else if (strcasecmp(optarg, "nxio") == 0) {
849 				error = ENXIO;
850 			} else if (strcasecmp(optarg, "dtl") == 0) {
851 				error = ECHILD;
852 			} else {
853 				(void) fprintf(stderr, "invalid error type "
854 				    "'%s': must be 'io', 'checksum' or "
855 				    "'nxio'\n", optarg);
856 				usage();
857 				return (1);
858 			}
859 			break;
860 		case 'f':
861 			ret = parse_frequency(optarg, &record.zi_freq);
862 			if (ret != 0) {
863 				(void) fprintf(stderr, "%sfrequency value must "
864 				    "be in the range [0.0001, 100.0]\n",
865 				    ret == EINVAL ? "invalid value: " :
866 				    ret == ERANGE ? "out of range: " : "");
867 				return (1);
868 			}
869 			break;
870 		case 'F':
871 			record.zi_failfast = B_TRUE;
872 			break;
873 		case 'g':
874 			dur_txg = 1;
875 			record.zi_duration = (int)strtol(optarg, &end, 10);
876 			if (record.zi_duration <= 0 || *end != '\0') {
877 				(void) fprintf(stderr, "invalid duration '%s': "
878 				    "must be a positive integer\n", optarg);
879 				usage();
880 				return (1);
881 			}
882 			/* store duration of txgs as its negative */
883 			record.zi_duration *= -1;
884 			break;
885 		case 'h':
886 			usage();
887 			return (0);
888 		case 'I':
889 			/* default duration, if one hasn't yet been defined */
890 			nowrites = 1;
891 			if (dur_secs == 0 && dur_txg == 0)
892 				record.zi_duration = 30;
893 			break;
894 		case 'l':
895 			level = (int)strtol(optarg, &end, 10);
896 			if (*end != '\0') {
897 				(void) fprintf(stderr, "invalid level '%s': "
898 				    "must be an integer\n", optarg);
899 				usage();
900 				return (1);
901 			}
902 			break;
903 		case 'm':
904 			domount = 1;
905 			break;
906 		case 'p':
907 			(void) strlcpy(record.zi_func, optarg,
908 			    sizeof (record.zi_func));
909 			record.zi_cmd = ZINJECT_PANIC;
910 			break;
911 		case 'q':
912 			quiet = 1;
913 			break;
914 		case 'r':
915 			range = optarg;
916 			flags |= ZINJECT_CALC_RANGE;
917 			break;
918 		case 's':
919 			dur_secs = 1;
920 			record.zi_duration = (int)strtol(optarg, &end, 10);
921 			if (record.zi_duration <= 0 || *end != '\0') {
922 				(void) fprintf(stderr, "invalid duration '%s': "
923 				    "must be a positive integer\n", optarg);
924 				usage();
925 				return (1);
926 			}
927 			break;
928 		case 'T':
929 			if (strcasecmp(optarg, "read") == 0) {
930 				io_type = ZIO_TYPE_READ;
931 			} else if (strcasecmp(optarg, "write") == 0) {
932 				io_type = ZIO_TYPE_WRITE;
933 			} else if (strcasecmp(optarg, "free") == 0) {
934 				io_type = ZIO_TYPE_FREE;
935 			} else if (strcasecmp(optarg, "claim") == 0) {
936 				io_type = ZIO_TYPE_CLAIM;
937 			} else if (strcasecmp(optarg, "all") == 0) {
938 				io_type = ZIO_TYPES;
939 			} else {
940 				(void) fprintf(stderr, "invalid I/O type "
941 				    "'%s': must be 'read', 'write', 'free', "
942 				    "'claim' or 'all'\n", optarg);
943 				usage();
944 				return (1);
945 			}
946 			break;
947 		case 't':
948 			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
949 			    !MOS_TYPE(type)) {
950 				(void) fprintf(stderr, "invalid type '%s'\n",
951 				    optarg);
952 				usage();
953 				return (1);
954 			}
955 			break;
956 		case 'u':
957 			flags |= ZINJECT_UNLOAD_SPA;
958 			break;
959 		case 'L':
960 			if ((label = name_to_type(optarg)) == TYPE_INVAL &&
961 			    !LABEL_TYPE(type)) {
962 				(void) fprintf(stderr, "invalid label type "
963 				    "'%s'\n", optarg);
964 				usage();
965 				return (1);
966 			}
967 			break;
968 		case ':':
969 			(void) fprintf(stderr, "option -%c requires an "
970 			    "operand\n", optopt);
971 			usage();
972 			return (1);
973 		case '?':
974 			(void) fprintf(stderr, "invalid option '%c'\n",
975 			    optopt);
976 			usage();
977 			return (2);
978 		}
979 	}
980 
981 	argc -= optind;
982 	argv += optind;
983 
984 	if (record.zi_duration != 0)
985 		record.zi_cmd = ZINJECT_IGNORED_WRITES;
986 
987 	if (cancel != NULL) {
988 		/*
989 		 * '-c' is invalid with any other options.
990 		 */
991 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
992 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
993 		    record.zi_freq > 0 || dvas != 0) {
994 			(void) fprintf(stderr, "cancel (-c) incompatible with "
995 			    "any other options\n");
996 			usage();
997 			return (2);
998 		}
999 		if (argc != 0) {
1000 			(void) fprintf(stderr, "extraneous argument to '-c'\n");
1001 			usage();
1002 			return (2);
1003 		}
1004 
1005 		if (strcmp(cancel, "all") == 0) {
1006 			return (cancel_all_handlers());
1007 		} else {
1008 			int id = (int)strtol(cancel, &end, 10);
1009 			if (*end != '\0') {
1010 				(void) fprintf(stderr, "invalid handle id '%s':"
1011 				    " must be an integer or 'all'\n", cancel);
1012 				usage();
1013 				return (1);
1014 			}
1015 			return (cancel_handler(id));
1016 		}
1017 	}
1018 
1019 	if (device != NULL) {
1020 		/*
1021 		 * Device (-d) injection uses a completely different mechanism
1022 		 * for doing injection, so handle it separately here.
1023 		 */
1024 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1025 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
1026 		    dvas != 0) {
1027 			(void) fprintf(stderr, "device (-d) incompatible with "
1028 			    "data error injection\n");
1029 			usage();
1030 			return (2);
1031 		}
1032 
1033 		if (argc != 1) {
1034 			(void) fprintf(stderr, "device (-d) injection requires "
1035 			    "a single pool name\n");
1036 			usage();
1037 			return (2);
1038 		}
1039 
1040 		(void) strcpy(pool, argv[0]);
1041 		dataset[0] = '\0';
1042 
1043 		if (error == ECKSUM) {
1044 			(void) fprintf(stderr, "device error type must be "
1045 			    "'io' or 'nxio'\n");
1046 			return (1);
1047 		}
1048 
1049 		record.zi_iotype = io_type;
1050 		if (translate_device(pool, device, label, &record) != 0)
1051 			return (1);
1052 		if (!error)
1053 			error = ENXIO;
1054 
1055 		if (action != VDEV_STATE_UNKNOWN)
1056 			return (perform_action(pool, &record, action));
1057 
1058 	} else if (raw != NULL) {
1059 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
1060 		    record.zi_cmd != ZINJECT_UNINITIALIZED ||
1061 		    record.zi_freq > 0 || dvas != 0) {
1062 			(void) fprintf(stderr, "raw (-b) format with "
1063 			    "any other options\n");
1064 			usage();
1065 			return (2);
1066 		}
1067 
1068 		if (argc != 1) {
1069 			(void) fprintf(stderr, "raw (-b) format expects a "
1070 			    "single pool name\n");
1071 			usage();
1072 			return (2);
1073 		}
1074 
1075 		(void) strcpy(pool, argv[0]);
1076 		dataset[0] = '\0';
1077 
1078 		if (error == ENXIO) {
1079 			(void) fprintf(stderr, "data error type must be "
1080 			    "'checksum' or 'io'\n");
1081 			return (1);
1082 		}
1083 
1084 		record.zi_cmd = ZINJECT_DATA_FAULT;
1085 		if (translate_raw(raw, &record) != 0)
1086 			return (1);
1087 		if (!error)
1088 			error = EIO;
1089 	} else if (record.zi_cmd == ZINJECT_PANIC) {
1090 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1091 		    level != 0 || device != NULL || record.zi_freq > 0 ||
1092 		    dvas != 0) {
1093 			(void) fprintf(stderr, "panic (-p) incompatible with "
1094 			    "other options\n");
1095 			usage();
1096 			return (2);
1097 		}
1098 
1099 		if (argc < 1 || argc > 2) {
1100 			(void) fprintf(stderr, "panic (-p) injection requires "
1101 			    "a single pool name and an optional id\n");
1102 			usage();
1103 			return (2);
1104 		}
1105 
1106 		(void) strcpy(pool, argv[0]);
1107 		if (argv[1] != NULL)
1108 			record.zi_type = atoi(argv[1]);
1109 		dataset[0] = '\0';
1110 	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
1111 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1112 		    level != 0 || record.zi_freq > 0 || dvas != 0) {
1113 			(void) fprintf(stderr, "hardware failure (-I) "
1114 			    "incompatible with other options\n");
1115 			usage();
1116 			libzfs_fini(g_zfs);
1117 			return (2);
1118 		}
1119 
1120 		if (nowrites == 0) {
1121 			(void) fprintf(stderr, "-s or -g meaningless "
1122 			    "without -I (ignore writes)\n");
1123 			usage();
1124 			return (2);
1125 		} else if (dur_secs && dur_txg) {
1126 			(void) fprintf(stderr, "choose a duration either "
1127 			    "in seconds (-s) or a number of txgs (-g) "
1128 			    "but not both\n");
1129 			usage();
1130 			return (2);
1131 		} else if (argc != 1) {
1132 			(void) fprintf(stderr, "ignore writes (-I) "
1133 			    "injection requires a single pool name\n");
1134 			usage();
1135 			return (2);
1136 		}
1137 
1138 		(void) strcpy(pool, argv[0]);
1139 		dataset[0] = '\0';
1140 	} else if (type == TYPE_INVAL) {
1141 		if (flags == 0) {
1142 			(void) fprintf(stderr, "at least one of '-b', '-d', "
1143 			    "'-t', '-a', '-p', '-I' or '-u' "
1144 			    "must be specified\n");
1145 			usage();
1146 			return (2);
1147 		}
1148 
1149 		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
1150 			(void) strcpy(pool, argv[0]);
1151 			dataset[0] = '\0';
1152 		} else if (argc != 0) {
1153 			(void) fprintf(stderr, "extraneous argument for "
1154 			    "'-f'\n");
1155 			usage();
1156 			return (2);
1157 		}
1158 
1159 		flags |= ZINJECT_NULL;
1160 	} else {
1161 		if (argc != 1) {
1162 			(void) fprintf(stderr, "missing object\n");
1163 			usage();
1164 			return (2);
1165 		}
1166 
1167 		if (error == ENXIO) {
1168 			(void) fprintf(stderr, "data error type must be "
1169 			    "'checksum' or 'io'\n");
1170 			return (1);
1171 		}
1172 
1173 		if (dvas != 0) {
1174 			if (error == EACCES || error == EINVAL) {
1175 				(void) fprintf(stderr, "the '-C' option may "
1176 				    "not be used with logical data errors "
1177 				    "'decrypt' and 'decompress'\n");
1178 				record.zi_dvas = dvas;
1179 			}
1180 		}
1181 
1182 		record.zi_cmd = ZINJECT_DATA_FAULT;
1183 
1184 		if (error == EACCES) {
1185 			if (type != TYPE_DATA) {
1186 				(void) fprintf(stderr, "decryption errors "
1187 				    "may only be injected for 'data' types\n");
1188 				libzfs_fini(g_zfs);
1189 				return (1);
1190 			}
1191 
1192 			record.zi_cmd = ZINJECT_DECRYPT_FAULT;
1193 			/*
1194 			 * Internally, ZFS actually uses ECKSUM for decryption
1195 			 * errors since EACCES is used to indicate the key was
1196 			 * not found.
1197 			 */
1198 			error = ECKSUM;
1199 		}
1200 
1201 		if (translate_record(type, argv[0], range, level, &record, pool,
1202 		    dataset) != 0)
1203 			return (1);
1204 		if (!error)
1205 			error = EIO;
1206 	}
1207 
1208 	/*
1209 	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
1210 	 * unload the pool, so that we trigger spa-wide reopen of metadata next
1211 	 * time we access the pool.
1212 	 */
1213 	if (dataset[0] != '\0' && domount) {
1214 		if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL)
1215 			return (1);
1216 
1217 		if (zfs_unmount(zhp, NULL, 0) != 0)
1218 			return (1);
1219 	}
1220 
1221 	record.zi_error = error;
1222 
1223 	ret = register_handler(pool, flags, &record, quiet);
1224 
1225 	if (dataset[0] != '\0' && domount)
1226 		ret = (zfs_mount(zhp, NULL, 0) != 0);
1227 
1228 	libzfs_fini(g_zfs);
1229 
1230 	return (ret);
1231 }
1232