xref: /illumos-gate/usr/src/cmd/zinject/zinject.c (revision 3382f241)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  * Copyright (c) 2017, Intel Corporation.
25  */
26 
27 /*
28  * ZFS Fault Injector
29  *
30  * This userland component takes a set of options and uses libzpool to translate
31  * from a user-visible object type and name to an internal representation.
32  * There are two basic types of faults: device faults and data faults.
33  *
34  *
35  * DEVICE FAULTS
36  *
37  * Errors can be injected into a particular vdev using the '-d' option.  This
38  * option takes a path or vdev GUID to uniquely identify the device within a
39  * pool.  There are two types of errors that can be injected, EIO and ENXIO,
40  * that can be controlled through the '-e' option.  The default is ENXIO.  For
41  * EIO failures, any attempt to read data from the device will return EIO, but
42  * subsequent attempt to reopen the device will succeed.  For ENXIO failures,
43  * any attempt to read from the device will return EIO, but any attempt to
44  * reopen the device will also return ENXIO.
45  * For label faults, the -L option must be specified. This allows faults
46  * to be injected into either the nvlist, uberblock, pad1, or pad2 region
47  * of all the labels for the specified device.
48  *
49  * This form of the command looks like:
50  *
51  *	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
52  *
53  *
54  * DATA FAULTS
55  *
56  * We begin with a tuple of the form:
57  *
58  *	<type,level,range,object>
59  *
60  *	type	A string describing the type of data to target.  Each type
61  *		implicitly describes how to interpret 'object'. Currently,
62  *		the following values are supported:
63  *
64  *		data		User data for a file
65  *		dnode		Dnode for a file or directory
66  *
67  *		The following MOS objects are special.  Instead of injecting
68  *		errors on a particular object or blkid, we inject errors across
69  *		all objects of the given type.
70  *
71  *		mos		Any data in the MOS
72  *		mosdir		object directory
73  *		config		pool configuration
74  *		bpobj		blkptr list
75  *		spacemap	spacemap
76  *		metaslab	metaslab
77  *		errlog		persistent error log
78  *
79  *	level	Object level.  Defaults to '0', not applicable to all types.  If
80  *		a range is given, this corresponds to the indirect block
81  *		corresponding to the specific range.
82  *
83  *	range	A numerical range [start,end) within the object.  Defaults to
84  *		the full size of the file.
85  *
86  *	object	A string describing the logical location of the object.  For
87  *		files and directories (currently the only supported types),
88  *		this is the path of the object on disk.
89  *
90  * This is translated, via libzpool, into the following internal representation:
91  *
92  *	<type,objset,object,level,range>
93  *
94  * These types should be self-explanatory.  This tuple is then passed to the
95  * kernel via a special ioctl() to initiate fault injection for the given
96  * object.  Note that 'type' is not strictly necessary for fault injection, but
97  * is used when translating existing faults into a human-readable string.
98  *
99  *
100  * The command itself takes one of the forms:
101  *
102  *	zinject
103  *	zinject <-a | -u pool>
104  *	zinject -c <id|all>
105  *	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
106  *	    [-r range] <object>
107  *	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
108  *
109  * With no arguments, the command prints all currently registered injection
110  * handlers, with their numeric identifiers.
111  *
112  * The '-c' option will clear the given handler, or all handlers if 'all' is
113  * specified.
114  *
115  * The '-e' option takes a string describing the errno to simulate.  This must
116  * be one of 'io', 'checksum', or 'decrypt'.  In most cases this will result
117  * in the same behavior, but RAID-Z will produce a different set of ereports
118  * for this situation.
119  *
120  * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
121  * specified, then the ARC cache is flushed appropriately.  If '-u' is
122  * specified, then the underlying SPA is unloaded.  Either of these flags can be
123  * specified independently of any other handlers.  The '-m' flag automatically
124  * does an unmount and remount of the underlying dataset to aid in flushing the
125  * cache.
126  *
127  * The '-f' flag controls the frequency of errors injected, expressed as a
128  * real number percentage between 0.0001 and 100.  The default is 100.
129  *
130  * The this form is responsible for actually injecting the handler into the
131  * framework.  It takes the arguments described above, translates them to the
132  * internal tuple using libzpool, and then issues an ioctl() to register the
133  * handler.
134  *
135  * The final form can target a specific bookmark, regardless of whether a
136  * human-readable interface has been designed.  It allows developers to specify
137  * a particular block by number.
138  */
139 
140 #include <errno.h>
141 #include <fcntl.h>
142 #include <stdio.h>
143 #include <stdlib.h>
144 #include <strings.h>
145 #include <unistd.h>
146 
147 #include <sys/fs/zfs.h>
148 #include <sys/mount.h>
149 
150 #include <libzfs.h>
151 
152 #undef verify	/* both libzfs.h and zfs_context.h want to define this */
153 
154 #include "zinject.h"
155 
156 libzfs_handle_t *g_zfs;
157 int zfs_fd;
158 
159 #define	ECKSUM	EBADE
160 
161 static const char *errtable[TYPE_INVAL] = {
162 	"data",
163 	"dnode",
164 	"mos",
165 	"mosdir",
166 	"metaslab",
167 	"config",
168 	"bpobj",
169 	"spacemap",
170 	"errlog",
171 	"uber",
172 	"nvlist",
173 	"pad1",
174 	"pad2"
175 };
176 
177 static err_type_t
178 name_to_type(const char *arg)
179 {
180 	int i;
181 	for (i = 0; i < TYPE_INVAL; i++)
182 		if (strcmp(errtable[i], arg) == 0)
183 			return (i);
184 
185 	return (TYPE_INVAL);
186 }
187 
188 static const char *
189 type_to_name(uint64_t type)
190 {
191 	switch (type) {
192 	case DMU_OT_OBJECT_DIRECTORY:
193 		return ("mosdir");
194 	case DMU_OT_OBJECT_ARRAY:
195 		return ("metaslab");
196 	case DMU_OT_PACKED_NVLIST:
197 		return ("config");
198 	case DMU_OT_BPOBJ:
199 		return ("bpobj");
200 	case DMU_OT_SPACE_MAP:
201 		return ("spacemap");
202 	case DMU_OT_ERROR_LOG:
203 		return ("errlog");
204 	default:
205 		return ("-");
206 	}
207 }
208 
209 
210 /*
211  * Print usage message.
212  */
213 void
214 usage(void)
215 {
216 	(void) printf(
217 	    "usage:\n"
218 	    "\n"
219 	    "\tzinject\n"
220 	    "\n"
221 	    "\t\tList all active injection records.\n"
222 	    "\n"
223 	    "\tzinject -c <id|all>\n"
224 	    "\n"
225 	    "\t\tClear the particular record (if given a numeric ID), or\n"
226 	    "\t\tall records if 'all' is specificed.\n"
227 	    "\n"
228 	    "\tzinject -p <function name> pool\n"
229 	    "\n"
230 	    "\t\tInject a panic fault at the specified function. Only \n"
231 	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
232 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
233 	    "\n"
234 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
235 	    "\t    [-T <read|write|free|claim|all>] [-f frequency] pool\n"
236 	    "\n"
237 	    "\t\tInject a fault into a particular device or the device's\n"
238 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
239 	    "\t\t'pad1', or 'pad2'.\n"
240 	    "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
241 	    "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
242 	    "\t\tdevice error injection to a percentage of the IOs.\n"
243 	    "\n"
244 	    "\tzinject -d device -A <degrade|fault> pool\n"
245 	    "\n"
246 	    "\t\tPerform a specific action on a particular device\n"
247 	    "\n"
248 	    "\tzinject -d device -D latency:lanes pool\n"
249 	    "\n"
250 	    "\t\tAdd an artificial delay to IO requests on a particular\n"
251 	    "\t\tdevice, such that the requests take a minimum of 'latency'\n"
252 	    "\t\tmilliseconds to complete. Each delay has an associated\n"
253 	    "\t\tnumber of 'lanes' which defines the number of concurrent\n"
254 	    "\t\tIO requests that can be processed.\n"
255 	    "\n"
256 	    "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
257 	    "\t\tthe device will only be able to service a single IO request\n"
258 	    "\t\tat a time with each request taking 10 ms to complete. So,\n"
259 	    "\t\tif only a single request is submitted every 10 ms, the\n"
260 	    "\t\taverage latency will be 10 ms; but if more than one request\n"
261 	    "\t\tis submitted every 10 ms, the average latency will be more\n"
262 	    "\t\tthan 10 ms.\n"
263 	    "\n"
264 	    "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
265 	    "\t\tlanes (-D 10:2), then the device will be able to service\n"
266 	    "\t\ttwo requests at a time, each with a minimum latency of\n"
267 	    "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
268 	    "\t\tthe average latency will be 10 ms; but if more than two\n"
269 	    "\t\trequests are submitted every 10 ms, the average latency\n"
270 	    "\t\twill be more than 10 ms.\n"
271 	    "\n"
272 	    "\t\tAlso note, these delays are additive. So two invocations\n"
273 	    "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
274 	    "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
275 	    "\t\tlanes with differing target latencies. For example, an\n"
276 	    "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
277 	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
278 	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
279 	    "\n"
280 	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
281 	    "\n"
282 	    "\t\tCause the pool to stop writing blocks yet not\n"
283 	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
284 	    "\t\tthat fails to honor cache flush requests.\n"
285 	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
286 	    "\t\tat the end of the duration.\n"
287 	    "\n"
288 	    "\tzinject -b objset:object:level:blkid pool\n"
289 	    "\n"
290 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
291 	    "\t\tspecified by the remaining tuple.  Each number is in\n"
292 	    "\t\thexidecimal, and only one block can be specified.\n"
293 	    "\n"
294 	    "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
295 	    "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
296 	    "\n"
297 	    "\t\tInject an error into the object specified by the '-t' option\n"
298 	    "\t\tand the object descriptor.  The 'object' parameter is\n"
299 	    "\t\tinterperted depending on the '-t' option.\n"
300 	    "\n"
301 	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
302 	    "\t\t-e\tInject a specific error.  Must be one of 'io', "
303 	    "'checksum',\n"
304 	    "\t\t\t'decompress', or decrypt.  Default is 'io'.\n"
305 	    "\t\t-C\tInject the given error only into specific DVAs. The\n"
306 	    "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
307 	    "\t\t\tseparated by commas (ex. '0,2').\n"
308 	    "\t\t-l\tInject error at a particular block level. Default is "
309 	    "0.\n"
310 	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
311 	    "\t\t-r\tInject error over a particular logical range of an\n"
312 	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
313 	    "\t\t\trange according to the object's properties.\n"
314 	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
315 	    "\t\t\tassociated object.\n"
316 	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
317 	    "\t\t\ta pool object.\n"
318 	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
319 	    "\t\t\ta percentage between 0.0001 and 100.\n"
320 	    "\n"
321 	    "\t-t data\t\tInject an error into the plain file contents of a\n"
322 	    "\t\t\tfile.  The object must be specified as a complete path\n"
323 	    "\t\t\tto a file on a ZFS filesystem.\n"
324 	    "\n"
325 	    "\t-t dnode\tInject an error into the metadnode in the block\n"
326 	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
327 	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
328 	    "\t\t\tis specified as a complete path to a file or directory\n"
329 	    "\t\t\ton a ZFS filesystem.\n"
330 	    "\n"
331 	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
332 	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
333 	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
334 	    "\t\t\tthe poolname.\n");
335 }
336 
337 static int
338 iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
339     void *data)
340 {
341 	zfs_cmd_t zc = { 0 };
342 	int ret;
343 
344 	while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
345 		if ((ret = func((int)zc.zc_guid, zc.zc_name,
346 		    &zc.zc_inject_record, data)) != 0)
347 			return (ret);
348 
349 	if (errno != ENOENT) {
350 		(void) fprintf(stderr, "Unable to list handlers: %s\n",
351 		    strerror(errno));
352 		return (-1);
353 	}
354 
355 	return (0);
356 }
357 
358 static int
359 print_data_handler(int id, const char *pool, zinject_record_t *record,
360     void *data)
361 {
362 	int *count = data;
363 
364 	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
365 		return (0);
366 
367 	if (*count == 0) {
368 		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  ",
369 		    "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
370 		    "LVL", "DVAs", "RANGE");
371 		(void) printf("---  ---------------  ------  "
372 		    "------  --------  ---  ---- ----------------\n");
373 	}
374 
375 	*count += 1;
376 
377 	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %-3d  0x%02x  ",
378 	    id, pool, (u_longlong_t)record->zi_objset,
379 	    (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
380 	    record->zi_level, record->zi_dvas);
381 
382 	if (record->zi_start == 0 &&
383 	    record->zi_end == -1ULL)
384 		(void) printf("all\n");
385 	else
386 		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
387 		    (u_longlong_t)record->zi_end);
388 
389 	return (0);
390 }
391 
392 static int
393 print_device_handler(int id, const char *pool, zinject_record_t *record,
394     void *data)
395 {
396 	int *count = data;
397 
398 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
399 		return (0);
400 
401 	if (record->zi_cmd == ZINJECT_DELAY_IO)
402 		return (0);
403 
404 	if (*count == 0) {
405 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
406 		(void) printf("---  ---------------  ----------------\n");
407 	}
408 
409 	*count += 1;
410 
411 	(void) printf("%3d  %-15s  %llx\n", id, pool,
412 	    (u_longlong_t)record->zi_guid);
413 
414 	return (0);
415 }
416 
417 static int
418 print_delay_handler(int id, const char *pool, zinject_record_t *record,
419     void *data)
420 {
421 	int *count = data;
422 
423 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
424 		return (0);
425 
426 	if (record->zi_cmd != ZINJECT_DELAY_IO)
427 		return (0);
428 
429 	if (*count == 0) {
430 		(void) printf("%3s  %-15s  %-15s  %-15s  %s\n",
431 		    "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
432 		(void) printf("---  ---------------  ---------------  "
433 		    "---------------  ----------------\n");
434 	}
435 
436 	*count += 1;
437 
438 	(void) printf("%3d  %-15s  %-15llu  %-15llu  %llx\n", id, pool,
439 	    (u_longlong_t)NSEC2MSEC(record->zi_timer),
440 	    (u_longlong_t)record->zi_nlanes,
441 	    (u_longlong_t)record->zi_guid);
442 
443 	return (0);
444 }
445 
446 static int
447 print_panic_handler(int id, const char *pool, zinject_record_t *record,
448     void *data)
449 {
450 	int *count = data;
451 
452 	if (record->zi_func[0] == '\0')
453 		return (0);
454 
455 	if (*count == 0) {
456 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
457 		(void) printf("---  ---------------  ----------------\n");
458 	}
459 
460 	*count += 1;
461 
462 	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
463 
464 	return (0);
465 }
466 
467 /*
468  * Print all registered error handlers.  Returns the number of handlers
469  * registered.
470  */
471 static int
472 print_all_handlers(void)
473 {
474 	int count = 0, total = 0;
475 
476 	(void) iter_handlers(print_device_handler, &count);
477 	if (count > 0) {
478 		total += count;
479 		(void) printf("\n");
480 		count = 0;
481 	}
482 
483 	(void) iter_handlers(print_delay_handler, &count);
484 	if (count > 0) {
485 		total += count;
486 		(void) printf("\n");
487 		count = 0;
488 	}
489 
490 	(void) iter_handlers(print_data_handler, &count);
491 	if (count > 0) {
492 		total += count;
493 		(void) printf("\n");
494 		count = 0;
495 	}
496 
497 	(void) iter_handlers(print_panic_handler, &count);
498 
499 	return (count + total);
500 }
501 
502 /* ARGSUSED */
503 static int
504 cancel_one_handler(int id, const char *pool, zinject_record_t *record,
505     void *data)
506 {
507 	zfs_cmd_t zc = { 0 };
508 
509 	zc.zc_guid = (uint64_t)id;
510 
511 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
512 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
513 		    id, strerror(errno));
514 		return (1);
515 	}
516 
517 	return (0);
518 }
519 
520 /*
521  * Remove all fault injection handlers.
522  */
523 static int
524 cancel_all_handlers(void)
525 {
526 	int ret = iter_handlers(cancel_one_handler, NULL);
527 
528 	if (ret == 0)
529 		(void) printf("removed all registered handlers\n");
530 
531 	return (ret);
532 }
533 
534 /*
535  * Remove a specific fault injection handler.
536  */
537 static int
538 cancel_handler(int id)
539 {
540 	zfs_cmd_t zc = { 0 };
541 
542 	zc.zc_guid = (uint64_t)id;
543 
544 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
545 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
546 		    id, strerror(errno));
547 		return (1);
548 	}
549 
550 	(void) printf("removed handler %d\n", id);
551 
552 	return (0);
553 }
554 
555 /*
556  * Register a new fault injection handler.
557  */
558 static int
559 register_handler(const char *pool, int flags, zinject_record_t *record,
560     int quiet)
561 {
562 	zfs_cmd_t zc = { 0 };
563 
564 	(void) strcpy(zc.zc_name, pool);
565 	zc.zc_inject_record = *record;
566 	zc.zc_guid = flags;
567 
568 	if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
569 		(void) fprintf(stderr, "failed to add handler: %s\n",
570 		    errno == EDOM ? "block level exceeds max level of object" :
571 		    strerror(errno));
572 		return (1);
573 	}
574 
575 	if (flags & ZINJECT_NULL)
576 		return (0);
577 
578 	if (quiet) {
579 		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
580 	} else {
581 		(void) printf("Added handler %llu with the following "
582 		    "properties:\n", (u_longlong_t)zc.zc_guid);
583 		(void) printf("  pool: %s\n", pool);
584 		if (record->zi_guid) {
585 			(void) printf("  vdev: %llx\n",
586 			    (u_longlong_t)record->zi_guid);
587 		} else if (record->zi_func[0] != '\0') {
588 			(void) printf("  panic function: %s\n",
589 			    record->zi_func);
590 		} else if (record->zi_duration > 0) {
591 			(void) printf(" time: %lld seconds\n",
592 			    (u_longlong_t)record->zi_duration);
593 		} else if (record->zi_duration < 0) {
594 			(void) printf(" txgs: %lld \n",
595 			    (u_longlong_t)-record->zi_duration);
596 		} else {
597 			(void) printf("objset: %llu\n",
598 			    (u_longlong_t)record->zi_objset);
599 			(void) printf("object: %llu\n",
600 			    (u_longlong_t)record->zi_object);
601 			(void) printf("  type: %llu\n",
602 			    (u_longlong_t)record->zi_type);
603 			(void) printf(" level: %d\n", record->zi_level);
604 			if (record->zi_start == 0 &&
605 			    record->zi_end == -1ULL)
606 				(void) printf(" range: all\n");
607 			else
608 				(void) printf(" range: [%llu, %llu)\n",
609 				    (u_longlong_t)record->zi_start,
610 				    (u_longlong_t)record->zi_end);
611 			(void) printf("  dvas: 0x%x\n", record->zi_dvas);
612 		}
613 	}
614 
615 	return (0);
616 }
617 
618 int
619 perform_action(const char *pool, zinject_record_t *record, int cmd)
620 {
621 	zfs_cmd_t zc = { 0 };
622 
623 	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
624 	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
625 	zc.zc_guid = record->zi_guid;
626 	zc.zc_cookie = cmd;
627 
628 	if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
629 		return (0);
630 
631 	return (1);
632 }
633 
634 static int
635 parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
636 {
637 	unsigned long scan_delay;
638 	unsigned long scan_nlanes;
639 
640 	if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
641 		return (1);
642 
643 	/*
644 	 * We explicitly disallow a delay of zero here, because we key
645 	 * off this value being non-zero in translate_device(), to
646 	 * determine if the fault is a ZINJECT_DELAY_IO fault or not.
647 	 */
648 	if (scan_delay == 0)
649 		return (1);
650 
651 	/*
652 	 * The units for the CLI delay parameter is milliseconds, but
653 	 * the data passed to the kernel is interpreted as nanoseconds.
654 	 * Thus we scale the milliseconds to nanoseconds here, and this
655 	 * nanosecond value is used to pass the delay to the kernel.
656 	 */
657 	*delay = MSEC2NSEC(scan_delay);
658 	*nlanes = scan_nlanes;
659 
660 	return (0);
661 }
662 
663 static int
664 parse_frequency(const char *str, uint32_t *percent)
665 {
666 	double val;
667 	char *post;
668 
669 	val = strtod(str, &post);
670 	if (post == NULL || *post != '\0')
671 		return (EINVAL);
672 
673 	/* valid range is [0.0001, 100.0] */
674 	val /= 100.0f;
675 	if (val < 0.000001f || val > 1.0f)
676 		return (ERANGE);
677 
678 	/* convert to an integer for use by kernel */
679 	*percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX));
680 
681 	return (0);
682 }
683 
684 /*
685  * This function converts a string specifier for DVAs into a bit mask.
686  * The dva's provided by the user should be 0 indexed and separated by
687  * a comma. For example:
688  *     "1"     -> 0b0010  (0x2)
689  *     "0,1"   -> 0b0011  (0x3)
690  *     "0,1,2" -> 0b0111  (0x7)
691  */
692 static int
693 parse_dvas(const char *str, uint32_t *dvas_out)
694 {
695 	const char *c = str;
696 	uint32_t mask = 0;
697 	boolean_t need_delim = B_FALSE;
698 
699 	/* max string length is 5 ("0,1,2") */
700 	if (strlen(str) > 5 || strlen(str) == 0)
701 		return (EINVAL);
702 
703 	while (*c != '\0') {
704 		switch (*c) {
705 		case '0':
706 		case '1':
707 		case '2':
708 			/* check for pipe between DVAs */
709 			if (need_delim)
710 				return (EINVAL);
711 
712 			/* check if this DVA has been set already */
713 			if (mask & (1 << ((*c) - '0')))
714 				return (EINVAL);
715 
716 			mask |= (1 << ((*c) - '0'));
717 			need_delim = B_TRUE;
718 			break;
719 		case ',':
720 			need_delim = B_FALSE;
721 			break;
722 		default:
723 			/* check for invalid character */
724 			return (EINVAL);
725 		}
726 		c++;
727 	}
728 
729 	/* check for dangling delimiter */
730 	if (!need_delim)
731 		return (EINVAL);
732 
733 	*dvas_out = mask;
734 	return (0);
735 }
736 
737 int
738 main(int argc, char **argv)
739 {
740 	int c;
741 	char *range = NULL;
742 	char *cancel = NULL;
743 	char *end;
744 	char *raw = NULL;
745 	char *device = NULL;
746 	int level = 0;
747 	int quiet = 0;
748 	int error = 0;
749 	int domount = 0;
750 	int io_type = ZIO_TYPES;
751 	int action = VDEV_STATE_UNKNOWN;
752 	err_type_t type = TYPE_INVAL;
753 	err_type_t label = TYPE_INVAL;
754 	zinject_record_t record = { 0 };
755 	char pool[MAXNAMELEN];
756 	char dataset[MAXNAMELEN];
757 	zfs_handle_t *zhp;
758 	int nowrites = 0;
759 	int dur_txg = 0;
760 	int dur_secs = 0;
761 	int ret;
762 	int flags = 0;
763 	uint32_t dvas = 0;
764 
765 	if ((g_zfs = libzfs_init()) == NULL) {
766 		(void) fprintf(stderr, "internal error: failed to "
767 		    "initialize ZFS library\n");
768 		return (1);
769 	}
770 
771 	libzfs_print_on_error(g_zfs, B_TRUE);
772 
773 	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
774 		(void) fprintf(stderr, "failed to open ZFS device\n");
775 		libzfs_fini(g_zfs);
776 		return (1);
777 	}
778 
779 	if (argc == 1) {
780 		/*
781 		 * No arguments.  Print the available handlers.  If there are no
782 		 * available handlers, direct the user to '-h' for help
783 		 * information.
784 		 */
785 		if (print_all_handlers() == 0) {
786 			(void) printf("No handlers registered.\n");
787 			(void) printf("Run 'zinject -h' for usage "
788 			    "information.\n");
789 		}
790 
791 		libzfs_fini(g_zfs);
792 		return (0);
793 	}
794 
795 	while ((c = getopt(argc, argv,
796 	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
797 		switch (c) {
798 		case 'a':
799 			flags |= ZINJECT_FLUSH_ARC;
800 			break;
801 		case 'A':
802 			if (strcasecmp(optarg, "degrade") == 0) {
803 				action = VDEV_STATE_DEGRADED;
804 			} else if (strcasecmp(optarg, "fault") == 0) {
805 				action = VDEV_STATE_FAULTED;
806 			} else {
807 				(void) fprintf(stderr, "invalid action '%s': "
808 				    "must be 'degrade' or 'fault'\n", optarg);
809 				usage();
810 				libzfs_fini(g_zfs);
811 				return (1);
812 			}
813 			break;
814 		case 'b':
815 			raw = optarg;
816 			break;
817 		case 'c':
818 			cancel = optarg;
819 			break;
820 		case 'C':
821 			ret = parse_dvas(optarg, &dvas);
822 			if (ret != 0) {
823 				(void) fprintf(stderr, "invalid DVA list '%s': "
824 				    "DVAs should be 0 indexed and separated by "
825 				    "commas.\n", optarg);
826 				usage();
827 				libzfs_fini(g_zfs);
828 				return (1);
829 			}
830 			break;
831 		case 'd':
832 			device = optarg;
833 			break;
834 		case 'D':
835 			ret = parse_delay(optarg, &record.zi_timer,
836 			    &record.zi_nlanes);
837 			if (ret != 0) {
838 				(void) fprintf(stderr, "invalid i/o delay "
839 				    "value: '%s'\n", optarg);
840 				usage();
841 				libzfs_fini(g_zfs);
842 				return (1);
843 			}
844 			break;
845 		case 'e':
846 			if (strcasecmp(optarg, "io") == 0) {
847 				error = EIO;
848 			} else if (strcasecmp(optarg, "checksum") == 0) {
849 				error = ECKSUM;
850 			} else if (strcasecmp(optarg, "decrypt") == 0) {
851 				error = EACCES;
852 			} else if (strcasecmp(optarg, "nxio") == 0) {
853 				error = ENXIO;
854 			} else if (strcasecmp(optarg, "dtl") == 0) {
855 				error = ECHILD;
856 			} else {
857 				(void) fprintf(stderr, "invalid error type "
858 				    "'%s': must be 'io', 'checksum' or "
859 				    "'nxio'\n", optarg);
860 				usage();
861 				return (1);
862 			}
863 			break;
864 		case 'f':
865 			ret = parse_frequency(optarg, &record.zi_freq);
866 			if (ret != 0) {
867 				(void) fprintf(stderr, "%sfrequency value must "
868 				    "be in the range [0.0001, 100.0]\n",
869 				    ret == EINVAL ? "invalid value: " :
870 				    ret == ERANGE ? "out of range: " : "");
871 				libzfs_fini(g_zfs);
872 				return (1);
873 			}
874 			break;
875 		case 'F':
876 			record.zi_failfast = B_TRUE;
877 			break;
878 		case 'g':
879 			dur_txg = 1;
880 			record.zi_duration = (int)strtol(optarg, &end, 10);
881 			if (record.zi_duration <= 0 || *end != '\0') {
882 				(void) fprintf(stderr, "invalid duration '%s': "
883 				    "must be a positive integer\n", optarg);
884 				usage();
885 				libzfs_fini(g_zfs);
886 				return (1);
887 			}
888 			/* store duration of txgs as its negative */
889 			record.zi_duration *= -1;
890 			break;
891 		case 'h':
892 			usage();
893 			libzfs_fini(g_zfs);
894 			return (0);
895 		case 'I':
896 			/* default duration, if one hasn't yet been defined */
897 			nowrites = 1;
898 			if (dur_secs == 0 && dur_txg == 0)
899 				record.zi_duration = 30;
900 			break;
901 		case 'l':
902 			level = (int)strtol(optarg, &end, 10);
903 			if (*end != '\0') {
904 				(void) fprintf(stderr, "invalid level '%s': "
905 				    "must be an integer\n", optarg);
906 				usage();
907 				libzfs_fini(g_zfs);
908 				return (1);
909 			}
910 			break;
911 		case 'm':
912 			domount = 1;
913 			break;
914 		case 'p':
915 			(void) strlcpy(record.zi_func, optarg,
916 			    sizeof (record.zi_func));
917 			record.zi_cmd = ZINJECT_PANIC;
918 			break;
919 		case 'q':
920 			quiet = 1;
921 			break;
922 		case 'r':
923 			range = optarg;
924 			flags |= ZINJECT_CALC_RANGE;
925 			break;
926 		case 's':
927 			dur_secs = 1;
928 			record.zi_duration = (int)strtol(optarg, &end, 10);
929 			if (record.zi_duration <= 0 || *end != '\0') {
930 				(void) fprintf(stderr, "invalid duration '%s': "
931 				    "must be a positive integer\n", optarg);
932 				usage();
933 				libzfs_fini(g_zfs);
934 				return (1);
935 			}
936 			break;
937 		case 'T':
938 			if (strcasecmp(optarg, "read") == 0) {
939 				io_type = ZIO_TYPE_READ;
940 			} else if (strcasecmp(optarg, "write") == 0) {
941 				io_type = ZIO_TYPE_WRITE;
942 			} else if (strcasecmp(optarg, "free") == 0) {
943 				io_type = ZIO_TYPE_FREE;
944 			} else if (strcasecmp(optarg, "claim") == 0) {
945 				io_type = ZIO_TYPE_CLAIM;
946 			} else if (strcasecmp(optarg, "all") == 0) {
947 				io_type = ZIO_TYPES;
948 			} else {
949 				(void) fprintf(stderr, "invalid I/O type "
950 				    "'%s': must be 'read', 'write', 'free', "
951 				    "'claim' or 'all'\n", optarg);
952 				usage();
953 				libzfs_fini(g_zfs);
954 				return (1);
955 			}
956 			break;
957 		case 't':
958 			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
959 			    !MOS_TYPE(type)) {
960 				(void) fprintf(stderr, "invalid type '%s'\n",
961 				    optarg);
962 				usage();
963 				libzfs_fini(g_zfs);
964 				return (1);
965 			}
966 			break;
967 		case 'u':
968 			flags |= ZINJECT_UNLOAD_SPA;
969 			break;
970 		case 'L':
971 			if ((label = name_to_type(optarg)) == TYPE_INVAL &&
972 			    !LABEL_TYPE(type)) {
973 				(void) fprintf(stderr, "invalid label type "
974 				    "'%s'\n", optarg);
975 				usage();
976 				libzfs_fini(g_zfs);
977 				return (1);
978 			}
979 			break;
980 		case ':':
981 			(void) fprintf(stderr, "option -%c requires an "
982 			    "operand\n", optopt);
983 			usage();
984 			libzfs_fini(g_zfs);
985 			return (1);
986 		case '?':
987 			(void) fprintf(stderr, "invalid option '%c'\n",
988 			    optopt);
989 			usage();
990 			libzfs_fini(g_zfs);
991 			return (2);
992 		}
993 	}
994 
995 	argc -= optind;
996 	argv += optind;
997 
998 	if (record.zi_duration != 0)
999 		record.zi_cmd = ZINJECT_IGNORED_WRITES;
1000 
1001 	if (cancel != NULL) {
1002 		/*
1003 		 * '-c' is invalid with any other options.
1004 		 */
1005 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1006 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
1007 		    record.zi_freq > 0 || dvas != 0) {
1008 			(void) fprintf(stderr, "cancel (-c) incompatible with "
1009 			    "any other options\n");
1010 			usage();
1011 			libzfs_fini(g_zfs);
1012 			return (2);
1013 		}
1014 		if (argc != 0) {
1015 			(void) fprintf(stderr, "extraneous argument to '-c'\n");
1016 			usage();
1017 			libzfs_fini(g_zfs);
1018 			return (2);
1019 		}
1020 
1021 		if (strcmp(cancel, "all") == 0) {
1022 			return (cancel_all_handlers());
1023 		} else {
1024 			int id = (int)strtol(cancel, &end, 10);
1025 			if (*end != '\0') {
1026 				(void) fprintf(stderr, "invalid handle id '%s':"
1027 				    " must be an integer or 'all'\n", cancel);
1028 				usage();
1029 				libzfs_fini(g_zfs);
1030 				return (1);
1031 			}
1032 			return (cancel_handler(id));
1033 		}
1034 	}
1035 
1036 	if (device != NULL) {
1037 		/*
1038 		 * Device (-d) injection uses a completely different mechanism
1039 		 * for doing injection, so handle it separately here.
1040 		 */
1041 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1042 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
1043 		    dvas != 0) {
1044 			(void) fprintf(stderr, "device (-d) incompatible with "
1045 			    "data error injection\n");
1046 			usage();
1047 			libzfs_fini(g_zfs);
1048 			return (2);
1049 		}
1050 
1051 		if (argc != 1) {
1052 			(void) fprintf(stderr, "device (-d) injection requires "
1053 			    "a single pool name\n");
1054 			usage();
1055 			libzfs_fini(g_zfs);
1056 			return (2);
1057 		}
1058 
1059 		(void) strlcpy(pool, argv[0], sizeof (pool));
1060 		dataset[0] = '\0';
1061 
1062 		if (error == ECKSUM) {
1063 			(void) fprintf(stderr, "device error type must be "
1064 			    "'io' or 'nxio'\n");
1065 			libzfs_fini(g_zfs);
1066 			return (1);
1067 		}
1068 
1069 		record.zi_iotype = io_type;
1070 		if (translate_device(pool, device, label, &record) != 0) {
1071 			libzfs_fini(g_zfs);
1072 			return (1);
1073 		}
1074 		if (!error)
1075 			error = ENXIO;
1076 
1077 		if (action != VDEV_STATE_UNKNOWN)
1078 			return (perform_action(pool, &record, action));
1079 
1080 	} else if (raw != NULL) {
1081 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
1082 		    record.zi_cmd != ZINJECT_UNINITIALIZED ||
1083 		    record.zi_freq > 0 || dvas != 0) {
1084 			(void) fprintf(stderr, "raw (-b) format with "
1085 			    "any other options\n");
1086 			usage();
1087 			libzfs_fini(g_zfs);
1088 			return (2);
1089 		}
1090 
1091 		if (argc != 1) {
1092 			(void) fprintf(stderr, "raw (-b) format expects a "
1093 			    "single pool name\n");
1094 			usage();
1095 			libzfs_fini(g_zfs);
1096 			return (2);
1097 		}
1098 
1099 		(void) strlcpy(pool, argv[0], sizeof (pool));
1100 		dataset[0] = '\0';
1101 
1102 		if (error == ENXIO) {
1103 			(void) fprintf(stderr, "data error type must be "
1104 			    "'checksum' or 'io'\n");
1105 			libzfs_fini(g_zfs);
1106 			return (1);
1107 		}
1108 
1109 		record.zi_cmd = ZINJECT_DATA_FAULT;
1110 		if (translate_raw(raw, &record) != 0) {
1111 			libzfs_fini(g_zfs);
1112 			return (1);
1113 		}
1114 		if (!error)
1115 			error = EIO;
1116 	} else if (record.zi_cmd == ZINJECT_PANIC) {
1117 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1118 		    level != 0 || device != NULL || record.zi_freq > 0 ||
1119 		    dvas != 0) {
1120 			(void) fprintf(stderr, "panic (-p) incompatible with "
1121 			    "other options\n");
1122 			usage();
1123 			libzfs_fini(g_zfs);
1124 			return (2);
1125 		}
1126 
1127 		if (argc < 1 || argc > 2) {
1128 			(void) fprintf(stderr, "panic (-p) injection requires "
1129 			    "a single pool name and an optional id\n");
1130 			usage();
1131 			libzfs_fini(g_zfs);
1132 			return (2);
1133 		}
1134 
1135 		(void) strlcpy(pool, argv[0], sizeof (pool));
1136 		if (argv[1] != NULL)
1137 			record.zi_type = atoi(argv[1]);
1138 		dataset[0] = '\0';
1139 	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
1140 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1141 		    level != 0 || record.zi_freq > 0 || dvas != 0) {
1142 			(void) fprintf(stderr, "hardware failure (-I) "
1143 			    "incompatible with other options\n");
1144 			usage();
1145 			libzfs_fini(g_zfs);
1146 			return (2);
1147 		}
1148 
1149 		if (nowrites == 0) {
1150 			(void) fprintf(stderr, "-s or -g meaningless "
1151 			    "without -I (ignore writes)\n");
1152 			usage();
1153 			libzfs_fini(g_zfs);
1154 			return (2);
1155 		} else if (dur_secs && dur_txg) {
1156 			(void) fprintf(stderr, "choose a duration either "
1157 			    "in seconds (-s) or a number of txgs (-g) "
1158 			    "but not both\n");
1159 			usage();
1160 			libzfs_fini(g_zfs);
1161 			return (2);
1162 		} else if (argc != 1) {
1163 			(void) fprintf(stderr, "ignore writes (-I) "
1164 			    "injection requires a single pool name\n");
1165 			usage();
1166 			libzfs_fini(g_zfs);
1167 			return (2);
1168 		}
1169 
1170 		(void) strlcpy(pool, argv[0], sizeof (pool));
1171 		dataset[0] = '\0';
1172 	} else if (type == TYPE_INVAL) {
1173 		if (flags == 0) {
1174 			(void) fprintf(stderr, "at least one of '-b', '-d', "
1175 			    "'-t', '-a', '-p', '-I' or '-u' "
1176 			    "must be specified\n");
1177 			usage();
1178 			libzfs_fini(g_zfs);
1179 			return (2);
1180 		}
1181 
1182 		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
1183 			(void) strlcpy(pool, argv[0], sizeof (pool));
1184 			dataset[0] = '\0';
1185 		} else if (argc != 0) {
1186 			(void) fprintf(stderr, "extraneous argument for "
1187 			    "'-f'\n");
1188 			usage();
1189 			libzfs_fini(g_zfs);
1190 			return (2);
1191 		}
1192 
1193 		flags |= ZINJECT_NULL;
1194 	} else {
1195 		if (argc != 1) {
1196 			(void) fprintf(stderr, "missing object\n");
1197 			usage();
1198 			libzfs_fini(g_zfs);
1199 			return (2);
1200 		}
1201 
1202 		if (error == ENXIO) {
1203 			(void) fprintf(stderr, "data error type must be "
1204 			    "'checksum' or 'io'\n");
1205 			libzfs_fini(g_zfs);
1206 			return (1);
1207 		}
1208 
1209 		if (dvas != 0) {
1210 			if (error == EACCES || error == EINVAL) {
1211 				(void) fprintf(stderr, "the '-C' option may "
1212 				    "not be used with logical data errors "
1213 				    "'decrypt' and 'decompress'\n");
1214 				record.zi_dvas = dvas;
1215 			}
1216 		}
1217 
1218 		record.zi_cmd = ZINJECT_DATA_FAULT;
1219 
1220 		if (error == EACCES) {
1221 			if (type != TYPE_DATA) {
1222 				(void) fprintf(stderr, "decryption errors "
1223 				    "may only be injected for 'data' types\n");
1224 				libzfs_fini(g_zfs);
1225 				return (1);
1226 			}
1227 
1228 			record.zi_cmd = ZINJECT_DECRYPT_FAULT;
1229 			/*
1230 			 * Internally, ZFS actually uses ECKSUM for decryption
1231 			 * errors since EACCES is used to indicate the key was
1232 			 * not found.
1233 			 */
1234 			error = ECKSUM;
1235 		}
1236 
1237 		if (translate_record(type, argv[0], range, level, &record, pool,
1238 		    dataset) != 0) {
1239 			libzfs_fini(g_zfs);
1240 			return (1);
1241 		}
1242 		if (!error)
1243 			error = EIO;
1244 	}
1245 
1246 	/*
1247 	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
1248 	 * unload the pool, so that we trigger spa-wide reopen of metadata next
1249 	 * time we access the pool.
1250 	 */
1251 	if (dataset[0] != '\0' && domount) {
1252 		if ((zhp = zfs_open(g_zfs, dataset,
1253 		    ZFS_TYPE_DATASET)) == NULL) {
1254 			libzfs_fini(g_zfs);
1255 			return (1);
1256 		}
1257 
1258 		if (zfs_unmount(zhp, NULL, 0) != 0) {
1259 			libzfs_fini(g_zfs);
1260 			return (1);
1261 		}
1262 	}
1263 
1264 	record.zi_error = error;
1265 
1266 	ret = register_handler(pool, flags, &record, quiet);
1267 
1268 	if (dataset[0] != '\0' && domount)
1269 		ret = (zfs_mount(zhp, NULL, 0) != 0);
1270 
1271 	libzfs_fini(g_zfs);
1272 
1273 	return (ret);
1274 }
1275