1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright (c) 2017, Intel Corporation.
25 */
26
27/*
28 * ZFS Fault Injector
29 *
30 * This userland component takes a set of options and uses libzpool to translate
31 * from a user-visible object type and name to an internal representation.
32 * There are two basic types of faults: device faults and data faults.
33 *
34 *
35 * DEVICE FAULTS
36 *
37 * Errors can be injected into a particular vdev using the '-d' option.  This
38 * option takes a path or vdev GUID to uniquely identify the device within a
39 * pool.  There are two types of errors that can be injected, EIO and ENXIO,
40 * that can be controlled through the '-e' option.  The default is ENXIO.  For
41 * EIO failures, any attempt to read data from the device will return EIO, but
42 * subsequent attempt to reopen the device will succeed.  For ENXIO failures,
43 * any attempt to read from the device will return EIO, but any attempt to
44 * reopen the device will also return ENXIO.
45 * For label faults, the -L option must be specified. This allows faults
46 * to be injected into either the nvlist, uberblock, pad1, or pad2 region
47 * of all the labels for the specified device.
48 *
49 * This form of the command looks like:
50 *
51 *	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
52 *
53 *
54 * DATA FAULTS
55 *
56 * We begin with a tuple of the form:
57 *
58 *	<type,level,range,object>
59 *
60 *	type	A string describing the type of data to target.  Each type
61 *		implicitly describes how to interpret 'object'. Currently,
62 *		the following values are supported:
63 *
64 *		data		User data for a file
65 *		dnode		Dnode for a file or directory
66 *
67 *		The following MOS objects are special.  Instead of injecting
68 *		errors on a particular object or blkid, we inject errors across
69 *		all objects of the given type.
70 *
71 *		mos		Any data in the MOS
72 *		mosdir		object directory
73 *		config		pool configuration
74 *		bpobj		blkptr list
75 *		spacemap	spacemap
76 *		metaslab	metaslab
77 *		errlog		persistent error log
78 *
79 *	level	Object level.  Defaults to '0', not applicable to all types.  If
80 *		a range is given, this corresponds to the indirect block
81 *		corresponding to the specific range.
82 *
83 *	range	A numerical range [start,end) within the object.  Defaults to
84 *		the full size of the file.
85 *
86 *	object	A string describing the logical location of the object.  For
87 *		files and directories (currently the only supported types),
88 *		this is the path of the object on disk.
89 *
90 * This is translated, via libzpool, into the following internal representation:
91 *
92 *	<type,objset,object,level,range>
93 *
94 * These types should be self-explanatory.  This tuple is then passed to the
95 * kernel via a special ioctl() to initiate fault injection for the given
96 * object.  Note that 'type' is not strictly necessary for fault injection, but
97 * is used when translating existing faults into a human-readable string.
98 *
99 *
100 * The command itself takes one of the forms:
101 *
102 *	zinject
103 *	zinject <-a | -u pool>
104 *	zinject -c <id|all>
105 *	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
106 *	    [-r range] <object>
107 *	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
108 *
109 * With no arguments, the command prints all currently registered injection
110 * handlers, with their numeric identifiers.
111 *
112 * The '-c' option will clear the given handler, or all handlers if 'all' is
113 * specified.
114 *
115 * The '-e' option takes a string describing the errno to simulate.  This must
116 * be one of 'io', 'checksum', or 'decrypt'.  In most cases this will result
117 * in the same behavior, but RAID-Z will produce a different set of ereports
118 * for this situation.
119 *
120 * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
121 * specified, then the ARC cache is flushed appropriately.  If '-u' is
122 * specified, then the underlying SPA is unloaded.  Either of these flags can be
123 * specified independently of any other handlers.  The '-m' flag automatically
124 * does an unmount and remount of the underlying dataset to aid in flushing the
125 * cache.
126 *
127 * The '-f' flag controls the frequency of errors injected, expressed as a
128 * real number percentage between 0.0001 and 100.  The default is 100.
129 *
130 * The this form is responsible for actually injecting the handler into the
131 * framework.  It takes the arguments described above, translates them to the
132 * internal tuple using libzpool, and then issues an ioctl() to register the
133 * handler.
134 *
135 * The final form can target a specific bookmark, regardless of whether a
136 * human-readable interface has been designed.  It allows developers to specify
137 * a particular block by number.
138 */
139
140#include <errno.h>
141#include <fcntl.h>
142#include <stdio.h>
143#include <stdlib.h>
144#include <strings.h>
145#include <unistd.h>
146
147#include <sys/fs/zfs.h>
148#include <sys/mount.h>
149
150#include <libzfs.h>
151
152#undef verify	/* both libzfs.h and zfs_context.h want to define this */
153
154#include "zinject.h"
155
156libzfs_handle_t *g_zfs;
157int zfs_fd;
158
159#define	ECKSUM	EBADE
160
161static const char *errtable[TYPE_INVAL] = {
162	"data",
163	"dnode",
164	"mos",
165	"mosdir",
166	"metaslab",
167	"config",
168	"bpobj",
169	"spacemap",
170	"errlog",
171	"uber",
172	"nvlist",
173	"pad1",
174	"pad2"
175};
176
177static err_type_t
178name_to_type(const char *arg)
179{
180	int i;
181	for (i = 0; i < TYPE_INVAL; i++)
182		if (strcmp(errtable[i], arg) == 0)
183			return (i);
184
185	return (TYPE_INVAL);
186}
187
188static const char *
189type_to_name(uint64_t type)
190{
191	switch (type) {
192	case DMU_OT_OBJECT_DIRECTORY:
193		return ("mosdir");
194	case DMU_OT_OBJECT_ARRAY:
195		return ("metaslab");
196	case DMU_OT_PACKED_NVLIST:
197		return ("config");
198	case DMU_OT_BPOBJ:
199		return ("bpobj");
200	case DMU_OT_SPACE_MAP:
201		return ("spacemap");
202	case DMU_OT_ERROR_LOG:
203		return ("errlog");
204	default:
205		return ("-");
206	}
207}
208
209
210/*
211 * Print usage message.
212 */
213void
214usage(void)
215{
216	(void) printf(
217	    "usage:\n"
218	    "\n"
219	    "\tzinject\n"
220	    "\n"
221	    "\t\tList all active injection records.\n"
222	    "\n"
223	    "\tzinject -c <id|all>\n"
224	    "\n"
225	    "\t\tClear the particular record (if given a numeric ID), or\n"
226	    "\t\tall records if 'all' is specificed.\n"
227	    "\n"
228	    "\tzinject -p <function name> pool\n"
229	    "\n"
230	    "\t\tInject a panic fault at the specified function. Only \n"
231	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
232	    "\t\tspa_vdev_exit() will trigger a panic.\n"
233	    "\n"
234	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
235	    "\t    [-T <read|write|free|claim|all>] [-f frequency] pool\n"
236	    "\n"
237	    "\t\tInject a fault into a particular device or the device's\n"
238	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
239	    "\t\t'pad1', or 'pad2'.\n"
240	    "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
241	    "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n"
242	    "\t\tdevice error injection to a percentage of the IOs.\n"
243	    "\n"
244	    "\tzinject -d device -A <degrade|fault> pool\n"
245	    "\n"
246	    "\t\tPerform a specific action on a particular device\n"
247	    "\n"
248	    "\tzinject -d device -D latency:lanes pool\n"
249	    "\n"
250	    "\t\tAdd an artificial delay to IO requests on a particular\n"
251	    "\t\tdevice, such that the requests take a minimum of 'latency'\n"
252	    "\t\tmilliseconds to complete. Each delay has an associated\n"
253	    "\t\tnumber of 'lanes' which defines the number of concurrent\n"
254	    "\t\tIO requests that can be processed.\n"
255	    "\n"
256	    "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
257	    "\t\tthe device will only be able to service a single IO request\n"
258	    "\t\tat a time with each request taking 10 ms to complete. So,\n"
259	    "\t\tif only a single request is submitted every 10 ms, the\n"
260	    "\t\taverage latency will be 10 ms; but if more than one request\n"
261	    "\t\tis submitted every 10 ms, the average latency will be more\n"
262	    "\t\tthan 10 ms.\n"
263	    "\n"
264	    "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
265	    "\t\tlanes (-D 10:2), then the device will be able to service\n"
266	    "\t\ttwo requests at a time, each with a minimum latency of\n"
267	    "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
268	    "\t\tthe average latency will be 10 ms; but if more than two\n"
269	    "\t\trequests are submitted every 10 ms, the average latency\n"
270	    "\t\twill be more than 10 ms.\n"
271	    "\n"
272	    "\t\tAlso note, these delays are additive. So two invocations\n"
273	    "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
274	    "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
275	    "\t\tlanes with differing target latencies. For example, an\n"
276	    "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
277	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
278	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
279	    "\n"
280	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
281	    "\n"
282	    "\t\tCause the pool to stop writing blocks yet not\n"
283	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
284	    "\t\tthat fails to honor cache flush requests.\n"
285	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
286	    "\t\tat the end of the duration.\n"
287	    "\n"
288	    "\tzinject -b objset:object:level:blkid pool\n"
289	    "\n"
290	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
291	    "\t\tspecified by the remaining tuple.  Each number is in\n"
292	    "\t\thexidecimal, and only one block can be specified.\n"
293	    "\n"
294	    "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
295	    "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
296	    "\n"
297	    "\t\tInject an error into the object specified by the '-t' option\n"
298	    "\t\tand the object descriptor.  The 'object' parameter is\n"
299	    "\t\tinterperted depending on the '-t' option.\n"
300	    "\n"
301	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
302	    "\t\t-e\tInject a specific error.  Must be one of 'io', "
303	    "'checksum',\n"
304	    "\t\t\t'decompress', or decrypt.  Default is 'io'.\n"
305	    "\t\t-C\tInject the given error only into specific DVAs. The\n"
306	    "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
307	    "\t\t\tseparated by commas (ex. '0,2').\n"
308	    "\t\t-l\tInject error at a particular block level. Default is "
309	    "0.\n"
310	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
311	    "\t\t-r\tInject error over a particular logical range of an\n"
312	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
313	    "\t\t\trange according to the object's properties.\n"
314	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
315	    "\t\t\tassociated object.\n"
316	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
317	    "\t\t\ta pool object.\n"
318	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
319	    "\t\t\ta percentage between 0.0001 and 100.\n"
320	    "\n"
321	    "\t-t data\t\tInject an error into the plain file contents of a\n"
322	    "\t\t\tfile.  The object must be specified as a complete path\n"
323	    "\t\t\tto a file on a ZFS filesystem.\n"
324	    "\n"
325	    "\t-t dnode\tInject an error into the metadnode in the block\n"
326	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
327	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
328	    "\t\t\tis specified as a complete path to a file or directory\n"
329	    "\t\t\ton a ZFS filesystem.\n"
330	    "\n"
331	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
332	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
333	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
334	    "\t\t\tthe poolname.\n");
335}
336
337static int
338iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
339    void *data)
340{
341	zfs_cmd_t zc = { 0 };
342	int ret;
343
344	while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
345		if ((ret = func((int)zc.zc_guid, zc.zc_name,
346		    &zc.zc_inject_record, data)) != 0)
347			return (ret);
348
349	if (errno != ENOENT) {
350		(void) fprintf(stderr, "Unable to list handlers: %s\n",
351		    strerror(errno));
352		return (-1);
353	}
354
355	return (0);
356}
357
358static int
359print_data_handler(int id, const char *pool, zinject_record_t *record,
360    void *data)
361{
362	int *count = data;
363
364	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
365		return (0);
366
367	if (*count == 0) {
368		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  ",
369		    "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
370		    "LVL", "DVAs", "RANGE");
371		(void) printf("---  ---------------  ------  "
372		    "------  --------  ---  ---- ----------------\n");
373	}
374
375	*count += 1;
376
377	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %-3d  0x%02x  ",
378	    id, pool, (u_longlong_t)record->zi_objset,
379	    (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
380	    record->zi_level, record->zi_dvas);
381
382	if (record->zi_start == 0 &&
383	    record->zi_end == -1ULL)
384		(void) printf("all\n");
385	else
386		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
387		    (u_longlong_t)record->zi_end);
388
389	return (0);
390}
391
392static int
393print_device_handler(int id, const char *pool, zinject_record_t *record,
394    void *data)
395{
396	int *count = data;
397
398	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
399		return (0);
400
401	if (record->zi_cmd == ZINJECT_DELAY_IO)
402		return (0);
403
404	if (*count == 0) {
405		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
406		(void) printf("---  ---------------  ----------------\n");
407	}
408
409	*count += 1;
410
411	(void) printf("%3d  %-15s  %llx\n", id, pool,
412	    (u_longlong_t)record->zi_guid);
413
414	return (0);
415}
416
417static int
418print_delay_handler(int id, const char *pool, zinject_record_t *record,
419    void *data)
420{
421	int *count = data;
422
423	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
424		return (0);
425
426	if (record->zi_cmd != ZINJECT_DELAY_IO)
427		return (0);
428
429	if (*count == 0) {
430		(void) printf("%3s  %-15s  %-15s  %-15s  %s\n",
431		    "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
432		(void) printf("---  ---------------  ---------------  "
433		    "---------------  ----------------\n");
434	}
435
436	*count += 1;
437
438	(void) printf("%3d  %-15s  %-15llu  %-15llu  %llx\n", id, pool,
439	    (u_longlong_t)NSEC2MSEC(record->zi_timer),
440	    (u_longlong_t)record->zi_nlanes,
441	    (u_longlong_t)record->zi_guid);
442
443	return (0);
444}
445
446static int
447print_panic_handler(int id, const char *pool, zinject_record_t *record,
448    void *data)
449{
450	int *count = data;
451
452	if (record->zi_func[0] == '\0')
453		return (0);
454
455	if (*count == 0) {
456		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
457		(void) printf("---  ---------------  ----------------\n");
458	}
459
460	*count += 1;
461
462	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
463
464	return (0);
465}
466
467/*
468 * Print all registered error handlers.  Returns the number of handlers
469 * registered.
470 */
471static int
472print_all_handlers(void)
473{
474	int count = 0, total = 0;
475
476	(void) iter_handlers(print_device_handler, &count);
477	if (count > 0) {
478		total += count;
479		(void) printf("\n");
480		count = 0;
481	}
482
483	(void) iter_handlers(print_delay_handler, &count);
484	if (count > 0) {
485		total += count;
486		(void) printf("\n");
487		count = 0;
488	}
489
490	(void) iter_handlers(print_data_handler, &count);
491	if (count > 0) {
492		total += count;
493		(void) printf("\n");
494		count = 0;
495	}
496
497	(void) iter_handlers(print_panic_handler, &count);
498
499	return (count + total);
500}
501
502/* ARGSUSED */
503static int
504cancel_one_handler(int id, const char *pool, zinject_record_t *record,
505    void *data)
506{
507	zfs_cmd_t zc = { 0 };
508
509	zc.zc_guid = (uint64_t)id;
510
511	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
512		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
513		    id, strerror(errno));
514		return (1);
515	}
516
517	return (0);
518}
519
520/*
521 * Remove all fault injection handlers.
522 */
523static int
524cancel_all_handlers(void)
525{
526	int ret = iter_handlers(cancel_one_handler, NULL);
527
528	if (ret == 0)
529		(void) printf("removed all registered handlers\n");
530
531	return (ret);
532}
533
534/*
535 * Remove a specific fault injection handler.
536 */
537static int
538cancel_handler(int id)
539{
540	zfs_cmd_t zc = { 0 };
541
542	zc.zc_guid = (uint64_t)id;
543
544	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
545		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
546		    id, strerror(errno));
547		return (1);
548	}
549
550	(void) printf("removed handler %d\n", id);
551
552	return (0);
553}
554
555/*
556 * Register a new fault injection handler.
557 */
558static int
559register_handler(const char *pool, int flags, zinject_record_t *record,
560    int quiet)
561{
562	zfs_cmd_t zc = { 0 };
563
564	(void) strcpy(zc.zc_name, pool);
565	zc.zc_inject_record = *record;
566	zc.zc_guid = flags;
567
568	if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
569		(void) fprintf(stderr, "failed to add handler: %s\n",
570		    errno == EDOM ? "block level exceeds max level of object" :
571		    strerror(errno));
572		return (1);
573	}
574
575	if (flags & ZINJECT_NULL)
576		return (0);
577
578	if (quiet) {
579		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
580	} else {
581		(void) printf("Added handler %llu with the following "
582		    "properties:\n", (u_longlong_t)zc.zc_guid);
583		(void) printf("  pool: %s\n", pool);
584		if (record->zi_guid) {
585			(void) printf("  vdev: %llx\n",
586			    (u_longlong_t)record->zi_guid);
587		} else if (record->zi_func[0] != '\0') {
588			(void) printf("  panic function: %s\n",
589			    record->zi_func);
590		} else if (record->zi_duration > 0) {
591			(void) printf(" time: %lld seconds\n",
592			    (u_longlong_t)record->zi_duration);
593		} else if (record->zi_duration < 0) {
594			(void) printf(" txgs: %lld \n",
595			    (u_longlong_t)-record->zi_duration);
596		} else {
597			(void) printf("objset: %llu\n",
598			    (u_longlong_t)record->zi_objset);
599			(void) printf("object: %llu\n",
600			    (u_longlong_t)record->zi_object);
601			(void) printf("  type: %llu\n",
602			    (u_longlong_t)record->zi_type);
603			(void) printf(" level: %d\n", record->zi_level);
604			if (record->zi_start == 0 &&
605			    record->zi_end == -1ULL)
606				(void) printf(" range: all\n");
607			else
608				(void) printf(" range: [%llu, %llu)\n",
609				    (u_longlong_t)record->zi_start,
610				    (u_longlong_t)record->zi_end);
611			(void) printf("  dvas: 0x%x\n", record->zi_dvas);
612		}
613	}
614
615	return (0);
616}
617
618int
619perform_action(const char *pool, zinject_record_t *record, int cmd)
620{
621	zfs_cmd_t zc = { 0 };
622
623	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
624	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
625	zc.zc_guid = record->zi_guid;
626	zc.zc_cookie = cmd;
627
628	if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
629		return (0);
630
631	return (1);
632}
633
634static int
635parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
636{
637	unsigned long scan_delay;
638	unsigned long scan_nlanes;
639
640	if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
641		return (1);
642
643	/*
644	 * We explicitly disallow a delay of zero here, because we key
645	 * off this value being non-zero in translate_device(), to
646	 * determine if the fault is a ZINJECT_DELAY_IO fault or not.
647	 */
648	if (scan_delay == 0)
649		return (1);
650
651	/*
652	 * The units for the CLI delay parameter is milliseconds, but
653	 * the data passed to the kernel is interpreted as nanoseconds.
654	 * Thus we scale the milliseconds to nanoseconds here, and this
655	 * nanosecond value is used to pass the delay to the kernel.
656	 */
657	*delay = MSEC2NSEC(scan_delay);
658	*nlanes = scan_nlanes;
659
660	return (0);
661}
662
663static int
664parse_frequency(const char *str, uint32_t *percent)
665{
666	double val;
667	char *post;
668
669	val = strtod(str, &post);
670	if (post == NULL || *post != '\0')
671		return (EINVAL);
672
673	/* valid range is [0.0001, 100.0] */
674	val /= 100.0f;
675	if (val < 0.000001f || val > 1.0f)
676		return (ERANGE);
677
678	/* convert to an integer for use by kernel */
679	*percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX));
680
681	return (0);
682}
683
684/*
685 * This function converts a string specifier for DVAs into a bit mask.
686 * The dva's provided by the user should be 0 indexed and separated by
687 * a comma. For example:
688 *     "1"     -> 0b0010  (0x2)
689 *     "0,1"   -> 0b0011  (0x3)
690 *     "0,1,2" -> 0b0111  (0x7)
691 */
692static int
693parse_dvas(const char *str, uint32_t *dvas_out)
694{
695	const char *c = str;
696	uint32_t mask = 0;
697	boolean_t need_delim = B_FALSE;
698
699	/* max string length is 5 ("0,1,2") */
700	if (strlen(str) > 5 || strlen(str) == 0)
701		return (EINVAL);
702
703	while (*c != '\0') {
704		switch (*c) {
705		case '0':
706		case '1':
707		case '2':
708			/* check for pipe between DVAs */
709			if (need_delim)
710				return (EINVAL);
711
712			/* check if this DVA has been set already */
713			if (mask & (1 << ((*c) - '0')))
714				return (EINVAL);
715
716			mask |= (1 << ((*c) - '0'));
717			need_delim = B_TRUE;
718			break;
719		case ',':
720			need_delim = B_FALSE;
721			break;
722		default:
723			/* check for invalid character */
724			return (EINVAL);
725		}
726		c++;
727	}
728
729	/* check for dangling delimiter */
730	if (!need_delim)
731		return (EINVAL);
732
733	*dvas_out = mask;
734	return (0);
735}
736
737int
738main(int argc, char **argv)
739{
740	int c;
741	char *range = NULL;
742	char *cancel = NULL;
743	char *end;
744	char *raw = NULL;
745	char *device = NULL;
746	int level = 0;
747	int quiet = 0;
748	int error = 0;
749	int domount = 0;
750	int io_type = ZIO_TYPES;
751	int action = VDEV_STATE_UNKNOWN;
752	err_type_t type = TYPE_INVAL;
753	err_type_t label = TYPE_INVAL;
754	zinject_record_t record = { 0 };
755	char pool[MAXNAMELEN];
756	char dataset[MAXNAMELEN];
757	zfs_handle_t *zhp;
758	int nowrites = 0;
759	int dur_txg = 0;
760	int dur_secs = 0;
761	int ret;
762	int flags = 0;
763	uint32_t dvas = 0;
764
765	if ((g_zfs = libzfs_init()) == NULL) {
766		(void) fprintf(stderr, "internal error: failed to "
767		    "initialize ZFS library\n");
768		return (1);
769	}
770
771	libzfs_print_on_error(g_zfs, B_TRUE);
772
773	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
774		(void) fprintf(stderr, "failed to open ZFS device\n");
775		libzfs_fini(g_zfs);
776		return (1);
777	}
778
779	if (argc == 1) {
780		/*
781		 * No arguments.  Print the available handlers.  If there are no
782		 * available handlers, direct the user to '-h' for help
783		 * information.
784		 */
785		if (print_all_handlers() == 0) {
786			(void) printf("No handlers registered.\n");
787			(void) printf("Run 'zinject -h' for usage "
788			    "information.\n");
789		}
790
791		libzfs_fini(g_zfs);
792		return (0);
793	}
794
795	while ((c = getopt(argc, argv,
796	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
797		switch (c) {
798		case 'a':
799			flags |= ZINJECT_FLUSH_ARC;
800			break;
801		case 'A':
802			if (strcasecmp(optarg, "degrade") == 0) {
803				action = VDEV_STATE_DEGRADED;
804			} else if (strcasecmp(optarg, "fault") == 0) {
805				action = VDEV_STATE_FAULTED;
806			} else {
807				(void) fprintf(stderr, "invalid action '%s': "
808				    "must be 'degrade' or 'fault'\n", optarg);
809				usage();
810				libzfs_fini(g_zfs);
811				return (1);
812			}
813			break;
814		case 'b':
815			raw = optarg;
816			break;
817		case 'c':
818			cancel = optarg;
819			break;
820		case 'C':
821			ret = parse_dvas(optarg, &dvas);
822			if (ret != 0) {
823				(void) fprintf(stderr, "invalid DVA list '%s': "
824				    "DVAs should be 0 indexed and separated by "
825				    "commas.\n", optarg);
826				usage();
827				libzfs_fini(g_zfs);
828				return (1);
829			}
830			break;
831		case 'd':
832			device = optarg;
833			break;
834		case 'D':
835			ret = parse_delay(optarg, &record.zi_timer,
836			    &record.zi_nlanes);
837			if (ret != 0) {
838				(void) fprintf(stderr, "invalid i/o delay "
839				    "value: '%s'\n", optarg);
840				usage();
841				libzfs_fini(g_zfs);
842				return (1);
843			}
844			break;
845		case 'e':
846			if (strcasecmp(optarg, "io") == 0) {
847				error = EIO;
848			} else if (strcasecmp(optarg, "checksum") == 0) {
849				error = ECKSUM;
850			} else if (strcasecmp(optarg, "decrypt") == 0) {
851				error = EACCES;
852			} else if (strcasecmp(optarg, "nxio") == 0) {
853				error = ENXIO;
854			} else if (strcasecmp(optarg, "dtl") == 0) {
855				error = ECHILD;
856			} else {
857				(void) fprintf(stderr, "invalid error type "
858				    "'%s': must be 'io', 'checksum' or "
859				    "'nxio'\n", optarg);
860				usage();
861				return (1);
862			}
863			break;
864		case 'f':
865			ret = parse_frequency(optarg, &record.zi_freq);
866			if (ret != 0) {
867				(void) fprintf(stderr, "%sfrequency value must "
868				    "be in the range [0.0001, 100.0]\n",
869				    ret == EINVAL ? "invalid value: " :
870				    ret == ERANGE ? "out of range: " : "");
871				libzfs_fini(g_zfs);
872				return (1);
873			}
874			break;
875		case 'F':
876			record.zi_failfast = B_TRUE;
877			break;
878		case 'g':
879			dur_txg = 1;
880			record.zi_duration = (int)strtol(optarg, &end, 10);
881			if (record.zi_duration <= 0 || *end != '\0') {
882				(void) fprintf(stderr, "invalid duration '%s': "
883				    "must be a positive integer\n", optarg);
884				usage();
885				libzfs_fini(g_zfs);
886				return (1);
887			}
888			/* store duration of txgs as its negative */
889			record.zi_duration *= -1;
890			break;
891		case 'h':
892			usage();
893			libzfs_fini(g_zfs);
894			return (0);
895		case 'I':
896			/* default duration, if one hasn't yet been defined */
897			nowrites = 1;
898			if (dur_secs == 0 && dur_txg == 0)
899				record.zi_duration = 30;
900			break;
901		case 'l':
902			level = (int)strtol(optarg, &end, 10);
903			if (*end != '\0') {
904				(void) fprintf(stderr, "invalid level '%s': "
905				    "must be an integer\n", optarg);
906				usage();
907				libzfs_fini(g_zfs);
908				return (1);
909			}
910			break;
911		case 'm':
912			domount = 1;
913			break;
914		case 'p':
915			(void) strlcpy(record.zi_func, optarg,
916			    sizeof (record.zi_func));
917			record.zi_cmd = ZINJECT_PANIC;
918			break;
919		case 'q':
920			quiet = 1;
921			break;
922		case 'r':
923			range = optarg;
924			flags |= ZINJECT_CALC_RANGE;
925			break;
926		case 's':
927			dur_secs = 1;
928			record.zi_duration = (int)strtol(optarg, &end, 10);
929			if (record.zi_duration <= 0 || *end != '\0') {
930				(void) fprintf(stderr, "invalid duration '%s': "
931				    "must be a positive integer\n", optarg);
932				usage();
933				libzfs_fini(g_zfs);
934				return (1);
935			}
936			break;
937		case 'T':
938			if (strcasecmp(optarg, "read") == 0) {
939				io_type = ZIO_TYPE_READ;
940			} else if (strcasecmp(optarg, "write") == 0) {
941				io_type = ZIO_TYPE_WRITE;
942			} else if (strcasecmp(optarg, "free") == 0) {
943				io_type = ZIO_TYPE_FREE;
944			} else if (strcasecmp(optarg, "claim") == 0) {
945				io_type = ZIO_TYPE_CLAIM;
946			} else if (strcasecmp(optarg, "all") == 0) {
947				io_type = ZIO_TYPES;
948			} else {
949				(void) fprintf(stderr, "invalid I/O type "
950				    "'%s': must be 'read', 'write', 'free', "
951				    "'claim' or 'all'\n", optarg);
952				usage();
953				libzfs_fini(g_zfs);
954				return (1);
955			}
956			break;
957		case 't':
958			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
959			    !MOS_TYPE(type)) {
960				(void) fprintf(stderr, "invalid type '%s'\n",
961				    optarg);
962				usage();
963				libzfs_fini(g_zfs);
964				return (1);
965			}
966			break;
967		case 'u':
968			flags |= ZINJECT_UNLOAD_SPA;
969			break;
970		case 'L':
971			if ((label = name_to_type(optarg)) == TYPE_INVAL &&
972			    !LABEL_TYPE(type)) {
973				(void) fprintf(stderr, "invalid label type "
974				    "'%s'\n", optarg);
975				usage();
976				libzfs_fini(g_zfs);
977				return (1);
978			}
979			break;
980		case ':':
981			(void) fprintf(stderr, "option -%c requires an "
982			    "operand\n", optopt);
983			usage();
984			libzfs_fini(g_zfs);
985			return (1);
986		case '?':
987			(void) fprintf(stderr, "invalid option '%c'\n",
988			    optopt);
989			usage();
990			libzfs_fini(g_zfs);
991			return (2);
992		}
993	}
994
995	argc -= optind;
996	argv += optind;
997
998	if (record.zi_duration != 0)
999		record.zi_cmd = ZINJECT_IGNORED_WRITES;
1000
1001	if (cancel != NULL) {
1002		/*
1003		 * '-c' is invalid with any other options.
1004		 */
1005		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1006		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
1007		    record.zi_freq > 0 || dvas != 0) {
1008			(void) fprintf(stderr, "cancel (-c) incompatible with "
1009			    "any other options\n");
1010			usage();
1011			libzfs_fini(g_zfs);
1012			return (2);
1013		}
1014		if (argc != 0) {
1015			(void) fprintf(stderr, "extraneous argument to '-c'\n");
1016			usage();
1017			libzfs_fini(g_zfs);
1018			return (2);
1019		}
1020
1021		if (strcmp(cancel, "all") == 0) {
1022			return (cancel_all_handlers());
1023		} else {
1024			int id = (int)strtol(cancel, &end, 10);
1025			if (*end != '\0') {
1026				(void) fprintf(stderr, "invalid handle id '%s':"
1027				    " must be an integer or 'all'\n", cancel);
1028				usage();
1029				libzfs_fini(g_zfs);
1030				return (1);
1031			}
1032			return (cancel_handler(id));
1033		}
1034	}
1035
1036	if (device != NULL) {
1037		/*
1038		 * Device (-d) injection uses a completely different mechanism
1039		 * for doing injection, so handle it separately here.
1040		 */
1041		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1042		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
1043		    dvas != 0) {
1044			(void) fprintf(stderr, "device (-d) incompatible with "
1045			    "data error injection\n");
1046			usage();
1047			libzfs_fini(g_zfs);
1048			return (2);
1049		}
1050
1051		if (argc != 1) {
1052			(void) fprintf(stderr, "device (-d) injection requires "
1053			    "a single pool name\n");
1054			usage();
1055			libzfs_fini(g_zfs);
1056			return (2);
1057		}
1058
1059		(void) strlcpy(pool, argv[0], sizeof (pool));
1060		dataset[0] = '\0';
1061
1062		if (error == ECKSUM) {
1063			(void) fprintf(stderr, "device error type must be "
1064			    "'io' or 'nxio'\n");
1065			libzfs_fini(g_zfs);
1066			return (1);
1067		}
1068
1069		record.zi_iotype = io_type;
1070		if (translate_device(pool, device, label, &record) != 0) {
1071			libzfs_fini(g_zfs);
1072			return (1);
1073		}
1074		if (!error)
1075			error = ENXIO;
1076
1077		if (action != VDEV_STATE_UNKNOWN)
1078			return (perform_action(pool, &record, action));
1079
1080	} else if (raw != NULL) {
1081		if (range != NULL || type != TYPE_INVAL || level != 0 ||
1082		    record.zi_cmd != ZINJECT_UNINITIALIZED ||
1083		    record.zi_freq > 0 || dvas != 0) {
1084			(void) fprintf(stderr, "raw (-b) format with "
1085			    "any other options\n");
1086			usage();
1087			libzfs_fini(g_zfs);
1088			return (2);
1089		}
1090
1091		if (argc != 1) {
1092			(void) fprintf(stderr, "raw (-b) format expects a "
1093			    "single pool name\n");
1094			usage();
1095			libzfs_fini(g_zfs);
1096			return (2);
1097		}
1098
1099		(void) strlcpy(pool, argv[0], sizeof (pool));
1100		dataset[0] = '\0';
1101
1102		if (error == ENXIO) {
1103			(void) fprintf(stderr, "data error type must be "
1104			    "'checksum' or 'io'\n");
1105			libzfs_fini(g_zfs);
1106			return (1);
1107		}
1108
1109		record.zi_cmd = ZINJECT_DATA_FAULT;
1110		if (translate_raw(raw, &record) != 0) {
1111			libzfs_fini(g_zfs);
1112			return (1);
1113		}
1114		if (!error)
1115			error = EIO;
1116	} else if (record.zi_cmd == ZINJECT_PANIC) {
1117		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1118		    level != 0 || device != NULL || record.zi_freq > 0 ||
1119		    dvas != 0) {
1120			(void) fprintf(stderr, "panic (-p) incompatible with "
1121			    "other options\n");
1122			usage();
1123			libzfs_fini(g_zfs);
1124			return (2);
1125		}
1126
1127		if (argc < 1 || argc > 2) {
1128			(void) fprintf(stderr, "panic (-p) injection requires "
1129			    "a single pool name and an optional id\n");
1130			usage();
1131			libzfs_fini(g_zfs);
1132			return (2);
1133		}
1134
1135		(void) strlcpy(pool, argv[0], sizeof (pool));
1136		if (argv[1] != NULL)
1137			record.zi_type = atoi(argv[1]);
1138		dataset[0] = '\0';
1139	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
1140		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1141		    level != 0 || record.zi_freq > 0 || dvas != 0) {
1142			(void) fprintf(stderr, "hardware failure (-I) "
1143			    "incompatible with other options\n");
1144			usage();
1145			libzfs_fini(g_zfs);
1146			return (2);
1147		}
1148
1149		if (nowrites == 0) {
1150			(void) fprintf(stderr, "-s or -g meaningless "
1151			    "without -I (ignore writes)\n");
1152			usage();
1153			libzfs_fini(g_zfs);
1154			return (2);
1155		} else if (dur_secs && dur_txg) {
1156			(void) fprintf(stderr, "choose a duration either "
1157			    "in seconds (-s) or a number of txgs (-g) "
1158			    "but not both\n");
1159			usage();
1160			libzfs_fini(g_zfs);
1161			return (2);
1162		} else if (argc != 1) {
1163			(void) fprintf(stderr, "ignore writes (-I) "
1164			    "injection requires a single pool name\n");
1165			usage();
1166			libzfs_fini(g_zfs);
1167			return (2);
1168		}
1169
1170		(void) strlcpy(pool, argv[0], sizeof (pool));
1171		dataset[0] = '\0';
1172	} else if (type == TYPE_INVAL) {
1173		if (flags == 0) {
1174			(void) fprintf(stderr, "at least one of '-b', '-d', "
1175			    "'-t', '-a', '-p', '-I' or '-u' "
1176			    "must be specified\n");
1177			usage();
1178			libzfs_fini(g_zfs);
1179			return (2);
1180		}
1181
1182		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
1183			(void) strlcpy(pool, argv[0], sizeof (pool));
1184			dataset[0] = '\0';
1185		} else if (argc != 0) {
1186			(void) fprintf(stderr, "extraneous argument for "
1187			    "'-f'\n");
1188			usage();
1189			libzfs_fini(g_zfs);
1190			return (2);
1191		}
1192
1193		flags |= ZINJECT_NULL;
1194	} else {
1195		if (argc != 1) {
1196			(void) fprintf(stderr, "missing object\n");
1197			usage();
1198			libzfs_fini(g_zfs);
1199			return (2);
1200		}
1201
1202		if (error == ENXIO) {
1203			(void) fprintf(stderr, "data error type must be "
1204			    "'checksum' or 'io'\n");
1205			libzfs_fini(g_zfs);
1206			return (1);
1207		}
1208
1209		if (dvas != 0) {
1210			if (error == EACCES || error == EINVAL) {
1211				(void) fprintf(stderr, "the '-C' option may "
1212				    "not be used with logical data errors "
1213				    "'decrypt' and 'decompress'\n");
1214				record.zi_dvas = dvas;
1215			}
1216		}
1217
1218		record.zi_cmd = ZINJECT_DATA_FAULT;
1219
1220		if (error == EACCES) {
1221			if (type != TYPE_DATA) {
1222				(void) fprintf(stderr, "decryption errors "
1223				    "may only be injected for 'data' types\n");
1224				libzfs_fini(g_zfs);
1225				return (1);
1226			}
1227
1228			record.zi_cmd = ZINJECT_DECRYPT_FAULT;
1229			/*
1230			 * Internally, ZFS actually uses ECKSUM for decryption
1231			 * errors since EACCES is used to indicate the key was
1232			 * not found.
1233			 */
1234			error = ECKSUM;
1235		}
1236
1237		if (translate_record(type, argv[0], range, level, &record, pool,
1238		    dataset) != 0) {
1239			libzfs_fini(g_zfs);
1240			return (1);
1241		}
1242		if (!error)
1243			error = EIO;
1244	}
1245
1246	/*
1247	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
1248	 * unload the pool, so that we trigger spa-wide reopen of metadata next
1249	 * time we access the pool.
1250	 */
1251	if (dataset[0] != '\0' && domount) {
1252		if ((zhp = zfs_open(g_zfs, dataset,
1253		    ZFS_TYPE_DATASET)) == NULL) {
1254			libzfs_fini(g_zfs);
1255			return (1);
1256		}
1257
1258		if (zfs_unmount(zhp, NULL, 0) != 0) {
1259			libzfs_fini(g_zfs);
1260			return (1);
1261		}
1262	}
1263
1264	record.zi_error = error;
1265
1266	ret = register_handler(pool, flags, &record, quiet);
1267
1268	if (dataset[0] != '\0' && domount)
1269		ret = (zfs_mount(zhp, NULL, 0) != 0);
1270
1271	libzfs_fini(g_zfs);
1272
1273	return (ret);
1274}
1275