1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/vdev_impl.h>
31#include <sys/vdev_disk.h>
32#include <sys/vdev_file.h>
33#include <sys/vdev_raidz.h>
34#include <sys/zio.h>
35#include <sys/zio_checksum.h>
36#include <sys/fs/zfs.h>
37#include <sys/fm/fs/zfs.h>
38
39/*
40 * Virtual device vector for RAID-Z.
41 *
42 * This vdev supports single, double, and triple parity. For single parity,
43 * we use a simple XOR of all the data columns. For double or triple parity,
44 * we use a special case of Reed-Solomon coding. This extends the
45 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
46 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
47 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
48 * former is also based. The latter is designed to provide higher performance
49 * for writes.
50 *
51 * Note that the Plank paper claimed to support arbitrary N+M, but was then
52 * amended six years later identifying a critical flaw that invalidates its
53 * claims. Nevertheless, the technique can be adapted to work for up to
54 * triple parity. For additional parity, the amendment "Note: Correction to
55 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
56 * is viable, but the additional complexity means that write performance will
57 * suffer.
58 *
59 * All of the methods above operate on a Galois field, defined over the
60 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
61 * can be expressed with a single byte. Briefly, the operations on the
62 * field are defined as follows:
63 *
64 *   o addition (+) is represented by a bitwise XOR
65 *   o subtraction (-) is therefore identical to addition: A + B = A - B
66 *   o multiplication of A by 2 is defined by the following bitwise expression:
67 *
68 *	(A * 2)_7 = A_6
69 *	(A * 2)_6 = A_5
70 *	(A * 2)_5 = A_4
71 *	(A * 2)_4 = A_3 + A_7
72 *	(A * 2)_3 = A_2 + A_7
73 *	(A * 2)_2 = A_1 + A_7
74 *	(A * 2)_1 = A_0
75 *	(A * 2)_0 = A_7
76 *
77 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
78 * As an aside, this multiplication is derived from the error correcting
79 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
80 *
81 * Observe that any number in the field (except for 0) can be expressed as a
82 * power of 2 -- a generator for the field. We store a table of the powers of
83 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
84 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
85 * than field addition). The inverse of a field element A (A^-1) is therefore
86 * A ^ (255 - 1) = A^254.
87 *
88 * The up-to-three parity columns, P, Q, R over several data columns,
89 * D_0, ... D_n-1, can be expressed by field operations:
90 *
91 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
92 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
93 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
94 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
95 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
96 *
97 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
98 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
99 * independent coefficients. (There are no additional coefficients that have
100 * this property which is why the uncorrected Plank method breaks down.)
101 *
102 * See the reconstruction code below for how P, Q and R can used individually
103 * or in concert to recover missing data columns.
104 */
105
106typedef struct raidz_col {
107	uint64_t rc_devidx;		/* child device index for I/O */
108	uint64_t rc_offset;		/* device offset */
109	uint64_t rc_size;		/* I/O size */
110	void *rc_data;			/* I/O data */
111	void *rc_gdata;			/* used to store the "good" version */
112	int rc_error;			/* I/O error for this device */
113	uint8_t rc_tried;		/* Did we attempt this I/O column? */
114	uint8_t rc_skipped;		/* Did we skip this I/O column? */
115} raidz_col_t;
116
117typedef struct raidz_map {
118	uint64_t rm_cols;		/* Regular column count */
119	uint64_t rm_scols;		/* Count including skipped columns */
120	uint64_t rm_bigcols;		/* Number of oversized columns */
121	uint64_t rm_asize;		/* Actual total I/O size */
122	uint64_t rm_missingdata;	/* Count of missing data devices */
123	uint64_t rm_missingparity;	/* Count of missing parity devices */
124	uint64_t rm_firstdatacol;	/* First data column/parity count */
125	uint64_t rm_nskip;		/* Skipped sectors for padding */
126	uint64_t rm_skipstart;		/* Column index of padding start */
127	void *rm_datacopy;		/* rm_asize-buffer of copied data */
128	uintptr_t rm_reports;		/* # of referencing checksum reports */
129	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
130	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
131	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
132} raidz_map_t;
133
134#define	VDEV_RAIDZ_P		0
135#define	VDEV_RAIDZ_Q		1
136#define	VDEV_RAIDZ_R		2
137
138#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
139#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
140
141/*
142 * We provide a mechanism to perform the field multiplication operation on a
143 * 64-bit value all at once rather than a byte at a time. This works by
144 * creating a mask from the top bit in each byte and using that to
145 * conditionally apply the XOR of 0x1d.
146 */
147#define	VDEV_RAIDZ_64MUL_2(x, mask) \
148{ \
149	(mask) = (x) & 0x8080808080808080ULL; \
150	(mask) = ((mask) << 1) - ((mask) >> 7); \
151	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
152	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
153}
154
155#define	VDEV_RAIDZ_64MUL_4(x, mask) \
156{ \
157	VDEV_RAIDZ_64MUL_2((x), mask); \
158	VDEV_RAIDZ_64MUL_2((x), mask); \
159}
160
161#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
162
163/*
164 * Force reconstruction to use the general purpose method.
165 */
166int vdev_raidz_default_to_general;
167
168/* Powers of 2 in the Galois field defined above. */
169static const uint8_t vdev_raidz_pow2[256] = {
170	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
171	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
172	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
173	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
174	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
175	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
176	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
177	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
178	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
179	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
180	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
181	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
182	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
183	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
184	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
185	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
186	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
187	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
188	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
189	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
190	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
191	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
192	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
193	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
194	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
195	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
196	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
197	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
198	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
199	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
200	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
201	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
202};
203/* Logs of 2 in the Galois field defined above. */
204static const uint8_t vdev_raidz_log2[256] = {
205	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
206	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
207	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
208	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
209	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
210	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
211	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
212	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
213	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
214	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
215	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
216	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
217	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
218	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
219	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
220	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
221	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
222	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
223	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
224	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
225	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
226	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
227	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
228	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
229	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
230	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
231	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
232	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
233	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
234	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
235	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
236	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
237};
238
239static void vdev_raidz_generate_parity(raidz_map_t *rm);
240
241/*
242 * Multiply a given number by 2 raised to the given power.
243 */
244static uint8_t
245vdev_raidz_exp2(uint_t a, int exp)
246{
247	if (a == 0)
248		return (0);
249
250	ASSERT(exp >= 0);
251	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
252
253	exp += vdev_raidz_log2[a];
254	if (exp > 255)
255		exp -= 255;
256
257	return (vdev_raidz_pow2[exp]);
258}
259
260static void
261vdev_raidz_map_free(raidz_map_t *rm)
262{
263	int c;
264	size_t size;
265
266	for (c = 0; c < rm->rm_firstdatacol; c++) {
267		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
268
269		if (rm->rm_col[c].rc_gdata != NULL)
270			zio_buf_free(rm->rm_col[c].rc_gdata,
271			    rm->rm_col[c].rc_size);
272	}
273
274	size = 0;
275	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
276		size += rm->rm_col[c].rc_size;
277
278	if (rm->rm_datacopy != NULL)
279		zio_buf_free(rm->rm_datacopy, size);
280
281	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
282}
283
284static void
285vdev_raidz_map_free_vsd(zio_t *zio)
286{
287	raidz_map_t *rm = zio->io_vsd;
288
289	ASSERT0(rm->rm_freed);
290	rm->rm_freed = 1;
291
292	if (rm->rm_reports == 0)
293		vdev_raidz_map_free(rm);
294}
295
296/*ARGSUSED*/
297static void
298vdev_raidz_cksum_free(void *arg, size_t ignored)
299{
300	raidz_map_t *rm = arg;
301
302	ASSERT3U(rm->rm_reports, >, 0);
303
304	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
305		vdev_raidz_map_free(rm);
306}
307
308static void
309vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
310{
311	raidz_map_t *rm = zcr->zcr_cbdata;
312	size_t c = zcr->zcr_cbinfo;
313	size_t x;
314
315	const char *good = NULL;
316	const char *bad = rm->rm_col[c].rc_data;
317
318	if (good_data == NULL) {
319		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
320		return;
321	}
322
323	if (c < rm->rm_firstdatacol) {
324		/*
325		 * The first time through, calculate the parity blocks for
326		 * the good data (this relies on the fact that the good
327		 * data never changes for a given logical ZIO)
328		 */
329		if (rm->rm_col[0].rc_gdata == NULL) {
330			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
331			char *buf;
332
333			/*
334			 * Set up the rm_col[]s to generate the parity for
335			 * good_data, first saving the parity bufs and
336			 * replacing them with buffers to hold the result.
337			 */
338			for (x = 0; x < rm->rm_firstdatacol; x++) {
339				bad_parity[x] = rm->rm_col[x].rc_data;
340				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
341				    zio_buf_alloc(rm->rm_col[x].rc_size);
342			}
343
344			/* fill in the data columns from good_data */
345			buf = (char *)good_data;
346			for (; x < rm->rm_cols; x++) {
347				rm->rm_col[x].rc_data = buf;
348				buf += rm->rm_col[x].rc_size;
349			}
350
351			/*
352			 * Construct the parity from the good data.
353			 */
354			vdev_raidz_generate_parity(rm);
355
356			/* restore everything back to its original state */
357			for (x = 0; x < rm->rm_firstdatacol; x++)
358				rm->rm_col[x].rc_data = bad_parity[x];
359
360			buf = rm->rm_datacopy;
361			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
362				rm->rm_col[x].rc_data = buf;
363				buf += rm->rm_col[x].rc_size;
364			}
365		}
366
367		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
368		good = rm->rm_col[c].rc_gdata;
369	} else {
370		/* adjust good_data to point at the start of our column */
371		good = good_data;
372
373		for (x = rm->rm_firstdatacol; x < c; x++)
374			good += rm->rm_col[x].rc_size;
375	}
376
377	/* we drop the ereport if it ends up that the data was good */
378	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
379}
380
381/*
382 * Invoked indirectly by zfs_ereport_start_checksum(), called
383 * below when our read operation fails completely.  The main point
384 * is to keep a copy of everything we read from disk, so that at
385 * vdev_raidz_cksum_finish() time we can compare it with the good data.
386 */
387static void
388vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
389{
390	size_t c = (size_t)(uintptr_t)arg;
391	caddr_t buf;
392
393	raidz_map_t *rm = zio->io_vsd;
394	size_t size;
395
396	/* set up the report and bump the refcount  */
397	zcr->zcr_cbdata = rm;
398	zcr->zcr_cbinfo = c;
399	zcr->zcr_finish = vdev_raidz_cksum_finish;
400	zcr->zcr_free = vdev_raidz_cksum_free;
401
402	rm->rm_reports++;
403	ASSERT3U(rm->rm_reports, >, 0);
404
405	if (rm->rm_datacopy != NULL)
406		return;
407
408	/*
409	 * It's the first time we're called for this raidz_map_t, so we need
410	 * to copy the data aside; there's no guarantee that our zio's buffer
411	 * won't be re-used for something else.
412	 *
413	 * Our parity data is already in separate buffers, so there's no need
414	 * to copy them.
415	 */
416
417	size = 0;
418	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
419		size += rm->rm_col[c].rc_size;
420
421	buf = rm->rm_datacopy = zio_buf_alloc(size);
422
423	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
424		raidz_col_t *col = &rm->rm_col[c];
425
426		bcopy(col->rc_data, buf, col->rc_size);
427		col->rc_data = buf;
428
429		buf += col->rc_size;
430	}
431	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
432}
433
434static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
435	vdev_raidz_map_free_vsd,
436	vdev_raidz_cksum_report
437};
438
439/*
440 * Divides the IO evenly across all child vdevs; usually, dcols is
441 * the number of children in the target vdev.
442 */
443static raidz_map_t *
444vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
445    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
446{
447	raidz_map_t *rm;
448	/* The starting RAIDZ (parent) vdev sector of the block. */
449	uint64_t b = offset >> unit_shift;
450	/* The zio's size in units of the vdev's minimum sector size. */
451	uint64_t s = size >> unit_shift;
452	/* The first column for this stripe. */
453	uint64_t f = b % dcols;
454	/* The starting byte offset on each child vdev. */
455	uint64_t o = (b / dcols) << unit_shift;
456	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
457
458	/*
459	 * "Quotient": The number of data sectors for this stripe on all but
460	 * the "big column" child vdevs that also contain "remainder" data.
461	 */
462	q = s / (dcols - nparity);
463
464	/*
465	 * "Remainder": The number of partial stripe data sectors in this I/O.
466	 * This will add a sector to some, but not all, child vdevs.
467	 */
468	r = s - q * (dcols - nparity);
469
470	/* The number of "big columns" - those which contain remainder data. */
471	bc = (r == 0 ? 0 : r + nparity);
472
473	/*
474	 * The total number of data and parity sectors associated with
475	 * this I/O.
476	 */
477	tot = s + nparity * (q + (r == 0 ? 0 : 1));
478
479	/* acols: The columns that will be accessed. */
480	/* scols: The columns that will be accessed or skipped. */
481	if (q == 0) {
482		/* Our I/O request doesn't span all child vdevs. */
483		acols = bc;
484		scols = MIN(dcols, roundup(bc, nparity + 1));
485	} else {
486		acols = dcols;
487		scols = dcols;
488	}
489
490	ASSERT3U(acols, <=, scols);
491
492	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
493
494	rm->rm_cols = acols;
495	rm->rm_scols = scols;
496	rm->rm_bigcols = bc;
497	rm->rm_skipstart = bc;
498	rm->rm_missingdata = 0;
499	rm->rm_missingparity = 0;
500	rm->rm_firstdatacol = nparity;
501	rm->rm_datacopy = NULL;
502	rm->rm_reports = 0;
503	rm->rm_freed = 0;
504	rm->rm_ecksuminjected = 0;
505
506	asize = 0;
507
508	for (c = 0; c < scols; c++) {
509		col = f + c;
510		coff = o;
511		if (col >= dcols) {
512			col -= dcols;
513			coff += 1ULL << unit_shift;
514		}
515		rm->rm_col[c].rc_devidx = col;
516		rm->rm_col[c].rc_offset = coff;
517		rm->rm_col[c].rc_data = NULL;
518		rm->rm_col[c].rc_gdata = NULL;
519		rm->rm_col[c].rc_error = 0;
520		rm->rm_col[c].rc_tried = 0;
521		rm->rm_col[c].rc_skipped = 0;
522
523		if (c >= acols)
524			rm->rm_col[c].rc_size = 0;
525		else if (c < bc)
526			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
527		else
528			rm->rm_col[c].rc_size = q << unit_shift;
529
530		asize += rm->rm_col[c].rc_size;
531	}
532
533	ASSERT3U(asize, ==, tot << unit_shift);
534	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
535	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
536	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
537	ASSERT3U(rm->rm_nskip, <=, nparity);
538
539	for (c = 0; c < rm->rm_firstdatacol; c++)
540		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
541
542	rm->rm_col[c].rc_data = data;
543
544	for (c = c + 1; c < acols; c++)
545		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
546		    rm->rm_col[c - 1].rc_size;
547
548	/*
549	 * If all data stored spans all columns, there's a danger that parity
550	 * will always be on the same device and, since parity isn't read
551	 * during normal operation, that that device's I/O bandwidth won't be
552	 * used effectively. We therefore switch the parity every 1MB.
553	 *
554	 * ... at least that was, ostensibly, the theory. As a practical
555	 * matter unless we juggle the parity between all devices evenly, we
556	 * won't see any benefit. Further, occasional writes that aren't a
557	 * multiple of the LCM of the number of children and the minimum
558	 * stripe width are sufficient to avoid pessimal behavior.
559	 * Unfortunately, this decision created an implicit on-disk format
560	 * requirement that we need to support for all eternity, but only
561	 * for single-parity RAID-Z.
562	 *
563	 * If we intend to skip a sector in the zeroth column for padding
564	 * we must make sure to note this swap. We will never intend to
565	 * skip the first column since at least one data and one parity
566	 * column must appear in each row.
567	 */
568	ASSERT(rm->rm_cols >= 2);
569	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
570
571	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
572		devidx = rm->rm_col[0].rc_devidx;
573		o = rm->rm_col[0].rc_offset;
574		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
575		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
576		rm->rm_col[1].rc_devidx = devidx;
577		rm->rm_col[1].rc_offset = o;
578
579		if (rm->rm_skipstart == 0)
580			rm->rm_skipstart = 1;
581	}
582
583	return (rm);
584}
585
586static void
587vdev_raidz_generate_parity_p(raidz_map_t *rm)
588{
589	uint64_t *p, *src, pcount, ccount, i;
590	int c;
591
592	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
593
594	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
595		src = rm->rm_col[c].rc_data;
596		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
597		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
598
599		if (c == rm->rm_firstdatacol) {
600			ASSERT(ccount == pcount);
601			for (i = 0; i < ccount; i++, src++, p++) {
602				*p = *src;
603			}
604		} else {
605			ASSERT(ccount <= pcount);
606			for (i = 0; i < ccount; i++, src++, p++) {
607				*p ^= *src;
608			}
609		}
610	}
611}
612
613static void
614vdev_raidz_generate_parity_pq(raidz_map_t *rm)
615{
616	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
617	int c;
618
619	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
620	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
621	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
622
623	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
624		src = rm->rm_col[c].rc_data;
625		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
626		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
627
628		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
629
630		if (c == rm->rm_firstdatacol) {
631			ASSERT(ccnt == pcnt || ccnt == 0);
632			for (i = 0; i < ccnt; i++, src++, p++, q++) {
633				*p = *src;
634				*q = *src;
635			}
636			for (; i < pcnt; i++, src++, p++, q++) {
637				*p = 0;
638				*q = 0;
639			}
640		} else {
641			ASSERT(ccnt <= pcnt);
642
643			/*
644			 * Apply the algorithm described above by multiplying
645			 * the previous result and adding in the new value.
646			 */
647			for (i = 0; i < ccnt; i++, src++, p++, q++) {
648				*p ^= *src;
649
650				VDEV_RAIDZ_64MUL_2(*q, mask);
651				*q ^= *src;
652			}
653
654			/*
655			 * Treat short columns as though they are full of 0s.
656			 * Note that there's therefore nothing needed for P.
657			 */
658			for (; i < pcnt; i++, q++) {
659				VDEV_RAIDZ_64MUL_2(*q, mask);
660			}
661		}
662	}
663}
664
665static void
666vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
667{
668	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
669	int c;
670
671	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
672	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
673	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
674	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
675	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
676
677	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678		src = rm->rm_col[c].rc_data;
679		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
680		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
681		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
682
683		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
684
685		if (c == rm->rm_firstdatacol) {
686			ASSERT(ccnt == pcnt || ccnt == 0);
687			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
688				*p = *src;
689				*q = *src;
690				*r = *src;
691			}
692			for (; i < pcnt; i++, src++, p++, q++, r++) {
693				*p = 0;
694				*q = 0;
695				*r = 0;
696			}
697		} else {
698			ASSERT(ccnt <= pcnt);
699
700			/*
701			 * Apply the algorithm described above by multiplying
702			 * the previous result and adding in the new value.
703			 */
704			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
705				*p ^= *src;
706
707				VDEV_RAIDZ_64MUL_2(*q, mask);
708				*q ^= *src;
709
710				VDEV_RAIDZ_64MUL_4(*r, mask);
711				*r ^= *src;
712			}
713
714			/*
715			 * Treat short columns as though they are full of 0s.
716			 * Note that there's therefore nothing needed for P.
717			 */
718			for (; i < pcnt; i++, q++, r++) {
719				VDEV_RAIDZ_64MUL_2(*q, mask);
720				VDEV_RAIDZ_64MUL_4(*r, mask);
721			}
722		}
723	}
724}
725
726/*
727 * Generate RAID parity in the first virtual columns according to the number of
728 * parity columns available.
729 */
730static void
731vdev_raidz_generate_parity(raidz_map_t *rm)
732{
733	switch (rm->rm_firstdatacol) {
734	case 1:
735		vdev_raidz_generate_parity_p(rm);
736		break;
737	case 2:
738		vdev_raidz_generate_parity_pq(rm);
739		break;
740	case 3:
741		vdev_raidz_generate_parity_pqr(rm);
742		break;
743	default:
744		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
745	}
746}
747
748static int
749vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
750{
751	uint64_t *dst, *src, xcount, ccount, count, i;
752	int x = tgts[0];
753	int c;
754
755	ASSERT(ntgts == 1);
756	ASSERT(x >= rm->rm_firstdatacol);
757	ASSERT(x < rm->rm_cols);
758
759	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
760	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
761	ASSERT(xcount > 0);
762
763	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
764	dst = rm->rm_col[x].rc_data;
765	for (i = 0; i < xcount; i++, dst++, src++) {
766		*dst = *src;
767	}
768
769	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
770		src = rm->rm_col[c].rc_data;
771		dst = rm->rm_col[x].rc_data;
772
773		if (c == x)
774			continue;
775
776		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
777		count = MIN(ccount, xcount);
778
779		for (i = 0; i < count; i++, dst++, src++) {
780			*dst ^= *src;
781		}
782	}
783
784	return (1 << VDEV_RAIDZ_P);
785}
786
787static int
788vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
789{
790	uint64_t *dst, *src, xcount, ccount, count, mask, i;
791	uint8_t *b;
792	int x = tgts[0];
793	int c, j, exp;
794
795	ASSERT(ntgts == 1);
796
797	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
798	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
799
800	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
801		src = rm->rm_col[c].rc_data;
802		dst = rm->rm_col[x].rc_data;
803
804		if (c == x)
805			ccount = 0;
806		else
807			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
808
809		count = MIN(ccount, xcount);
810
811		if (c == rm->rm_firstdatacol) {
812			for (i = 0; i < count; i++, dst++, src++) {
813				*dst = *src;
814			}
815			for (; i < xcount; i++, dst++) {
816				*dst = 0;
817			}
818
819		} else {
820			for (i = 0; i < count; i++, dst++, src++) {
821				VDEV_RAIDZ_64MUL_2(*dst, mask);
822				*dst ^= *src;
823			}
824
825			for (; i < xcount; i++, dst++) {
826				VDEV_RAIDZ_64MUL_2(*dst, mask);
827			}
828		}
829	}
830
831	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
832	dst = rm->rm_col[x].rc_data;
833	exp = 255 - (rm->rm_cols - 1 - x);
834
835	for (i = 0; i < xcount; i++, dst++, src++) {
836		*dst ^= *src;
837		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
838			*b = vdev_raidz_exp2(*b, exp);
839		}
840	}
841
842	return (1 << VDEV_RAIDZ_Q);
843}
844
845static int
846vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
847{
848	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
849	void *pdata, *qdata;
850	uint64_t xsize, ysize, i;
851	int x = tgts[0];
852	int y = tgts[1];
853
854	ASSERT(ntgts == 2);
855	ASSERT(x < y);
856	ASSERT(x >= rm->rm_firstdatacol);
857	ASSERT(y < rm->rm_cols);
858
859	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
860
861	/*
862	 * Move the parity data aside -- we're going to compute parity as
863	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
864	 * reuse the parity generation mechanism without trashing the actual
865	 * parity so we make those columns appear to be full of zeros by
866	 * setting their lengths to zero.
867	 */
868	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
869	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
870	xsize = rm->rm_col[x].rc_size;
871	ysize = rm->rm_col[y].rc_size;
872
873	rm->rm_col[VDEV_RAIDZ_P].rc_data =
874	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
875	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
876	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
877	rm->rm_col[x].rc_size = 0;
878	rm->rm_col[y].rc_size = 0;
879
880	vdev_raidz_generate_parity_pq(rm);
881
882	rm->rm_col[x].rc_size = xsize;
883	rm->rm_col[y].rc_size = ysize;
884
885	p = pdata;
886	q = qdata;
887	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
888	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
889	xd = rm->rm_col[x].rc_data;
890	yd = rm->rm_col[y].rc_data;
891
892	/*
893	 * We now have:
894	 *	Pxy = P + D_x + D_y
895	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
896	 *
897	 * We can then solve for D_x:
898	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
899	 * where
900	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
901	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
902	 *
903	 * With D_x in hand, we can easily solve for D_y:
904	 *	D_y = P + Pxy + D_x
905	 */
906
907	a = vdev_raidz_pow2[255 + x - y];
908	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
909	tmp = 255 - vdev_raidz_log2[a ^ 1];
910
911	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
912	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
913
914	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
915		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
916		    vdev_raidz_exp2(*q ^ *qxy, bexp);
917
918		if (i < ysize)
919			*yd = *p ^ *pxy ^ *xd;
920	}
921
922	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
923	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
924	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
925	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
926
927	/*
928	 * Restore the saved parity data.
929	 */
930	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
931	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
932
933	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
934}
935
936/* BEGIN CSTYLED */
937/*
938 * In the general case of reconstruction, we must solve the system of linear
939 * equations defined by the coeffecients used to generate parity as well as
940 * the contents of the data and parity disks. This can be expressed with
941 * vectors for the original data (D) and the actual data (d) and parity (p)
942 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
943 *
944 *            __   __                     __     __
945 *            |     |         __     __   |  p_0  |
946 *            |  V  |         |  D_0  |   | p_m-1 |
947 *            |     |    x    |   :   | = |  d_0  |
948 *            |  I  |         | D_n-1 |   |   :   |
949 *            |     |         ~~     ~~   | d_n-1 |
950 *            ~~   ~~                     ~~     ~~
951 *
952 * I is simply a square identity matrix of size n, and V is a vandermonde
953 * matrix defined by the coeffecients we chose for the various parity columns
954 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
955 * computation as well as linear separability.
956 *
957 *      __               __               __     __
958 *      |   1   ..  1 1 1 |               |  p_0  |
959 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
960 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
961 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
962 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
963 *      |   :       : : : |   |   :   |   |  d_2  |
964 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
965 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
966 *      |   0   ..  0 0 1 |               | d_n-1 |
967 *      ~~               ~~               ~~     ~~
968 *
969 * Note that I, V, d, and p are known. To compute D, we must invert the
970 * matrix and use the known data and parity values to reconstruct the unknown
971 * data values. We begin by removing the rows in V|I and d|p that correspond
972 * to failed or missing columns; we then make V|I square (n x n) and d|p
973 * sized n by removing rows corresponding to unused parity from the bottom up
974 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
975 * using Gauss-Jordan elimination. In the example below we use m=3 parity
976 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
977 *           __                               __
978 *           |  1   1   1   1   1   1   1   1  |
979 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
980 *           |  19 205 116  29  64  16  4   1  |      / /
981 *           |  1   0   0   0   0   0   0   0  |     / /
982 *           |  0   1   0   0   0   0   0   0  | <--' /
983 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
984 *           |  0   0   0   1   0   0   0   0  |
985 *           |  0   0   0   0   1   0   0   0  |
986 *           |  0   0   0   0   0   1   0   0  |
987 *           |  0   0   0   0   0   0   1   0  |
988 *           |  0   0   0   0   0   0   0   1  |
989 *           ~~                               ~~
990 *           __                               __
991 *           |  1   1   1   1   1   1   1   1  |
992 *           |  19 205 116  29  64  16  4   1  |
993 *           |  1   0   0   0   0   0   0   0  |
994 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
995 *           |  0   0   0   0   1   0   0   0  |
996 *           |  0   0   0   0   0   1   0   0  |
997 *           |  0   0   0   0   0   0   1   0  |
998 *           |  0   0   0   0   0   0   0   1  |
999 *           ~~                               ~~
1000 *
1001 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003 * matrix is not singular.
1004 * __                                                                 __
1005 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1006 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1007 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1008 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1009 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1010 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1011 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1012 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1013 * ~~                                                                 ~~
1014 * __                                                                 __
1015 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1016 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1019 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1020 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1021 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1022 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1023 * ~~                                                                 ~~
1024 * __                                                                 __
1025 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1026 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1027 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1028 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1029 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1030 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1031 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1032 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1033 * ~~                                                                 ~~
1034 * __                                                                 __
1035 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1036 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1037 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1038 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1039 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1040 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1041 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1042 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1043 * ~~                                                                 ~~
1044 * __                                                                 __
1045 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1046 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1047 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1048 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1049 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1050 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1051 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1052 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1053 * ~~                                                                 ~~
1054 * __                                                                 __
1055 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1056 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1057 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1058 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1059 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1060 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1061 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1062 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1063 * ~~                                                                 ~~
1064 *                   __                               __
1065 *                   |  0   0   1   0   0   0   0   0  |
1066 *                   | 167 100  5   41 159 169 217 208 |
1067 *                   | 166 100  4   40 158 168 216 209 |
1068 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1069 *                   |  0   0   0   0   1   0   0   0  |
1070 *                   |  0   0   0   0   0   1   0   0  |
1071 *                   |  0   0   0   0   0   0   1   0  |
1072 *                   |  0   0   0   0   0   0   0   1  |
1073 *                   ~~                               ~~
1074 *
1075 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076 * of the missing data.
1077 *
1078 * As is apparent from the example above, the only non-trivial rows in the
1079 * inverse matrix correspond to the data disks that we're trying to
1080 * reconstruct. Indeed, those are the only rows we need as the others would
1081 * only be useful for reconstructing data known or assumed to be valid. For
1082 * that reason, we only build the coefficients in the rows that correspond to
1083 * targeted columns.
1084 */
1085/* END CSTYLED */
1086
1087static void
1088vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089    uint8_t **rows)
1090{
1091	int i, j;
1092	int pow;
1093
1094	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095
1096	/*
1097	 * Fill in the missing rows of interest.
1098	 */
1099	for (i = 0; i < nmap; i++) {
1100		ASSERT3S(0, <=, map[i]);
1101		ASSERT3S(map[i], <=, 2);
1102
1103		pow = map[i] * n;
1104		if (pow > 255)
1105			pow -= 255;
1106		ASSERT(pow <= 255);
1107
1108		for (j = 0; j < n; j++) {
1109			pow -= map[i];
1110			if (pow < 0)
1111				pow += 255;
1112			rows[i][j] = vdev_raidz_pow2[pow];
1113		}
1114	}
1115}
1116
1117static void
1118vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120{
1121	int i, j, ii, jj;
1122	uint8_t log;
1123
1124	/*
1125	 * Assert that the first nmissing entries from the array of used
1126	 * columns correspond to parity columns and that subsequent entries
1127	 * correspond to data columns.
1128	 */
1129	for (i = 0; i < nmissing; i++) {
1130		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131	}
1132	for (; i < n; i++) {
1133		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134	}
1135
1136	/*
1137	 * First initialize the storage where we'll compute the inverse rows.
1138	 */
1139	for (i = 0; i < nmissing; i++) {
1140		for (j = 0; j < n; j++) {
1141			invrows[i][j] = (i == j) ? 1 : 0;
1142		}
1143	}
1144
1145	/*
1146	 * Subtract all trivial rows from the rows of consequence.
1147	 */
1148	for (i = 0; i < nmissing; i++) {
1149		for (j = nmissing; j < n; j++) {
1150			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151			jj = used[j] - rm->rm_firstdatacol;
1152			ASSERT3S(jj, <, n);
1153			invrows[i][j] = rows[i][jj];
1154			rows[i][jj] = 0;
1155		}
1156	}
1157
1158	/*
1159	 * For each of the rows of interest, we must normalize it and subtract
1160	 * a multiple of it from the other rows.
1161	 */
1162	for (i = 0; i < nmissing; i++) {
1163		for (j = 0; j < missing[i]; j++) {
1164			ASSERT0(rows[i][j]);
1165		}
1166		ASSERT3U(rows[i][missing[i]], !=, 0);
1167
1168		/*
1169		 * Compute the inverse of the first element and multiply each
1170		 * element in the row by that value.
1171		 */
1172		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173
1174		for (j = 0; j < n; j++) {
1175			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177		}
1178
1179		for (ii = 0; ii < nmissing; ii++) {
1180			if (i == ii)
1181				continue;
1182
1183			ASSERT3U(rows[ii][missing[i]], !=, 0);
1184
1185			log = vdev_raidz_log2[rows[ii][missing[i]]];
1186
1187			for (j = 0; j < n; j++) {
1188				rows[ii][j] ^=
1189				    vdev_raidz_exp2(rows[i][j], log);
1190				invrows[ii][j] ^=
1191				    vdev_raidz_exp2(invrows[i][j], log);
1192			}
1193		}
1194	}
1195
1196	/*
1197	 * Verify that the data that is left in the rows are properly part of
1198	 * an identity matrix.
1199	 */
1200	for (i = 0; i < nmissing; i++) {
1201		for (j = 0; j < n; j++) {
1202			if (j == missing[i]) {
1203				ASSERT3U(rows[i][j], ==, 1);
1204			} else {
1205				ASSERT0(rows[i][j]);
1206			}
1207		}
1208	}
1209}
1210
1211static void
1212vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213    int *missing, uint8_t **invrows, const uint8_t *used)
1214{
1215	int i, j, x, cc, c;
1216	uint8_t *src;
1217	uint64_t ccount;
1218	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220	uint8_t log = 0;
1221	uint8_t val;
1222	int ll;
1223	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224	uint8_t *p, *pp;
1225	size_t psize;
1226
1227	psize = sizeof (invlog[0][0]) * n * nmissing;
1228	p = kmem_alloc(psize, KM_SLEEP);
1229
1230	for (pp = p, i = 0; i < nmissing; i++) {
1231		invlog[i] = pp;
1232		pp += n;
1233	}
1234
1235	for (i = 0; i < nmissing; i++) {
1236		for (j = 0; j < n; j++) {
1237			ASSERT3U(invrows[i][j], !=, 0);
1238			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239		}
1240	}
1241
1242	for (i = 0; i < n; i++) {
1243		c = used[i];
1244		ASSERT3U(c, <, rm->rm_cols);
1245
1246		src = rm->rm_col[c].rc_data;
1247		ccount = rm->rm_col[c].rc_size;
1248		for (j = 0; j < nmissing; j++) {
1249			cc = missing[j] + rm->rm_firstdatacol;
1250			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251			ASSERT3U(cc, <, rm->rm_cols);
1252			ASSERT3U(cc, !=, c);
1253
1254			dst[j] = rm->rm_col[cc].rc_data;
1255			dcount[j] = rm->rm_col[cc].rc_size;
1256		}
1257
1258		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259
1260		for (x = 0; x < ccount; x++, src++) {
1261			if (*src != 0)
1262				log = vdev_raidz_log2[*src];
1263
1264			for (cc = 0; cc < nmissing; cc++) {
1265				if (x >= dcount[cc])
1266					continue;
1267
1268				if (*src == 0) {
1269					val = 0;
1270				} else {
1271					if ((ll = log + invlog[cc][i]) >= 255)
1272						ll -= 255;
1273					val = vdev_raidz_pow2[ll];
1274				}
1275
1276				if (i == 0)
1277					dst[cc][x] = val;
1278				else
1279					dst[cc][x] ^= val;
1280			}
1281		}
1282	}
1283
1284	kmem_free(p, psize);
1285}
1286
1287static int
1288vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289{
1290	int n, i, c, t, tt;
1291	int nmissing_rows;
1292	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293	int parity_map[VDEV_RAIDZ_MAXPARITY];
1294
1295	uint8_t *p, *pp;
1296	size_t psize;
1297
1298	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300	uint8_t *used;
1301
1302	int code = 0;
1303
1304
1305	n = rm->rm_cols - rm->rm_firstdatacol;
1306
1307	/*
1308	 * Figure out which data columns are missing.
1309	 */
1310	nmissing_rows = 0;
1311	for (t = 0; t < ntgts; t++) {
1312		if (tgts[t] >= rm->rm_firstdatacol) {
1313			missing_rows[nmissing_rows++] =
1314			    tgts[t] - rm->rm_firstdatacol;
1315		}
1316	}
1317
1318	/*
1319	 * Figure out which parity columns to use to help generate the missing
1320	 * data columns.
1321	 */
1322	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323		ASSERT(tt < ntgts);
1324		ASSERT(c < rm->rm_firstdatacol);
1325
1326		/*
1327		 * Skip any targeted parity columns.
1328		 */
1329		if (c == tgts[tt]) {
1330			tt++;
1331			continue;
1332		}
1333
1334		code |= 1 << c;
1335
1336		parity_map[i] = c;
1337		i++;
1338	}
1339
1340	ASSERT(code != 0);
1341	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342
1343	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344	    nmissing_rows * n + sizeof (used[0]) * n;
1345	p = kmem_alloc(psize, KM_SLEEP);
1346
1347	for (pp = p, i = 0; i < nmissing_rows; i++) {
1348		rows[i] = pp;
1349		pp += n;
1350		invrows[i] = pp;
1351		pp += n;
1352	}
1353	used = pp;
1354
1355	for (i = 0; i < nmissing_rows; i++) {
1356		used[i] = parity_map[i];
1357	}
1358
1359	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360		if (tt < nmissing_rows &&
1361		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1362			tt++;
1363			continue;
1364		}
1365
1366		ASSERT3S(i, <, n);
1367		used[i] = c;
1368		i++;
1369	}
1370
1371	/*
1372	 * Initialize the interesting rows of the matrix.
1373	 */
1374	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375
1376	/*
1377	 * Invert the matrix.
1378	 */
1379	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380	    invrows, used);
1381
1382	/*
1383	 * Reconstruct the missing data using the generated matrix.
1384	 */
1385	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386	    invrows, used);
1387
1388	kmem_free(p, psize);
1389
1390	return (code);
1391}
1392
1393static int
1394vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395{
1396	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397	int ntgts;
1398	int i, c;
1399	int code;
1400	int nbadparity, nbaddata;
1401	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402
1403	/*
1404	 * The tgts list must already be sorted.
1405	 */
1406	for (i = 1; i < nt; i++) {
1407		ASSERT(t[i] > t[i - 1]);
1408	}
1409
1410	nbadparity = rm->rm_firstdatacol;
1411	nbaddata = rm->rm_cols - nbadparity;
1412	ntgts = 0;
1413	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414		if (c < rm->rm_firstdatacol)
1415			parity_valid[c] = B_FALSE;
1416
1417		if (i < nt && c == t[i]) {
1418			tgts[ntgts++] = c;
1419			i++;
1420		} else if (rm->rm_col[c].rc_error != 0) {
1421			tgts[ntgts++] = c;
1422		} else if (c >= rm->rm_firstdatacol) {
1423			nbaddata--;
1424		} else {
1425			parity_valid[c] = B_TRUE;
1426			nbadparity--;
1427		}
1428	}
1429
1430	ASSERT(ntgts >= nt);
1431	ASSERT(nbaddata >= 0);
1432	ASSERT(nbaddata + nbadparity == ntgts);
1433
1434	dt = &tgts[nbadparity];
1435
1436	/*
1437	 * See if we can use any of our optimized reconstruction routines.
1438	 */
1439	if (!vdev_raidz_default_to_general) {
1440		switch (nbaddata) {
1441		case 1:
1442			if (parity_valid[VDEV_RAIDZ_P])
1443				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444
1445			ASSERT(rm->rm_firstdatacol > 1);
1446
1447			if (parity_valid[VDEV_RAIDZ_Q])
1448				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449
1450			ASSERT(rm->rm_firstdatacol > 2);
1451			break;
1452
1453		case 2:
1454			ASSERT(rm->rm_firstdatacol > 1);
1455
1456			if (parity_valid[VDEV_RAIDZ_P] &&
1457			    parity_valid[VDEV_RAIDZ_Q])
1458				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459
1460			ASSERT(rm->rm_firstdatacol > 2);
1461
1462			break;
1463		}
1464	}
1465
1466	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468	ASSERT(code > 0);
1469	return (code);
1470}
1471
1472static int
1473vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1474    uint64_t *ashift)
1475{
1476	vdev_t *cvd;
1477	uint64_t nparity = vd->vdev_nparity;
1478	int c;
1479	int lasterror = 0;
1480	int numerrors = 0;
1481
1482	ASSERT(nparity > 0);
1483
1484	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1485	    vd->vdev_children < nparity + 1) {
1486		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487		return (SET_ERROR(EINVAL));
1488	}
1489
1490	vdev_open_children(vd);
1491
1492	for (c = 0; c < vd->vdev_children; c++) {
1493		cvd = vd->vdev_child[c];
1494
1495		if (cvd->vdev_open_error != 0) {
1496			lasterror = cvd->vdev_open_error;
1497			numerrors++;
1498			continue;
1499		}
1500
1501		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1502		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503		*ashift = MAX(*ashift, cvd->vdev_ashift);
1504	}
1505
1506	*asize *= vd->vdev_children;
1507	*max_asize *= vd->vdev_children;
1508
1509	if (numerrors > nparity) {
1510		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511		return (lasterror);
1512	}
1513
1514	return (0);
1515}
1516
1517static void
1518vdev_raidz_close(vdev_t *vd)
1519{
1520	int c;
1521
1522	for (c = 0; c < vd->vdev_children; c++)
1523		vdev_close(vd->vdev_child[c]);
1524}
1525
1526/*
1527 * Handle a read or write I/O to a RAID-Z dump device.
1528 *
1529 * The dump device is in a unique situation compared to other ZFS datasets:
1530 * writing to this device should be as simple and fast as possible.  In
1531 * addition, durability matters much less since the dump will be extracted
1532 * once the machine reboots.  For that reason, this function eschews parity for
1533 * performance and simplicity.  The dump device uses the checksum setting
1534 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1535 * dataset.
1536 *
1537 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1538 * 128 KB will not fill an entire block; in addition, they may not be properly
1539 * aligned.  In that case, this function uses the preallocated 128 KB block and
1540 * omits reading or writing any "empty" portions of that block, as opposed to
1541 * allocating a fresh appropriately-sized block.
1542 *
1543 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1544 *
1545 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1546 *
1547 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1548 * allocated which spans all five child vdevs.  8 KB of data would be written to
1549 * each of four vdevs, with the fifth containing the parity bits.
1550 *
1551 *       parity    data     data     data     data
1552 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1553 *         ^        ^        ^        ^        ^
1554 *         |        |        |        |        |
1555 *   8 KB parity    ------8 KB data blocks------
1556 *
1557 * However, when writing to the dump device, the behavior is different:
1558 *
1559 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1560 *
1561 * Unlike the normal RAID-Z case in which the block is allocated based on the
1562 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1563 * I/O size is less than 128 KB, only the actual portions of data are written.
1564 * In this example the data is written to the third data vdev since that vdev
1565 * contains the offset [64 KB, 96 KB).
1566 *
1567 *       parity    data     data     data     data
1568 *     |        |        |        |   XX   |        |
1569 *                                    ^
1570 *                                    |
1571 *                             32 KB data block
1572 *
1573 * As a result, an individual I/O may not span all child vdevs; moreover, a
1574 * small I/O may only operate on a single child vdev.
1575 *
1576 * Note that since there are no parity bits calculated or written, this format
1577 * remains the same no matter how many parity bits are used in a normal RAID-Z
1578 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1579 * would look like:
1580 *
1581 *       parity   parity   parity    data     data     data     data
1582 *     |        |        |        |        |        |   XX   |        |
1583 *                                                      ^
1584 *                                                      |
1585 *                                               32 KB data block
1586 */
1587int
1588vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1589    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1590{
1591	vdev_t *tvd = vd->vdev_top;
1592	vdev_t *cvd;
1593	raidz_map_t *rm;
1594	raidz_col_t *rc;
1595	int c, err = 0;
1596
1597	uint64_t start, end, colstart, colend;
1598	uint64_t coloffset, colsize, colskip;
1599
1600	int flags = doread ? B_READ : B_WRITE;
1601
1602#ifdef	_KERNEL
1603
1604	/*
1605	 * Don't write past the end of the block
1606	 */
1607	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1608
1609	start = offset;
1610	end = start + size;
1611
1612	/*
1613	 * Allocate a RAID-Z map for this block.  Note that this block starts
1614	 * from the "original" offset, this is, the offset of the extent which
1615	 * contains the requisite offset of the data being read or written.
1616	 *
1617	 * Even if this I/O operation doesn't span the full block size, let's
1618	 * treat the on-disk format as if the only blocks are the complete 128
1619	 * KB size.
1620	 */
1621	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1622	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1623	    vd->vdev_children, vd->vdev_nparity);
1624
1625	coloffset = origoffset;
1626
1627	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1628	    c++, coloffset += rc->rc_size) {
1629		rc = &rm->rm_col[c];
1630		cvd = vd->vdev_child[rc->rc_devidx];
1631
1632		/*
1633		 * Find the start and end of this column in the RAID-Z map,
1634		 * keeping in mind that the stated size and offset of the
1635		 * operation may not fill the entire column for this vdev.
1636		 *
1637		 * If any portion of the data spans this column, issue the
1638		 * appropriate operation to the vdev.
1639		 */
1640		if (coloffset + rc->rc_size <= start)
1641			continue;
1642		if (coloffset >= end)
1643			continue;
1644
1645		colstart = MAX(coloffset, start);
1646		colend = MIN(end, coloffset + rc->rc_size);
1647		colsize = colend - colstart;
1648		colskip = colstart - coloffset;
1649
1650		VERIFY3U(colsize, <=, rc->rc_size);
1651		VERIFY3U(colskip, <=, rc->rc_size);
1652
1653		/*
1654		 * Note that the child vdev will have a vdev label at the start
1655		 * of its range of offsets, hence the need for
1656		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1657		 * example of why this calculation is needed.
1658		 */
1659		if ((err = vdev_disk_physio(cvd,
1660		    ((char *)rc->rc_data) + colskip, colsize,
1661		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1662		    flags, isdump)) != 0)
1663			break;
1664	}
1665
1666	vdev_raidz_map_free(rm);
1667#endif	/* KERNEL */
1668
1669	return (err);
1670}
1671
1672static uint64_t
1673vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1674{
1675	uint64_t asize;
1676	uint64_t ashift = vd->vdev_top->vdev_ashift;
1677	uint64_t cols = vd->vdev_children;
1678	uint64_t nparity = vd->vdev_nparity;
1679
1680	asize = ((psize - 1) >> ashift) + 1;
1681	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1682	asize = roundup(asize, nparity + 1) << ashift;
1683
1684	return (asize);
1685}
1686
1687static void
1688vdev_raidz_child_done(zio_t *zio)
1689{
1690	raidz_col_t *rc = zio->io_private;
1691
1692	rc->rc_error = zio->io_error;
1693	rc->rc_tried = 1;
1694	rc->rc_skipped = 0;
1695}
1696
1697/*
1698 * Start an IO operation on a RAIDZ VDev
1699 *
1700 * Outline:
1701 * - For write operations:
1702 *   1. Generate the parity data
1703 *   2. Create child zio write operations to each column's vdev, for both
1704 *      data and parity.
1705 *   3. If the column skips any sectors for padding, create optional dummy
1706 *      write zio children for those areas to improve aggregation continuity.
1707 * - For read operations:
1708 *   1. Create child zio read operations to each data column's vdev to read
1709 *      the range of data required for zio.
1710 *   2. If this is a scrub or resilver operation, or if any of the data
1711 *      vdevs have had errors, then create zio read operations to the parity
1712 *      columns' VDevs as well.
1713 */
1714static void
1715vdev_raidz_io_start(zio_t *zio)
1716{
1717	vdev_t *vd = zio->io_vd;
1718	vdev_t *tvd = vd->vdev_top;
1719	vdev_t *cvd;
1720	raidz_map_t *rm;
1721	raidz_col_t *rc;
1722	int c, i;
1723
1724	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1725	    tvd->vdev_ashift, vd->vdev_children,
1726	    vd->vdev_nparity);
1727
1728	zio->io_vsd = rm;
1729	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1730
1731	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1732
1733	if (zio->io_type == ZIO_TYPE_WRITE) {
1734		vdev_raidz_generate_parity(rm);
1735
1736		for (c = 0; c < rm->rm_cols; c++) {
1737			rc = &rm->rm_col[c];
1738			cvd = vd->vdev_child[rc->rc_devidx];
1739			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1740			    rc->rc_offset, rc->rc_data, rc->rc_size,
1741			    zio->io_type, zio->io_priority, 0,
1742			    vdev_raidz_child_done, rc));
1743		}
1744
1745		/*
1746		 * Generate optional I/Os for any skipped sectors to improve
1747		 * aggregation contiguity.
1748		 */
1749		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1750			ASSERT(c <= rm->rm_scols);
1751			if (c == rm->rm_scols)
1752				c = 0;
1753			rc = &rm->rm_col[c];
1754			cvd = vd->vdev_child[rc->rc_devidx];
1755			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1756			    rc->rc_offset + rc->rc_size, NULL,
1757			    1 << tvd->vdev_ashift,
1758			    zio->io_type, zio->io_priority,
1759			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1760		}
1761
1762		zio_execute(zio);
1763		return;
1764	}
1765
1766	ASSERT(zio->io_type == ZIO_TYPE_READ);
1767
1768	/*
1769	 * Iterate over the columns in reverse order so that we hit the parity
1770	 * last -- any errors along the way will force us to read the parity.
1771	 */
1772	for (c = rm->rm_cols - 1; c >= 0; c--) {
1773		rc = &rm->rm_col[c];
1774		cvd = vd->vdev_child[rc->rc_devidx];
1775		if (!vdev_readable(cvd)) {
1776			if (c >= rm->rm_firstdatacol)
1777				rm->rm_missingdata++;
1778			else
1779				rm->rm_missingparity++;
1780			rc->rc_error = SET_ERROR(ENXIO);
1781			rc->rc_tried = 1;	/* don't even try */
1782			rc->rc_skipped = 1;
1783			continue;
1784		}
1785		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1786			if (c >= rm->rm_firstdatacol)
1787				rm->rm_missingdata++;
1788			else
1789				rm->rm_missingparity++;
1790			rc->rc_error = SET_ERROR(ESTALE);
1791			rc->rc_skipped = 1;
1792			continue;
1793		}
1794		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1795		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1796			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1797			    rc->rc_offset, rc->rc_data, rc->rc_size,
1798			    zio->io_type, zio->io_priority, 0,
1799			    vdev_raidz_child_done, rc));
1800		}
1801	}
1802
1803	zio_execute(zio);
1804}
1805
1806
1807/*
1808 * Report a checksum error for a child of a RAID-Z device.
1809 */
1810static void
1811raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1812{
1813	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1814
1815	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1816		zio_bad_cksum_t zbc;
1817		raidz_map_t *rm = zio->io_vsd;
1818
1819		mutex_enter(&vd->vdev_stat_lock);
1820		vd->vdev_stat.vs_checksum_errors++;
1821		mutex_exit(&vd->vdev_stat_lock);
1822
1823		zbc.zbc_has_cksum = 0;
1824		zbc.zbc_injected = rm->rm_ecksuminjected;
1825
1826		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1827		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1828		    &zbc);
1829	}
1830}
1831
1832/*
1833 * We keep track of whether or not there were any injected errors, so that
1834 * any ereports we generate can note it.
1835 */
1836static int
1837raidz_checksum_verify(zio_t *zio)
1838{
1839	zio_bad_cksum_t zbc;
1840	raidz_map_t *rm = zio->io_vsd;
1841
1842	int ret = zio_checksum_error(zio, &zbc);
1843	if (ret != 0 && zbc.zbc_injected != 0)
1844		rm->rm_ecksuminjected = 1;
1845
1846	return (ret);
1847}
1848
1849/*
1850 * Generate the parity from the data columns. If we tried and were able to
1851 * read the parity without error, verify that the generated parity matches the
1852 * data we read. If it doesn't, we fire off a checksum error. Return the
1853 * number such failures.
1854 */
1855static int
1856raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1857{
1858	void *orig[VDEV_RAIDZ_MAXPARITY];
1859	int c, ret = 0;
1860	raidz_col_t *rc;
1861
1862	blkptr_t *bp = zio->io_bp;
1863	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1864	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1865
1866	if (checksum == ZIO_CHECKSUM_NOPARITY)
1867		return (ret);
1868
1869	for (c = 0; c < rm->rm_firstdatacol; c++) {
1870		rc = &rm->rm_col[c];
1871		if (!rc->rc_tried || rc->rc_error != 0)
1872			continue;
1873		orig[c] = zio_buf_alloc(rc->rc_size);
1874		bcopy(rc->rc_data, orig[c], rc->rc_size);
1875	}
1876
1877	vdev_raidz_generate_parity(rm);
1878
1879	for (c = 0; c < rm->rm_firstdatacol; c++) {
1880		rc = &rm->rm_col[c];
1881		if (!rc->rc_tried || rc->rc_error != 0)
1882			continue;
1883		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1884			raidz_checksum_error(zio, rc, orig[c]);
1885			rc->rc_error = SET_ERROR(ECKSUM);
1886			ret++;
1887		}
1888		zio_buf_free(orig[c], rc->rc_size);
1889	}
1890
1891	return (ret);
1892}
1893
1894/*
1895 * Keep statistics on all the ways that we used parity to correct data.
1896 */
1897static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1898
1899static int
1900vdev_raidz_worst_error(raidz_map_t *rm)
1901{
1902	int error = 0;
1903
1904	for (int c = 0; c < rm->rm_cols; c++)
1905		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1906
1907	return (error);
1908}
1909
1910/*
1911 * Iterate over all combinations of bad data and attempt a reconstruction.
1912 * Note that the algorithm below is non-optimal because it doesn't take into
1913 * account how reconstruction is actually performed. For example, with
1914 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1915 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1916 * cases we'd only use parity information in column 0.
1917 */
1918static int
1919vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1920{
1921	raidz_map_t *rm = zio->io_vsd;
1922	raidz_col_t *rc;
1923	void *orig[VDEV_RAIDZ_MAXPARITY];
1924	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1925	int *tgts = &tstore[1];
1926	int current, next, i, c, n;
1927	int code, ret = 0;
1928
1929	ASSERT(total_errors < rm->rm_firstdatacol);
1930
1931	/*
1932	 * This simplifies one edge condition.
1933	 */
1934	tgts[-1] = -1;
1935
1936	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1937		/*
1938		 * Initialize the targets array by finding the first n columns
1939		 * that contain no error.
1940		 *
1941		 * If there were no data errors, we need to ensure that we're
1942		 * always explicitly attempting to reconstruct at least one
1943		 * data column. To do this, we simply push the highest target
1944		 * up into the data columns.
1945		 */
1946		for (c = 0, i = 0; i < n; i++) {
1947			if (i == n - 1 && data_errors == 0 &&
1948			    c < rm->rm_firstdatacol) {
1949				c = rm->rm_firstdatacol;
1950			}
1951
1952			while (rm->rm_col[c].rc_error != 0) {
1953				c++;
1954				ASSERT3S(c, <, rm->rm_cols);
1955			}
1956
1957			tgts[i] = c++;
1958		}
1959
1960		/*
1961		 * Setting tgts[n] simplifies the other edge condition.
1962		 */
1963		tgts[n] = rm->rm_cols;
1964
1965		/*
1966		 * These buffers were allocated in previous iterations.
1967		 */
1968		for (i = 0; i < n - 1; i++) {
1969			ASSERT(orig[i] != NULL);
1970		}
1971
1972		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1973
1974		current = 0;
1975		next = tgts[current];
1976
1977		while (current != n) {
1978			tgts[current] = next;
1979			current = 0;
1980
1981			/*
1982			 * Save off the original data that we're going to
1983			 * attempt to reconstruct.
1984			 */
1985			for (i = 0; i < n; i++) {
1986				ASSERT(orig[i] != NULL);
1987				c = tgts[i];
1988				ASSERT3S(c, >=, 0);
1989				ASSERT3S(c, <, rm->rm_cols);
1990				rc = &rm->rm_col[c];
1991				bcopy(rc->rc_data, orig[i], rc->rc_size);
1992			}
1993
1994			/*
1995			 * Attempt a reconstruction and exit the outer loop on
1996			 * success.
1997			 */
1998			code = vdev_raidz_reconstruct(rm, tgts, n);
1999			if (raidz_checksum_verify(zio) == 0) {
2000				atomic_inc_64(&raidz_corrected[code]);
2001
2002				for (i = 0; i < n; i++) {
2003					c = tgts[i];
2004					rc = &rm->rm_col[c];
2005					ASSERT(rc->rc_error == 0);
2006					if (rc->rc_tried)
2007						raidz_checksum_error(zio, rc,
2008						    orig[i]);
2009					rc->rc_error = SET_ERROR(ECKSUM);
2010				}
2011
2012				ret = code;
2013				goto done;
2014			}
2015
2016			/*
2017			 * Restore the original data.
2018			 */
2019			for (i = 0; i < n; i++) {
2020				c = tgts[i];
2021				rc = &rm->rm_col[c];
2022				bcopy(orig[i], rc->rc_data, rc->rc_size);
2023			}
2024
2025			do {
2026				/*
2027				 * Find the next valid column after the current
2028				 * position..
2029				 */
2030				for (next = tgts[current] + 1;
2031				    next < rm->rm_cols &&
2032				    rm->rm_col[next].rc_error != 0; next++)
2033					continue;
2034
2035				ASSERT(next <= tgts[current + 1]);
2036
2037				/*
2038				 * If that spot is available, we're done here.
2039				 */
2040				if (next != tgts[current + 1])
2041					break;
2042
2043				/*
2044				 * Otherwise, find the next valid column after
2045				 * the previous position.
2046				 */
2047				for (c = tgts[current - 1] + 1;
2048				    rm->rm_col[c].rc_error != 0; c++)
2049					continue;
2050
2051				tgts[current] = c;
2052				current++;
2053
2054			} while (current != n);
2055		}
2056	}
2057	n--;
2058done:
2059	for (i = 0; i < n; i++) {
2060		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2061	}
2062
2063	return (ret);
2064}
2065
2066/*
2067 * Complete an IO operation on a RAIDZ VDev
2068 *
2069 * Outline:
2070 * - For write operations:
2071 *   1. Check for errors on the child IOs.
2072 *   2. Return, setting an error code if too few child VDevs were written
2073 *      to reconstruct the data later.  Note that partial writes are
2074 *      considered successful if they can be reconstructed at all.
2075 * - For read operations:
2076 *   1. Check for errors on the child IOs.
2077 *   2. If data errors occurred:
2078 *      a. Try to reassemble the data from the parity available.
2079 *      b. If we haven't yet read the parity drives, read them now.
2080 *      c. If all parity drives have been read but the data still doesn't
2081 *         reassemble with a correct checksum, then try combinatorial
2082 *         reconstruction.
2083 *      d. If that doesn't work, return an error.
2084 *   3. If there were unexpected errors or this is a resilver operation,
2085 *      rewrite the vdevs that had errors.
2086 */
2087static void
2088vdev_raidz_io_done(zio_t *zio)
2089{
2090	vdev_t *vd = zio->io_vd;
2091	vdev_t *cvd;
2092	raidz_map_t *rm = zio->io_vsd;
2093	raidz_col_t *rc;
2094	int unexpected_errors = 0;
2095	int parity_errors = 0;
2096	int parity_untried = 0;
2097	int data_errors = 0;
2098	int total_errors = 0;
2099	int n, c;
2100	int tgts[VDEV_RAIDZ_MAXPARITY];
2101	int code;
2102
2103	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2104
2105	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2106	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2107
2108	for (c = 0; c < rm->rm_cols; c++) {
2109		rc = &rm->rm_col[c];
2110
2111		if (rc->rc_error) {
2112			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2113
2114			if (c < rm->rm_firstdatacol)
2115				parity_errors++;
2116			else
2117				data_errors++;
2118
2119			if (!rc->rc_skipped)
2120				unexpected_errors++;
2121
2122			total_errors++;
2123		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2124			parity_untried++;
2125		}
2126	}
2127
2128	if (zio->io_type == ZIO_TYPE_WRITE) {
2129		/*
2130		 * XXX -- for now, treat partial writes as a success.
2131		 * (If we couldn't write enough columns to reconstruct
2132		 * the data, the I/O failed.  Otherwise, good enough.)
2133		 *
2134		 * Now that we support write reallocation, it would be better
2135		 * to treat partial failure as real failure unless there are
2136		 * no non-degraded top-level vdevs left, and not update DTLs
2137		 * if we intend to reallocate.
2138		 */
2139		/* XXPOLICY */
2140		if (total_errors > rm->rm_firstdatacol)
2141			zio->io_error = vdev_raidz_worst_error(rm);
2142
2143		return;
2144	}
2145
2146	ASSERT(zio->io_type == ZIO_TYPE_READ);
2147	/*
2148	 * There are three potential phases for a read:
2149	 *	1. produce valid data from the columns read
2150	 *	2. read all disks and try again
2151	 *	3. perform combinatorial reconstruction
2152	 *
2153	 * Each phase is progressively both more expensive and less likely to
2154	 * occur. If we encounter more errors than we can repair or all phases
2155	 * fail, we have no choice but to return an error.
2156	 */
2157
2158	/*
2159	 * If the number of errors we saw was correctable -- less than or equal
2160	 * to the number of parity disks read -- attempt to produce data that
2161	 * has a valid checksum. Naturally, this case applies in the absence of
2162	 * any errors.
2163	 */
2164	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2165		if (data_errors == 0) {
2166			if (raidz_checksum_verify(zio) == 0) {
2167				/*
2168				 * If we read parity information (unnecessarily
2169				 * as it happens since no reconstruction was
2170				 * needed) regenerate and verify the parity.
2171				 * We also regenerate parity when resilvering
2172				 * so we can write it out to the failed device
2173				 * later.
2174				 */
2175				if (parity_errors + parity_untried <
2176				    rm->rm_firstdatacol ||
2177				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2178					n = raidz_parity_verify(zio, rm);
2179					unexpected_errors += n;
2180					ASSERT(parity_errors + n <=
2181					    rm->rm_firstdatacol);
2182				}
2183				goto done;
2184			}
2185		} else {
2186			/*
2187			 * We either attempt to read all the parity columns or
2188			 * none of them. If we didn't try to read parity, we
2189			 * wouldn't be here in the correctable case. There must
2190			 * also have been fewer parity errors than parity
2191			 * columns or, again, we wouldn't be in this code path.
2192			 */
2193			ASSERT(parity_untried == 0);
2194			ASSERT(parity_errors < rm->rm_firstdatacol);
2195
2196			/*
2197			 * Identify the data columns that reported an error.
2198			 */
2199			n = 0;
2200			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2201				rc = &rm->rm_col[c];
2202				if (rc->rc_error != 0) {
2203					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2204					tgts[n++] = c;
2205				}
2206			}
2207
2208			ASSERT(rm->rm_firstdatacol >= n);
2209
2210			code = vdev_raidz_reconstruct(rm, tgts, n);
2211
2212			if (raidz_checksum_verify(zio) == 0) {
2213				atomic_inc_64(&raidz_corrected[code]);
2214
2215				/*
2216				 * If we read more parity disks than were used
2217				 * for reconstruction, confirm that the other
2218				 * parity disks produced correct data. This
2219				 * routine is suboptimal in that it regenerates
2220				 * the parity that we already used in addition
2221				 * to the parity that we're attempting to
2222				 * verify, but this should be a relatively
2223				 * uncommon case, and can be optimized if it
2224				 * becomes a problem. Note that we regenerate
2225				 * parity when resilvering so we can write it
2226				 * out to failed devices later.
2227				 */
2228				if (parity_errors < rm->rm_firstdatacol - n ||
2229				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2230					n = raidz_parity_verify(zio, rm);
2231					unexpected_errors += n;
2232					ASSERT(parity_errors + n <=
2233					    rm->rm_firstdatacol);
2234				}
2235
2236				goto done;
2237			}
2238		}
2239	}
2240
2241	/*
2242	 * This isn't a typical situation -- either we got a read error or
2243	 * a child silently returned bad data. Read every block so we can
2244	 * try again with as much data and parity as we can track down. If
2245	 * we've already been through once before, all children will be marked
2246	 * as tried so we'll proceed to combinatorial reconstruction.
2247	 */
2248	unexpected_errors = 1;
2249	rm->rm_missingdata = 0;
2250	rm->rm_missingparity = 0;
2251
2252	for (c = 0; c < rm->rm_cols; c++) {
2253		if (rm->rm_col[c].rc_tried)
2254			continue;
2255
2256		zio_vdev_io_redone(zio);
2257		do {
2258			rc = &rm->rm_col[c];
2259			if (rc->rc_tried)
2260				continue;
2261			zio_nowait(zio_vdev_child_io(zio, NULL,
2262			    vd->vdev_child[rc->rc_devidx],
2263			    rc->rc_offset, rc->rc_data, rc->rc_size,
2264			    zio->io_type, zio->io_priority, 0,
2265			    vdev_raidz_child_done, rc));
2266		} while (++c < rm->rm_cols);
2267
2268		return;
2269	}
2270
2271	/*
2272	 * At this point we've attempted to reconstruct the data given the
2273	 * errors we detected, and we've attempted to read all columns. There
2274	 * must, therefore, be one or more additional problems -- silent errors
2275	 * resulting in invalid data rather than explicit I/O errors resulting
2276	 * in absent data. We check if there is enough additional data to
2277	 * possibly reconstruct the data and then perform combinatorial
2278	 * reconstruction over all possible combinations. If that fails,
2279	 * we're cooked.
2280	 */
2281	if (total_errors > rm->rm_firstdatacol) {
2282		zio->io_error = vdev_raidz_worst_error(rm);
2283
2284	} else if (total_errors < rm->rm_firstdatacol &&
2285	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2286		/*
2287		 * If we didn't use all the available parity for the
2288		 * combinatorial reconstruction, verify that the remaining
2289		 * parity is correct.
2290		 */
2291		if (code != (1 << rm->rm_firstdatacol) - 1)
2292			(void) raidz_parity_verify(zio, rm);
2293	} else {
2294		/*
2295		 * We're here because either:
2296		 *
2297		 *	total_errors == rm_first_datacol, or
2298		 *	vdev_raidz_combrec() failed
2299		 *
2300		 * In either case, there is enough bad data to prevent
2301		 * reconstruction.
2302		 *
2303		 * Start checksum ereports for all children which haven't
2304		 * failed, and the IO wasn't speculative.
2305		 */
2306		zio->io_error = SET_ERROR(ECKSUM);
2307
2308		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2309			for (c = 0; c < rm->rm_cols; c++) {
2310				rc = &rm->rm_col[c];
2311				if (rc->rc_error == 0) {
2312					zio_bad_cksum_t zbc;
2313					zbc.zbc_has_cksum = 0;
2314					zbc.zbc_injected =
2315					    rm->rm_ecksuminjected;
2316
2317					zfs_ereport_start_checksum(
2318					    zio->io_spa,
2319					    vd->vdev_child[rc->rc_devidx],
2320					    zio, rc->rc_offset, rc->rc_size,
2321					    (void *)(uintptr_t)c, &zbc);
2322				}
2323			}
2324		}
2325	}
2326
2327done:
2328	zio_checksum_verified(zio);
2329
2330	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2331	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2332		/*
2333		 * Use the good data we have in hand to repair damaged children.
2334		 */
2335		for (c = 0; c < rm->rm_cols; c++) {
2336			rc = &rm->rm_col[c];
2337			cvd = vd->vdev_child[rc->rc_devidx];
2338
2339			if (rc->rc_error == 0)
2340				continue;
2341
2342			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2343			    rc->rc_offset, rc->rc_data, rc->rc_size,
2344			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2345			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2346			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2347		}
2348	}
2349}
2350
2351static void
2352vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2353{
2354	if (faulted > vd->vdev_nparity)
2355		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2356		    VDEV_AUX_NO_REPLICAS);
2357	else if (degraded + faulted != 0)
2358		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2359	else
2360		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2361}
2362
2363vdev_ops_t vdev_raidz_ops = {
2364	vdev_raidz_open,
2365	vdev_raidz_close,
2366	vdev_raidz_asize,
2367	vdev_raidz_io_start,
2368	vdev_raidz_io_done,
2369	vdev_raidz_state_change,
2370	NULL,
2371	NULL,
2372	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2373	B_FALSE			/* not a leaf vdev */
2374};
2375