1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#include <sys/vdev_disk.h>
33#include <sys/vdev_file.h>
34#include <sys/vdev_raidz.h>
35#include <sys/zio.h>
36#include <sys/zio_checksum.h>
37#include <sys/fs/zfs.h>
38#include <sys/fm/fs/zfs.h>
39
40/*
41 * Virtual device vector for RAID-Z.
42 *
43 * This vdev supports single, double, and triple parity. For single parity,
44 * we use a simple XOR of all the data columns. For double or triple parity,
45 * we use a special case of Reed-Solomon coding. This extends the
46 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
47 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
48 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
49 * former is also based. The latter is designed to provide higher performance
50 * for writes.
51 *
52 * Note that the Plank paper claimed to support arbitrary N+M, but was then
53 * amended six years later identifying a critical flaw that invalidates its
54 * claims. Nevertheless, the technique can be adapted to work for up to
55 * triple parity. For additional parity, the amendment "Note: Correction to
56 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
57 * is viable, but the additional complexity means that write performance will
58 * suffer.
59 *
60 * All of the methods above operate on a Galois field, defined over the
61 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
62 * can be expressed with a single byte. Briefly, the operations on the
63 * field are defined as follows:
64 *
65 *   o addition (+) is represented by a bitwise XOR
66 *   o subtraction (-) is therefore identical to addition: A + B = A - B
67 *   o multiplication of A by 2 is defined by the following bitwise expression:
68 *
69 *	(A * 2)_7 = A_6
70 *	(A * 2)_6 = A_5
71 *	(A * 2)_5 = A_4
72 *	(A * 2)_4 = A_3 + A_7
73 *	(A * 2)_3 = A_2 + A_7
74 *	(A * 2)_2 = A_1 + A_7
75 *	(A * 2)_1 = A_0
76 *	(A * 2)_0 = A_7
77 *
78 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
79 * As an aside, this multiplication is derived from the error correcting
80 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
81 *
82 * Observe that any number in the field (except for 0) can be expressed as a
83 * power of 2 -- a generator for the field. We store a table of the powers of
84 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
85 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
86 * than field addition). The inverse of a field element A (A^-1) is therefore
87 * A ^ (255 - 1) = A^254.
88 *
89 * The up-to-three parity columns, P, Q, R over several data columns,
90 * D_0, ... D_n-1, can be expressed by field operations:
91 *
92 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
93 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
94 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
95 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
96 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
97 *
98 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
99 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
100 * independent coefficients. (There are no additional coefficients that have
101 * this property which is why the uncorrected Plank method breaks down.)
102 *
103 * See the reconstruction code below for how P, Q and R can used individually
104 * or in concert to recover missing data columns.
105 */
106
107typedef struct raidz_col {
108	uint64_t rc_devidx;		/* child device index for I/O */
109	uint64_t rc_offset;		/* device offset */
110	uint64_t rc_size;		/* I/O size */
111	void *rc_data;			/* I/O data */
112	void *rc_gdata;			/* used to store the "good" version */
113	int rc_error;			/* I/O error for this device */
114	uint8_t rc_tried;		/* Did we attempt this I/O column? */
115	uint8_t rc_skipped;		/* Did we skip this I/O column? */
116} raidz_col_t;
117
118typedef struct raidz_map {
119	uint64_t rm_cols;		/* Regular column count */
120	uint64_t rm_scols;		/* Count including skipped columns */
121	uint64_t rm_bigcols;		/* Number of oversized columns */
122	uint64_t rm_asize;		/* Actual total I/O size */
123	uint64_t rm_missingdata;	/* Count of missing data devices */
124	uint64_t rm_missingparity;	/* Count of missing parity devices */
125	uint64_t rm_firstdatacol;	/* First data column/parity count */
126	uint64_t rm_nskip;		/* Skipped sectors for padding */
127	uint64_t rm_skipstart;		/* Column index of padding start */
128	void *rm_datacopy;		/* rm_asize-buffer of copied data */
129	uintptr_t rm_reports;		/* # of referencing checksum reports */
130	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
131	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
132	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
133} raidz_map_t;
134
135#define	VDEV_RAIDZ_P		0
136#define	VDEV_RAIDZ_Q		1
137#define	VDEV_RAIDZ_R		2
138
139#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
140#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
141
142/*
143 * We provide a mechanism to perform the field multiplication operation on a
144 * 64-bit value all at once rather than a byte at a time. This works by
145 * creating a mask from the top bit in each byte and using that to
146 * conditionally apply the XOR of 0x1d.
147 */
148#define	VDEV_RAIDZ_64MUL_2(x, mask) \
149{ \
150	(mask) = (x) & 0x8080808080808080ULL; \
151	(mask) = ((mask) << 1) - ((mask) >> 7); \
152	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
153	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
154}
155
156#define	VDEV_RAIDZ_64MUL_4(x, mask) \
157{ \
158	VDEV_RAIDZ_64MUL_2((x), mask); \
159	VDEV_RAIDZ_64MUL_2((x), mask); \
160}
161
162#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
163
164/*
165 * Force reconstruction to use the general purpose method.
166 */
167int vdev_raidz_default_to_general;
168
169/* Powers of 2 in the Galois field defined above. */
170static const uint8_t vdev_raidz_pow2[256] = {
171	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
172	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
173	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
174	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
175	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
176	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
177	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
178	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
179	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
180	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
181	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
182	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
183	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
184	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
185	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
186	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
187	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
188	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
189	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
190	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
191	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
192	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
193	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
194	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
195	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
196	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
197	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
198	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
199	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
200	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
201	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
202	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
203};
204/* Logs of 2 in the Galois field defined above. */
205static const uint8_t vdev_raidz_log2[256] = {
206	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
207	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
208	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
209	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
210	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
211	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
212	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
213	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
214	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
215	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
216	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
217	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
218	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
219	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
220	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
221	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
222	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
223	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
224	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
225	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
226	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
227	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
228	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
229	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
230	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
231	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
232	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
233	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
234	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
235	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
236	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
237	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
238};
239
240static void vdev_raidz_generate_parity(raidz_map_t *rm);
241
242/*
243 * Multiply a given number by 2 raised to the given power.
244 */
245static uint8_t
246vdev_raidz_exp2(uint_t a, int exp)
247{
248	if (a == 0)
249		return (0);
250
251	ASSERT(exp >= 0);
252	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
253
254	exp += vdev_raidz_log2[a];
255	if (exp > 255)
256		exp -= 255;
257
258	return (vdev_raidz_pow2[exp]);
259}
260
261static void
262vdev_raidz_map_free(raidz_map_t *rm)
263{
264	int c;
265	size_t size;
266
267	for (c = 0; c < rm->rm_firstdatacol; c++) {
268		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
269
270		if (rm->rm_col[c].rc_gdata != NULL)
271			zio_buf_free(rm->rm_col[c].rc_gdata,
272			    rm->rm_col[c].rc_size);
273	}
274
275	size = 0;
276	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
277		size += rm->rm_col[c].rc_size;
278
279	if (rm->rm_datacopy != NULL)
280		zio_buf_free(rm->rm_datacopy, size);
281
282	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
283}
284
285static void
286vdev_raidz_map_free_vsd(zio_t *zio)
287{
288	raidz_map_t *rm = zio->io_vsd;
289
290	ASSERT0(rm->rm_freed);
291	rm->rm_freed = 1;
292
293	if (rm->rm_reports == 0)
294		vdev_raidz_map_free(rm);
295}
296
297/*ARGSUSED*/
298static void
299vdev_raidz_cksum_free(void *arg, size_t ignored)
300{
301	raidz_map_t *rm = arg;
302
303	ASSERT3U(rm->rm_reports, >, 0);
304
305	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
306		vdev_raidz_map_free(rm);
307}
308
309static void
310vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
311{
312	raidz_map_t *rm = zcr->zcr_cbdata;
313	size_t c = zcr->zcr_cbinfo;
314	size_t x;
315
316	const char *good = NULL;
317	const char *bad = rm->rm_col[c].rc_data;
318
319	if (good_data == NULL) {
320		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
321		return;
322	}
323
324	if (c < rm->rm_firstdatacol) {
325		/*
326		 * The first time through, calculate the parity blocks for
327		 * the good data (this relies on the fact that the good
328		 * data never changes for a given logical ZIO)
329		 */
330		if (rm->rm_col[0].rc_gdata == NULL) {
331			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
332			char *buf;
333
334			/*
335			 * Set up the rm_col[]s to generate the parity for
336			 * good_data, first saving the parity bufs and
337			 * replacing them with buffers to hold the result.
338			 */
339			for (x = 0; x < rm->rm_firstdatacol; x++) {
340				bad_parity[x] = rm->rm_col[x].rc_data;
341				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
342				    zio_buf_alloc(rm->rm_col[x].rc_size);
343			}
344
345			/* fill in the data columns from good_data */
346			buf = (char *)good_data;
347			for (; x < rm->rm_cols; x++) {
348				rm->rm_col[x].rc_data = buf;
349				buf += rm->rm_col[x].rc_size;
350			}
351
352			/*
353			 * Construct the parity from the good data.
354			 */
355			vdev_raidz_generate_parity(rm);
356
357			/* restore everything back to its original state */
358			for (x = 0; x < rm->rm_firstdatacol; x++)
359				rm->rm_col[x].rc_data = bad_parity[x];
360
361			buf = rm->rm_datacopy;
362			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
363				rm->rm_col[x].rc_data = buf;
364				buf += rm->rm_col[x].rc_size;
365			}
366		}
367
368		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
369		good = rm->rm_col[c].rc_gdata;
370	} else {
371		/* adjust good_data to point at the start of our column */
372		good = good_data;
373
374		for (x = rm->rm_firstdatacol; x < c; x++)
375			good += rm->rm_col[x].rc_size;
376	}
377
378	/* we drop the ereport if it ends up that the data was good */
379	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
380}
381
382/*
383 * Invoked indirectly by zfs_ereport_start_checksum(), called
384 * below when our read operation fails completely.  The main point
385 * is to keep a copy of everything we read from disk, so that at
386 * vdev_raidz_cksum_finish() time we can compare it with the good data.
387 */
388static void
389vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
390{
391	size_t c = (size_t)(uintptr_t)arg;
392	caddr_t buf;
393
394	raidz_map_t *rm = zio->io_vsd;
395	size_t size;
396
397	/* set up the report and bump the refcount  */
398	zcr->zcr_cbdata = rm;
399	zcr->zcr_cbinfo = c;
400	zcr->zcr_finish = vdev_raidz_cksum_finish;
401	zcr->zcr_free = vdev_raidz_cksum_free;
402
403	rm->rm_reports++;
404	ASSERT3U(rm->rm_reports, >, 0);
405
406	if (rm->rm_datacopy != NULL)
407		return;
408
409	/*
410	 * It's the first time we're called for this raidz_map_t, so we need
411	 * to copy the data aside; there's no guarantee that our zio's buffer
412	 * won't be re-used for something else.
413	 *
414	 * Our parity data is already in separate buffers, so there's no need
415	 * to copy them.
416	 */
417
418	size = 0;
419	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
420		size += rm->rm_col[c].rc_size;
421
422	buf = rm->rm_datacopy = zio_buf_alloc(size);
423
424	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
425		raidz_col_t *col = &rm->rm_col[c];
426
427		bcopy(col->rc_data, buf, col->rc_size);
428		col->rc_data = buf;
429
430		buf += col->rc_size;
431	}
432	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
433}
434
435static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
436	vdev_raidz_map_free_vsd,
437	vdev_raidz_cksum_report
438};
439
440/*
441 * Divides the IO evenly across all child vdevs; usually, dcols is
442 * the number of children in the target vdev.
443 */
444static raidz_map_t *
445vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
446    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
447{
448	raidz_map_t *rm;
449	/* The starting RAIDZ (parent) vdev sector of the block. */
450	uint64_t b = offset >> unit_shift;
451	/* The zio's size in units of the vdev's minimum sector size. */
452	uint64_t s = size >> unit_shift;
453	/* The first column for this stripe. */
454	uint64_t f = b % dcols;
455	/* The starting byte offset on each child vdev. */
456	uint64_t o = (b / dcols) << unit_shift;
457	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
458
459	/*
460	 * "Quotient": The number of data sectors for this stripe on all but
461	 * the "big column" child vdevs that also contain "remainder" data.
462	 */
463	q = s / (dcols - nparity);
464
465	/*
466	 * "Remainder": The number of partial stripe data sectors in this I/O.
467	 * This will add a sector to some, but not all, child vdevs.
468	 */
469	r = s - q * (dcols - nparity);
470
471	/* The number of "big columns" - those which contain remainder data. */
472	bc = (r == 0 ? 0 : r + nparity);
473
474	/*
475	 * The total number of data and parity sectors associated with
476	 * this I/O.
477	 */
478	tot = s + nparity * (q + (r == 0 ? 0 : 1));
479
480	/* acols: The columns that will be accessed. */
481	/* scols: The columns that will be accessed or skipped. */
482	if (q == 0) {
483		/* Our I/O request doesn't span all child vdevs. */
484		acols = bc;
485		scols = MIN(dcols, roundup(bc, nparity + 1));
486	} else {
487		acols = dcols;
488		scols = dcols;
489	}
490
491	ASSERT3U(acols, <=, scols);
492
493	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
494
495	rm->rm_cols = acols;
496	rm->rm_scols = scols;
497	rm->rm_bigcols = bc;
498	rm->rm_skipstart = bc;
499	rm->rm_missingdata = 0;
500	rm->rm_missingparity = 0;
501	rm->rm_firstdatacol = nparity;
502	rm->rm_datacopy = NULL;
503	rm->rm_reports = 0;
504	rm->rm_freed = 0;
505	rm->rm_ecksuminjected = 0;
506
507	asize = 0;
508
509	for (c = 0; c < scols; c++) {
510		col = f + c;
511		coff = o;
512		if (col >= dcols) {
513			col -= dcols;
514			coff += 1ULL << unit_shift;
515		}
516		rm->rm_col[c].rc_devidx = col;
517		rm->rm_col[c].rc_offset = coff;
518		rm->rm_col[c].rc_data = NULL;
519		rm->rm_col[c].rc_gdata = NULL;
520		rm->rm_col[c].rc_error = 0;
521		rm->rm_col[c].rc_tried = 0;
522		rm->rm_col[c].rc_skipped = 0;
523
524		if (c >= acols)
525			rm->rm_col[c].rc_size = 0;
526		else if (c < bc)
527			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
528		else
529			rm->rm_col[c].rc_size = q << unit_shift;
530
531		asize += rm->rm_col[c].rc_size;
532	}
533
534	ASSERT3U(asize, ==, tot << unit_shift);
535	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
536	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
537	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
538	ASSERT3U(rm->rm_nskip, <=, nparity);
539
540	for (c = 0; c < rm->rm_firstdatacol; c++)
541		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
542
543	rm->rm_col[c].rc_data = data;
544
545	for (c = c + 1; c < acols; c++)
546		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
547		    rm->rm_col[c - 1].rc_size;
548
549	/*
550	 * If all data stored spans all columns, there's a danger that parity
551	 * will always be on the same device and, since parity isn't read
552	 * during normal operation, that that device's I/O bandwidth won't be
553	 * used effectively. We therefore switch the parity every 1MB.
554	 *
555	 * ... at least that was, ostensibly, the theory. As a practical
556	 * matter unless we juggle the parity between all devices evenly, we
557	 * won't see any benefit. Further, occasional writes that aren't a
558	 * multiple of the LCM of the number of children and the minimum
559	 * stripe width are sufficient to avoid pessimal behavior.
560	 * Unfortunately, this decision created an implicit on-disk format
561	 * requirement that we need to support for all eternity, but only
562	 * for single-parity RAID-Z.
563	 *
564	 * If we intend to skip a sector in the zeroth column for padding
565	 * we must make sure to note this swap. We will never intend to
566	 * skip the first column since at least one data and one parity
567	 * column must appear in each row.
568	 */
569	ASSERT(rm->rm_cols >= 2);
570	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
571
572	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
573		devidx = rm->rm_col[0].rc_devidx;
574		o = rm->rm_col[0].rc_offset;
575		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
576		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
577		rm->rm_col[1].rc_devidx = devidx;
578		rm->rm_col[1].rc_offset = o;
579
580		if (rm->rm_skipstart == 0)
581			rm->rm_skipstart = 1;
582	}
583
584	return (rm);
585}
586
587static void
588vdev_raidz_generate_parity_p(raidz_map_t *rm)
589{
590	uint64_t *p, *src, pcount, ccount, i;
591	int c;
592
593	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
594
595	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
596		src = rm->rm_col[c].rc_data;
597		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
598		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
599
600		if (c == rm->rm_firstdatacol) {
601			ASSERT(ccount == pcount);
602			for (i = 0; i < ccount; i++, src++, p++) {
603				*p = *src;
604			}
605		} else {
606			ASSERT(ccount <= pcount);
607			for (i = 0; i < ccount; i++, src++, p++) {
608				*p ^= *src;
609			}
610		}
611	}
612}
613
614static void
615vdev_raidz_generate_parity_pq(raidz_map_t *rm)
616{
617	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
618	int c;
619
620	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
621	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
622	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
623
624	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
625		src = rm->rm_col[c].rc_data;
626		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
627		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
628
629		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
630
631		if (c == rm->rm_firstdatacol) {
632			ASSERT(ccnt == pcnt || ccnt == 0);
633			for (i = 0; i < ccnt; i++, src++, p++, q++) {
634				*p = *src;
635				*q = *src;
636			}
637			for (; i < pcnt; i++, src++, p++, q++) {
638				*p = 0;
639				*q = 0;
640			}
641		} else {
642			ASSERT(ccnt <= pcnt);
643
644			/*
645			 * Apply the algorithm described above by multiplying
646			 * the previous result and adding in the new value.
647			 */
648			for (i = 0; i < ccnt; i++, src++, p++, q++) {
649				*p ^= *src;
650
651				VDEV_RAIDZ_64MUL_2(*q, mask);
652				*q ^= *src;
653			}
654
655			/*
656			 * Treat short columns as though they are full of 0s.
657			 * Note that there's therefore nothing needed for P.
658			 */
659			for (; i < pcnt; i++, q++) {
660				VDEV_RAIDZ_64MUL_2(*q, mask);
661			}
662		}
663	}
664}
665
666static void
667vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
668{
669	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
670	int c;
671
672	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
673	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
674	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
675	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
676	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
677
678	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
679		src = rm->rm_col[c].rc_data;
680		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
681		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
682		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
683
684		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
685
686		if (c == rm->rm_firstdatacol) {
687			ASSERT(ccnt == pcnt || ccnt == 0);
688			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
689				*p = *src;
690				*q = *src;
691				*r = *src;
692			}
693			for (; i < pcnt; i++, src++, p++, q++, r++) {
694				*p = 0;
695				*q = 0;
696				*r = 0;
697			}
698		} else {
699			ASSERT(ccnt <= pcnt);
700
701			/*
702			 * Apply the algorithm described above by multiplying
703			 * the previous result and adding in the new value.
704			 */
705			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
706				*p ^= *src;
707
708				VDEV_RAIDZ_64MUL_2(*q, mask);
709				*q ^= *src;
710
711				VDEV_RAIDZ_64MUL_4(*r, mask);
712				*r ^= *src;
713			}
714
715			/*
716			 * Treat short columns as though they are full of 0s.
717			 * Note that there's therefore nothing needed for P.
718			 */
719			for (; i < pcnt; i++, q++, r++) {
720				VDEV_RAIDZ_64MUL_2(*q, mask);
721				VDEV_RAIDZ_64MUL_4(*r, mask);
722			}
723		}
724	}
725}
726
727/*
728 * Generate RAID parity in the first virtual columns according to the number of
729 * parity columns available.
730 */
731static void
732vdev_raidz_generate_parity(raidz_map_t *rm)
733{
734	switch (rm->rm_firstdatacol) {
735	case 1:
736		vdev_raidz_generate_parity_p(rm);
737		break;
738	case 2:
739		vdev_raidz_generate_parity_pq(rm);
740		break;
741	case 3:
742		vdev_raidz_generate_parity_pqr(rm);
743		break;
744	default:
745		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
746	}
747}
748
749static int
750vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
751{
752	uint64_t *dst, *src, xcount, ccount, count, i;
753	int x = tgts[0];
754	int c;
755
756	ASSERT(ntgts == 1);
757	ASSERT(x >= rm->rm_firstdatacol);
758	ASSERT(x < rm->rm_cols);
759
760	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
761	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
762	ASSERT(xcount > 0);
763
764	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
765	dst = rm->rm_col[x].rc_data;
766	for (i = 0; i < xcount; i++, dst++, src++) {
767		*dst = *src;
768	}
769
770	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771		src = rm->rm_col[c].rc_data;
772		dst = rm->rm_col[x].rc_data;
773
774		if (c == x)
775			continue;
776
777		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778		count = MIN(ccount, xcount);
779
780		for (i = 0; i < count; i++, dst++, src++) {
781			*dst ^= *src;
782		}
783	}
784
785	return (1 << VDEV_RAIDZ_P);
786}
787
788static int
789vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
790{
791	uint64_t *dst, *src, xcount, ccount, count, mask, i;
792	uint8_t *b;
793	int x = tgts[0];
794	int c, j, exp;
795
796	ASSERT(ntgts == 1);
797
798	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
799	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
800
801	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
802		src = rm->rm_col[c].rc_data;
803		dst = rm->rm_col[x].rc_data;
804
805		if (c == x)
806			ccount = 0;
807		else
808			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
809
810		count = MIN(ccount, xcount);
811
812		if (c == rm->rm_firstdatacol) {
813			for (i = 0; i < count; i++, dst++, src++) {
814				*dst = *src;
815			}
816			for (; i < xcount; i++, dst++) {
817				*dst = 0;
818			}
819
820		} else {
821			for (i = 0; i < count; i++, dst++, src++) {
822				VDEV_RAIDZ_64MUL_2(*dst, mask);
823				*dst ^= *src;
824			}
825
826			for (; i < xcount; i++, dst++) {
827				VDEV_RAIDZ_64MUL_2(*dst, mask);
828			}
829		}
830	}
831
832	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
833	dst = rm->rm_col[x].rc_data;
834	exp = 255 - (rm->rm_cols - 1 - x);
835
836	for (i = 0; i < xcount; i++, dst++, src++) {
837		*dst ^= *src;
838		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
839			*b = vdev_raidz_exp2(*b, exp);
840		}
841	}
842
843	return (1 << VDEV_RAIDZ_Q);
844}
845
846static int
847vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
848{
849	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
850	void *pdata, *qdata;
851	uint64_t xsize, ysize, i;
852	int x = tgts[0];
853	int y = tgts[1];
854
855	ASSERT(ntgts == 2);
856	ASSERT(x < y);
857	ASSERT(x >= rm->rm_firstdatacol);
858	ASSERT(y < rm->rm_cols);
859
860	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
861
862	/*
863	 * Move the parity data aside -- we're going to compute parity as
864	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
865	 * reuse the parity generation mechanism without trashing the actual
866	 * parity so we make those columns appear to be full of zeros by
867	 * setting their lengths to zero.
868	 */
869	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
870	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
871	xsize = rm->rm_col[x].rc_size;
872	ysize = rm->rm_col[y].rc_size;
873
874	rm->rm_col[VDEV_RAIDZ_P].rc_data =
875	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
876	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
877	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
878	rm->rm_col[x].rc_size = 0;
879	rm->rm_col[y].rc_size = 0;
880
881	vdev_raidz_generate_parity_pq(rm);
882
883	rm->rm_col[x].rc_size = xsize;
884	rm->rm_col[y].rc_size = ysize;
885
886	p = pdata;
887	q = qdata;
888	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
889	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
890	xd = rm->rm_col[x].rc_data;
891	yd = rm->rm_col[y].rc_data;
892
893	/*
894	 * We now have:
895	 *	Pxy = P + D_x + D_y
896	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
897	 *
898	 * We can then solve for D_x:
899	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
900	 * where
901	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
902	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
903	 *
904	 * With D_x in hand, we can easily solve for D_y:
905	 *	D_y = P + Pxy + D_x
906	 */
907
908	a = vdev_raidz_pow2[255 + x - y];
909	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
910	tmp = 255 - vdev_raidz_log2[a ^ 1];
911
912	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
913	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
914
915	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
916		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
917		    vdev_raidz_exp2(*q ^ *qxy, bexp);
918
919		if (i < ysize)
920			*yd = *p ^ *pxy ^ *xd;
921	}
922
923	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
924	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
925	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
926	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
927
928	/*
929	 * Restore the saved parity data.
930	 */
931	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
932	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
933
934	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
935}
936
937/* BEGIN CSTYLED */
938/*
939 * In the general case of reconstruction, we must solve the system of linear
940 * equations defined by the coeffecients used to generate parity as well as
941 * the contents of the data and parity disks. This can be expressed with
942 * vectors for the original data (D) and the actual data (d) and parity (p)
943 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
944 *
945 *            __   __                     __     __
946 *            |     |         __     __   |  p_0  |
947 *            |  V  |         |  D_0  |   | p_m-1 |
948 *            |     |    x    |   :   | = |  d_0  |
949 *            |  I  |         | D_n-1 |   |   :   |
950 *            |     |         ~~     ~~   | d_n-1 |
951 *            ~~   ~~                     ~~     ~~
952 *
953 * I is simply a square identity matrix of size n, and V is a vandermonde
954 * matrix defined by the coeffecients we chose for the various parity columns
955 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
956 * computation as well as linear separability.
957 *
958 *      __               __               __     __
959 *      |   1   ..  1 1 1 |               |  p_0  |
960 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
961 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
962 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
963 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
964 *      |   :       : : : |   |   :   |   |  d_2  |
965 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
966 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
967 *      |   0   ..  0 0 1 |               | d_n-1 |
968 *      ~~               ~~               ~~     ~~
969 *
970 * Note that I, V, d, and p are known. To compute D, we must invert the
971 * matrix and use the known data and parity values to reconstruct the unknown
972 * data values. We begin by removing the rows in V|I and d|p that correspond
973 * to failed or missing columns; we then make V|I square (n x n) and d|p
974 * sized n by removing rows corresponding to unused parity from the bottom up
975 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
976 * using Gauss-Jordan elimination. In the example below we use m=3 parity
977 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
978 *           __                               __
979 *           |  1   1   1   1   1   1   1   1  |
980 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
981 *           |  19 205 116  29  64  16  4   1  |      / /
982 *           |  1   0   0   0   0   0   0   0  |     / /
983 *           |  0   1   0   0   0   0   0   0  | <--' /
984 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
985 *           |  0   0   0   1   0   0   0   0  |
986 *           |  0   0   0   0   1   0   0   0  |
987 *           |  0   0   0   0   0   1   0   0  |
988 *           |  0   0   0   0   0   0   1   0  |
989 *           |  0   0   0   0   0   0   0   1  |
990 *           ~~                               ~~
991 *           __                               __
992 *           |  1   1   1   1   1   1   1   1  |
993 *           |  19 205 116  29  64  16  4   1  |
994 *           |  1   0   0   0   0   0   0   0  |
995 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
996 *           |  0   0   0   0   1   0   0   0  |
997 *           |  0   0   0   0   0   1   0   0  |
998 *           |  0   0   0   0   0   0   1   0  |
999 *           |  0   0   0   0   0   0   0   1  |
1000 *           ~~                               ~~
1001 *
1002 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1003 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1004 * matrix is not singular.
1005 * __                                                                 __
1006 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1007 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1008 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1009 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1010 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1011 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1012 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1013 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1014 * ~~                                                                 ~~
1015 * __                                                                 __
1016 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1017 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1018 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1019 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1020 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1021 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1022 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1023 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1024 * ~~                                                                 ~~
1025 * __                                                                 __
1026 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1027 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1028 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1029 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1030 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1031 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1032 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1033 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1034 * ~~                                                                 ~~
1035 * __                                                                 __
1036 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1037 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1038 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1039 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1040 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1041 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1042 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1043 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1044 * ~~                                                                 ~~
1045 * __                                                                 __
1046 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1047 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1048 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1049 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1050 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1051 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1052 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1053 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1054 * ~~                                                                 ~~
1055 * __                                                                 __
1056 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1058 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1059 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1060 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1061 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1062 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1063 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1064 * ~~                                                                 ~~
1065 *                   __                               __
1066 *                   |  0   0   1   0   0   0   0   0  |
1067 *                   | 167 100  5   41 159 169 217 208 |
1068 *                   | 166 100  4   40 158 168 216 209 |
1069 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1070 *                   |  0   0   0   0   1   0   0   0  |
1071 *                   |  0   0   0   0   0   1   0   0  |
1072 *                   |  0   0   0   0   0   0   1   0  |
1073 *                   |  0   0   0   0   0   0   0   1  |
1074 *                   ~~                               ~~
1075 *
1076 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1077 * of the missing data.
1078 *
1079 * As is apparent from the example above, the only non-trivial rows in the
1080 * inverse matrix correspond to the data disks that we're trying to
1081 * reconstruct. Indeed, those are the only rows we need as the others would
1082 * only be useful for reconstructing data known or assumed to be valid. For
1083 * that reason, we only build the coefficients in the rows that correspond to
1084 * targeted columns.
1085 */
1086/* END CSTYLED */
1087
1088static void
1089vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1090    uint8_t **rows)
1091{
1092	int i, j;
1093	int pow;
1094
1095	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1096
1097	/*
1098	 * Fill in the missing rows of interest.
1099	 */
1100	for (i = 0; i < nmap; i++) {
1101		ASSERT3S(0, <=, map[i]);
1102		ASSERT3S(map[i], <=, 2);
1103
1104		pow = map[i] * n;
1105		if (pow > 255)
1106			pow -= 255;
1107		ASSERT(pow <= 255);
1108
1109		for (j = 0; j < n; j++) {
1110			pow -= map[i];
1111			if (pow < 0)
1112				pow += 255;
1113			rows[i][j] = vdev_raidz_pow2[pow];
1114		}
1115	}
1116}
1117
1118static void
1119vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1120    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1121{
1122	int i, j, ii, jj;
1123	uint8_t log;
1124
1125	/*
1126	 * Assert that the first nmissing entries from the array of used
1127	 * columns correspond to parity columns and that subsequent entries
1128	 * correspond to data columns.
1129	 */
1130	for (i = 0; i < nmissing; i++) {
1131		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1132	}
1133	for (; i < n; i++) {
1134		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1135	}
1136
1137	/*
1138	 * First initialize the storage where we'll compute the inverse rows.
1139	 */
1140	for (i = 0; i < nmissing; i++) {
1141		for (j = 0; j < n; j++) {
1142			invrows[i][j] = (i == j) ? 1 : 0;
1143		}
1144	}
1145
1146	/*
1147	 * Subtract all trivial rows from the rows of consequence.
1148	 */
1149	for (i = 0; i < nmissing; i++) {
1150		for (j = nmissing; j < n; j++) {
1151			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1152			jj = used[j] - rm->rm_firstdatacol;
1153			ASSERT3S(jj, <, n);
1154			invrows[i][j] = rows[i][jj];
1155			rows[i][jj] = 0;
1156		}
1157	}
1158
1159	/*
1160	 * For each of the rows of interest, we must normalize it and subtract
1161	 * a multiple of it from the other rows.
1162	 */
1163	for (i = 0; i < nmissing; i++) {
1164		for (j = 0; j < missing[i]; j++) {
1165			ASSERT0(rows[i][j]);
1166		}
1167		ASSERT3U(rows[i][missing[i]], !=, 0);
1168
1169		/*
1170		 * Compute the inverse of the first element and multiply each
1171		 * element in the row by that value.
1172		 */
1173		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1174
1175		for (j = 0; j < n; j++) {
1176			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1177			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1178		}
1179
1180		for (ii = 0; ii < nmissing; ii++) {
1181			if (i == ii)
1182				continue;
1183
1184			ASSERT3U(rows[ii][missing[i]], !=, 0);
1185
1186			log = vdev_raidz_log2[rows[ii][missing[i]]];
1187
1188			for (j = 0; j < n; j++) {
1189				rows[ii][j] ^=
1190				    vdev_raidz_exp2(rows[i][j], log);
1191				invrows[ii][j] ^=
1192				    vdev_raidz_exp2(invrows[i][j], log);
1193			}
1194		}
1195	}
1196
1197	/*
1198	 * Verify that the data that is left in the rows are properly part of
1199	 * an identity matrix.
1200	 */
1201	for (i = 0; i < nmissing; i++) {
1202		for (j = 0; j < n; j++) {
1203			if (j == missing[i]) {
1204				ASSERT3U(rows[i][j], ==, 1);
1205			} else {
1206				ASSERT0(rows[i][j]);
1207			}
1208		}
1209	}
1210}
1211
1212static void
1213vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1214    int *missing, uint8_t **invrows, const uint8_t *used)
1215{
1216	int i, j, x, cc, c;
1217	uint8_t *src;
1218	uint64_t ccount;
1219	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1220	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1221	uint8_t log = 0;
1222	uint8_t val;
1223	int ll;
1224	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1225	uint8_t *p, *pp;
1226	size_t psize;
1227
1228	psize = sizeof (invlog[0][0]) * n * nmissing;
1229	p = kmem_alloc(psize, KM_SLEEP);
1230
1231	for (pp = p, i = 0; i < nmissing; i++) {
1232		invlog[i] = pp;
1233		pp += n;
1234	}
1235
1236	for (i = 0; i < nmissing; i++) {
1237		for (j = 0; j < n; j++) {
1238			ASSERT3U(invrows[i][j], !=, 0);
1239			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1240		}
1241	}
1242
1243	for (i = 0; i < n; i++) {
1244		c = used[i];
1245		ASSERT3U(c, <, rm->rm_cols);
1246
1247		src = rm->rm_col[c].rc_data;
1248		ccount = rm->rm_col[c].rc_size;
1249		for (j = 0; j < nmissing; j++) {
1250			cc = missing[j] + rm->rm_firstdatacol;
1251			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1252			ASSERT3U(cc, <, rm->rm_cols);
1253			ASSERT3U(cc, !=, c);
1254
1255			dst[j] = rm->rm_col[cc].rc_data;
1256			dcount[j] = rm->rm_col[cc].rc_size;
1257		}
1258
1259		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1260
1261		for (x = 0; x < ccount; x++, src++) {
1262			if (*src != 0)
1263				log = vdev_raidz_log2[*src];
1264
1265			for (cc = 0; cc < nmissing; cc++) {
1266				if (x >= dcount[cc])
1267					continue;
1268
1269				if (*src == 0) {
1270					val = 0;
1271				} else {
1272					if ((ll = log + invlog[cc][i]) >= 255)
1273						ll -= 255;
1274					val = vdev_raidz_pow2[ll];
1275				}
1276
1277				if (i == 0)
1278					dst[cc][x] = val;
1279				else
1280					dst[cc][x] ^= val;
1281			}
1282		}
1283	}
1284
1285	kmem_free(p, psize);
1286}
1287
1288static int
1289vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1290{
1291	int n, i, c, t, tt;
1292	int nmissing_rows;
1293	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1294	int parity_map[VDEV_RAIDZ_MAXPARITY];
1295
1296	uint8_t *p, *pp;
1297	size_t psize;
1298
1299	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1300	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1301	uint8_t *used;
1302
1303	int code = 0;
1304
1305
1306	n = rm->rm_cols - rm->rm_firstdatacol;
1307
1308	/*
1309	 * Figure out which data columns are missing.
1310	 */
1311	nmissing_rows = 0;
1312	for (t = 0; t < ntgts; t++) {
1313		if (tgts[t] >= rm->rm_firstdatacol) {
1314			missing_rows[nmissing_rows++] =
1315			    tgts[t] - rm->rm_firstdatacol;
1316		}
1317	}
1318
1319	/*
1320	 * Figure out which parity columns to use to help generate the missing
1321	 * data columns.
1322	 */
1323	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1324		ASSERT(tt < ntgts);
1325		ASSERT(c < rm->rm_firstdatacol);
1326
1327		/*
1328		 * Skip any targeted parity columns.
1329		 */
1330		if (c == tgts[tt]) {
1331			tt++;
1332			continue;
1333		}
1334
1335		code |= 1 << c;
1336
1337		parity_map[i] = c;
1338		i++;
1339	}
1340
1341	ASSERT(code != 0);
1342	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1343
1344	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1345	    nmissing_rows * n + sizeof (used[0]) * n;
1346	p = kmem_alloc(psize, KM_SLEEP);
1347
1348	for (pp = p, i = 0; i < nmissing_rows; i++) {
1349		rows[i] = pp;
1350		pp += n;
1351		invrows[i] = pp;
1352		pp += n;
1353	}
1354	used = pp;
1355
1356	for (i = 0; i < nmissing_rows; i++) {
1357		used[i] = parity_map[i];
1358	}
1359
1360	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1361		if (tt < nmissing_rows &&
1362		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1363			tt++;
1364			continue;
1365		}
1366
1367		ASSERT3S(i, <, n);
1368		used[i] = c;
1369		i++;
1370	}
1371
1372	/*
1373	 * Initialize the interesting rows of the matrix.
1374	 */
1375	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1376
1377	/*
1378	 * Invert the matrix.
1379	 */
1380	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1381	    invrows, used);
1382
1383	/*
1384	 * Reconstruct the missing data using the generated matrix.
1385	 */
1386	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1387	    invrows, used);
1388
1389	kmem_free(p, psize);
1390
1391	return (code);
1392}
1393
1394static int
1395vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1396{
1397	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1398	int ntgts;
1399	int i, c;
1400	int code;
1401	int nbadparity, nbaddata;
1402	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1403
1404	/*
1405	 * The tgts list must already be sorted.
1406	 */
1407	for (i = 1; i < nt; i++) {
1408		ASSERT(t[i] > t[i - 1]);
1409	}
1410
1411	nbadparity = rm->rm_firstdatacol;
1412	nbaddata = rm->rm_cols - nbadparity;
1413	ntgts = 0;
1414	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1415		if (c < rm->rm_firstdatacol)
1416			parity_valid[c] = B_FALSE;
1417
1418		if (i < nt && c == t[i]) {
1419			tgts[ntgts++] = c;
1420			i++;
1421		} else if (rm->rm_col[c].rc_error != 0) {
1422			tgts[ntgts++] = c;
1423		} else if (c >= rm->rm_firstdatacol) {
1424			nbaddata--;
1425		} else {
1426			parity_valid[c] = B_TRUE;
1427			nbadparity--;
1428		}
1429	}
1430
1431	ASSERT(ntgts >= nt);
1432	ASSERT(nbaddata >= 0);
1433	ASSERT(nbaddata + nbadparity == ntgts);
1434
1435	dt = &tgts[nbadparity];
1436
1437	/*
1438	 * See if we can use any of our optimized reconstruction routines.
1439	 */
1440	if (!vdev_raidz_default_to_general) {
1441		switch (nbaddata) {
1442		case 1:
1443			if (parity_valid[VDEV_RAIDZ_P])
1444				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1445
1446			ASSERT(rm->rm_firstdatacol > 1);
1447
1448			if (parity_valid[VDEV_RAIDZ_Q])
1449				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1450
1451			ASSERT(rm->rm_firstdatacol > 2);
1452			break;
1453
1454		case 2:
1455			ASSERT(rm->rm_firstdatacol > 1);
1456
1457			if (parity_valid[VDEV_RAIDZ_P] &&
1458			    parity_valid[VDEV_RAIDZ_Q])
1459				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1460
1461			ASSERT(rm->rm_firstdatacol > 2);
1462
1463			break;
1464		}
1465	}
1466
1467	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1468	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1469	ASSERT(code > 0);
1470	return (code);
1471}
1472
1473static int
1474vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1475    uint64_t *ashift)
1476{
1477	vdev_t *cvd;
1478	uint64_t nparity = vd->vdev_nparity;
1479	int c;
1480	int lasterror = 0;
1481	int numerrors = 0;
1482
1483	ASSERT(nparity > 0);
1484
1485	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1486	    vd->vdev_children < nparity + 1) {
1487		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1488		return (SET_ERROR(EINVAL));
1489	}
1490
1491	vdev_open_children(vd);
1492
1493	for (c = 0; c < vd->vdev_children; c++) {
1494		cvd = vd->vdev_child[c];
1495
1496		if (cvd->vdev_open_error != 0) {
1497			lasterror = cvd->vdev_open_error;
1498			numerrors++;
1499			continue;
1500		}
1501
1502		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1503		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1504		*ashift = MAX(*ashift, cvd->vdev_ashift);
1505	}
1506
1507	*asize *= vd->vdev_children;
1508	*max_asize *= vd->vdev_children;
1509
1510	if (numerrors > nparity) {
1511		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1512		return (lasterror);
1513	}
1514
1515	return (0);
1516}
1517
1518static void
1519vdev_raidz_close(vdev_t *vd)
1520{
1521	int c;
1522
1523	for (c = 0; c < vd->vdev_children; c++)
1524		vdev_close(vd->vdev_child[c]);
1525}
1526
1527/*
1528 * Handle a read or write I/O to a RAID-Z dump device.
1529 *
1530 * The dump device is in a unique situation compared to other ZFS datasets:
1531 * writing to this device should be as simple and fast as possible.  In
1532 * addition, durability matters much less since the dump will be extracted
1533 * once the machine reboots.  For that reason, this function eschews parity for
1534 * performance and simplicity.  The dump device uses the checksum setting
1535 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1536 * dataset.
1537 *
1538 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1539 * 128 KB will not fill an entire block; in addition, they may not be properly
1540 * aligned.  In that case, this function uses the preallocated 128 KB block and
1541 * omits reading or writing any "empty" portions of that block, as opposed to
1542 * allocating a fresh appropriately-sized block.
1543 *
1544 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1545 *
1546 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1547 *
1548 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1549 * allocated which spans all five child vdevs.  8 KB of data would be written to
1550 * each of four vdevs, with the fifth containing the parity bits.
1551 *
1552 *       parity    data     data     data     data
1553 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1554 *         ^        ^        ^        ^        ^
1555 *         |        |        |        |        |
1556 *   8 KB parity    ------8 KB data blocks------
1557 *
1558 * However, when writing to the dump device, the behavior is different:
1559 *
1560 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1561 *
1562 * Unlike the normal RAID-Z case in which the block is allocated based on the
1563 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1564 * I/O size is less than 128 KB, only the actual portions of data are written.
1565 * In this example the data is written to the third data vdev since that vdev
1566 * contains the offset [64 KB, 96 KB).
1567 *
1568 *       parity    data     data     data     data
1569 *     |        |        |        |   XX   |        |
1570 *                                    ^
1571 *                                    |
1572 *                             32 KB data block
1573 *
1574 * As a result, an individual I/O may not span all child vdevs; moreover, a
1575 * small I/O may only operate on a single child vdev.
1576 *
1577 * Note that since there are no parity bits calculated or written, this format
1578 * remains the same no matter how many parity bits are used in a normal RAID-Z
1579 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1580 * would look like:
1581 *
1582 *       parity   parity   parity    data     data     data     data
1583 *     |        |        |        |        |        |   XX   |        |
1584 *                                                      ^
1585 *                                                      |
1586 *                                               32 KB data block
1587 */
1588int
1589vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1590    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1591{
1592	vdev_t *tvd = vd->vdev_top;
1593	vdev_t *cvd;
1594	raidz_map_t *rm;
1595	raidz_col_t *rc;
1596	int c, err = 0;
1597
1598	uint64_t start, end, colstart, colend;
1599	uint64_t coloffset, colsize, colskip;
1600
1601	int flags = doread ? B_READ : B_WRITE;
1602
1603#ifdef	_KERNEL
1604
1605	/*
1606	 * Don't write past the end of the block
1607	 */
1608	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1609
1610	start = offset;
1611	end = start + size;
1612
1613	/*
1614	 * Allocate a RAID-Z map for this block.  Note that this block starts
1615	 * from the "original" offset, this is, the offset of the extent which
1616	 * contains the requisite offset of the data being read or written.
1617	 *
1618	 * Even if this I/O operation doesn't span the full block size, let's
1619	 * treat the on-disk format as if the only blocks are the complete 128
1620	 * KB size.
1621	 */
1622	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1623	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1624	    vd->vdev_children, vd->vdev_nparity);
1625
1626	coloffset = origoffset;
1627
1628	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1629	    c++, coloffset += rc->rc_size) {
1630		rc = &rm->rm_col[c];
1631		cvd = vd->vdev_child[rc->rc_devidx];
1632
1633		/*
1634		 * Find the start and end of this column in the RAID-Z map,
1635		 * keeping in mind that the stated size and offset of the
1636		 * operation may not fill the entire column for this vdev.
1637		 *
1638		 * If any portion of the data spans this column, issue the
1639		 * appropriate operation to the vdev.
1640		 */
1641		if (coloffset + rc->rc_size <= start)
1642			continue;
1643		if (coloffset >= end)
1644			continue;
1645
1646		colstart = MAX(coloffset, start);
1647		colend = MIN(end, coloffset + rc->rc_size);
1648		colsize = colend - colstart;
1649		colskip = colstart - coloffset;
1650
1651		VERIFY3U(colsize, <=, rc->rc_size);
1652		VERIFY3U(colskip, <=, rc->rc_size);
1653
1654		/*
1655		 * Note that the child vdev will have a vdev label at the start
1656		 * of its range of offsets, hence the need for
1657		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1658		 * example of why this calculation is needed.
1659		 */
1660		if ((err = vdev_disk_physio(cvd,
1661		    ((char *)rc->rc_data) + colskip, colsize,
1662		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1663		    flags, isdump)) != 0)
1664			break;
1665	}
1666
1667	vdev_raidz_map_free(rm);
1668#endif	/* KERNEL */
1669
1670	return (err);
1671}
1672
1673static uint64_t
1674vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1675{
1676	uint64_t asize;
1677	uint64_t ashift = vd->vdev_top->vdev_ashift;
1678	uint64_t cols = vd->vdev_children;
1679	uint64_t nparity = vd->vdev_nparity;
1680
1681	asize = ((psize - 1) >> ashift) + 1;
1682	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1683	asize = roundup(asize, nparity + 1) << ashift;
1684
1685	return (asize);
1686}
1687
1688static void
1689vdev_raidz_child_done(zio_t *zio)
1690{
1691	raidz_col_t *rc = zio->io_private;
1692
1693	rc->rc_error = zio->io_error;
1694	rc->rc_tried = 1;
1695	rc->rc_skipped = 0;
1696}
1697
1698/*
1699 * Start an IO operation on a RAIDZ VDev
1700 *
1701 * Outline:
1702 * - For write operations:
1703 *   1. Generate the parity data
1704 *   2. Create child zio write operations to each column's vdev, for both
1705 *      data and parity.
1706 *   3. If the column skips any sectors for padding, create optional dummy
1707 *      write zio children for those areas to improve aggregation continuity.
1708 * - For read operations:
1709 *   1. Create child zio read operations to each data column's vdev to read
1710 *      the range of data required for zio.
1711 *   2. If this is a scrub or resilver operation, or if any of the data
1712 *      vdevs have had errors, then create zio read operations to the parity
1713 *      columns' VDevs as well.
1714 */
1715static void
1716vdev_raidz_io_start(zio_t *zio)
1717{
1718	vdev_t *vd = zio->io_vd;
1719	vdev_t *tvd = vd->vdev_top;
1720	vdev_t *cvd;
1721	raidz_map_t *rm;
1722	raidz_col_t *rc;
1723	int c, i;
1724
1725	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1726	    tvd->vdev_ashift, vd->vdev_children,
1727	    vd->vdev_nparity);
1728
1729	zio->io_vsd = rm;
1730	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1731
1732	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1733
1734	if (zio->io_type == ZIO_TYPE_WRITE) {
1735		vdev_raidz_generate_parity(rm);
1736
1737		for (c = 0; c < rm->rm_cols; c++) {
1738			rc = &rm->rm_col[c];
1739			cvd = vd->vdev_child[rc->rc_devidx];
1740			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1741			    rc->rc_offset, rc->rc_data, rc->rc_size,
1742			    zio->io_type, zio->io_priority, 0,
1743			    vdev_raidz_child_done, rc));
1744		}
1745
1746		/*
1747		 * Generate optional I/Os for any skipped sectors to improve
1748		 * aggregation contiguity.
1749		 */
1750		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1751			ASSERT(c <= rm->rm_scols);
1752			if (c == rm->rm_scols)
1753				c = 0;
1754			rc = &rm->rm_col[c];
1755			cvd = vd->vdev_child[rc->rc_devidx];
1756			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1757			    rc->rc_offset + rc->rc_size, NULL,
1758			    1 << tvd->vdev_ashift,
1759			    zio->io_type, zio->io_priority,
1760			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1761		}
1762
1763		zio_execute(zio);
1764		return;
1765	}
1766
1767	ASSERT(zio->io_type == ZIO_TYPE_READ);
1768
1769	/*
1770	 * Iterate over the columns in reverse order so that we hit the parity
1771	 * last -- any errors along the way will force us to read the parity.
1772	 */
1773	for (c = rm->rm_cols - 1; c >= 0; c--) {
1774		rc = &rm->rm_col[c];
1775		cvd = vd->vdev_child[rc->rc_devidx];
1776		if (!vdev_readable(cvd)) {
1777			if (c >= rm->rm_firstdatacol)
1778				rm->rm_missingdata++;
1779			else
1780				rm->rm_missingparity++;
1781			rc->rc_error = SET_ERROR(ENXIO);
1782			rc->rc_tried = 1;	/* don't even try */
1783			rc->rc_skipped = 1;
1784			continue;
1785		}
1786		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1787			if (c >= rm->rm_firstdatacol)
1788				rm->rm_missingdata++;
1789			else
1790				rm->rm_missingparity++;
1791			rc->rc_error = SET_ERROR(ESTALE);
1792			rc->rc_skipped = 1;
1793			continue;
1794		}
1795		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1796		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1797			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1798			    rc->rc_offset, rc->rc_data, rc->rc_size,
1799			    zio->io_type, zio->io_priority, 0,
1800			    vdev_raidz_child_done, rc));
1801		}
1802	}
1803
1804	zio_execute(zio);
1805}
1806
1807
1808/*
1809 * Report a checksum error for a child of a RAID-Z device.
1810 */
1811static void
1812raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1813{
1814	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1815
1816	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1817		zio_bad_cksum_t zbc;
1818		raidz_map_t *rm = zio->io_vsd;
1819
1820		mutex_enter(&vd->vdev_stat_lock);
1821		vd->vdev_stat.vs_checksum_errors++;
1822		mutex_exit(&vd->vdev_stat_lock);
1823
1824		zbc.zbc_has_cksum = 0;
1825		zbc.zbc_injected = rm->rm_ecksuminjected;
1826
1827		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1828		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1829		    &zbc);
1830	}
1831}
1832
1833/*
1834 * We keep track of whether or not there were any injected errors, so that
1835 * any ereports we generate can note it.
1836 */
1837static int
1838raidz_checksum_verify(zio_t *zio)
1839{
1840	zio_bad_cksum_t zbc;
1841	raidz_map_t *rm = zio->io_vsd;
1842
1843	int ret = zio_checksum_error(zio, &zbc);
1844	if (ret != 0 && zbc.zbc_injected != 0)
1845		rm->rm_ecksuminjected = 1;
1846
1847	return (ret);
1848}
1849
1850/*
1851 * Generate the parity from the data columns. If we tried and were able to
1852 * read the parity without error, verify that the generated parity matches the
1853 * data we read. If it doesn't, we fire off a checksum error. Return the
1854 * number such failures.
1855 */
1856static int
1857raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1858{
1859	void *orig[VDEV_RAIDZ_MAXPARITY];
1860	int c, ret = 0;
1861	raidz_col_t *rc;
1862
1863	blkptr_t *bp = zio->io_bp;
1864	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1865	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1866
1867	if (checksum == ZIO_CHECKSUM_NOPARITY)
1868		return (ret);
1869
1870	for (c = 0; c < rm->rm_firstdatacol; c++) {
1871		rc = &rm->rm_col[c];
1872		if (!rc->rc_tried || rc->rc_error != 0)
1873			continue;
1874		orig[c] = zio_buf_alloc(rc->rc_size);
1875		bcopy(rc->rc_data, orig[c], rc->rc_size);
1876	}
1877
1878	vdev_raidz_generate_parity(rm);
1879
1880	for (c = 0; c < rm->rm_firstdatacol; c++) {
1881		rc = &rm->rm_col[c];
1882		if (!rc->rc_tried || rc->rc_error != 0)
1883			continue;
1884		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1885			raidz_checksum_error(zio, rc, orig[c]);
1886			rc->rc_error = SET_ERROR(ECKSUM);
1887			ret++;
1888		}
1889		zio_buf_free(orig[c], rc->rc_size);
1890	}
1891
1892	return (ret);
1893}
1894
1895/*
1896 * Keep statistics on all the ways that we used parity to correct data.
1897 */
1898static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1899
1900static int
1901vdev_raidz_worst_error(raidz_map_t *rm)
1902{
1903	int error = 0;
1904
1905	for (int c = 0; c < rm->rm_cols; c++)
1906		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1907
1908	return (error);
1909}
1910
1911/*
1912 * Iterate over all combinations of bad data and attempt a reconstruction.
1913 * Note that the algorithm below is non-optimal because it doesn't take into
1914 * account how reconstruction is actually performed. For example, with
1915 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1916 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1917 * cases we'd only use parity information in column 0.
1918 */
1919static int
1920vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1921{
1922	raidz_map_t *rm = zio->io_vsd;
1923	raidz_col_t *rc;
1924	void *orig[VDEV_RAIDZ_MAXPARITY];
1925	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1926	int *tgts = &tstore[1];
1927	int current, next, i, c, n;
1928	int code, ret = 0;
1929
1930	ASSERT(total_errors < rm->rm_firstdatacol);
1931
1932	/*
1933	 * This simplifies one edge condition.
1934	 */
1935	tgts[-1] = -1;
1936
1937	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1938		/*
1939		 * Initialize the targets array by finding the first n columns
1940		 * that contain no error.
1941		 *
1942		 * If there were no data errors, we need to ensure that we're
1943		 * always explicitly attempting to reconstruct at least one
1944		 * data column. To do this, we simply push the highest target
1945		 * up into the data columns.
1946		 */
1947		for (c = 0, i = 0; i < n; i++) {
1948			if (i == n - 1 && data_errors == 0 &&
1949			    c < rm->rm_firstdatacol) {
1950				c = rm->rm_firstdatacol;
1951			}
1952
1953			while (rm->rm_col[c].rc_error != 0) {
1954				c++;
1955				ASSERT3S(c, <, rm->rm_cols);
1956			}
1957
1958			tgts[i] = c++;
1959		}
1960
1961		/*
1962		 * Setting tgts[n] simplifies the other edge condition.
1963		 */
1964		tgts[n] = rm->rm_cols;
1965
1966		/*
1967		 * These buffers were allocated in previous iterations.
1968		 */
1969		for (i = 0; i < n - 1; i++) {
1970			ASSERT(orig[i] != NULL);
1971		}
1972
1973		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1974
1975		current = 0;
1976		next = tgts[current];
1977
1978		while (current != n) {
1979			tgts[current] = next;
1980			current = 0;
1981
1982			/*
1983			 * Save off the original data that we're going to
1984			 * attempt to reconstruct.
1985			 */
1986			for (i = 0; i < n; i++) {
1987				ASSERT(orig[i] != NULL);
1988				c = tgts[i];
1989				ASSERT3S(c, >=, 0);
1990				ASSERT3S(c, <, rm->rm_cols);
1991				rc = &rm->rm_col[c];
1992				bcopy(rc->rc_data, orig[i], rc->rc_size);
1993			}
1994
1995			/*
1996			 * Attempt a reconstruction and exit the outer loop on
1997			 * success.
1998			 */
1999			code = vdev_raidz_reconstruct(rm, tgts, n);
2000			if (raidz_checksum_verify(zio) == 0) {
2001				atomic_inc_64(&raidz_corrected[code]);
2002
2003				for (i = 0; i < n; i++) {
2004					c = tgts[i];
2005					rc = &rm->rm_col[c];
2006					ASSERT(rc->rc_error == 0);
2007					if (rc->rc_tried)
2008						raidz_checksum_error(zio, rc,
2009						    orig[i]);
2010					rc->rc_error = SET_ERROR(ECKSUM);
2011				}
2012
2013				ret = code;
2014				goto done;
2015			}
2016
2017			/*
2018			 * Restore the original data.
2019			 */
2020			for (i = 0; i < n; i++) {
2021				c = tgts[i];
2022				rc = &rm->rm_col[c];
2023				bcopy(orig[i], rc->rc_data, rc->rc_size);
2024			}
2025
2026			do {
2027				/*
2028				 * Find the next valid column after the current
2029				 * position..
2030				 */
2031				for (next = tgts[current] + 1;
2032				    next < rm->rm_cols &&
2033				    rm->rm_col[next].rc_error != 0; next++)
2034					continue;
2035
2036				ASSERT(next <= tgts[current + 1]);
2037
2038				/*
2039				 * If that spot is available, we're done here.
2040				 */
2041				if (next != tgts[current + 1])
2042					break;
2043
2044				/*
2045				 * Otherwise, find the next valid column after
2046				 * the previous position.
2047				 */
2048				for (c = tgts[current - 1] + 1;
2049				    rm->rm_col[c].rc_error != 0; c++)
2050					continue;
2051
2052				tgts[current] = c;
2053				current++;
2054
2055			} while (current != n);
2056		}
2057	}
2058	n--;
2059done:
2060	for (i = 0; i < n; i++) {
2061		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2062	}
2063
2064	return (ret);
2065}
2066
2067/*
2068 * Complete an IO operation on a RAIDZ VDev
2069 *
2070 * Outline:
2071 * - For write operations:
2072 *   1. Check for errors on the child IOs.
2073 *   2. Return, setting an error code if too few child VDevs were written
2074 *      to reconstruct the data later.  Note that partial writes are
2075 *      considered successful if they can be reconstructed at all.
2076 * - For read operations:
2077 *   1. Check for errors on the child IOs.
2078 *   2. If data errors occurred:
2079 *      a. Try to reassemble the data from the parity available.
2080 *      b. If we haven't yet read the parity drives, read them now.
2081 *      c. If all parity drives have been read but the data still doesn't
2082 *         reassemble with a correct checksum, then try combinatorial
2083 *         reconstruction.
2084 *      d. If that doesn't work, return an error.
2085 *   3. If there were unexpected errors or this is a resilver operation,
2086 *      rewrite the vdevs that had errors.
2087 */
2088static void
2089vdev_raidz_io_done(zio_t *zio)
2090{
2091	vdev_t *vd = zio->io_vd;
2092	vdev_t *cvd;
2093	raidz_map_t *rm = zio->io_vsd;
2094	raidz_col_t *rc;
2095	int unexpected_errors = 0;
2096	int parity_errors = 0;
2097	int parity_untried = 0;
2098	int data_errors = 0;
2099	int total_errors = 0;
2100	int n, c;
2101	int tgts[VDEV_RAIDZ_MAXPARITY];
2102	int code;
2103
2104	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2105
2106	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2107	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2108
2109	for (c = 0; c < rm->rm_cols; c++) {
2110		rc = &rm->rm_col[c];
2111
2112		if (rc->rc_error) {
2113			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2114
2115			if (c < rm->rm_firstdatacol)
2116				parity_errors++;
2117			else
2118				data_errors++;
2119
2120			if (!rc->rc_skipped)
2121				unexpected_errors++;
2122
2123			total_errors++;
2124		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2125			parity_untried++;
2126		}
2127	}
2128
2129	if (zio->io_type == ZIO_TYPE_WRITE) {
2130		/*
2131		 * XXX -- for now, treat partial writes as a success.
2132		 * (If we couldn't write enough columns to reconstruct
2133		 * the data, the I/O failed.  Otherwise, good enough.)
2134		 *
2135		 * Now that we support write reallocation, it would be better
2136		 * to treat partial failure as real failure unless there are
2137		 * no non-degraded top-level vdevs left, and not update DTLs
2138		 * if we intend to reallocate.
2139		 */
2140		/* XXPOLICY */
2141		if (total_errors > rm->rm_firstdatacol)
2142			zio->io_error = vdev_raidz_worst_error(rm);
2143
2144		return;
2145	}
2146
2147	ASSERT(zio->io_type == ZIO_TYPE_READ);
2148	/*
2149	 * There are three potential phases for a read:
2150	 *	1. produce valid data from the columns read
2151	 *	2. read all disks and try again
2152	 *	3. perform combinatorial reconstruction
2153	 *
2154	 * Each phase is progressively both more expensive and less likely to
2155	 * occur. If we encounter more errors than we can repair or all phases
2156	 * fail, we have no choice but to return an error.
2157	 */
2158
2159	/*
2160	 * If the number of errors we saw was correctable -- less than or equal
2161	 * to the number of parity disks read -- attempt to produce data that
2162	 * has a valid checksum. Naturally, this case applies in the absence of
2163	 * any errors.
2164	 */
2165	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2166		if (data_errors == 0) {
2167			if (raidz_checksum_verify(zio) == 0) {
2168				/*
2169				 * If we read parity information (unnecessarily
2170				 * as it happens since no reconstruction was
2171				 * needed) regenerate and verify the parity.
2172				 * We also regenerate parity when resilvering
2173				 * so we can write it out to the failed device
2174				 * later.
2175				 */
2176				if (parity_errors + parity_untried <
2177				    rm->rm_firstdatacol ||
2178				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2179					n = raidz_parity_verify(zio, rm);
2180					unexpected_errors += n;
2181					ASSERT(parity_errors + n <=
2182					    rm->rm_firstdatacol);
2183				}
2184				goto done;
2185			}
2186		} else {
2187			/*
2188			 * We either attempt to read all the parity columns or
2189			 * none of them. If we didn't try to read parity, we
2190			 * wouldn't be here in the correctable case. There must
2191			 * also have been fewer parity errors than parity
2192			 * columns or, again, we wouldn't be in this code path.
2193			 */
2194			ASSERT(parity_untried == 0);
2195			ASSERT(parity_errors < rm->rm_firstdatacol);
2196
2197			/*
2198			 * Identify the data columns that reported an error.
2199			 */
2200			n = 0;
2201			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2202				rc = &rm->rm_col[c];
2203				if (rc->rc_error != 0) {
2204					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2205					tgts[n++] = c;
2206				}
2207			}
2208
2209			ASSERT(rm->rm_firstdatacol >= n);
2210
2211			code = vdev_raidz_reconstruct(rm, tgts, n);
2212
2213			if (raidz_checksum_verify(zio) == 0) {
2214				atomic_inc_64(&raidz_corrected[code]);
2215
2216				/*
2217				 * If we read more parity disks than were used
2218				 * for reconstruction, confirm that the other
2219				 * parity disks produced correct data. This
2220				 * routine is suboptimal in that it regenerates
2221				 * the parity that we already used in addition
2222				 * to the parity that we're attempting to
2223				 * verify, but this should be a relatively
2224				 * uncommon case, and can be optimized if it
2225				 * becomes a problem. Note that we regenerate
2226				 * parity when resilvering so we can write it
2227				 * out to failed devices later.
2228				 */
2229				if (parity_errors < rm->rm_firstdatacol - n ||
2230				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2231					n = raidz_parity_verify(zio, rm);
2232					unexpected_errors += n;
2233					ASSERT(parity_errors + n <=
2234					    rm->rm_firstdatacol);
2235				}
2236
2237				goto done;
2238			}
2239		}
2240	}
2241
2242	/*
2243	 * This isn't a typical situation -- either we got a read error or
2244	 * a child silently returned bad data. Read every block so we can
2245	 * try again with as much data and parity as we can track down. If
2246	 * we've already been through once before, all children will be marked
2247	 * as tried so we'll proceed to combinatorial reconstruction.
2248	 */
2249	unexpected_errors = 1;
2250	rm->rm_missingdata = 0;
2251	rm->rm_missingparity = 0;
2252
2253	for (c = 0; c < rm->rm_cols; c++) {
2254		if (rm->rm_col[c].rc_tried)
2255			continue;
2256
2257		zio_vdev_io_redone(zio);
2258		do {
2259			rc = &rm->rm_col[c];
2260			if (rc->rc_tried)
2261				continue;
2262			zio_nowait(zio_vdev_child_io(zio, NULL,
2263			    vd->vdev_child[rc->rc_devidx],
2264			    rc->rc_offset, rc->rc_data, rc->rc_size,
2265			    zio->io_type, zio->io_priority, 0,
2266			    vdev_raidz_child_done, rc));
2267		} while (++c < rm->rm_cols);
2268
2269		return;
2270	}
2271
2272	/*
2273	 * At this point we've attempted to reconstruct the data given the
2274	 * errors we detected, and we've attempted to read all columns. There
2275	 * must, therefore, be one or more additional problems -- silent errors
2276	 * resulting in invalid data rather than explicit I/O errors resulting
2277	 * in absent data. We check if there is enough additional data to
2278	 * possibly reconstruct the data and then perform combinatorial
2279	 * reconstruction over all possible combinations. If that fails,
2280	 * we're cooked.
2281	 */
2282	if (total_errors > rm->rm_firstdatacol) {
2283		zio->io_error = vdev_raidz_worst_error(rm);
2284
2285	} else if (total_errors < rm->rm_firstdatacol &&
2286	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2287		/*
2288		 * If we didn't use all the available parity for the
2289		 * combinatorial reconstruction, verify that the remaining
2290		 * parity is correct.
2291		 */
2292		if (code != (1 << rm->rm_firstdatacol) - 1)
2293			(void) raidz_parity_verify(zio, rm);
2294	} else {
2295		/*
2296		 * We're here because either:
2297		 *
2298		 *	total_errors == rm_first_datacol, or
2299		 *	vdev_raidz_combrec() failed
2300		 *
2301		 * In either case, there is enough bad data to prevent
2302		 * reconstruction.
2303		 *
2304		 * Start checksum ereports for all children which haven't
2305		 * failed, and the IO wasn't speculative.
2306		 */
2307		zio->io_error = SET_ERROR(ECKSUM);
2308
2309		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2310			for (c = 0; c < rm->rm_cols; c++) {
2311				rc = &rm->rm_col[c];
2312				if (rc->rc_error == 0) {
2313					zio_bad_cksum_t zbc;
2314					zbc.zbc_has_cksum = 0;
2315					zbc.zbc_injected =
2316					    rm->rm_ecksuminjected;
2317
2318					zfs_ereport_start_checksum(
2319					    zio->io_spa,
2320					    vd->vdev_child[rc->rc_devidx],
2321					    zio, rc->rc_offset, rc->rc_size,
2322					    (void *)(uintptr_t)c, &zbc);
2323				}
2324			}
2325		}
2326	}
2327
2328done:
2329	zio_checksum_verified(zio);
2330
2331	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2332	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2333		/*
2334		 * Use the good data we have in hand to repair damaged children.
2335		 */
2336		for (c = 0; c < rm->rm_cols; c++) {
2337			rc = &rm->rm_col[c];
2338			cvd = vd->vdev_child[rc->rc_devidx];
2339
2340			if (rc->rc_error == 0)
2341				continue;
2342
2343			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2344			    rc->rc_offset, rc->rc_data, rc->rc_size,
2345			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2346			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2347			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2348		}
2349	}
2350}
2351
2352static void
2353vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2354{
2355	if (faulted > vd->vdev_nparity)
2356		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2357		    VDEV_AUX_NO_REPLICAS);
2358	else if (degraded + faulted != 0)
2359		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2360	else
2361		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2362}
2363
2364vdev_ops_t vdev_raidz_ops = {
2365	vdev_raidz_open,
2366	vdev_raidz_close,
2367	vdev_raidz_asize,
2368	vdev_raidz_io_start,
2369	vdev_raidz_io_done,
2370	vdev_raidz_state_change,
2371	NULL,
2372	NULL,
2373	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2374	B_FALSE			/* not a leaf vdev */
2375};
2376