1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#include <sys/vdev_disk.h>
33#include <sys/vdev_file.h>
34#include <sys/vdev_raidz.h>
35#include <sys/zio.h>
36#include <sys/zio_checksum.h>
37#include <sys/abd.h>
38#include <sys/fs/zfs.h>
39#include <sys/fm/fs/zfs.h>
40
41/*
42 * Virtual device vector for RAID-Z.
43 *
44 * This vdev supports single, double, and triple parity. For single parity,
45 * we use a simple XOR of all the data columns. For double or triple parity,
46 * we use a special case of Reed-Solomon coding. This extends the
47 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
48 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
49 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
50 * former is also based. The latter is designed to provide higher performance
51 * for writes.
52 *
53 * Note that the Plank paper claimed to support arbitrary N+M, but was then
54 * amended six years later identifying a critical flaw that invalidates its
55 * claims. Nevertheless, the technique can be adapted to work for up to
56 * triple parity. For additional parity, the amendment "Note: Correction to
57 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
58 * is viable, but the additional complexity means that write performance will
59 * suffer.
60 *
61 * All of the methods above operate on a Galois field, defined over the
62 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
63 * can be expressed with a single byte. Briefly, the operations on the
64 * field are defined as follows:
65 *
66 *   o addition (+) is represented by a bitwise XOR
67 *   o subtraction (-) is therefore identical to addition: A + B = A - B
68 *   o multiplication of A by 2 is defined by the following bitwise expression:
69 *
70 *	(A * 2)_7 = A_6
71 *	(A * 2)_6 = A_5
72 *	(A * 2)_5 = A_4
73 *	(A * 2)_4 = A_3 + A_7
74 *	(A * 2)_3 = A_2 + A_7
75 *	(A * 2)_2 = A_1 + A_7
76 *	(A * 2)_1 = A_0
77 *	(A * 2)_0 = A_7
78 *
79 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
80 * As an aside, this multiplication is derived from the error correcting
81 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
82 *
83 * Observe that any number in the field (except for 0) can be expressed as a
84 * power of 2 -- a generator for the field. We store a table of the powers of
85 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
86 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
87 * than field addition). The inverse of a field element A (A^-1) is therefore
88 * A ^ (255 - 1) = A^254.
89 *
90 * The up-to-three parity columns, P, Q, R over several data columns,
91 * D_0, ... D_n-1, can be expressed by field operations:
92 *
93 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
94 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
95 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
96 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
97 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
98 *
99 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
100 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
101 * independent coefficients. (There are no additional coefficients that have
102 * this property which is why the uncorrected Plank method breaks down.)
103 *
104 * See the reconstruction code below for how P, Q and R can used individually
105 * or in concert to recover missing data columns.
106 */
107
108typedef struct raidz_col {
109	uint64_t rc_devidx;		/* child device index for I/O */
110	uint64_t rc_offset;		/* device offset */
111	uint64_t rc_size;		/* I/O size */
112	abd_t *rc_abd;			/* I/O data */
113	void *rc_gdata;			/* used to store the "good" version */
114	int rc_error;			/* I/O error for this device */
115	uint8_t rc_tried;		/* Did we attempt this I/O column? */
116	uint8_t rc_skipped;		/* Did we skip this I/O column? */
117} raidz_col_t;
118
119typedef struct raidz_map {
120	uint64_t rm_cols;		/* Regular column count */
121	uint64_t rm_scols;		/* Count including skipped columns */
122	uint64_t rm_bigcols;		/* Number of oversized columns */
123	uint64_t rm_asize;		/* Actual total I/O size */
124	uint64_t rm_missingdata;	/* Count of missing data devices */
125	uint64_t rm_missingparity;	/* Count of missing parity devices */
126	uint64_t rm_firstdatacol;	/* First data column/parity count */
127	uint64_t rm_nskip;		/* Skipped sectors for padding */
128	uint64_t rm_skipstart;		/* Column index of padding start */
129	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
130	uintptr_t rm_reports;		/* # of referencing checksum reports */
131	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
132	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
133	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
134} raidz_map_t;
135
136#define	VDEV_RAIDZ_P		0
137#define	VDEV_RAIDZ_Q		1
138#define	VDEV_RAIDZ_R		2
139
140#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
141#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
142
143/*
144 * We provide a mechanism to perform the field multiplication operation on a
145 * 64-bit value all at once rather than a byte at a time. This works by
146 * creating a mask from the top bit in each byte and using that to
147 * conditionally apply the XOR of 0x1d.
148 */
149#define	VDEV_RAIDZ_64MUL_2(x, mask) \
150{ \
151	(mask) = (x) & 0x8080808080808080ULL; \
152	(mask) = ((mask) << 1) - ((mask) >> 7); \
153	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
154	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
155}
156
157#define	VDEV_RAIDZ_64MUL_4(x, mask) \
158{ \
159	VDEV_RAIDZ_64MUL_2((x), mask); \
160	VDEV_RAIDZ_64MUL_2((x), mask); \
161}
162
163#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
164
165/*
166 * Force reconstruction to use the general purpose method.
167 */
168int vdev_raidz_default_to_general;
169
170/* Powers of 2 in the Galois field defined above. */
171static const uint8_t vdev_raidz_pow2[256] = {
172	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
173	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
174	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
175	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
176	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
177	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
178	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
179	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
180	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
181	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
182	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
183	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
184	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
185	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
186	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
187	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
188	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
189	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
190	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
191	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
192	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
193	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
194	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
195	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
196	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
197	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
198	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
199	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
200	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
201	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
202	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
203	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
204};
205/* Logs of 2 in the Galois field defined above. */
206static const uint8_t vdev_raidz_log2[256] = {
207	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
208	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
209	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
210	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
211	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
212	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
213	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
214	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
215	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
216	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
217	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
218	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
219	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
220	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
221	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
222	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
223	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
224	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
225	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
226	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
227	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
228	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
229	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
230	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
231	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
232	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
233	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
234	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
235	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
236	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
237	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
238	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
239};
240
241static void vdev_raidz_generate_parity(raidz_map_t *rm);
242
243/*
244 * Multiply a given number by 2 raised to the given power.
245 */
246static uint8_t
247vdev_raidz_exp2(uint_t a, int exp)
248{
249	if (a == 0)
250		return (0);
251
252	ASSERT(exp >= 0);
253	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
254
255	exp += vdev_raidz_log2[a];
256	if (exp > 255)
257		exp -= 255;
258
259	return (vdev_raidz_pow2[exp]);
260}
261
262static void
263vdev_raidz_map_free(raidz_map_t *rm)
264{
265	int c;
266	size_t size;
267
268	for (c = 0; c < rm->rm_firstdatacol; c++) {
269		abd_free(rm->rm_col[c].rc_abd);
270
271		if (rm->rm_col[c].rc_gdata != NULL)
272			zio_buf_free(rm->rm_col[c].rc_gdata,
273			    rm->rm_col[c].rc_size);
274	}
275
276	size = 0;
277	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
278		abd_put(rm->rm_col[c].rc_abd);
279		size += rm->rm_col[c].rc_size;
280	}
281
282	if (rm->rm_abd_copy != NULL)
283		abd_free(rm->rm_abd_copy);
284
285	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
286}
287
288static void
289vdev_raidz_map_free_vsd(zio_t *zio)
290{
291	raidz_map_t *rm = zio->io_vsd;
292
293	ASSERT0(rm->rm_freed);
294	rm->rm_freed = 1;
295
296	if (rm->rm_reports == 0)
297		vdev_raidz_map_free(rm);
298}
299
300/*ARGSUSED*/
301static void
302vdev_raidz_cksum_free(void *arg, size_t ignored)
303{
304	raidz_map_t *rm = arg;
305
306	ASSERT3U(rm->rm_reports, >, 0);
307
308	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
309		vdev_raidz_map_free(rm);
310}
311
312static void
313vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
314{
315	raidz_map_t *rm = zcr->zcr_cbdata;
316	size_t c = zcr->zcr_cbinfo;
317	size_t x;
318
319	const char *good = NULL;
320	char *bad;
321
322	if (good_data == NULL) {
323		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
324		return;
325	}
326
327	if (c < rm->rm_firstdatacol) {
328		/*
329		 * The first time through, calculate the parity blocks for
330		 * the good data (this relies on the fact that the good
331		 * data never changes for a given logical ZIO)
332		 */
333		if (rm->rm_col[0].rc_gdata == NULL) {
334			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
335			char *buf;
336			int offset;
337
338			/*
339			 * Set up the rm_col[]s to generate the parity for
340			 * good_data, first saving the parity bufs and
341			 * replacing them with buffers to hold the result.
342			 */
343			for (x = 0; x < rm->rm_firstdatacol; x++) {
344				bad_parity[x] = rm->rm_col[x].rc_abd;
345				rm->rm_col[x].rc_gdata =
346				    zio_buf_alloc(rm->rm_col[x].rc_size);
347				rm->rm_col[x].rc_abd =
348				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
349				    rm->rm_col[x].rc_size);
350			}
351
352			/* fill in the data columns from good_data */
353			buf = (char *)good_data;
354			for (; x < rm->rm_cols; x++) {
355				abd_put(rm->rm_col[x].rc_abd);
356				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
357				    rm->rm_col[x].rc_size);
358				buf += rm->rm_col[x].rc_size;
359			}
360
361			/*
362			 * Construct the parity from the good data.
363			 */
364			vdev_raidz_generate_parity(rm);
365
366			/* restore everything back to its original state */
367			for (x = 0; x < rm->rm_firstdatacol; x++) {
368				abd_put(rm->rm_col[x].rc_abd);
369				rm->rm_col[x].rc_abd = bad_parity[x];
370			}
371
372			offset = 0;
373			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
374				abd_put(rm->rm_col[x].rc_abd);
375				rm->rm_col[x].rc_abd = abd_get_offset(
376				    rm->rm_abd_copy, offset);
377				offset += rm->rm_col[x].rc_size;
378			}
379		}
380
381		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
382		good = rm->rm_col[c].rc_gdata;
383	} else {
384		/* adjust good_data to point at the start of our column */
385		good = good_data;
386
387		for (x = rm->rm_firstdatacol; x < c; x++)
388			good += rm->rm_col[x].rc_size;
389	}
390
391	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
392	/* we drop the ereport if it ends up that the data was good */
393	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
394	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
395}
396
397/*
398 * Invoked indirectly by zfs_ereport_start_checksum(), called
399 * below when our read operation fails completely.  The main point
400 * is to keep a copy of everything we read from disk, so that at
401 * vdev_raidz_cksum_finish() time we can compare it with the good data.
402 */
403static void
404vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
405{
406	size_t c = (size_t)(uintptr_t)arg;
407	size_t offset;
408
409	raidz_map_t *rm = zio->io_vsd;
410	size_t size;
411
412	/* set up the report and bump the refcount  */
413	zcr->zcr_cbdata = rm;
414	zcr->zcr_cbinfo = c;
415	zcr->zcr_finish = vdev_raidz_cksum_finish;
416	zcr->zcr_free = vdev_raidz_cksum_free;
417
418	rm->rm_reports++;
419	ASSERT3U(rm->rm_reports, >, 0);
420
421	if (rm->rm_abd_copy != NULL)
422		return;
423
424	/*
425	 * It's the first time we're called for this raidz_map_t, so we need
426	 * to copy the data aside; there's no guarantee that our zio's buffer
427	 * won't be re-used for something else.
428	 *
429	 * Our parity data is already in separate buffers, so there's no need
430	 * to copy them.
431	 */
432
433	size = 0;
434	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
435		size += rm->rm_col[c].rc_size;
436
437	rm->rm_abd_copy =
438	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
439
440	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441		raidz_col_t *col = &rm->rm_col[c];
442		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
443
444		abd_copy(tmp, col->rc_abd, col->rc_size);
445		abd_put(col->rc_abd);
446		col->rc_abd = tmp;
447
448		offset += col->rc_size;
449	}
450	ASSERT3U(offset, ==, size);
451}
452
453static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
454	vdev_raidz_map_free_vsd,
455	vdev_raidz_cksum_report
456};
457
458/*
459 * Divides the IO evenly across all child vdevs; usually, dcols is
460 * the number of children in the target vdev.
461 */
462static raidz_map_t *
463vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
464    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
465{
466	raidz_map_t *rm;
467	/* The starting RAIDZ (parent) vdev sector of the block. */
468	uint64_t b = offset >> unit_shift;
469	/* The zio's size in units of the vdev's minimum sector size. */
470	uint64_t s = size >> unit_shift;
471	/* The first column for this stripe. */
472	uint64_t f = b % dcols;
473	/* The starting byte offset on each child vdev. */
474	uint64_t o = (b / dcols) << unit_shift;
475	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
476	uint64_t off = 0;
477
478	/*
479	 * "Quotient": The number of data sectors for this stripe on all but
480	 * the "big column" child vdevs that also contain "remainder" data.
481	 */
482	q = s / (dcols - nparity);
483
484	/*
485	 * "Remainder": The number of partial stripe data sectors in this I/O.
486	 * This will add a sector to some, but not all, child vdevs.
487	 */
488	r = s - q * (dcols - nparity);
489
490	/* The number of "big columns" - those which contain remainder data. */
491	bc = (r == 0 ? 0 : r + nparity);
492
493	/*
494	 * The total number of data and parity sectors associated with
495	 * this I/O.
496	 */
497	tot = s + nparity * (q + (r == 0 ? 0 : 1));
498
499	/* acols: The columns that will be accessed. */
500	/* scols: The columns that will be accessed or skipped. */
501	if (q == 0) {
502		/* Our I/O request doesn't span all child vdevs. */
503		acols = bc;
504		scols = MIN(dcols, roundup(bc, nparity + 1));
505	} else {
506		acols = dcols;
507		scols = dcols;
508	}
509
510	ASSERT3U(acols, <=, scols);
511
512	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
513
514	rm->rm_cols = acols;
515	rm->rm_scols = scols;
516	rm->rm_bigcols = bc;
517	rm->rm_skipstart = bc;
518	rm->rm_missingdata = 0;
519	rm->rm_missingparity = 0;
520	rm->rm_firstdatacol = nparity;
521	rm->rm_abd_copy = NULL;
522	rm->rm_reports = 0;
523	rm->rm_freed = 0;
524	rm->rm_ecksuminjected = 0;
525
526	asize = 0;
527
528	for (c = 0; c < scols; c++) {
529		col = f + c;
530		coff = o;
531		if (col >= dcols) {
532			col -= dcols;
533			coff += 1ULL << unit_shift;
534		}
535		rm->rm_col[c].rc_devidx = col;
536		rm->rm_col[c].rc_offset = coff;
537		rm->rm_col[c].rc_abd = NULL;
538		rm->rm_col[c].rc_gdata = NULL;
539		rm->rm_col[c].rc_error = 0;
540		rm->rm_col[c].rc_tried = 0;
541		rm->rm_col[c].rc_skipped = 0;
542
543		if (c >= acols)
544			rm->rm_col[c].rc_size = 0;
545		else if (c < bc)
546			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
547		else
548			rm->rm_col[c].rc_size = q << unit_shift;
549
550		asize += rm->rm_col[c].rc_size;
551	}
552
553	ASSERT3U(asize, ==, tot << unit_shift);
554	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
555	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
556	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
557	ASSERT3U(rm->rm_nskip, <=, nparity);
558
559	for (c = 0; c < rm->rm_firstdatacol; c++)
560		rm->rm_col[c].rc_abd =
561		    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
562
563	rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
564	off = rm->rm_col[c].rc_size;
565
566	for (c = c + 1; c < acols; c++) {
567		rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
568		off += rm->rm_col[c].rc_size;
569	}
570
571	/*
572	 * If all data stored spans all columns, there's a danger that parity
573	 * will always be on the same device and, since parity isn't read
574	 * during normal operation, that that device's I/O bandwidth won't be
575	 * used effectively. We therefore switch the parity every 1MB.
576	 *
577	 * ... at least that was, ostensibly, the theory. As a practical
578	 * matter unless we juggle the parity between all devices evenly, we
579	 * won't see any benefit. Further, occasional writes that aren't a
580	 * multiple of the LCM of the number of children and the minimum
581	 * stripe width are sufficient to avoid pessimal behavior.
582	 * Unfortunately, this decision created an implicit on-disk format
583	 * requirement that we need to support for all eternity, but only
584	 * for single-parity RAID-Z.
585	 *
586	 * If we intend to skip a sector in the zeroth column for padding
587	 * we must make sure to note this swap. We will never intend to
588	 * skip the first column since at least one data and one parity
589	 * column must appear in each row.
590	 */
591	ASSERT(rm->rm_cols >= 2);
592	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
593
594	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
595		devidx = rm->rm_col[0].rc_devidx;
596		o = rm->rm_col[0].rc_offset;
597		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
598		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
599		rm->rm_col[1].rc_devidx = devidx;
600		rm->rm_col[1].rc_offset = o;
601
602		if (rm->rm_skipstart == 0)
603			rm->rm_skipstart = 1;
604	}
605
606	return (rm);
607}
608
609struct pqr_struct {
610	uint64_t *p;
611	uint64_t *q;
612	uint64_t *r;
613};
614
615static int
616vdev_raidz_p_func(void *buf, size_t size, void *private)
617{
618	struct pqr_struct *pqr = private;
619	const uint64_t *src = buf;
620	int i, cnt = size / sizeof (src[0]);
621
622	ASSERT(pqr->p && !pqr->q && !pqr->r);
623
624	for (i = 0; i < cnt; i++, src++, pqr->p++)
625		*pqr->p ^= *src;
626
627	return (0);
628}
629
630static int
631vdev_raidz_pq_func(void *buf, size_t size, void *private)
632{
633	struct pqr_struct *pqr = private;
634	const uint64_t *src = buf;
635	uint64_t mask;
636	int i, cnt = size / sizeof (src[0]);
637
638	ASSERT(pqr->p && pqr->q && !pqr->r);
639
640	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
641		*pqr->p ^= *src;
642		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
643		*pqr->q ^= *src;
644	}
645
646	return (0);
647}
648
649static int
650vdev_raidz_pqr_func(void *buf, size_t size, void *private)
651{
652	struct pqr_struct *pqr = private;
653	const uint64_t *src = buf;
654	uint64_t mask;
655	int i, cnt = size / sizeof (src[0]);
656
657	ASSERT(pqr->p && pqr->q && pqr->r);
658
659	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
660		*pqr->p ^= *src;
661		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
662		*pqr->q ^= *src;
663		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
664		*pqr->r ^= *src;
665	}
666
667	return (0);
668}
669
670static void
671vdev_raidz_generate_parity_p(raidz_map_t *rm)
672{
673	uint64_t *p;
674	int c;
675	abd_t *src;
676
677	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678		src = rm->rm_col[c].rc_abd;
679		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
680
681		if (c == rm->rm_firstdatacol) {
682			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
683		} else {
684			struct pqr_struct pqr = { p, NULL, NULL };
685			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
686			    vdev_raidz_p_func, &pqr);
687		}
688	}
689}
690
691static void
692vdev_raidz_generate_parity_pq(raidz_map_t *rm)
693{
694	uint64_t *p, *q, pcnt, ccnt, mask, i;
695	int c;
696	abd_t *src;
697
698	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
699	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
700	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
701
702	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
703		src = rm->rm_col[c].rc_abd;
704		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
705		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
706
707		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
708
709		if (c == rm->rm_firstdatacol) {
710			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
711			(void) memcpy(q, p, rm->rm_col[c].rc_size);
712		} else {
713			struct pqr_struct pqr = { p, q, NULL };
714			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
715			    vdev_raidz_pq_func, &pqr);
716		}
717
718		if (c == rm->rm_firstdatacol) {
719			for (i = ccnt; i < pcnt; i++) {
720				p[i] = 0;
721				q[i] = 0;
722			}
723		} else {
724			/*
725			 * Treat short columns as though they are full of 0s.
726			 * Note that there's therefore nothing needed for P.
727			 */
728			for (i = ccnt; i < pcnt; i++) {
729				VDEV_RAIDZ_64MUL_2(q[i], mask);
730			}
731		}
732	}
733}
734
735static void
736vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
737{
738	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
739	int c;
740	abd_t *src;
741
742	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
743	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
744	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
745	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
746	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
747
748	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
749		src = rm->rm_col[c].rc_abd;
750		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
751		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
752		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
753
754		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
755
756		if (c == rm->rm_firstdatacol) {
757			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
758			(void) memcpy(q, p, rm->rm_col[c].rc_size);
759			(void) memcpy(r, p, rm->rm_col[c].rc_size);
760		} else {
761			struct pqr_struct pqr = { p, q, r };
762			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
763			    vdev_raidz_pqr_func, &pqr);
764		}
765
766		if (c == rm->rm_firstdatacol) {
767			for (i = ccnt; i < pcnt; i++) {
768				p[i] = 0;
769				q[i] = 0;
770				r[i] = 0;
771			}
772		} else {
773			/*
774			 * Treat short columns as though they are full of 0s.
775			 * Note that there's therefore nothing needed for P.
776			 */
777			for (i = ccnt; i < pcnt; i++) {
778				VDEV_RAIDZ_64MUL_2(q[i], mask);
779				VDEV_RAIDZ_64MUL_4(r[i], mask);
780			}
781		}
782	}
783}
784
785/*
786 * Generate RAID parity in the first virtual columns according to the number of
787 * parity columns available.
788 */
789static void
790vdev_raidz_generate_parity(raidz_map_t *rm)
791{
792	switch (rm->rm_firstdatacol) {
793	case 1:
794		vdev_raidz_generate_parity_p(rm);
795		break;
796	case 2:
797		vdev_raidz_generate_parity_pq(rm);
798		break;
799	case 3:
800		vdev_raidz_generate_parity_pqr(rm);
801		break;
802	default:
803		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
804	}
805}
806
807/* ARGSUSED */
808static int
809vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
810{
811	uint64_t *dst = dbuf;
812	uint64_t *src = sbuf;
813	int cnt = size / sizeof (src[0]);
814
815	for (int i = 0; i < cnt; i++) {
816		dst[i] ^= src[i];
817	}
818
819	return (0);
820}
821
822/* ARGSUSED */
823static int
824vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
825    void *private)
826{
827	uint64_t *dst = dbuf;
828	uint64_t *src = sbuf;
829	uint64_t mask;
830	int cnt = size / sizeof (dst[0]);
831
832	for (int i = 0; i < cnt; i++, dst++, src++) {
833		VDEV_RAIDZ_64MUL_2(*dst, mask);
834		*dst ^= *src;
835	}
836
837	return (0);
838}
839
840/* ARGSUSED */
841static int
842vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
843{
844	uint64_t *dst = buf;
845	uint64_t mask;
846	int cnt = size / sizeof (dst[0]);
847
848	for (int i = 0; i < cnt; i++, dst++) {
849		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
850		VDEV_RAIDZ_64MUL_2(*dst, mask);
851	}
852
853	return (0);
854}
855
856struct reconst_q_struct {
857	uint64_t *q;
858	int exp;
859};
860
861static int
862vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
863{
864	struct reconst_q_struct *rq = private;
865	uint64_t *dst = buf;
866	int cnt = size / sizeof (dst[0]);
867
868	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
869		*dst ^= *rq->q;
870
871		int j;
872		uint8_t *b;
873		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
874			*b = vdev_raidz_exp2(*b, rq->exp);
875		}
876	}
877
878	return (0);
879}
880
881struct reconst_pq_struct {
882	uint8_t *p;
883	uint8_t *q;
884	uint8_t *pxy;
885	uint8_t *qxy;
886	int aexp;
887	int bexp;
888};
889
890static int
891vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
892{
893	struct reconst_pq_struct *rpq = private;
894	uint8_t *xd = xbuf;
895	uint8_t *yd = ybuf;
896
897	for (int i = 0; i < size;
898	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
899		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
900		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
901		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
902	}
903
904	return (0);
905}
906
907static int
908vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
909{
910	struct reconst_pq_struct *rpq = private;
911	uint8_t *xd = xbuf;
912
913	for (int i = 0; i < size;
914	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
915		/* same operation as vdev_raidz_reconst_pq_func() on xd */
916		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
917		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
918	}
919
920	return (0);
921}
922
923static int
924vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
925{
926	int x = tgts[0];
927	int c;
928	abd_t *dst, *src;
929
930	ASSERT(ntgts == 1);
931	ASSERT(x >= rm->rm_firstdatacol);
932	ASSERT(x < rm->rm_cols);
933
934	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
935	ASSERT(rm->rm_col[x].rc_size > 0);
936
937	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
938	dst = rm->rm_col[x].rc_abd;
939
940	abd_copy(dst, src, rm->rm_col[x].rc_size);
941
942	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
943		uint64_t size = MIN(rm->rm_col[x].rc_size,
944		    rm->rm_col[c].rc_size);
945
946		src = rm->rm_col[c].rc_abd;
947		dst = rm->rm_col[x].rc_abd;
948
949		if (c == x)
950			continue;
951
952		(void) abd_iterate_func2(dst, src, 0, 0, size,
953		    vdev_raidz_reconst_p_func, NULL);
954	}
955
956	return (1 << VDEV_RAIDZ_P);
957}
958
959static int
960vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
961{
962	int x = tgts[0];
963	int c, exp;
964	abd_t *dst, *src;
965
966	ASSERT(ntgts == 1);
967
968	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
969
970	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
971		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
972		    rm->rm_col[c].rc_size);
973
974		src = rm->rm_col[c].rc_abd;
975		dst = rm->rm_col[x].rc_abd;
976
977		if (c == rm->rm_firstdatacol) {
978			abd_copy(dst, src, size);
979			if (rm->rm_col[x].rc_size > size)
980				abd_zero_off(dst, size,
981				    rm->rm_col[x].rc_size - size);
982		} else {
983			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
984			(void) abd_iterate_func2(dst, src, 0, 0, size,
985			    vdev_raidz_reconst_q_pre_func, NULL);
986			(void) abd_iterate_func(dst,
987			    size, rm->rm_col[x].rc_size - size,
988			    vdev_raidz_reconst_q_pre_tail_func, NULL);
989		}
990	}
991
992	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
993	dst = rm->rm_col[x].rc_abd;
994	exp = 255 - (rm->rm_cols - 1 - x);
995
996	struct reconst_q_struct rq = { abd_to_buf(src), exp };
997	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
998	    vdev_raidz_reconst_q_post_func, &rq);
999
1000	return (1 << VDEV_RAIDZ_Q);
1001}
1002
1003static int
1004vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1005{
1006	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1007	abd_t *pdata, *qdata;
1008	uint64_t xsize, ysize;
1009	int x = tgts[0];
1010	int y = tgts[1];
1011	abd_t *xd, *yd;
1012
1013	ASSERT(ntgts == 2);
1014	ASSERT(x < y);
1015	ASSERT(x >= rm->rm_firstdatacol);
1016	ASSERT(y < rm->rm_cols);
1017
1018	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1019
1020	/*
1021	 * Move the parity data aside -- we're going to compute parity as
1022	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1023	 * reuse the parity generation mechanism without trashing the actual
1024	 * parity so we make those columns appear to be full of zeros by
1025	 * setting their lengths to zero.
1026	 */
1027	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1028	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1029	xsize = rm->rm_col[x].rc_size;
1030	ysize = rm->rm_col[y].rc_size;
1031
1032	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1033	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1034	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1035	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1036	rm->rm_col[x].rc_size = 0;
1037	rm->rm_col[y].rc_size = 0;
1038
1039	vdev_raidz_generate_parity_pq(rm);
1040
1041	rm->rm_col[x].rc_size = xsize;
1042	rm->rm_col[y].rc_size = ysize;
1043
1044	p = abd_to_buf(pdata);
1045	q = abd_to_buf(qdata);
1046	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1047	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1048	xd = rm->rm_col[x].rc_abd;
1049	yd = rm->rm_col[y].rc_abd;
1050
1051	/*
1052	 * We now have:
1053	 *	Pxy = P + D_x + D_y
1054	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1055	 *
1056	 * We can then solve for D_x:
1057	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1058	 * where
1059	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1060	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1061	 *
1062	 * With D_x in hand, we can easily solve for D_y:
1063	 *	D_y = P + Pxy + D_x
1064	 */
1065
1066	a = vdev_raidz_pow2[255 + x - y];
1067	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1068	tmp = 255 - vdev_raidz_log2[a ^ 1];
1069
1070	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1071	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1072
1073	ASSERT3U(xsize, >=, ysize);
1074	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1075	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1076	    vdev_raidz_reconst_pq_func, &rpq);
1077	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1078	    vdev_raidz_reconst_pq_tail_func, &rpq);
1079
1080	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1081	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1082
1083	/*
1084	 * Restore the saved parity data.
1085	 */
1086	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1087	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1088
1089	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1090}
1091
1092/* BEGIN CSTYLED */
1093/*
1094 * In the general case of reconstruction, we must solve the system of linear
1095 * equations defined by the coeffecients used to generate parity as well as
1096 * the contents of the data and parity disks. This can be expressed with
1097 * vectors for the original data (D) and the actual data (d) and parity (p)
1098 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1099 *
1100 *            __   __                     __     __
1101 *            |     |         __     __   |  p_0  |
1102 *            |  V  |         |  D_0  |   | p_m-1 |
1103 *            |     |    x    |   :   | = |  d_0  |
1104 *            |  I  |         | D_n-1 |   |   :   |
1105 *            |     |         ~~     ~~   | d_n-1 |
1106 *            ~~   ~~                     ~~     ~~
1107 *
1108 * I is simply a square identity matrix of size n, and V is a vandermonde
1109 * matrix defined by the coeffecients we chose for the various parity columns
1110 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1111 * computation as well as linear separability.
1112 *
1113 *      __               __               __     __
1114 *      |   1   ..  1 1 1 |               |  p_0  |
1115 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1116 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1117 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1118 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1119 *      |   :       : : : |   |   :   |   |  d_2  |
1120 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1121 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1122 *      |   0   ..  0 0 1 |               | d_n-1 |
1123 *      ~~               ~~               ~~     ~~
1124 *
1125 * Note that I, V, d, and p are known. To compute D, we must invert the
1126 * matrix and use the known data and parity values to reconstruct the unknown
1127 * data values. We begin by removing the rows in V|I and d|p that correspond
1128 * to failed or missing columns; we then make V|I square (n x n) and d|p
1129 * sized n by removing rows corresponding to unused parity from the bottom up
1130 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1131 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1132 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1133 *           __                               __
1134 *           |  1   1   1   1   1   1   1   1  |
1135 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1136 *           |  19 205 116  29  64  16  4   1  |      / /
1137 *           |  1   0   0   0   0   0   0   0  |     / /
1138 *           |  0   1   0   0   0   0   0   0  | <--' /
1139 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1140 *           |  0   0   0   1   0   0   0   0  |
1141 *           |  0   0   0   0   1   0   0   0  |
1142 *           |  0   0   0   0   0   1   0   0  |
1143 *           |  0   0   0   0   0   0   1   0  |
1144 *           |  0   0   0   0   0   0   0   1  |
1145 *           ~~                               ~~
1146 *           __                               __
1147 *           |  1   1   1   1   1   1   1   1  |
1148 *           |  19 205 116  29  64  16  4   1  |
1149 *           |  1   0   0   0   0   0   0   0  |
1150 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1151 *           |  0   0   0   0   1   0   0   0  |
1152 *           |  0   0   0   0   0   1   0   0  |
1153 *           |  0   0   0   0   0   0   1   0  |
1154 *           |  0   0   0   0   0   0   0   1  |
1155 *           ~~                               ~~
1156 *
1157 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1158 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1159 * matrix is not singular.
1160 * __                                                                 __
1161 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1162 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1163 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1164 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1165 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1166 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1167 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1168 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1169 * ~~                                                                 ~~
1170 * __                                                                 __
1171 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1172 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1173 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1174 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1175 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1176 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1177 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1178 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1179 * ~~                                                                 ~~
1180 * __                                                                 __
1181 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1182 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1183 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1184 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1185 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1186 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1187 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1188 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1189 * ~~                                                                 ~~
1190 * __                                                                 __
1191 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1192 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1193 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1194 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1195 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1196 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1197 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1198 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1199 * ~~                                                                 ~~
1200 * __                                                                 __
1201 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1202 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1203 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1204 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1205 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1206 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1207 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1208 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1209 * ~~                                                                 ~~
1210 * __                                                                 __
1211 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1212 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1213 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1214 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1215 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1216 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1217 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1218 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1219 * ~~                                                                 ~~
1220 *                   __                               __
1221 *                   |  0   0   1   0   0   0   0   0  |
1222 *                   | 167 100  5   41 159 169 217 208 |
1223 *                   | 166 100  4   40 158 168 216 209 |
1224 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1225 *                   |  0   0   0   0   1   0   0   0  |
1226 *                   |  0   0   0   0   0   1   0   0  |
1227 *                   |  0   0   0   0   0   0   1   0  |
1228 *                   |  0   0   0   0   0   0   0   1  |
1229 *                   ~~                               ~~
1230 *
1231 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1232 * of the missing data.
1233 *
1234 * As is apparent from the example above, the only non-trivial rows in the
1235 * inverse matrix correspond to the data disks that we're trying to
1236 * reconstruct. Indeed, those are the only rows we need as the others would
1237 * only be useful for reconstructing data known or assumed to be valid. For
1238 * that reason, we only build the coefficients in the rows that correspond to
1239 * targeted columns.
1240 */
1241/* END CSTYLED */
1242
1243static void
1244vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1245    uint8_t **rows)
1246{
1247	int i, j;
1248	int pow;
1249
1250	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1251
1252	/*
1253	 * Fill in the missing rows of interest.
1254	 */
1255	for (i = 0; i < nmap; i++) {
1256		ASSERT3S(0, <=, map[i]);
1257		ASSERT3S(map[i], <=, 2);
1258
1259		pow = map[i] * n;
1260		if (pow > 255)
1261			pow -= 255;
1262		ASSERT(pow <= 255);
1263
1264		for (j = 0; j < n; j++) {
1265			pow -= map[i];
1266			if (pow < 0)
1267				pow += 255;
1268			rows[i][j] = vdev_raidz_pow2[pow];
1269		}
1270	}
1271}
1272
1273static void
1274vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1275    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1276{
1277	int i, j, ii, jj;
1278	uint8_t log;
1279
1280	/*
1281	 * Assert that the first nmissing entries from the array of used
1282	 * columns correspond to parity columns and that subsequent entries
1283	 * correspond to data columns.
1284	 */
1285	for (i = 0; i < nmissing; i++) {
1286		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1287	}
1288	for (; i < n; i++) {
1289		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1290	}
1291
1292	/*
1293	 * First initialize the storage where we'll compute the inverse rows.
1294	 */
1295	for (i = 0; i < nmissing; i++) {
1296		for (j = 0; j < n; j++) {
1297			invrows[i][j] = (i == j) ? 1 : 0;
1298		}
1299	}
1300
1301	/*
1302	 * Subtract all trivial rows from the rows of consequence.
1303	 */
1304	for (i = 0; i < nmissing; i++) {
1305		for (j = nmissing; j < n; j++) {
1306			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1307			jj = used[j] - rm->rm_firstdatacol;
1308			ASSERT3S(jj, <, n);
1309			invrows[i][j] = rows[i][jj];
1310			rows[i][jj] = 0;
1311		}
1312	}
1313
1314	/*
1315	 * For each of the rows of interest, we must normalize it and subtract
1316	 * a multiple of it from the other rows.
1317	 */
1318	for (i = 0; i < nmissing; i++) {
1319		for (j = 0; j < missing[i]; j++) {
1320			ASSERT0(rows[i][j]);
1321		}
1322		ASSERT3U(rows[i][missing[i]], !=, 0);
1323
1324		/*
1325		 * Compute the inverse of the first element and multiply each
1326		 * element in the row by that value.
1327		 */
1328		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1329
1330		for (j = 0; j < n; j++) {
1331			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1332			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1333		}
1334
1335		for (ii = 0; ii < nmissing; ii++) {
1336			if (i == ii)
1337				continue;
1338
1339			ASSERT3U(rows[ii][missing[i]], !=, 0);
1340
1341			log = vdev_raidz_log2[rows[ii][missing[i]]];
1342
1343			for (j = 0; j < n; j++) {
1344				rows[ii][j] ^=
1345				    vdev_raidz_exp2(rows[i][j], log);
1346				invrows[ii][j] ^=
1347				    vdev_raidz_exp2(invrows[i][j], log);
1348			}
1349		}
1350	}
1351
1352	/*
1353	 * Verify that the data that is left in the rows are properly part of
1354	 * an identity matrix.
1355	 */
1356	for (i = 0; i < nmissing; i++) {
1357		for (j = 0; j < n; j++) {
1358			if (j == missing[i]) {
1359				ASSERT3U(rows[i][j], ==, 1);
1360			} else {
1361				ASSERT0(rows[i][j]);
1362			}
1363		}
1364	}
1365}
1366
1367static void
1368vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1369    int *missing, uint8_t **invrows, const uint8_t *used)
1370{
1371	int i, j, x, cc, c;
1372	uint8_t *src;
1373	uint64_t ccount;
1374	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1375	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1376	uint8_t log = 0;
1377	uint8_t val;
1378	int ll;
1379	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1380	uint8_t *p, *pp;
1381	size_t psize;
1382
1383	psize = sizeof (invlog[0][0]) * n * nmissing;
1384	p = kmem_alloc(psize, KM_SLEEP);
1385
1386	for (pp = p, i = 0; i < nmissing; i++) {
1387		invlog[i] = pp;
1388		pp += n;
1389	}
1390
1391	for (i = 0; i < nmissing; i++) {
1392		for (j = 0; j < n; j++) {
1393			ASSERT3U(invrows[i][j], !=, 0);
1394			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1395		}
1396	}
1397
1398	for (i = 0; i < n; i++) {
1399		c = used[i];
1400		ASSERT3U(c, <, rm->rm_cols);
1401
1402		src = abd_to_buf(rm->rm_col[c].rc_abd);
1403		ccount = rm->rm_col[c].rc_size;
1404		for (j = 0; j < nmissing; j++) {
1405			cc = missing[j] + rm->rm_firstdatacol;
1406			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1407			ASSERT3U(cc, <, rm->rm_cols);
1408			ASSERT3U(cc, !=, c);
1409
1410			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1411			dcount[j] = rm->rm_col[cc].rc_size;
1412		}
1413
1414		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1415
1416		for (x = 0; x < ccount; x++, src++) {
1417			if (*src != 0)
1418				log = vdev_raidz_log2[*src];
1419
1420			for (cc = 0; cc < nmissing; cc++) {
1421				if (x >= dcount[cc])
1422					continue;
1423
1424				if (*src == 0) {
1425					val = 0;
1426				} else {
1427					if ((ll = log + invlog[cc][i]) >= 255)
1428						ll -= 255;
1429					val = vdev_raidz_pow2[ll];
1430				}
1431
1432				if (i == 0)
1433					dst[cc][x] = val;
1434				else
1435					dst[cc][x] ^= val;
1436			}
1437		}
1438	}
1439
1440	kmem_free(p, psize);
1441}
1442
1443static int
1444vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1445{
1446	int n, i, c, t, tt;
1447	int nmissing_rows;
1448	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1449	int parity_map[VDEV_RAIDZ_MAXPARITY];
1450
1451	uint8_t *p, *pp;
1452	size_t psize;
1453
1454	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1455	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1456	uint8_t *used;
1457
1458	abd_t **bufs = NULL;
1459
1460	int code = 0;
1461
1462	/*
1463	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1464	 * temporary linear ABDs.
1465	 */
1466	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1467		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1468
1469		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1470			raidz_col_t *col = &rm->rm_col[c];
1471
1472			bufs[c] = col->rc_abd;
1473			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1474			abd_copy(col->rc_abd, bufs[c], col->rc_size);
1475		}
1476	}
1477
1478	n = rm->rm_cols - rm->rm_firstdatacol;
1479
1480	/*
1481	 * Figure out which data columns are missing.
1482	 */
1483	nmissing_rows = 0;
1484	for (t = 0; t < ntgts; t++) {
1485		if (tgts[t] >= rm->rm_firstdatacol) {
1486			missing_rows[nmissing_rows++] =
1487			    tgts[t] - rm->rm_firstdatacol;
1488		}
1489	}
1490
1491	/*
1492	 * Figure out which parity columns to use to help generate the missing
1493	 * data columns.
1494	 */
1495	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1496		ASSERT(tt < ntgts);
1497		ASSERT(c < rm->rm_firstdatacol);
1498
1499		/*
1500		 * Skip any targeted parity columns.
1501		 */
1502		if (c == tgts[tt]) {
1503			tt++;
1504			continue;
1505		}
1506
1507		code |= 1 << c;
1508
1509		parity_map[i] = c;
1510		i++;
1511	}
1512
1513	ASSERT(code != 0);
1514	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1515
1516	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1517	    nmissing_rows * n + sizeof (used[0]) * n;
1518	p = kmem_alloc(psize, KM_SLEEP);
1519
1520	for (pp = p, i = 0; i < nmissing_rows; i++) {
1521		rows[i] = pp;
1522		pp += n;
1523		invrows[i] = pp;
1524		pp += n;
1525	}
1526	used = pp;
1527
1528	for (i = 0; i < nmissing_rows; i++) {
1529		used[i] = parity_map[i];
1530	}
1531
1532	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1533		if (tt < nmissing_rows &&
1534		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1535			tt++;
1536			continue;
1537		}
1538
1539		ASSERT3S(i, <, n);
1540		used[i] = c;
1541		i++;
1542	}
1543
1544	/*
1545	 * Initialize the interesting rows of the matrix.
1546	 */
1547	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1548
1549	/*
1550	 * Invert the matrix.
1551	 */
1552	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1553	    invrows, used);
1554
1555	/*
1556	 * Reconstruct the missing data using the generated matrix.
1557	 */
1558	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1559	    invrows, used);
1560
1561	kmem_free(p, psize);
1562
1563	/*
1564	 * copy back from temporary linear abds and free them
1565	 */
1566	if (bufs) {
1567		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1568			raidz_col_t *col = &rm->rm_col[c];
1569
1570			abd_copy(bufs[c], col->rc_abd, col->rc_size);
1571			abd_free(col->rc_abd);
1572			col->rc_abd = bufs[c];
1573		}
1574		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1575	}
1576
1577	return (code);
1578}
1579
1580static int
1581vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1582{
1583	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1584	int ntgts;
1585	int i, c;
1586	int code;
1587	int nbadparity, nbaddata;
1588	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1589
1590	/*
1591	 * The tgts list must already be sorted.
1592	 */
1593	for (i = 1; i < nt; i++) {
1594		ASSERT(t[i] > t[i - 1]);
1595	}
1596
1597	nbadparity = rm->rm_firstdatacol;
1598	nbaddata = rm->rm_cols - nbadparity;
1599	ntgts = 0;
1600	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1601		if (c < rm->rm_firstdatacol)
1602			parity_valid[c] = B_FALSE;
1603
1604		if (i < nt && c == t[i]) {
1605			tgts[ntgts++] = c;
1606			i++;
1607		} else if (rm->rm_col[c].rc_error != 0) {
1608			tgts[ntgts++] = c;
1609		} else if (c >= rm->rm_firstdatacol) {
1610			nbaddata--;
1611		} else {
1612			parity_valid[c] = B_TRUE;
1613			nbadparity--;
1614		}
1615	}
1616
1617	ASSERT(ntgts >= nt);
1618	ASSERT(nbaddata >= 0);
1619	ASSERT(nbaddata + nbadparity == ntgts);
1620
1621	dt = &tgts[nbadparity];
1622
1623	/*
1624	 * See if we can use any of our optimized reconstruction routines.
1625	 */
1626	if (!vdev_raidz_default_to_general) {
1627		switch (nbaddata) {
1628		case 1:
1629			if (parity_valid[VDEV_RAIDZ_P])
1630				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1631
1632			ASSERT(rm->rm_firstdatacol > 1);
1633
1634			if (parity_valid[VDEV_RAIDZ_Q])
1635				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1636
1637			ASSERT(rm->rm_firstdatacol > 2);
1638			break;
1639
1640		case 2:
1641			ASSERT(rm->rm_firstdatacol > 1);
1642
1643			if (parity_valid[VDEV_RAIDZ_P] &&
1644			    parity_valid[VDEV_RAIDZ_Q])
1645				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1646
1647			ASSERT(rm->rm_firstdatacol > 2);
1648
1649			break;
1650		}
1651	}
1652
1653	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1654	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1655	ASSERT(code > 0);
1656	return (code);
1657}
1658
1659static int
1660vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1661    uint64_t *ashift)
1662{
1663	vdev_t *cvd;
1664	uint64_t nparity = vd->vdev_nparity;
1665	int c;
1666	int lasterror = 0;
1667	int numerrors = 0;
1668
1669	ASSERT(nparity > 0);
1670
1671	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1672	    vd->vdev_children < nparity + 1) {
1673		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1674		return (SET_ERROR(EINVAL));
1675	}
1676
1677	vdev_open_children(vd);
1678
1679	for (c = 0; c < vd->vdev_children; c++) {
1680		cvd = vd->vdev_child[c];
1681
1682		if (cvd->vdev_open_error != 0) {
1683			lasterror = cvd->vdev_open_error;
1684			numerrors++;
1685			continue;
1686		}
1687
1688		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1689		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1690		*ashift = MAX(*ashift, cvd->vdev_ashift);
1691	}
1692
1693	*asize *= vd->vdev_children;
1694	*max_asize *= vd->vdev_children;
1695
1696	if (numerrors > nparity) {
1697		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1698		return (lasterror);
1699	}
1700
1701	return (0);
1702}
1703
1704static void
1705vdev_raidz_close(vdev_t *vd)
1706{
1707	int c;
1708
1709	for (c = 0; c < vd->vdev_children; c++)
1710		vdev_close(vd->vdev_child[c]);
1711}
1712
1713/*
1714 * Handle a read or write I/O to a RAID-Z dump device.
1715 *
1716 * The dump device is in a unique situation compared to other ZFS datasets:
1717 * writing to this device should be as simple and fast as possible.  In
1718 * addition, durability matters much less since the dump will be extracted
1719 * once the machine reboots.  For that reason, this function eschews parity for
1720 * performance and simplicity.  The dump device uses the checksum setting
1721 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1722 * dataset.
1723 *
1724 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1725 * 128 KB will not fill an entire block; in addition, they may not be properly
1726 * aligned.  In that case, this function uses the preallocated 128 KB block and
1727 * omits reading or writing any "empty" portions of that block, as opposed to
1728 * allocating a fresh appropriately-sized block.
1729 *
1730 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1731 *
1732 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1733 *
1734 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1735 * allocated which spans all five child vdevs.  8 KB of data would be written to
1736 * each of four vdevs, with the fifth containing the parity bits.
1737 *
1738 *       parity    data     data     data     data
1739 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1740 *         ^        ^        ^        ^        ^
1741 *         |        |        |        |        |
1742 *   8 KB parity    ------8 KB data blocks------
1743 *
1744 * However, when writing to the dump device, the behavior is different:
1745 *
1746 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1747 *
1748 * Unlike the normal RAID-Z case in which the block is allocated based on the
1749 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1750 * I/O size is less than 128 KB, only the actual portions of data are written.
1751 * In this example the data is written to the third data vdev since that vdev
1752 * contains the offset [64 KB, 96 KB).
1753 *
1754 *       parity    data     data     data     data
1755 *     |        |        |        |   XX   |        |
1756 *                                    ^
1757 *                                    |
1758 *                             32 KB data block
1759 *
1760 * As a result, an individual I/O may not span all child vdevs; moreover, a
1761 * small I/O may only operate on a single child vdev.
1762 *
1763 * Note that since there are no parity bits calculated or written, this format
1764 * remains the same no matter how many parity bits are used in a normal RAID-Z
1765 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1766 * would look like:
1767 *
1768 *       parity   parity   parity    data     data     data     data
1769 *     |        |        |        |        |        |   XX   |        |
1770 *                                                      ^
1771 *                                                      |
1772 *                                               32 KB data block
1773 */
1774int
1775vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1776    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1777{
1778	vdev_t *tvd = vd->vdev_top;
1779	vdev_t *cvd;
1780	raidz_map_t *rm;
1781	raidz_col_t *rc;
1782	int c, err = 0;
1783
1784	uint64_t start, end, colstart, colend;
1785	uint64_t coloffset, colsize, colskip;
1786
1787	int flags = doread ? B_READ : B_WRITE;
1788
1789#ifdef	_KERNEL
1790
1791	/*
1792	 * Don't write past the end of the block
1793	 */
1794	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1795
1796	start = offset;
1797	end = start + size;
1798
1799	/*
1800	 * Allocate a RAID-Z map for this block.  Note that this block starts
1801	 * from the "original" offset, this is, the offset of the extent which
1802	 * contains the requisite offset of the data being read or written.
1803	 *
1804	 * Even if this I/O operation doesn't span the full block size, let's
1805	 * treat the on-disk format as if the only blocks are the complete 128
1806	 * KB size.
1807	 */
1808	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1809	    SPA_OLD_MAXBLOCKSIZE);
1810	rm = vdev_raidz_map_alloc(abd,
1811	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1812	    vd->vdev_children, vd->vdev_nparity);
1813
1814	coloffset = origoffset;
1815
1816	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1817	    c++, coloffset += rc->rc_size) {
1818		rc = &rm->rm_col[c];
1819		cvd = vd->vdev_child[rc->rc_devidx];
1820
1821		/*
1822		 * Find the start and end of this column in the RAID-Z map,
1823		 * keeping in mind that the stated size and offset of the
1824		 * operation may not fill the entire column for this vdev.
1825		 *
1826		 * If any portion of the data spans this column, issue the
1827		 * appropriate operation to the vdev.
1828		 */
1829		if (coloffset + rc->rc_size <= start)
1830			continue;
1831		if (coloffset >= end)
1832			continue;
1833
1834		colstart = MAX(coloffset, start);
1835		colend = MIN(end, coloffset + rc->rc_size);
1836		colsize = colend - colstart;
1837		colskip = colstart - coloffset;
1838
1839		VERIFY3U(colsize, <=, rc->rc_size);
1840		VERIFY3U(colskip, <=, rc->rc_size);
1841
1842		/*
1843		 * Note that the child vdev will have a vdev label at the start
1844		 * of its range of offsets, hence the need for
1845		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1846		 * example of why this calculation is needed.
1847		 */
1848		if ((err = vdev_disk_physio(cvd,
1849		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1850		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1851		    flags, isdump)) != 0)
1852			break;
1853	}
1854
1855	vdev_raidz_map_free(rm);
1856	abd_put(abd);
1857#endif	/* KERNEL */
1858
1859	return (err);
1860}
1861
1862static uint64_t
1863vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1864{
1865	uint64_t asize;
1866	uint64_t ashift = vd->vdev_top->vdev_ashift;
1867	uint64_t cols = vd->vdev_children;
1868	uint64_t nparity = vd->vdev_nparity;
1869
1870	asize = ((psize - 1) >> ashift) + 1;
1871	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1872	asize = roundup(asize, nparity + 1) << ashift;
1873
1874	return (asize);
1875}
1876
1877static void
1878vdev_raidz_child_done(zio_t *zio)
1879{
1880	raidz_col_t *rc = zio->io_private;
1881
1882	rc->rc_error = zio->io_error;
1883	rc->rc_tried = 1;
1884	rc->rc_skipped = 0;
1885}
1886
1887/*
1888 * Start an IO operation on a RAIDZ VDev
1889 *
1890 * Outline:
1891 * - For write operations:
1892 *   1. Generate the parity data
1893 *   2. Create child zio write operations to each column's vdev, for both
1894 *      data and parity.
1895 *   3. If the column skips any sectors for padding, create optional dummy
1896 *      write zio children for those areas to improve aggregation continuity.
1897 * - For read operations:
1898 *   1. Create child zio read operations to each data column's vdev to read
1899 *      the range of data required for zio.
1900 *   2. If this is a scrub or resilver operation, or if any of the data
1901 *      vdevs have had errors, then create zio read operations to the parity
1902 *      columns' VDevs as well.
1903 */
1904static void
1905vdev_raidz_io_start(zio_t *zio)
1906{
1907	vdev_t *vd = zio->io_vd;
1908	vdev_t *tvd = vd->vdev_top;
1909	vdev_t *cvd;
1910	raidz_map_t *rm;
1911	raidz_col_t *rc;
1912	int c, i;
1913
1914	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1915	    tvd->vdev_ashift, vd->vdev_children,
1916	    vd->vdev_nparity);
1917
1918	zio->io_vsd = rm;
1919	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1920
1921	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1922
1923	if (zio->io_type == ZIO_TYPE_WRITE) {
1924		vdev_raidz_generate_parity(rm);
1925
1926		for (c = 0; c < rm->rm_cols; c++) {
1927			rc = &rm->rm_col[c];
1928			cvd = vd->vdev_child[rc->rc_devidx];
1929			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1930			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1931			    zio->io_type, zio->io_priority, 0,
1932			    vdev_raidz_child_done, rc));
1933		}
1934
1935		/*
1936		 * Generate optional I/Os for any skipped sectors to improve
1937		 * aggregation contiguity.
1938		 */
1939		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1940			ASSERT(c <= rm->rm_scols);
1941			if (c == rm->rm_scols)
1942				c = 0;
1943			rc = &rm->rm_col[c];
1944			cvd = vd->vdev_child[rc->rc_devidx];
1945			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1946			    rc->rc_offset + rc->rc_size, NULL,
1947			    1 << tvd->vdev_ashift,
1948			    zio->io_type, zio->io_priority,
1949			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1950		}
1951
1952		zio_execute(zio);
1953		return;
1954	}
1955
1956	ASSERT(zio->io_type == ZIO_TYPE_READ);
1957
1958	/*
1959	 * Iterate over the columns in reverse order so that we hit the parity
1960	 * last -- any errors along the way will force us to read the parity.
1961	 */
1962	for (c = rm->rm_cols - 1; c >= 0; c--) {
1963		rc = &rm->rm_col[c];
1964		cvd = vd->vdev_child[rc->rc_devidx];
1965		if (!vdev_readable(cvd)) {
1966			if (c >= rm->rm_firstdatacol)
1967				rm->rm_missingdata++;
1968			else
1969				rm->rm_missingparity++;
1970			rc->rc_error = SET_ERROR(ENXIO);
1971			rc->rc_tried = 1;	/* don't even try */
1972			rc->rc_skipped = 1;
1973			continue;
1974		}
1975		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1976			if (c >= rm->rm_firstdatacol)
1977				rm->rm_missingdata++;
1978			else
1979				rm->rm_missingparity++;
1980			rc->rc_error = SET_ERROR(ESTALE);
1981			rc->rc_skipped = 1;
1982			continue;
1983		}
1984		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1985		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1986			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1987			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1988			    zio->io_type, zio->io_priority, 0,
1989			    vdev_raidz_child_done, rc));
1990		}
1991	}
1992
1993	zio_execute(zio);
1994}
1995
1996
1997/*
1998 * Report a checksum error for a child of a RAID-Z device.
1999 */
2000static void
2001raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2002{
2003	void *buf;
2004	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2005
2006	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2007		zio_bad_cksum_t zbc;
2008		raidz_map_t *rm = zio->io_vsd;
2009
2010		mutex_enter(&vd->vdev_stat_lock);
2011		vd->vdev_stat.vs_checksum_errors++;
2012		mutex_exit(&vd->vdev_stat_lock);
2013
2014		zbc.zbc_has_cksum = 0;
2015		zbc.zbc_injected = rm->rm_ecksuminjected;
2016
2017		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2018		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2019		    rc->rc_offset, rc->rc_size, buf, bad_data,
2020		    &zbc);
2021		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2022	}
2023}
2024
2025/*
2026 * We keep track of whether or not there were any injected errors, so that
2027 * any ereports we generate can note it.
2028 */
2029static int
2030raidz_checksum_verify(zio_t *zio)
2031{
2032	zio_bad_cksum_t zbc;
2033	raidz_map_t *rm = zio->io_vsd;
2034
2035	int ret = zio_checksum_error(zio, &zbc);
2036	if (ret != 0 && zbc.zbc_injected != 0)
2037		rm->rm_ecksuminjected = 1;
2038
2039	return (ret);
2040}
2041
2042/*
2043 * Generate the parity from the data columns. If we tried and were able to
2044 * read the parity without error, verify that the generated parity matches the
2045 * data we read. If it doesn't, we fire off a checksum error. Return the
2046 * number such failures.
2047 */
2048static int
2049raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2050{
2051	void *orig[VDEV_RAIDZ_MAXPARITY];
2052	int c, ret = 0;
2053	raidz_col_t *rc;
2054
2055	blkptr_t *bp = zio->io_bp;
2056	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2057	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2058
2059	if (checksum == ZIO_CHECKSUM_NOPARITY)
2060		return (ret);
2061
2062	for (c = 0; c < rm->rm_firstdatacol; c++) {
2063		rc = &rm->rm_col[c];
2064		if (!rc->rc_tried || rc->rc_error != 0)
2065			continue;
2066		orig[c] = zio_buf_alloc(rc->rc_size);
2067		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2068	}
2069
2070	vdev_raidz_generate_parity(rm);
2071
2072	for (c = 0; c < rm->rm_firstdatacol; c++) {
2073		rc = &rm->rm_col[c];
2074		if (!rc->rc_tried || rc->rc_error != 0)
2075			continue;
2076		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2077			raidz_checksum_error(zio, rc, orig[c]);
2078			rc->rc_error = SET_ERROR(ECKSUM);
2079			ret++;
2080		}
2081		zio_buf_free(orig[c], rc->rc_size);
2082	}
2083
2084	return (ret);
2085}
2086
2087/*
2088 * Keep statistics on all the ways that we used parity to correct data.
2089 */
2090static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2091
2092static int
2093vdev_raidz_worst_error(raidz_map_t *rm)
2094{
2095	int error = 0;
2096
2097	for (int c = 0; c < rm->rm_cols; c++)
2098		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2099
2100	return (error);
2101}
2102
2103/*
2104 * Iterate over all combinations of bad data and attempt a reconstruction.
2105 * Note that the algorithm below is non-optimal because it doesn't take into
2106 * account how reconstruction is actually performed. For example, with
2107 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2108 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2109 * cases we'd only use parity information in column 0.
2110 */
2111static int
2112vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2113{
2114	raidz_map_t *rm = zio->io_vsd;
2115	raidz_col_t *rc;
2116	void *orig[VDEV_RAIDZ_MAXPARITY];
2117	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2118	int *tgts = &tstore[1];
2119	int current, next, i, c, n;
2120	int code, ret = 0;
2121
2122	ASSERT(total_errors < rm->rm_firstdatacol);
2123
2124	/*
2125	 * This simplifies one edge condition.
2126	 */
2127	tgts[-1] = -1;
2128
2129	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2130		/*
2131		 * Initialize the targets array by finding the first n columns
2132		 * that contain no error.
2133		 *
2134		 * If there were no data errors, we need to ensure that we're
2135		 * always explicitly attempting to reconstruct at least one
2136		 * data column. To do this, we simply push the highest target
2137		 * up into the data columns.
2138		 */
2139		for (c = 0, i = 0; i < n; i++) {
2140			if (i == n - 1 && data_errors == 0 &&
2141			    c < rm->rm_firstdatacol) {
2142				c = rm->rm_firstdatacol;
2143			}
2144
2145			while (rm->rm_col[c].rc_error != 0) {
2146				c++;
2147				ASSERT3S(c, <, rm->rm_cols);
2148			}
2149
2150			tgts[i] = c++;
2151		}
2152
2153		/*
2154		 * Setting tgts[n] simplifies the other edge condition.
2155		 */
2156		tgts[n] = rm->rm_cols;
2157
2158		/*
2159		 * These buffers were allocated in previous iterations.
2160		 */
2161		for (i = 0; i < n - 1; i++) {
2162			ASSERT(orig[i] != NULL);
2163		}
2164
2165		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2166
2167		current = 0;
2168		next = tgts[current];
2169
2170		while (current != n) {
2171			tgts[current] = next;
2172			current = 0;
2173
2174			/*
2175			 * Save off the original data that we're going to
2176			 * attempt to reconstruct.
2177			 */
2178			for (i = 0; i < n; i++) {
2179				ASSERT(orig[i] != NULL);
2180				c = tgts[i];
2181				ASSERT3S(c, >=, 0);
2182				ASSERT3S(c, <, rm->rm_cols);
2183				rc = &rm->rm_col[c];
2184				abd_copy_to_buf(orig[i], rc->rc_abd,
2185				    rc->rc_size);
2186			}
2187
2188			/*
2189			 * Attempt a reconstruction and exit the outer loop on
2190			 * success.
2191			 */
2192			code = vdev_raidz_reconstruct(rm, tgts, n);
2193			if (raidz_checksum_verify(zio) == 0) {
2194				atomic_inc_64(&raidz_corrected[code]);
2195
2196				for (i = 0; i < n; i++) {
2197					c = tgts[i];
2198					rc = &rm->rm_col[c];
2199					ASSERT(rc->rc_error == 0);
2200					if (rc->rc_tried)
2201						raidz_checksum_error(zio, rc,
2202						    orig[i]);
2203					rc->rc_error = SET_ERROR(ECKSUM);
2204				}
2205
2206				ret = code;
2207				goto done;
2208			}
2209
2210			/*
2211			 * Restore the original data.
2212			 */
2213			for (i = 0; i < n; i++) {
2214				c = tgts[i];
2215				rc = &rm->rm_col[c];
2216				abd_copy_from_buf(rc->rc_abd, orig[i],
2217				    rc->rc_size);
2218			}
2219
2220			do {
2221				/*
2222				 * Find the next valid column after the current
2223				 * position..
2224				 */
2225				for (next = tgts[current] + 1;
2226				    next < rm->rm_cols &&
2227				    rm->rm_col[next].rc_error != 0; next++)
2228					continue;
2229
2230				ASSERT(next <= tgts[current + 1]);
2231
2232				/*
2233				 * If that spot is available, we're done here.
2234				 */
2235				if (next != tgts[current + 1])
2236					break;
2237
2238				/*
2239				 * Otherwise, find the next valid column after
2240				 * the previous position.
2241				 */
2242				for (c = tgts[current - 1] + 1;
2243				    rm->rm_col[c].rc_error != 0; c++)
2244					continue;
2245
2246				tgts[current] = c;
2247				current++;
2248
2249			} while (current != n);
2250		}
2251	}
2252	n--;
2253done:
2254	for (i = 0; i < n; i++) {
2255		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2256	}
2257
2258	return (ret);
2259}
2260
2261/*
2262 * Complete an IO operation on a RAIDZ VDev
2263 *
2264 * Outline:
2265 * - For write operations:
2266 *   1. Check for errors on the child IOs.
2267 *   2. Return, setting an error code if too few child VDevs were written
2268 *      to reconstruct the data later.  Note that partial writes are
2269 *      considered successful if they can be reconstructed at all.
2270 * - For read operations:
2271 *   1. Check for errors on the child IOs.
2272 *   2. If data errors occurred:
2273 *      a. Try to reassemble the data from the parity available.
2274 *      b. If we haven't yet read the parity drives, read them now.
2275 *      c. If all parity drives have been read but the data still doesn't
2276 *         reassemble with a correct checksum, then try combinatorial
2277 *         reconstruction.
2278 *      d. If that doesn't work, return an error.
2279 *   3. If there were unexpected errors or this is a resilver operation,
2280 *      rewrite the vdevs that had errors.
2281 */
2282static void
2283vdev_raidz_io_done(zio_t *zio)
2284{
2285	vdev_t *vd = zio->io_vd;
2286	vdev_t *cvd;
2287	raidz_map_t *rm = zio->io_vsd;
2288	raidz_col_t *rc;
2289	int unexpected_errors = 0;
2290	int parity_errors = 0;
2291	int parity_untried = 0;
2292	int data_errors = 0;
2293	int total_errors = 0;
2294	int n, c;
2295	int tgts[VDEV_RAIDZ_MAXPARITY];
2296	int code;
2297
2298	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2299
2300	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2301	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2302
2303	for (c = 0; c < rm->rm_cols; c++) {
2304		rc = &rm->rm_col[c];
2305
2306		if (rc->rc_error) {
2307			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2308
2309			if (c < rm->rm_firstdatacol)
2310				parity_errors++;
2311			else
2312				data_errors++;
2313
2314			if (!rc->rc_skipped)
2315				unexpected_errors++;
2316
2317			total_errors++;
2318		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2319			parity_untried++;
2320		}
2321	}
2322
2323	if (zio->io_type == ZIO_TYPE_WRITE) {
2324		/*
2325		 * XXX -- for now, treat partial writes as a success.
2326		 * (If we couldn't write enough columns to reconstruct
2327		 * the data, the I/O failed.  Otherwise, good enough.)
2328		 *
2329		 * Now that we support write reallocation, it would be better
2330		 * to treat partial failure as real failure unless there are
2331		 * no non-degraded top-level vdevs left, and not update DTLs
2332		 * if we intend to reallocate.
2333		 */
2334		/* XXPOLICY */
2335		if (total_errors > rm->rm_firstdatacol)
2336			zio->io_error = vdev_raidz_worst_error(rm);
2337
2338		return;
2339	}
2340
2341	ASSERT(zio->io_type == ZIO_TYPE_READ);
2342	/*
2343	 * There are three potential phases for a read:
2344	 *	1. produce valid data from the columns read
2345	 *	2. read all disks and try again
2346	 *	3. perform combinatorial reconstruction
2347	 *
2348	 * Each phase is progressively both more expensive and less likely to
2349	 * occur. If we encounter more errors than we can repair or all phases
2350	 * fail, we have no choice but to return an error.
2351	 */
2352
2353	/*
2354	 * If the number of errors we saw was correctable -- less than or equal
2355	 * to the number of parity disks read -- attempt to produce data that
2356	 * has a valid checksum. Naturally, this case applies in the absence of
2357	 * any errors.
2358	 */
2359	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2360		if (data_errors == 0) {
2361			if (raidz_checksum_verify(zio) == 0) {
2362				/*
2363				 * If we read parity information (unnecessarily
2364				 * as it happens since no reconstruction was
2365				 * needed) regenerate and verify the parity.
2366				 * We also regenerate parity when resilvering
2367				 * so we can write it out to the failed device
2368				 * later.
2369				 */
2370				if (parity_errors + parity_untried <
2371				    rm->rm_firstdatacol ||
2372				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2373					n = raidz_parity_verify(zio, rm);
2374					unexpected_errors += n;
2375					ASSERT(parity_errors + n <=
2376					    rm->rm_firstdatacol);
2377				}
2378				goto done;
2379			}
2380		} else {
2381			/*
2382			 * We either attempt to read all the parity columns or
2383			 * none of them. If we didn't try to read parity, we
2384			 * wouldn't be here in the correctable case. There must
2385			 * also have been fewer parity errors than parity
2386			 * columns or, again, we wouldn't be in this code path.
2387			 */
2388			ASSERT(parity_untried == 0);
2389			ASSERT(parity_errors < rm->rm_firstdatacol);
2390
2391			/*
2392			 * Identify the data columns that reported an error.
2393			 */
2394			n = 0;
2395			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2396				rc = &rm->rm_col[c];
2397				if (rc->rc_error != 0) {
2398					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2399					tgts[n++] = c;
2400				}
2401			}
2402
2403			ASSERT(rm->rm_firstdatacol >= n);
2404
2405			code = vdev_raidz_reconstruct(rm, tgts, n);
2406
2407			if (raidz_checksum_verify(zio) == 0) {
2408				atomic_inc_64(&raidz_corrected[code]);
2409
2410				/*
2411				 * If we read more parity disks than were used
2412				 * for reconstruction, confirm that the other
2413				 * parity disks produced correct data. This
2414				 * routine is suboptimal in that it regenerates
2415				 * the parity that we already used in addition
2416				 * to the parity that we're attempting to
2417				 * verify, but this should be a relatively
2418				 * uncommon case, and can be optimized if it
2419				 * becomes a problem. Note that we regenerate
2420				 * parity when resilvering so we can write it
2421				 * out to failed devices later.
2422				 */
2423				if (parity_errors < rm->rm_firstdatacol - n ||
2424				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2425					n = raidz_parity_verify(zio, rm);
2426					unexpected_errors += n;
2427					ASSERT(parity_errors + n <=
2428					    rm->rm_firstdatacol);
2429				}
2430
2431				goto done;
2432			}
2433		}
2434	}
2435
2436	/*
2437	 * This isn't a typical situation -- either we got a read error or
2438	 * a child silently returned bad data. Read every block so we can
2439	 * try again with as much data and parity as we can track down. If
2440	 * we've already been through once before, all children will be marked
2441	 * as tried so we'll proceed to combinatorial reconstruction.
2442	 */
2443	unexpected_errors = 1;
2444	rm->rm_missingdata = 0;
2445	rm->rm_missingparity = 0;
2446
2447	for (c = 0; c < rm->rm_cols; c++) {
2448		if (rm->rm_col[c].rc_tried)
2449			continue;
2450
2451		zio_vdev_io_redone(zio);
2452		do {
2453			rc = &rm->rm_col[c];
2454			if (rc->rc_tried)
2455				continue;
2456			zio_nowait(zio_vdev_child_io(zio, NULL,
2457			    vd->vdev_child[rc->rc_devidx],
2458			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2459			    zio->io_type, zio->io_priority, 0,
2460			    vdev_raidz_child_done, rc));
2461		} while (++c < rm->rm_cols);
2462
2463		return;
2464	}
2465
2466	/*
2467	 * At this point we've attempted to reconstruct the data given the
2468	 * errors we detected, and we've attempted to read all columns. There
2469	 * must, therefore, be one or more additional problems -- silent errors
2470	 * resulting in invalid data rather than explicit I/O errors resulting
2471	 * in absent data. We check if there is enough additional data to
2472	 * possibly reconstruct the data and then perform combinatorial
2473	 * reconstruction over all possible combinations. If that fails,
2474	 * we're cooked.
2475	 */
2476	if (total_errors > rm->rm_firstdatacol) {
2477		zio->io_error = vdev_raidz_worst_error(rm);
2478
2479	} else if (total_errors < rm->rm_firstdatacol &&
2480	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2481		/*
2482		 * If we didn't use all the available parity for the
2483		 * combinatorial reconstruction, verify that the remaining
2484		 * parity is correct.
2485		 */
2486		if (code != (1 << rm->rm_firstdatacol) - 1)
2487			(void) raidz_parity_verify(zio, rm);
2488	} else {
2489		/*
2490		 * We're here because either:
2491		 *
2492		 *	total_errors == rm_first_datacol, or
2493		 *	vdev_raidz_combrec() failed
2494		 *
2495		 * In either case, there is enough bad data to prevent
2496		 * reconstruction.
2497		 *
2498		 * Start checksum ereports for all children which haven't
2499		 * failed, and the IO wasn't speculative.
2500		 */
2501		zio->io_error = SET_ERROR(ECKSUM);
2502
2503		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2504			for (c = 0; c < rm->rm_cols; c++) {
2505				rc = &rm->rm_col[c];
2506				if (rc->rc_error == 0) {
2507					zio_bad_cksum_t zbc;
2508					zbc.zbc_has_cksum = 0;
2509					zbc.zbc_injected =
2510					    rm->rm_ecksuminjected;
2511
2512					zfs_ereport_start_checksum(
2513					    zio->io_spa,
2514					    vd->vdev_child[rc->rc_devidx],
2515					    zio, rc->rc_offset, rc->rc_size,
2516					    (void *)(uintptr_t)c, &zbc);
2517				}
2518			}
2519		}
2520	}
2521
2522done:
2523	zio_checksum_verified(zio);
2524
2525	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2526	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2527		/*
2528		 * Use the good data we have in hand to repair damaged children.
2529		 */
2530		for (c = 0; c < rm->rm_cols; c++) {
2531			rc = &rm->rm_col[c];
2532			cvd = vd->vdev_child[rc->rc_devidx];
2533
2534			if (rc->rc_error == 0)
2535				continue;
2536
2537			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2538			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2539			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2540			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2541			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2542		}
2543	}
2544}
2545
2546static void
2547vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2548{
2549	if (faulted > vd->vdev_nparity)
2550		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2551		    VDEV_AUX_NO_REPLICAS);
2552	else if (degraded + faulted != 0)
2553		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2554	else
2555		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2556}
2557
2558vdev_ops_t vdev_raidz_ops = {
2559	vdev_raidz_open,
2560	vdev_raidz_close,
2561	vdev_raidz_asize,
2562	vdev_raidz_io_start,
2563	vdev_raidz_io_done,
2564	vdev_raidz_state_change,
2565	NULL,
2566	NULL,
2567	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2568	B_FALSE			/* not a leaf vdev */
2569};
2570