1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#include <sys/vdev_disk.h>
33#include <sys/vdev_file.h>
34#include <sys/vdev_raidz.h>
35#include <sys/zio.h>
36#include <sys/zio_checksum.h>
37#include <sys/abd.h>
38#include <sys/fs/zfs.h>
39#include <sys/fm/fs/zfs.h>
40
41#ifdef ZFS_DEBUG
42#include <sys/vdev_initialize.h>	/* vdev_xlate testing */
43#endif
44
45/*
46 * Virtual device vector for RAID-Z.
47 *
48 * This vdev supports single, double, and triple parity. For single parity,
49 * we use a simple XOR of all the data columns. For double or triple parity,
50 * we use a special case of Reed-Solomon coding. This extends the
51 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54 * former is also based. The latter is designed to provide higher performance
55 * for writes.
56 *
57 * Note that the Plank paper claimed to support arbitrary N+M, but was then
58 * amended six years later identifying a critical flaw that invalidates its
59 * claims. Nevertheless, the technique can be adapted to work for up to
60 * triple parity. For additional parity, the amendment "Note: Correction to
61 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62 * is viable, but the additional complexity means that write performance will
63 * suffer.
64 *
65 * All of the methods above operate on a Galois field, defined over the
66 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67 * can be expressed with a single byte. Briefly, the operations on the
68 * field are defined as follows:
69 *
70 *   o addition (+) is represented by a bitwise XOR
71 *   o subtraction (-) is therefore identical to addition: A + B = A - B
72 *   o multiplication of A by 2 is defined by the following bitwise expression:
73 *
74 *	(A * 2)_7 = A_6
75 *	(A * 2)_6 = A_5
76 *	(A * 2)_5 = A_4
77 *	(A * 2)_4 = A_3 + A_7
78 *	(A * 2)_3 = A_2 + A_7
79 *	(A * 2)_2 = A_1 + A_7
80 *	(A * 2)_1 = A_0
81 *	(A * 2)_0 = A_7
82 *
83 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84 * As an aside, this multiplication is derived from the error correcting
85 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86 *
87 * Observe that any number in the field (except for 0) can be expressed as a
88 * power of 2 -- a generator for the field. We store a table of the powers of
89 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91 * than field addition). The inverse of a field element A (A^-1) is therefore
92 * A ^ (255 - 1) = A^254.
93 *
94 * The up-to-three parity columns, P, Q, R over several data columns,
95 * D_0, ... D_n-1, can be expressed by field operations:
96 *
97 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
98 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102 *
103 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105 * independent coefficients. (There are no additional coefficients that have
106 * this property which is why the uncorrected Plank method breaks down.)
107 *
108 * See the reconstruction code below for how P, Q and R can used individually
109 * or in concert to recover missing data columns.
110 */
111
112typedef struct raidz_col {
113	uint64_t rc_devidx;		/* child device index for I/O */
114	uint64_t rc_offset;		/* device offset */
115	uint64_t rc_size;		/* I/O size */
116	abd_t *rc_abd;			/* I/O data */
117	void *rc_gdata;			/* used to store the "good" version */
118	int rc_error;			/* I/O error for this device */
119	uint8_t rc_tried;		/* Did we attempt this I/O column? */
120	uint8_t rc_skipped;		/* Did we skip this I/O column? */
121} raidz_col_t;
122
123typedef struct raidz_map {
124	uint64_t rm_cols;		/* Regular column count */
125	uint64_t rm_scols;		/* Count including skipped columns */
126	uint64_t rm_bigcols;		/* Number of oversized columns */
127	uint64_t rm_asize;		/* Actual total I/O size */
128	uint64_t rm_missingdata;	/* Count of missing data devices */
129	uint64_t rm_missingparity;	/* Count of missing parity devices */
130	uint64_t rm_firstdatacol;	/* First data column/parity count */
131	uint64_t rm_nskip;		/* Skipped sectors for padding */
132	uint64_t rm_skipstart;		/* Column index of padding start */
133	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
134	uintptr_t rm_reports;		/* # of referencing checksum reports */
135	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
136	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
137	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
138} raidz_map_t;
139
140#define	VDEV_RAIDZ_P		0
141#define	VDEV_RAIDZ_Q		1
142#define	VDEV_RAIDZ_R		2
143
144#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146
147/*
148 * We provide a mechanism to perform the field multiplication operation on a
149 * 64-bit value all at once rather than a byte at a time. This works by
150 * creating a mask from the top bit in each byte and using that to
151 * conditionally apply the XOR of 0x1d.
152 */
153#define	VDEV_RAIDZ_64MUL_2(x, mask) \
154{ \
155	(mask) = (x) & 0x8080808080808080ULL; \
156	(mask) = ((mask) << 1) - ((mask) >> 7); \
157	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
159}
160
161#define	VDEV_RAIDZ_64MUL_4(x, mask) \
162{ \
163	VDEV_RAIDZ_64MUL_2((x), mask); \
164	VDEV_RAIDZ_64MUL_2((x), mask); \
165}
166
167#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
168
169/*
170 * Force reconstruction to use the general purpose method.
171 */
172int vdev_raidz_default_to_general;
173
174/* Powers of 2 in the Galois field defined above. */
175static const uint8_t vdev_raidz_pow2[256] = {
176	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208};
209/* Logs of 2 in the Galois field defined above. */
210static const uint8_t vdev_raidz_log2[256] = {
211	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243};
244
245static void vdev_raidz_generate_parity(raidz_map_t *rm);
246
247/*
248 * Multiply a given number by 2 raised to the given power.
249 */
250static uint8_t
251vdev_raidz_exp2(uint_t a, int exp)
252{
253	if (a == 0)
254		return (0);
255
256	ASSERT(exp >= 0);
257	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258
259	exp += vdev_raidz_log2[a];
260	if (exp > 255)
261		exp -= 255;
262
263	return (vdev_raidz_pow2[exp]);
264}
265
266static void
267vdev_raidz_map_free(raidz_map_t *rm)
268{
269	int c;
270	size_t size;
271
272	for (c = 0; c < rm->rm_firstdatacol; c++) {
273		abd_free(rm->rm_col[c].rc_abd);
274
275		if (rm->rm_col[c].rc_gdata != NULL)
276			zio_buf_free(rm->rm_col[c].rc_gdata,
277			    rm->rm_col[c].rc_size);
278	}
279
280	size = 0;
281	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
282		abd_put(rm->rm_col[c].rc_abd);
283		size += rm->rm_col[c].rc_size;
284	}
285
286	if (rm->rm_abd_copy != NULL)
287		abd_free(rm->rm_abd_copy);
288
289	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
290}
291
292static void
293vdev_raidz_map_free_vsd(zio_t *zio)
294{
295	raidz_map_t *rm = zio->io_vsd;
296
297	ASSERT0(rm->rm_freed);
298	rm->rm_freed = 1;
299
300	if (rm->rm_reports == 0)
301		vdev_raidz_map_free(rm);
302}
303
304/*ARGSUSED*/
305static void
306vdev_raidz_cksum_free(void *arg, size_t ignored)
307{
308	raidz_map_t *rm = arg;
309
310	ASSERT3U(rm->rm_reports, >, 0);
311
312	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
313		vdev_raidz_map_free(rm);
314}
315
316static void
317vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
318{
319	raidz_map_t *rm = zcr->zcr_cbdata;
320	size_t c = zcr->zcr_cbinfo;
321	size_t x;
322
323	const char *good = NULL;
324	char *bad;
325
326	if (good_data == NULL) {
327		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
328		return;
329	}
330
331	if (c < rm->rm_firstdatacol) {
332		/*
333		 * The first time through, calculate the parity blocks for
334		 * the good data (this relies on the fact that the good
335		 * data never changes for a given logical ZIO)
336		 */
337		if (rm->rm_col[0].rc_gdata == NULL) {
338			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
339			char *buf;
340			int offset;
341
342			/*
343			 * Set up the rm_col[]s to generate the parity for
344			 * good_data, first saving the parity bufs and
345			 * replacing them with buffers to hold the result.
346			 */
347			for (x = 0; x < rm->rm_firstdatacol; x++) {
348				bad_parity[x] = rm->rm_col[x].rc_abd;
349				rm->rm_col[x].rc_gdata =
350				    zio_buf_alloc(rm->rm_col[x].rc_size);
351				rm->rm_col[x].rc_abd =
352				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
353				    rm->rm_col[x].rc_size);
354			}
355
356			/* fill in the data columns from good_data */
357			buf = (char *)good_data;
358			for (; x < rm->rm_cols; x++) {
359				abd_put(rm->rm_col[x].rc_abd);
360				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
361				    rm->rm_col[x].rc_size);
362				buf += rm->rm_col[x].rc_size;
363			}
364
365			/*
366			 * Construct the parity from the good data.
367			 */
368			vdev_raidz_generate_parity(rm);
369
370			/* restore everything back to its original state */
371			for (x = 0; x < rm->rm_firstdatacol; x++) {
372				abd_put(rm->rm_col[x].rc_abd);
373				rm->rm_col[x].rc_abd = bad_parity[x];
374			}
375
376			offset = 0;
377			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
378				abd_put(rm->rm_col[x].rc_abd);
379				rm->rm_col[x].rc_abd = abd_get_offset(
380				    rm->rm_abd_copy, offset);
381				offset += rm->rm_col[x].rc_size;
382			}
383		}
384
385		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
386		good = rm->rm_col[c].rc_gdata;
387	} else {
388		/* adjust good_data to point at the start of our column */
389		good = good_data;
390
391		for (x = rm->rm_firstdatacol; x < c; x++)
392			good += rm->rm_col[x].rc_size;
393	}
394
395	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
396	/* we drop the ereport if it ends up that the data was good */
397	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
398	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
399}
400
401/*
402 * Invoked indirectly by zfs_ereport_start_checksum(), called
403 * below when our read operation fails completely.  The main point
404 * is to keep a copy of everything we read from disk, so that at
405 * vdev_raidz_cksum_finish() time we can compare it with the good data.
406 */
407static void
408vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
409{
410	size_t c = (size_t)(uintptr_t)arg;
411	size_t offset;
412
413	raidz_map_t *rm = zio->io_vsd;
414	size_t size;
415
416	/* set up the report and bump the refcount  */
417	zcr->zcr_cbdata = rm;
418	zcr->zcr_cbinfo = c;
419	zcr->zcr_finish = vdev_raidz_cksum_finish;
420	zcr->zcr_free = vdev_raidz_cksum_free;
421
422	rm->rm_reports++;
423	ASSERT3U(rm->rm_reports, >, 0);
424
425	if (rm->rm_abd_copy != NULL)
426		return;
427
428	/*
429	 * It's the first time we're called for this raidz_map_t, so we need
430	 * to copy the data aside; there's no guarantee that our zio's buffer
431	 * won't be re-used for something else.
432	 *
433	 * Our parity data is already in separate buffers, so there's no need
434	 * to copy them.
435	 */
436
437	size = 0;
438	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
439		size += rm->rm_col[c].rc_size;
440
441	rm->rm_abd_copy =
442	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
443
444	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
445		raidz_col_t *col = &rm->rm_col[c];
446		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
447
448		abd_copy(tmp, col->rc_abd, col->rc_size);
449		abd_put(col->rc_abd);
450		col->rc_abd = tmp;
451
452		offset += col->rc_size;
453	}
454	ASSERT3U(offset, ==, size);
455}
456
457static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
458	vdev_raidz_map_free_vsd,
459	vdev_raidz_cksum_report
460};
461
462/*
463 * Divides the IO evenly across all child vdevs; usually, dcols is
464 * the number of children in the target vdev.
465 */
466static raidz_map_t *
467vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
468    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
469{
470	raidz_map_t *rm;
471	/* The starting RAIDZ (parent) vdev sector of the block. */
472	uint64_t b = offset >> unit_shift;
473	/* The zio's size in units of the vdev's minimum sector size. */
474	uint64_t s = size >> unit_shift;
475	/* The first column for this stripe. */
476	uint64_t f = b % dcols;
477	/* The starting byte offset on each child vdev. */
478	uint64_t o = (b / dcols) << unit_shift;
479	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
480	uint64_t off = 0;
481
482	/*
483	 * "Quotient": The number of data sectors for this stripe on all but
484	 * the "big column" child vdevs that also contain "remainder" data.
485	 */
486	q = s / (dcols - nparity);
487
488	/*
489	 * "Remainder": The number of partial stripe data sectors in this I/O.
490	 * This will add a sector to some, but not all, child vdevs.
491	 */
492	r = s - q * (dcols - nparity);
493
494	/* The number of "big columns" - those which contain remainder data. */
495	bc = (r == 0 ? 0 : r + nparity);
496
497	/*
498	 * The total number of data and parity sectors associated with
499	 * this I/O.
500	 */
501	tot = s + nparity * (q + (r == 0 ? 0 : 1));
502
503	/* acols: The columns that will be accessed. */
504	/* scols: The columns that will be accessed or skipped. */
505	if (q == 0) {
506		/* Our I/O request doesn't span all child vdevs. */
507		acols = bc;
508		scols = MIN(dcols, roundup(bc, nparity + 1));
509	} else {
510		acols = dcols;
511		scols = dcols;
512	}
513
514	ASSERT3U(acols, <=, scols);
515
516	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
517
518	rm->rm_cols = acols;
519	rm->rm_scols = scols;
520	rm->rm_bigcols = bc;
521	rm->rm_skipstart = bc;
522	rm->rm_missingdata = 0;
523	rm->rm_missingparity = 0;
524	rm->rm_firstdatacol = nparity;
525	rm->rm_abd_copy = NULL;
526	rm->rm_reports = 0;
527	rm->rm_freed = 0;
528	rm->rm_ecksuminjected = 0;
529
530	asize = 0;
531
532	for (c = 0; c < scols; c++) {
533		col = f + c;
534		coff = o;
535		if (col >= dcols) {
536			col -= dcols;
537			coff += 1ULL << unit_shift;
538		}
539		rm->rm_col[c].rc_devidx = col;
540		rm->rm_col[c].rc_offset = coff;
541		rm->rm_col[c].rc_abd = NULL;
542		rm->rm_col[c].rc_gdata = NULL;
543		rm->rm_col[c].rc_error = 0;
544		rm->rm_col[c].rc_tried = 0;
545		rm->rm_col[c].rc_skipped = 0;
546
547		if (c >= acols)
548			rm->rm_col[c].rc_size = 0;
549		else if (c < bc)
550			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
551		else
552			rm->rm_col[c].rc_size = q << unit_shift;
553
554		asize += rm->rm_col[c].rc_size;
555	}
556
557	ASSERT3U(asize, ==, tot << unit_shift);
558	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
559	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
560	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
561	ASSERT3U(rm->rm_nskip, <=, nparity);
562
563	for (c = 0; c < rm->rm_firstdatacol; c++)
564		rm->rm_col[c].rc_abd =
565		    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
566
567	rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
568	off = rm->rm_col[c].rc_size;
569
570	for (c = c + 1; c < acols; c++) {
571		rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
572		off += rm->rm_col[c].rc_size;
573	}
574
575	/*
576	 * If all data stored spans all columns, there's a danger that parity
577	 * will always be on the same device and, since parity isn't read
578	 * during normal operation, that that device's I/O bandwidth won't be
579	 * used effectively. We therefore switch the parity every 1MB.
580	 *
581	 * ... at least that was, ostensibly, the theory. As a practical
582	 * matter unless we juggle the parity between all devices evenly, we
583	 * won't see any benefit. Further, occasional writes that aren't a
584	 * multiple of the LCM of the number of children and the minimum
585	 * stripe width are sufficient to avoid pessimal behavior.
586	 * Unfortunately, this decision created an implicit on-disk format
587	 * requirement that we need to support for all eternity, but only
588	 * for single-parity RAID-Z.
589	 *
590	 * If we intend to skip a sector in the zeroth column for padding
591	 * we must make sure to note this swap. We will never intend to
592	 * skip the first column since at least one data and one parity
593	 * column must appear in each row.
594	 */
595	ASSERT(rm->rm_cols >= 2);
596	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
597
598	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
599		devidx = rm->rm_col[0].rc_devidx;
600		o = rm->rm_col[0].rc_offset;
601		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
602		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
603		rm->rm_col[1].rc_devidx = devidx;
604		rm->rm_col[1].rc_offset = o;
605
606		if (rm->rm_skipstart == 0)
607			rm->rm_skipstart = 1;
608	}
609
610	return (rm);
611}
612
613struct pqr_struct {
614	uint64_t *p;
615	uint64_t *q;
616	uint64_t *r;
617};
618
619static int
620vdev_raidz_p_func(void *buf, size_t size, void *private)
621{
622	struct pqr_struct *pqr = private;
623	const uint64_t *src = buf;
624	int i, cnt = size / sizeof (src[0]);
625
626	ASSERT(pqr->p && !pqr->q && !pqr->r);
627
628	for (i = 0; i < cnt; i++, src++, pqr->p++)
629		*pqr->p ^= *src;
630
631	return (0);
632}
633
634static int
635vdev_raidz_pq_func(void *buf, size_t size, void *private)
636{
637	struct pqr_struct *pqr = private;
638	const uint64_t *src = buf;
639	uint64_t mask;
640	int i, cnt = size / sizeof (src[0]);
641
642	ASSERT(pqr->p && pqr->q && !pqr->r);
643
644	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
645		*pqr->p ^= *src;
646		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
647		*pqr->q ^= *src;
648	}
649
650	return (0);
651}
652
653static int
654vdev_raidz_pqr_func(void *buf, size_t size, void *private)
655{
656	struct pqr_struct *pqr = private;
657	const uint64_t *src = buf;
658	uint64_t mask;
659	int i, cnt = size / sizeof (src[0]);
660
661	ASSERT(pqr->p && pqr->q && pqr->r);
662
663	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
664		*pqr->p ^= *src;
665		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
666		*pqr->q ^= *src;
667		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
668		*pqr->r ^= *src;
669	}
670
671	return (0);
672}
673
674static void
675vdev_raidz_generate_parity_p(raidz_map_t *rm)
676{
677	uint64_t *p;
678	int c;
679	abd_t *src;
680
681	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
682		src = rm->rm_col[c].rc_abd;
683		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
684
685		if (c == rm->rm_firstdatacol) {
686			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
687		} else {
688			struct pqr_struct pqr = { p, NULL, NULL };
689			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
690			    vdev_raidz_p_func, &pqr);
691		}
692	}
693}
694
695static void
696vdev_raidz_generate_parity_pq(raidz_map_t *rm)
697{
698	uint64_t *p, *q, pcnt, ccnt, mask, i;
699	int c;
700	abd_t *src;
701
702	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
703	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
704	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
705
706	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
707		src = rm->rm_col[c].rc_abd;
708		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
709		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
710
711		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
712
713		if (c == rm->rm_firstdatacol) {
714			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
715			(void) memcpy(q, p, rm->rm_col[c].rc_size);
716		} else {
717			struct pqr_struct pqr = { p, q, NULL };
718			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
719			    vdev_raidz_pq_func, &pqr);
720		}
721
722		if (c == rm->rm_firstdatacol) {
723			for (i = ccnt; i < pcnt; i++) {
724				p[i] = 0;
725				q[i] = 0;
726			}
727		} else {
728			/*
729			 * Treat short columns as though they are full of 0s.
730			 * Note that there's therefore nothing needed for P.
731			 */
732			for (i = ccnt; i < pcnt; i++) {
733				VDEV_RAIDZ_64MUL_2(q[i], mask);
734			}
735		}
736	}
737}
738
739static void
740vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
741{
742	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
743	int c;
744	abd_t *src;
745
746	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
747	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
748	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
749	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
750	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
751
752	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
753		src = rm->rm_col[c].rc_abd;
754		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
755		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
756		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
757
758		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
759
760		if (c == rm->rm_firstdatacol) {
761			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
762			(void) memcpy(q, p, rm->rm_col[c].rc_size);
763			(void) memcpy(r, p, rm->rm_col[c].rc_size);
764		} else {
765			struct pqr_struct pqr = { p, q, r };
766			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
767			    vdev_raidz_pqr_func, &pqr);
768		}
769
770		if (c == rm->rm_firstdatacol) {
771			for (i = ccnt; i < pcnt; i++) {
772				p[i] = 0;
773				q[i] = 0;
774				r[i] = 0;
775			}
776		} else {
777			/*
778			 * Treat short columns as though they are full of 0s.
779			 * Note that there's therefore nothing needed for P.
780			 */
781			for (i = ccnt; i < pcnt; i++) {
782				VDEV_RAIDZ_64MUL_2(q[i], mask);
783				VDEV_RAIDZ_64MUL_4(r[i], mask);
784			}
785		}
786	}
787}
788
789/*
790 * Generate RAID parity in the first virtual columns according to the number of
791 * parity columns available.
792 */
793static void
794vdev_raidz_generate_parity(raidz_map_t *rm)
795{
796	switch (rm->rm_firstdatacol) {
797	case 1:
798		vdev_raidz_generate_parity_p(rm);
799		break;
800	case 2:
801		vdev_raidz_generate_parity_pq(rm);
802		break;
803	case 3:
804		vdev_raidz_generate_parity_pqr(rm);
805		break;
806	default:
807		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
808	}
809}
810
811/* ARGSUSED */
812static int
813vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
814{
815	uint64_t *dst = dbuf;
816	uint64_t *src = sbuf;
817	int cnt = size / sizeof (src[0]);
818
819	for (int i = 0; i < cnt; i++) {
820		dst[i] ^= src[i];
821	}
822
823	return (0);
824}
825
826/* ARGSUSED */
827static int
828vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
829    void *private)
830{
831	uint64_t *dst = dbuf;
832	uint64_t *src = sbuf;
833	uint64_t mask;
834	int cnt = size / sizeof (dst[0]);
835
836	for (int i = 0; i < cnt; i++, dst++, src++) {
837		VDEV_RAIDZ_64MUL_2(*dst, mask);
838		*dst ^= *src;
839	}
840
841	return (0);
842}
843
844/* ARGSUSED */
845static int
846vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
847{
848	uint64_t *dst = buf;
849	uint64_t mask;
850	int cnt = size / sizeof (dst[0]);
851
852	for (int i = 0; i < cnt; i++, dst++) {
853		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
854		VDEV_RAIDZ_64MUL_2(*dst, mask);
855	}
856
857	return (0);
858}
859
860struct reconst_q_struct {
861	uint64_t *q;
862	int exp;
863};
864
865static int
866vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
867{
868	struct reconst_q_struct *rq = private;
869	uint64_t *dst = buf;
870	int cnt = size / sizeof (dst[0]);
871
872	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
873		*dst ^= *rq->q;
874
875		int j;
876		uint8_t *b;
877		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
878			*b = vdev_raidz_exp2(*b, rq->exp);
879		}
880	}
881
882	return (0);
883}
884
885struct reconst_pq_struct {
886	uint8_t *p;
887	uint8_t *q;
888	uint8_t *pxy;
889	uint8_t *qxy;
890	int aexp;
891	int bexp;
892};
893
894static int
895vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
896{
897	struct reconst_pq_struct *rpq = private;
898	uint8_t *xd = xbuf;
899	uint8_t *yd = ybuf;
900
901	for (int i = 0; i < size;
902	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
903		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
904		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
905		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
906	}
907
908	return (0);
909}
910
911static int
912vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
913{
914	struct reconst_pq_struct *rpq = private;
915	uint8_t *xd = xbuf;
916
917	for (int i = 0; i < size;
918	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
919		/* same operation as vdev_raidz_reconst_pq_func() on xd */
920		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
921		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
922	}
923
924	return (0);
925}
926
927static int
928vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
929{
930	int x = tgts[0];
931	int c;
932	abd_t *dst, *src;
933
934	ASSERT(ntgts == 1);
935	ASSERT(x >= rm->rm_firstdatacol);
936	ASSERT(x < rm->rm_cols);
937
938	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
939	ASSERT(rm->rm_col[x].rc_size > 0);
940
941	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
942	dst = rm->rm_col[x].rc_abd;
943
944	abd_copy(dst, src, rm->rm_col[x].rc_size);
945
946	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
947		uint64_t size = MIN(rm->rm_col[x].rc_size,
948		    rm->rm_col[c].rc_size);
949
950		src = rm->rm_col[c].rc_abd;
951		dst = rm->rm_col[x].rc_abd;
952
953		if (c == x)
954			continue;
955
956		(void) abd_iterate_func2(dst, src, 0, 0, size,
957		    vdev_raidz_reconst_p_func, NULL);
958	}
959
960	return (1 << VDEV_RAIDZ_P);
961}
962
963static int
964vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
965{
966	int x = tgts[0];
967	int c, exp;
968	abd_t *dst, *src;
969
970	ASSERT(ntgts == 1);
971
972	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
973
974	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
975		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
976		    rm->rm_col[c].rc_size);
977
978		src = rm->rm_col[c].rc_abd;
979		dst = rm->rm_col[x].rc_abd;
980
981		if (c == rm->rm_firstdatacol) {
982			abd_copy(dst, src, size);
983			if (rm->rm_col[x].rc_size > size)
984				abd_zero_off(dst, size,
985				    rm->rm_col[x].rc_size - size);
986		} else {
987			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
988			(void) abd_iterate_func2(dst, src, 0, 0, size,
989			    vdev_raidz_reconst_q_pre_func, NULL);
990			(void) abd_iterate_func(dst,
991			    size, rm->rm_col[x].rc_size - size,
992			    vdev_raidz_reconst_q_pre_tail_func, NULL);
993		}
994	}
995
996	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
997	dst = rm->rm_col[x].rc_abd;
998	exp = 255 - (rm->rm_cols - 1 - x);
999
1000	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1001	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1002	    vdev_raidz_reconst_q_post_func, &rq);
1003
1004	return (1 << VDEV_RAIDZ_Q);
1005}
1006
1007static int
1008vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1009{
1010	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1011	abd_t *pdata, *qdata;
1012	uint64_t xsize, ysize;
1013	int x = tgts[0];
1014	int y = tgts[1];
1015	abd_t *xd, *yd;
1016
1017	ASSERT(ntgts == 2);
1018	ASSERT(x < y);
1019	ASSERT(x >= rm->rm_firstdatacol);
1020	ASSERT(y < rm->rm_cols);
1021
1022	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1023
1024	/*
1025	 * Move the parity data aside -- we're going to compute parity as
1026	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1027	 * reuse the parity generation mechanism without trashing the actual
1028	 * parity so we make those columns appear to be full of zeros by
1029	 * setting their lengths to zero.
1030	 */
1031	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1032	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1033	xsize = rm->rm_col[x].rc_size;
1034	ysize = rm->rm_col[y].rc_size;
1035
1036	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1037	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1038	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1039	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1040	rm->rm_col[x].rc_size = 0;
1041	rm->rm_col[y].rc_size = 0;
1042
1043	vdev_raidz_generate_parity_pq(rm);
1044
1045	rm->rm_col[x].rc_size = xsize;
1046	rm->rm_col[y].rc_size = ysize;
1047
1048	p = abd_to_buf(pdata);
1049	q = abd_to_buf(qdata);
1050	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1051	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1052	xd = rm->rm_col[x].rc_abd;
1053	yd = rm->rm_col[y].rc_abd;
1054
1055	/*
1056	 * We now have:
1057	 *	Pxy = P + D_x + D_y
1058	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1059	 *
1060	 * We can then solve for D_x:
1061	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1062	 * where
1063	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1064	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1065	 *
1066	 * With D_x in hand, we can easily solve for D_y:
1067	 *	D_y = P + Pxy + D_x
1068	 */
1069
1070	a = vdev_raidz_pow2[255 + x - y];
1071	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1072	tmp = 255 - vdev_raidz_log2[a ^ 1];
1073
1074	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1075	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1076
1077	ASSERT3U(xsize, >=, ysize);
1078	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1079	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1080	    vdev_raidz_reconst_pq_func, &rpq);
1081	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1082	    vdev_raidz_reconst_pq_tail_func, &rpq);
1083
1084	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1085	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1086
1087	/*
1088	 * Restore the saved parity data.
1089	 */
1090	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1091	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1092
1093	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1094}
1095
1096/* BEGIN CSTYLED */
1097/*
1098 * In the general case of reconstruction, we must solve the system of linear
1099 * equations defined by the coeffecients used to generate parity as well as
1100 * the contents of the data and parity disks. This can be expressed with
1101 * vectors for the original data (D) and the actual data (d) and parity (p)
1102 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1103 *
1104 *            __   __                     __     __
1105 *            |     |         __     __   |  p_0  |
1106 *            |  V  |         |  D_0  |   | p_m-1 |
1107 *            |     |    x    |   :   | = |  d_0  |
1108 *            |  I  |         | D_n-1 |   |   :   |
1109 *            |     |         ~~     ~~   | d_n-1 |
1110 *            ~~   ~~                     ~~     ~~
1111 *
1112 * I is simply a square identity matrix of size n, and V is a vandermonde
1113 * matrix defined by the coeffecients we chose for the various parity columns
1114 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1115 * computation as well as linear separability.
1116 *
1117 *      __               __               __     __
1118 *      |   1   ..  1 1 1 |               |  p_0  |
1119 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1120 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1121 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1122 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1123 *      |   :       : : : |   |   :   |   |  d_2  |
1124 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1125 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1126 *      |   0   ..  0 0 1 |               | d_n-1 |
1127 *      ~~               ~~               ~~     ~~
1128 *
1129 * Note that I, V, d, and p are known. To compute D, we must invert the
1130 * matrix and use the known data and parity values to reconstruct the unknown
1131 * data values. We begin by removing the rows in V|I and d|p that correspond
1132 * to failed or missing columns; we then make V|I square (n x n) and d|p
1133 * sized n by removing rows corresponding to unused parity from the bottom up
1134 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1135 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1136 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1137 *           __                               __
1138 *           |  1   1   1   1   1   1   1   1  |
1139 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1140 *           |  19 205 116  29  64  16  4   1  |      / /
1141 *           |  1   0   0   0   0   0   0   0  |     / /
1142 *           |  0   1   0   0   0   0   0   0  | <--' /
1143 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1144 *           |  0   0   0   1   0   0   0   0  |
1145 *           |  0   0   0   0   1   0   0   0  |
1146 *           |  0   0   0   0   0   1   0   0  |
1147 *           |  0   0   0   0   0   0   1   0  |
1148 *           |  0   0   0   0   0   0   0   1  |
1149 *           ~~                               ~~
1150 *           __                               __
1151 *           |  1   1   1   1   1   1   1   1  |
1152 *           |  19 205 116  29  64  16  4   1  |
1153 *           |  1   0   0   0   0   0   0   0  |
1154 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1155 *           |  0   0   0   0   1   0   0   0  |
1156 *           |  0   0   0   0   0   1   0   0  |
1157 *           |  0   0   0   0   0   0   1   0  |
1158 *           |  0   0   0   0   0   0   0   1  |
1159 *           ~~                               ~~
1160 *
1161 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1162 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1163 * matrix is not singular.
1164 * __                                                                 __
1165 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1166 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1167 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1168 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1169 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1170 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1171 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1172 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1173 * ~~                                                                 ~~
1174 * __                                                                 __
1175 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1176 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1177 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1178 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1179 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1180 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1181 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1182 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1183 * ~~                                                                 ~~
1184 * __                                                                 __
1185 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1186 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1187 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1188 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1189 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1190 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1191 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1192 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1193 * ~~                                                                 ~~
1194 * __                                                                 __
1195 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1196 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1197 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1198 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1199 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1200 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1201 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1202 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1203 * ~~                                                                 ~~
1204 * __                                                                 __
1205 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1206 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1207 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1208 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1209 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1210 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1211 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1212 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1213 * ~~                                                                 ~~
1214 * __                                                                 __
1215 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1216 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1217 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1218 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1219 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1220 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1221 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1222 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1223 * ~~                                                                 ~~
1224 *                   __                               __
1225 *                   |  0   0   1   0   0   0   0   0  |
1226 *                   | 167 100  5   41 159 169 217 208 |
1227 *                   | 166 100  4   40 158 168 216 209 |
1228 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1229 *                   |  0   0   0   0   1   0   0   0  |
1230 *                   |  0   0   0   0   0   1   0   0  |
1231 *                   |  0   0   0   0   0   0   1   0  |
1232 *                   |  0   0   0   0   0   0   0   1  |
1233 *                   ~~                               ~~
1234 *
1235 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1236 * of the missing data.
1237 *
1238 * As is apparent from the example above, the only non-trivial rows in the
1239 * inverse matrix correspond to the data disks that we're trying to
1240 * reconstruct. Indeed, those are the only rows we need as the others would
1241 * only be useful for reconstructing data known or assumed to be valid. For
1242 * that reason, we only build the coefficients in the rows that correspond to
1243 * targeted columns.
1244 */
1245/* END CSTYLED */
1246
1247static void
1248vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1249    uint8_t **rows)
1250{
1251	int i, j;
1252	int pow;
1253
1254	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1255
1256	/*
1257	 * Fill in the missing rows of interest.
1258	 */
1259	for (i = 0; i < nmap; i++) {
1260		ASSERT3S(0, <=, map[i]);
1261		ASSERT3S(map[i], <=, 2);
1262
1263		pow = map[i] * n;
1264		if (pow > 255)
1265			pow -= 255;
1266		ASSERT(pow <= 255);
1267
1268		for (j = 0; j < n; j++) {
1269			pow -= map[i];
1270			if (pow < 0)
1271				pow += 255;
1272			rows[i][j] = vdev_raidz_pow2[pow];
1273		}
1274	}
1275}
1276
1277static void
1278vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1279    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1280{
1281	int i, j, ii, jj;
1282	uint8_t log;
1283
1284	/*
1285	 * Assert that the first nmissing entries from the array of used
1286	 * columns correspond to parity columns and that subsequent entries
1287	 * correspond to data columns.
1288	 */
1289	for (i = 0; i < nmissing; i++) {
1290		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1291	}
1292	for (; i < n; i++) {
1293		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1294	}
1295
1296	/*
1297	 * First initialize the storage where we'll compute the inverse rows.
1298	 */
1299	for (i = 0; i < nmissing; i++) {
1300		for (j = 0; j < n; j++) {
1301			invrows[i][j] = (i == j) ? 1 : 0;
1302		}
1303	}
1304
1305	/*
1306	 * Subtract all trivial rows from the rows of consequence.
1307	 */
1308	for (i = 0; i < nmissing; i++) {
1309		for (j = nmissing; j < n; j++) {
1310			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1311			jj = used[j] - rm->rm_firstdatacol;
1312			ASSERT3S(jj, <, n);
1313			invrows[i][j] = rows[i][jj];
1314			rows[i][jj] = 0;
1315		}
1316	}
1317
1318	/*
1319	 * For each of the rows of interest, we must normalize it and subtract
1320	 * a multiple of it from the other rows.
1321	 */
1322	for (i = 0; i < nmissing; i++) {
1323		for (j = 0; j < missing[i]; j++) {
1324			ASSERT0(rows[i][j]);
1325		}
1326		ASSERT3U(rows[i][missing[i]], !=, 0);
1327
1328		/*
1329		 * Compute the inverse of the first element and multiply each
1330		 * element in the row by that value.
1331		 */
1332		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1333
1334		for (j = 0; j < n; j++) {
1335			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1336			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1337		}
1338
1339		for (ii = 0; ii < nmissing; ii++) {
1340			if (i == ii)
1341				continue;
1342
1343			ASSERT3U(rows[ii][missing[i]], !=, 0);
1344
1345			log = vdev_raidz_log2[rows[ii][missing[i]]];
1346
1347			for (j = 0; j < n; j++) {
1348				rows[ii][j] ^=
1349				    vdev_raidz_exp2(rows[i][j], log);
1350				invrows[ii][j] ^=
1351				    vdev_raidz_exp2(invrows[i][j], log);
1352			}
1353		}
1354	}
1355
1356	/*
1357	 * Verify that the data that is left in the rows are properly part of
1358	 * an identity matrix.
1359	 */
1360	for (i = 0; i < nmissing; i++) {
1361		for (j = 0; j < n; j++) {
1362			if (j == missing[i]) {
1363				ASSERT3U(rows[i][j], ==, 1);
1364			} else {
1365				ASSERT0(rows[i][j]);
1366			}
1367		}
1368	}
1369}
1370
1371static void
1372vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1373    int *missing, uint8_t **invrows, const uint8_t *used)
1374{
1375	int i, j, x, cc, c;
1376	uint8_t *src;
1377	uint64_t ccount;
1378	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1379	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1380	uint8_t log = 0;
1381	uint8_t val;
1382	int ll;
1383	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1384	uint8_t *p, *pp;
1385	size_t psize;
1386
1387	psize = sizeof (invlog[0][0]) * n * nmissing;
1388	p = kmem_alloc(psize, KM_SLEEP);
1389
1390	for (pp = p, i = 0; i < nmissing; i++) {
1391		invlog[i] = pp;
1392		pp += n;
1393	}
1394
1395	for (i = 0; i < nmissing; i++) {
1396		for (j = 0; j < n; j++) {
1397			ASSERT3U(invrows[i][j], !=, 0);
1398			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1399		}
1400	}
1401
1402	for (i = 0; i < n; i++) {
1403		c = used[i];
1404		ASSERT3U(c, <, rm->rm_cols);
1405
1406		src = abd_to_buf(rm->rm_col[c].rc_abd);
1407		ccount = rm->rm_col[c].rc_size;
1408		for (j = 0; j < nmissing; j++) {
1409			cc = missing[j] + rm->rm_firstdatacol;
1410			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1411			ASSERT3U(cc, <, rm->rm_cols);
1412			ASSERT3U(cc, !=, c);
1413
1414			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1415			dcount[j] = rm->rm_col[cc].rc_size;
1416		}
1417
1418		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1419
1420		for (x = 0; x < ccount; x++, src++) {
1421			if (*src != 0)
1422				log = vdev_raidz_log2[*src];
1423
1424			for (cc = 0; cc < nmissing; cc++) {
1425				if (x >= dcount[cc])
1426					continue;
1427
1428				if (*src == 0) {
1429					val = 0;
1430				} else {
1431					if ((ll = log + invlog[cc][i]) >= 255)
1432						ll -= 255;
1433					val = vdev_raidz_pow2[ll];
1434				}
1435
1436				if (i == 0)
1437					dst[cc][x] = val;
1438				else
1439					dst[cc][x] ^= val;
1440			}
1441		}
1442	}
1443
1444	kmem_free(p, psize);
1445}
1446
1447static int
1448vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1449{
1450	int n, i, c, t, tt;
1451	int nmissing_rows;
1452	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1453	int parity_map[VDEV_RAIDZ_MAXPARITY];
1454
1455	uint8_t *p, *pp;
1456	size_t psize;
1457
1458	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1459	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1460	uint8_t *used;
1461
1462	abd_t **bufs = NULL;
1463
1464	int code = 0;
1465
1466	/*
1467	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1468	 * temporary linear ABDs.
1469	 */
1470	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1471		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1472
1473		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1474			raidz_col_t *col = &rm->rm_col[c];
1475
1476			bufs[c] = col->rc_abd;
1477			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1478			abd_copy(col->rc_abd, bufs[c], col->rc_size);
1479		}
1480	}
1481
1482	n = rm->rm_cols - rm->rm_firstdatacol;
1483
1484	/*
1485	 * Figure out which data columns are missing.
1486	 */
1487	nmissing_rows = 0;
1488	for (t = 0; t < ntgts; t++) {
1489		if (tgts[t] >= rm->rm_firstdatacol) {
1490			missing_rows[nmissing_rows++] =
1491			    tgts[t] - rm->rm_firstdatacol;
1492		}
1493	}
1494
1495	/*
1496	 * Figure out which parity columns to use to help generate the missing
1497	 * data columns.
1498	 */
1499	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1500		ASSERT(tt < ntgts);
1501		ASSERT(c < rm->rm_firstdatacol);
1502
1503		/*
1504		 * Skip any targeted parity columns.
1505		 */
1506		if (c == tgts[tt]) {
1507			tt++;
1508			continue;
1509		}
1510
1511		code |= 1 << c;
1512
1513		parity_map[i] = c;
1514		i++;
1515	}
1516
1517	ASSERT(code != 0);
1518	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1519
1520	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1521	    nmissing_rows * n + sizeof (used[0]) * n;
1522	p = kmem_alloc(psize, KM_SLEEP);
1523
1524	for (pp = p, i = 0; i < nmissing_rows; i++) {
1525		rows[i] = pp;
1526		pp += n;
1527		invrows[i] = pp;
1528		pp += n;
1529	}
1530	used = pp;
1531
1532	for (i = 0; i < nmissing_rows; i++) {
1533		used[i] = parity_map[i];
1534	}
1535
1536	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1537		if (tt < nmissing_rows &&
1538		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1539			tt++;
1540			continue;
1541		}
1542
1543		ASSERT3S(i, <, n);
1544		used[i] = c;
1545		i++;
1546	}
1547
1548	/*
1549	 * Initialize the interesting rows of the matrix.
1550	 */
1551	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1552
1553	/*
1554	 * Invert the matrix.
1555	 */
1556	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1557	    invrows, used);
1558
1559	/*
1560	 * Reconstruct the missing data using the generated matrix.
1561	 */
1562	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1563	    invrows, used);
1564
1565	kmem_free(p, psize);
1566
1567	/*
1568	 * copy back from temporary linear abds and free them
1569	 */
1570	if (bufs) {
1571		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1572			raidz_col_t *col = &rm->rm_col[c];
1573
1574			abd_copy(bufs[c], col->rc_abd, col->rc_size);
1575			abd_free(col->rc_abd);
1576			col->rc_abd = bufs[c];
1577		}
1578		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1579	}
1580
1581	return (code);
1582}
1583
1584static int
1585vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1586{
1587	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1588	int ntgts;
1589	int i, c;
1590	int code;
1591	int nbadparity, nbaddata;
1592	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1593
1594	/*
1595	 * The tgts list must already be sorted.
1596	 */
1597	for (i = 1; i < nt; i++) {
1598		ASSERT(t[i] > t[i - 1]);
1599	}
1600
1601	nbadparity = rm->rm_firstdatacol;
1602	nbaddata = rm->rm_cols - nbadparity;
1603	ntgts = 0;
1604	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1605		if (c < rm->rm_firstdatacol)
1606			parity_valid[c] = B_FALSE;
1607
1608		if (i < nt && c == t[i]) {
1609			tgts[ntgts++] = c;
1610			i++;
1611		} else if (rm->rm_col[c].rc_error != 0) {
1612			tgts[ntgts++] = c;
1613		} else if (c >= rm->rm_firstdatacol) {
1614			nbaddata--;
1615		} else {
1616			parity_valid[c] = B_TRUE;
1617			nbadparity--;
1618		}
1619	}
1620
1621	ASSERT(ntgts >= nt);
1622	ASSERT(nbaddata >= 0);
1623	ASSERT(nbaddata + nbadparity == ntgts);
1624
1625	dt = &tgts[nbadparity];
1626
1627	/*
1628	 * See if we can use any of our optimized reconstruction routines.
1629	 */
1630	if (!vdev_raidz_default_to_general) {
1631		switch (nbaddata) {
1632		case 1:
1633			if (parity_valid[VDEV_RAIDZ_P])
1634				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1635
1636			ASSERT(rm->rm_firstdatacol > 1);
1637
1638			if (parity_valid[VDEV_RAIDZ_Q])
1639				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1640
1641			ASSERT(rm->rm_firstdatacol > 2);
1642			break;
1643
1644		case 2:
1645			ASSERT(rm->rm_firstdatacol > 1);
1646
1647			if (parity_valid[VDEV_RAIDZ_P] &&
1648			    parity_valid[VDEV_RAIDZ_Q])
1649				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1650
1651			ASSERT(rm->rm_firstdatacol > 2);
1652
1653			break;
1654		}
1655	}
1656
1657	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1658	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1659	ASSERT(code > 0);
1660	return (code);
1661}
1662
1663static int
1664vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1665    uint64_t *ashift)
1666{
1667	vdev_t *cvd;
1668	uint64_t nparity = vd->vdev_nparity;
1669	int c;
1670	int lasterror = 0;
1671	int numerrors = 0;
1672
1673	ASSERT(nparity > 0);
1674
1675	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1676	    vd->vdev_children < nparity + 1) {
1677		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1678		return (SET_ERROR(EINVAL));
1679	}
1680
1681	vdev_open_children(vd);
1682
1683	for (c = 0; c < vd->vdev_children; c++) {
1684		cvd = vd->vdev_child[c];
1685
1686		if (cvd->vdev_open_error != 0) {
1687			lasterror = cvd->vdev_open_error;
1688			numerrors++;
1689			continue;
1690		}
1691
1692		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1693		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1694		*ashift = MAX(*ashift, cvd->vdev_ashift);
1695	}
1696
1697	*asize *= vd->vdev_children;
1698	*max_asize *= vd->vdev_children;
1699
1700	if (numerrors > nparity) {
1701		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1702		return (lasterror);
1703	}
1704
1705	return (0);
1706}
1707
1708static void
1709vdev_raidz_close(vdev_t *vd)
1710{
1711	int c;
1712
1713	for (c = 0; c < vd->vdev_children; c++)
1714		vdev_close(vd->vdev_child[c]);
1715}
1716
1717/*
1718 * Handle a read or write I/O to a RAID-Z dump device.
1719 *
1720 * The dump device is in a unique situation compared to other ZFS datasets:
1721 * writing to this device should be as simple and fast as possible.  In
1722 * addition, durability matters much less since the dump will be extracted
1723 * once the machine reboots.  For that reason, this function eschews parity for
1724 * performance and simplicity.  The dump device uses the checksum setting
1725 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1726 * dataset.
1727 *
1728 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1729 * 128 KB will not fill an entire block; in addition, they may not be properly
1730 * aligned.  In that case, this function uses the preallocated 128 KB block and
1731 * omits reading or writing any "empty" portions of that block, as opposed to
1732 * allocating a fresh appropriately-sized block.
1733 *
1734 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1735 *
1736 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1737 *
1738 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1739 * allocated which spans all five child vdevs.  8 KB of data would be written to
1740 * each of four vdevs, with the fifth containing the parity bits.
1741 *
1742 *       parity    data     data     data     data
1743 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1744 *         ^        ^        ^        ^        ^
1745 *         |        |        |        |        |
1746 *   8 KB parity    ------8 KB data blocks------
1747 *
1748 * However, when writing to the dump device, the behavior is different:
1749 *
1750 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1751 *
1752 * Unlike the normal RAID-Z case in which the block is allocated based on the
1753 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1754 * I/O size is less than 128 KB, only the actual portions of data are written.
1755 * In this example the data is written to the third data vdev since that vdev
1756 * contains the offset [64 KB, 96 KB).
1757 *
1758 *       parity    data     data     data     data
1759 *     |        |        |        |   XX   |        |
1760 *                                    ^
1761 *                                    |
1762 *                             32 KB data block
1763 *
1764 * As a result, an individual I/O may not span all child vdevs; moreover, a
1765 * small I/O may only operate on a single child vdev.
1766 *
1767 * Note that since there are no parity bits calculated or written, this format
1768 * remains the same no matter how many parity bits are used in a normal RAID-Z
1769 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1770 * would look like:
1771 *
1772 *       parity   parity   parity    data     data     data     data
1773 *     |        |        |        |        |        |   XX   |        |
1774 *                                                      ^
1775 *                                                      |
1776 *                                               32 KB data block
1777 */
1778int
1779vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1780    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1781{
1782	vdev_t *tvd = vd->vdev_top;
1783	vdev_t *cvd;
1784	raidz_map_t *rm;
1785	raidz_col_t *rc;
1786	int c, err = 0;
1787
1788	uint64_t start, end, colstart, colend;
1789	uint64_t coloffset, colsize, colskip;
1790
1791	int flags = doread ? B_READ : B_WRITE;
1792
1793#ifdef	_KERNEL
1794
1795	/*
1796	 * Don't write past the end of the block
1797	 */
1798	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1799
1800	start = offset;
1801	end = start + size;
1802
1803	/*
1804	 * Allocate a RAID-Z map for this block.  Note that this block starts
1805	 * from the "original" offset, this is, the offset of the extent which
1806	 * contains the requisite offset of the data being read or written.
1807	 *
1808	 * Even if this I/O operation doesn't span the full block size, let's
1809	 * treat the on-disk format as if the only blocks are the complete 128
1810	 * KB size.
1811	 */
1812	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1813	    SPA_OLD_MAXBLOCKSIZE);
1814	rm = vdev_raidz_map_alloc(abd,
1815	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1816	    vd->vdev_children, vd->vdev_nparity);
1817
1818	coloffset = origoffset;
1819
1820	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1821	    c++, coloffset += rc->rc_size) {
1822		rc = &rm->rm_col[c];
1823		cvd = vd->vdev_child[rc->rc_devidx];
1824
1825		/*
1826		 * Find the start and end of this column in the RAID-Z map,
1827		 * keeping in mind that the stated size and offset of the
1828		 * operation may not fill the entire column for this vdev.
1829		 *
1830		 * If any portion of the data spans this column, issue the
1831		 * appropriate operation to the vdev.
1832		 */
1833		if (coloffset + rc->rc_size <= start)
1834			continue;
1835		if (coloffset >= end)
1836			continue;
1837
1838		colstart = MAX(coloffset, start);
1839		colend = MIN(end, coloffset + rc->rc_size);
1840		colsize = colend - colstart;
1841		colskip = colstart - coloffset;
1842
1843		VERIFY3U(colsize, <=, rc->rc_size);
1844		VERIFY3U(colskip, <=, rc->rc_size);
1845
1846		/*
1847		 * Note that the child vdev will have a vdev label at the start
1848		 * of its range of offsets, hence the need for
1849		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1850		 * example of why this calculation is needed.
1851		 */
1852		if ((err = vdev_disk_physio(cvd,
1853		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1854		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1855		    flags, isdump)) != 0)
1856			break;
1857	}
1858
1859	vdev_raidz_map_free(rm);
1860	abd_put(abd);
1861#endif	/* KERNEL */
1862
1863	return (err);
1864}
1865
1866static uint64_t
1867vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1868{
1869	uint64_t asize;
1870	uint64_t ashift = vd->vdev_top->vdev_ashift;
1871	uint64_t cols = vd->vdev_children;
1872	uint64_t nparity = vd->vdev_nparity;
1873
1874	asize = ((psize - 1) >> ashift) + 1;
1875	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1876	asize = roundup(asize, nparity + 1) << ashift;
1877
1878	return (asize);
1879}
1880
1881static void
1882vdev_raidz_child_done(zio_t *zio)
1883{
1884	raidz_col_t *rc = zio->io_private;
1885
1886	rc->rc_error = zio->io_error;
1887	rc->rc_tried = 1;
1888	rc->rc_skipped = 0;
1889}
1890
1891static void
1892vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1893{
1894#ifdef ZFS_DEBUG
1895	vdev_t *vd = zio->io_vd;
1896	vdev_t *tvd = vd->vdev_top;
1897
1898	range_seg_t logical_rs, physical_rs;
1899	logical_rs.rs_start = zio->io_offset;
1900	logical_rs.rs_end = logical_rs.rs_start +
1901	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1902
1903	raidz_col_t *rc = &rm->rm_col[col];
1904	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1905
1906	vdev_xlate(cvd, &logical_rs, &physical_rs);
1907	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1908	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1909	/*
1910	 * It would be nice to assert that rs_end is equal
1911	 * to rc_offset + rc_size but there might be an
1912	 * optional I/O at the end that is not accounted in
1913	 * rc_size.
1914	 */
1915	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1916		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1917		    rc->rc_size + (1 << tvd->vdev_ashift));
1918	} else {
1919		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1920	}
1921#endif
1922}
1923
1924/*
1925 * Start an IO operation on a RAIDZ VDev
1926 *
1927 * Outline:
1928 * - For write operations:
1929 *   1. Generate the parity data
1930 *   2. Create child zio write operations to each column's vdev, for both
1931 *      data and parity.
1932 *   3. If the column skips any sectors for padding, create optional dummy
1933 *      write zio children for those areas to improve aggregation continuity.
1934 * - For read operations:
1935 *   1. Create child zio read operations to each data column's vdev to read
1936 *      the range of data required for zio.
1937 *   2. If this is a scrub or resilver operation, or if any of the data
1938 *      vdevs have had errors, then create zio read operations to the parity
1939 *      columns' VDevs as well.
1940 */
1941static void
1942vdev_raidz_io_start(zio_t *zio)
1943{
1944	vdev_t *vd = zio->io_vd;
1945	vdev_t *tvd = vd->vdev_top;
1946	vdev_t *cvd;
1947	raidz_map_t *rm;
1948	raidz_col_t *rc;
1949	int c, i;
1950
1951	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1952	    tvd->vdev_ashift, vd->vdev_children,
1953	    vd->vdev_nparity);
1954
1955	zio->io_vsd = rm;
1956	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1957
1958	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1959
1960	if (zio->io_type == ZIO_TYPE_WRITE) {
1961		vdev_raidz_generate_parity(rm);
1962
1963		for (c = 0; c < rm->rm_cols; c++) {
1964			rc = &rm->rm_col[c];
1965			cvd = vd->vdev_child[rc->rc_devidx];
1966
1967			/*
1968			 * Verify physical to logical translation.
1969			 */
1970			vdev_raidz_io_verify(zio, rm, c);
1971
1972			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1973			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1974			    zio->io_type, zio->io_priority, 0,
1975			    vdev_raidz_child_done, rc));
1976		}
1977
1978		/*
1979		 * Generate optional I/Os for any skipped sectors to improve
1980		 * aggregation contiguity.
1981		 */
1982		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1983			ASSERT(c <= rm->rm_scols);
1984			if (c == rm->rm_scols)
1985				c = 0;
1986			rc = &rm->rm_col[c];
1987			cvd = vd->vdev_child[rc->rc_devidx];
1988			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1989			    rc->rc_offset + rc->rc_size, NULL,
1990			    1 << tvd->vdev_ashift,
1991			    zio->io_type, zio->io_priority,
1992			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1993		}
1994
1995		zio_execute(zio);
1996		return;
1997	}
1998
1999	ASSERT(zio->io_type == ZIO_TYPE_READ);
2000
2001	/*
2002	 * Iterate over the columns in reverse order so that we hit the parity
2003	 * last -- any errors along the way will force us to read the parity.
2004	 */
2005	for (c = rm->rm_cols - 1; c >= 0; c--) {
2006		rc = &rm->rm_col[c];
2007		cvd = vd->vdev_child[rc->rc_devidx];
2008		if (!vdev_readable(cvd)) {
2009			if (c >= rm->rm_firstdatacol)
2010				rm->rm_missingdata++;
2011			else
2012				rm->rm_missingparity++;
2013			rc->rc_error = SET_ERROR(ENXIO);
2014			rc->rc_tried = 1;	/* don't even try */
2015			rc->rc_skipped = 1;
2016			continue;
2017		}
2018		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2019			if (c >= rm->rm_firstdatacol)
2020				rm->rm_missingdata++;
2021			else
2022				rm->rm_missingparity++;
2023			rc->rc_error = SET_ERROR(ESTALE);
2024			rc->rc_skipped = 1;
2025			continue;
2026		}
2027		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2028		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2029			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2030			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2031			    zio->io_type, zio->io_priority, 0,
2032			    vdev_raidz_child_done, rc));
2033		}
2034	}
2035
2036	zio_execute(zio);
2037}
2038
2039
2040/*
2041 * Report a checksum error for a child of a RAID-Z device.
2042 */
2043static void
2044raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2045{
2046	void *buf;
2047	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2048
2049	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2050		zio_bad_cksum_t zbc;
2051		raidz_map_t *rm = zio->io_vsd;
2052
2053		mutex_enter(&vd->vdev_stat_lock);
2054		vd->vdev_stat.vs_checksum_errors++;
2055		mutex_exit(&vd->vdev_stat_lock);
2056
2057		zbc.zbc_has_cksum = 0;
2058		zbc.zbc_injected = rm->rm_ecksuminjected;
2059
2060		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2061		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2062		    rc->rc_offset, rc->rc_size, buf, bad_data,
2063		    &zbc);
2064		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2065	}
2066}
2067
2068/*
2069 * We keep track of whether or not there were any injected errors, so that
2070 * any ereports we generate can note it.
2071 */
2072static int
2073raidz_checksum_verify(zio_t *zio)
2074{
2075	zio_bad_cksum_t zbc;
2076	raidz_map_t *rm = zio->io_vsd;
2077
2078	int ret = zio_checksum_error(zio, &zbc);
2079	if (ret != 0 && zbc.zbc_injected != 0)
2080		rm->rm_ecksuminjected = 1;
2081
2082	return (ret);
2083}
2084
2085/*
2086 * Generate the parity from the data columns. If we tried and were able to
2087 * read the parity without error, verify that the generated parity matches the
2088 * data we read. If it doesn't, we fire off a checksum error. Return the
2089 * number such failures.
2090 */
2091static int
2092raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2093{
2094	void *orig[VDEV_RAIDZ_MAXPARITY];
2095	int c, ret = 0;
2096	raidz_col_t *rc;
2097
2098	blkptr_t *bp = zio->io_bp;
2099	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2100	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2101
2102	if (checksum == ZIO_CHECKSUM_NOPARITY)
2103		return (ret);
2104
2105	for (c = 0; c < rm->rm_firstdatacol; c++) {
2106		rc = &rm->rm_col[c];
2107		if (!rc->rc_tried || rc->rc_error != 0)
2108			continue;
2109		orig[c] = zio_buf_alloc(rc->rc_size);
2110		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2111	}
2112
2113	vdev_raidz_generate_parity(rm);
2114
2115	for (c = 0; c < rm->rm_firstdatacol; c++) {
2116		rc = &rm->rm_col[c];
2117		if (!rc->rc_tried || rc->rc_error != 0)
2118			continue;
2119		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2120			raidz_checksum_error(zio, rc, orig[c]);
2121			rc->rc_error = SET_ERROR(ECKSUM);
2122			ret++;
2123		}
2124		zio_buf_free(orig[c], rc->rc_size);
2125	}
2126
2127	return (ret);
2128}
2129
2130/*
2131 * Keep statistics on all the ways that we used parity to correct data.
2132 */
2133static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2134
2135static int
2136vdev_raidz_worst_error(raidz_map_t *rm)
2137{
2138	int error = 0;
2139
2140	for (int c = 0; c < rm->rm_cols; c++)
2141		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2142
2143	return (error);
2144}
2145
2146/*
2147 * Iterate over all combinations of bad data and attempt a reconstruction.
2148 * Note that the algorithm below is non-optimal because it doesn't take into
2149 * account how reconstruction is actually performed. For example, with
2150 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2151 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2152 * cases we'd only use parity information in column 0.
2153 */
2154static int
2155vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2156{
2157	raidz_map_t *rm = zio->io_vsd;
2158	raidz_col_t *rc;
2159	void *orig[VDEV_RAIDZ_MAXPARITY];
2160	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2161	int *tgts = &tstore[1];
2162	int current, next, i, c, n;
2163	int code, ret = 0;
2164
2165	ASSERT(total_errors < rm->rm_firstdatacol);
2166
2167	/*
2168	 * This simplifies one edge condition.
2169	 */
2170	tgts[-1] = -1;
2171
2172	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2173		/*
2174		 * Initialize the targets array by finding the first n columns
2175		 * that contain no error.
2176		 *
2177		 * If there were no data errors, we need to ensure that we're
2178		 * always explicitly attempting to reconstruct at least one
2179		 * data column. To do this, we simply push the highest target
2180		 * up into the data columns.
2181		 */
2182		for (c = 0, i = 0; i < n; i++) {
2183			if (i == n - 1 && data_errors == 0 &&
2184			    c < rm->rm_firstdatacol) {
2185				c = rm->rm_firstdatacol;
2186			}
2187
2188			while (rm->rm_col[c].rc_error != 0) {
2189				c++;
2190				ASSERT3S(c, <, rm->rm_cols);
2191			}
2192
2193			tgts[i] = c++;
2194		}
2195
2196		/*
2197		 * Setting tgts[n] simplifies the other edge condition.
2198		 */
2199		tgts[n] = rm->rm_cols;
2200
2201		/*
2202		 * These buffers were allocated in previous iterations.
2203		 */
2204		for (i = 0; i < n - 1; i++) {
2205			ASSERT(orig[i] != NULL);
2206		}
2207
2208		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2209
2210		current = 0;
2211		next = tgts[current];
2212
2213		while (current != n) {
2214			tgts[current] = next;
2215			current = 0;
2216
2217			/*
2218			 * Save off the original data that we're going to
2219			 * attempt to reconstruct.
2220			 */
2221			for (i = 0; i < n; i++) {
2222				ASSERT(orig[i] != NULL);
2223				c = tgts[i];
2224				ASSERT3S(c, >=, 0);
2225				ASSERT3S(c, <, rm->rm_cols);
2226				rc = &rm->rm_col[c];
2227				abd_copy_to_buf(orig[i], rc->rc_abd,
2228				    rc->rc_size);
2229			}
2230
2231			/*
2232			 * Attempt a reconstruction and exit the outer loop on
2233			 * success.
2234			 */
2235			code = vdev_raidz_reconstruct(rm, tgts, n);
2236			if (raidz_checksum_verify(zio) == 0) {
2237				atomic_inc_64(&raidz_corrected[code]);
2238
2239				for (i = 0; i < n; i++) {
2240					c = tgts[i];
2241					rc = &rm->rm_col[c];
2242					ASSERT(rc->rc_error == 0);
2243					if (rc->rc_tried)
2244						raidz_checksum_error(zio, rc,
2245						    orig[i]);
2246					rc->rc_error = SET_ERROR(ECKSUM);
2247				}
2248
2249				ret = code;
2250				goto done;
2251			}
2252
2253			/*
2254			 * Restore the original data.
2255			 */
2256			for (i = 0; i < n; i++) {
2257				c = tgts[i];
2258				rc = &rm->rm_col[c];
2259				abd_copy_from_buf(rc->rc_abd, orig[i],
2260				    rc->rc_size);
2261			}
2262
2263			do {
2264				/*
2265				 * Find the next valid column after the current
2266				 * position..
2267				 */
2268				for (next = tgts[current] + 1;
2269				    next < rm->rm_cols &&
2270				    rm->rm_col[next].rc_error != 0; next++)
2271					continue;
2272
2273				ASSERT(next <= tgts[current + 1]);
2274
2275				/*
2276				 * If that spot is available, we're done here.
2277				 */
2278				if (next != tgts[current + 1])
2279					break;
2280
2281				/*
2282				 * Otherwise, find the next valid column after
2283				 * the previous position.
2284				 */
2285				for (c = tgts[current - 1] + 1;
2286				    rm->rm_col[c].rc_error != 0; c++)
2287					continue;
2288
2289				tgts[current] = c;
2290				current++;
2291
2292			} while (current != n);
2293		}
2294	}
2295	n--;
2296done:
2297	for (i = 0; i < n; i++) {
2298		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2299	}
2300
2301	return (ret);
2302}
2303
2304/*
2305 * Complete an IO operation on a RAIDZ VDev
2306 *
2307 * Outline:
2308 * - For write operations:
2309 *   1. Check for errors on the child IOs.
2310 *   2. Return, setting an error code if too few child VDevs were written
2311 *      to reconstruct the data later.  Note that partial writes are
2312 *      considered successful if they can be reconstructed at all.
2313 * - For read operations:
2314 *   1. Check for errors on the child IOs.
2315 *   2. If data errors occurred:
2316 *      a. Try to reassemble the data from the parity available.
2317 *      b. If we haven't yet read the parity drives, read them now.
2318 *      c. If all parity drives have been read but the data still doesn't
2319 *         reassemble with a correct checksum, then try combinatorial
2320 *         reconstruction.
2321 *      d. If that doesn't work, return an error.
2322 *   3. If there were unexpected errors or this is a resilver operation,
2323 *      rewrite the vdevs that had errors.
2324 */
2325static void
2326vdev_raidz_io_done(zio_t *zio)
2327{
2328	vdev_t *vd = zio->io_vd;
2329	vdev_t *cvd;
2330	raidz_map_t *rm = zio->io_vsd;
2331	raidz_col_t *rc;
2332	int unexpected_errors = 0;
2333	int parity_errors = 0;
2334	int parity_untried = 0;
2335	int data_errors = 0;
2336	int total_errors = 0;
2337	int n, c;
2338	int tgts[VDEV_RAIDZ_MAXPARITY];
2339	int code;
2340
2341	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2342
2343	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2344	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2345
2346	for (c = 0; c < rm->rm_cols; c++) {
2347		rc = &rm->rm_col[c];
2348
2349		if (rc->rc_error) {
2350			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2351
2352			if (c < rm->rm_firstdatacol)
2353				parity_errors++;
2354			else
2355				data_errors++;
2356
2357			if (!rc->rc_skipped)
2358				unexpected_errors++;
2359
2360			total_errors++;
2361		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2362			parity_untried++;
2363		}
2364	}
2365
2366	if (zio->io_type == ZIO_TYPE_WRITE) {
2367		/*
2368		 * XXX -- for now, treat partial writes as a success.
2369		 * (If we couldn't write enough columns to reconstruct
2370		 * the data, the I/O failed.  Otherwise, good enough.)
2371		 *
2372		 * Now that we support write reallocation, it would be better
2373		 * to treat partial failure as real failure unless there are
2374		 * no non-degraded top-level vdevs left, and not update DTLs
2375		 * if we intend to reallocate.
2376		 */
2377		/* XXPOLICY */
2378		if (total_errors > rm->rm_firstdatacol)
2379			zio->io_error = vdev_raidz_worst_error(rm);
2380
2381		return;
2382	}
2383
2384	ASSERT(zio->io_type == ZIO_TYPE_READ);
2385	/*
2386	 * There are three potential phases for a read:
2387	 *	1. produce valid data from the columns read
2388	 *	2. read all disks and try again
2389	 *	3. perform combinatorial reconstruction
2390	 *
2391	 * Each phase is progressively both more expensive and less likely to
2392	 * occur. If we encounter more errors than we can repair or all phases
2393	 * fail, we have no choice but to return an error.
2394	 */
2395
2396	/*
2397	 * If the number of errors we saw was correctable -- less than or equal
2398	 * to the number of parity disks read -- attempt to produce data that
2399	 * has a valid checksum. Naturally, this case applies in the absence of
2400	 * any errors.
2401	 */
2402	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2403		if (data_errors == 0) {
2404			if (raidz_checksum_verify(zio) == 0) {
2405				/*
2406				 * If we read parity information (unnecessarily
2407				 * as it happens since no reconstruction was
2408				 * needed) regenerate and verify the parity.
2409				 * We also regenerate parity when resilvering
2410				 * so we can write it out to the failed device
2411				 * later.
2412				 */
2413				if (parity_errors + parity_untried <
2414				    rm->rm_firstdatacol ||
2415				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2416					n = raidz_parity_verify(zio, rm);
2417					unexpected_errors += n;
2418					ASSERT(parity_errors + n <=
2419					    rm->rm_firstdatacol);
2420				}
2421				goto done;
2422			}
2423		} else {
2424			/*
2425			 * We either attempt to read all the parity columns or
2426			 * none of them. If we didn't try to read parity, we
2427			 * wouldn't be here in the correctable case. There must
2428			 * also have been fewer parity errors than parity
2429			 * columns or, again, we wouldn't be in this code path.
2430			 */
2431			ASSERT(parity_untried == 0);
2432			ASSERT(parity_errors < rm->rm_firstdatacol);
2433
2434			/*
2435			 * Identify the data columns that reported an error.
2436			 */
2437			n = 0;
2438			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2439				rc = &rm->rm_col[c];
2440				if (rc->rc_error != 0) {
2441					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2442					tgts[n++] = c;
2443				}
2444			}
2445
2446			ASSERT(rm->rm_firstdatacol >= n);
2447
2448			code = vdev_raidz_reconstruct(rm, tgts, n);
2449
2450			if (raidz_checksum_verify(zio) == 0) {
2451				atomic_inc_64(&raidz_corrected[code]);
2452
2453				/*
2454				 * If we read more parity disks than were used
2455				 * for reconstruction, confirm that the other
2456				 * parity disks produced correct data. This
2457				 * routine is suboptimal in that it regenerates
2458				 * the parity that we already used in addition
2459				 * to the parity that we're attempting to
2460				 * verify, but this should be a relatively
2461				 * uncommon case, and can be optimized if it
2462				 * becomes a problem. Note that we regenerate
2463				 * parity when resilvering so we can write it
2464				 * out to failed devices later.
2465				 */
2466				if (parity_errors < rm->rm_firstdatacol - n ||
2467				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2468					n = raidz_parity_verify(zio, rm);
2469					unexpected_errors += n;
2470					ASSERT(parity_errors + n <=
2471					    rm->rm_firstdatacol);
2472				}
2473
2474				goto done;
2475			}
2476		}
2477	}
2478
2479	/*
2480	 * This isn't a typical situation -- either we got a read error or
2481	 * a child silently returned bad data. Read every block so we can
2482	 * try again with as much data and parity as we can track down. If
2483	 * we've already been through once before, all children will be marked
2484	 * as tried so we'll proceed to combinatorial reconstruction.
2485	 */
2486	unexpected_errors = 1;
2487	rm->rm_missingdata = 0;
2488	rm->rm_missingparity = 0;
2489
2490	for (c = 0; c < rm->rm_cols; c++) {
2491		if (rm->rm_col[c].rc_tried)
2492			continue;
2493
2494		zio_vdev_io_redone(zio);
2495		do {
2496			rc = &rm->rm_col[c];
2497			if (rc->rc_tried)
2498				continue;
2499			zio_nowait(zio_vdev_child_io(zio, NULL,
2500			    vd->vdev_child[rc->rc_devidx],
2501			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2502			    zio->io_type, zio->io_priority, 0,
2503			    vdev_raidz_child_done, rc));
2504		} while (++c < rm->rm_cols);
2505
2506		return;
2507	}
2508
2509	/*
2510	 * At this point we've attempted to reconstruct the data given the
2511	 * errors we detected, and we've attempted to read all columns. There
2512	 * must, therefore, be one or more additional problems -- silent errors
2513	 * resulting in invalid data rather than explicit I/O errors resulting
2514	 * in absent data. We check if there is enough additional data to
2515	 * possibly reconstruct the data and then perform combinatorial
2516	 * reconstruction over all possible combinations. If that fails,
2517	 * we're cooked.
2518	 */
2519	if (total_errors > rm->rm_firstdatacol) {
2520		zio->io_error = vdev_raidz_worst_error(rm);
2521
2522	} else if (total_errors < rm->rm_firstdatacol &&
2523	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2524		/*
2525		 * If we didn't use all the available parity for the
2526		 * combinatorial reconstruction, verify that the remaining
2527		 * parity is correct.
2528		 */
2529		if (code != (1 << rm->rm_firstdatacol) - 1)
2530			(void) raidz_parity_verify(zio, rm);
2531	} else {
2532		/*
2533		 * We're here because either:
2534		 *
2535		 *	total_errors == rm_first_datacol, or
2536		 *	vdev_raidz_combrec() failed
2537		 *
2538		 * In either case, there is enough bad data to prevent
2539		 * reconstruction.
2540		 *
2541		 * Start checksum ereports for all children which haven't
2542		 * failed, and the IO wasn't speculative.
2543		 */
2544		zio->io_error = SET_ERROR(ECKSUM);
2545
2546		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2547			for (c = 0; c < rm->rm_cols; c++) {
2548				rc = &rm->rm_col[c];
2549				if (rc->rc_error == 0) {
2550					zio_bad_cksum_t zbc;
2551					zbc.zbc_has_cksum = 0;
2552					zbc.zbc_injected =
2553					    rm->rm_ecksuminjected;
2554
2555					zfs_ereport_start_checksum(
2556					    zio->io_spa,
2557					    vd->vdev_child[rc->rc_devidx],
2558					    zio, rc->rc_offset, rc->rc_size,
2559					    (void *)(uintptr_t)c, &zbc);
2560				}
2561			}
2562		}
2563	}
2564
2565done:
2566	zio_checksum_verified(zio);
2567
2568	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2569	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2570		/*
2571		 * Use the good data we have in hand to repair damaged children.
2572		 */
2573		for (c = 0; c < rm->rm_cols; c++) {
2574			rc = &rm->rm_col[c];
2575			cvd = vd->vdev_child[rc->rc_devidx];
2576
2577			if (rc->rc_error == 0)
2578				continue;
2579
2580			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2581			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2582			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2583			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2584			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2585		}
2586	}
2587}
2588
2589static void
2590vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2591{
2592	if (faulted > vd->vdev_nparity)
2593		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2594		    VDEV_AUX_NO_REPLICAS);
2595	else if (degraded + faulted != 0)
2596		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2597	else
2598		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2599}
2600
2601static void
2602vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2603{
2604	vdev_t *raidvd = cvd->vdev_parent;
2605	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2606
2607	uint64_t width = raidvd->vdev_children;
2608	uint64_t tgt_col = cvd->vdev_id;
2609	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2610
2611	/* make sure the offsets are block-aligned */
2612	ASSERT0(in->rs_start % (1 << ashift));
2613	ASSERT0(in->rs_end % (1 << ashift));
2614	uint64_t b_start = in->rs_start >> ashift;
2615	uint64_t b_end = in->rs_end >> ashift;
2616
2617	uint64_t start_row = 0;
2618	if (b_start > tgt_col) /* avoid underflow */
2619		start_row = ((b_start - tgt_col - 1) / width) + 1;
2620
2621	uint64_t end_row = 0;
2622	if (b_end > tgt_col)
2623		end_row = ((b_end - tgt_col - 1) / width) + 1;
2624
2625	res->rs_start = start_row << ashift;
2626	res->rs_end = end_row << ashift;
2627
2628	ASSERT3U(res->rs_start, <=, in->rs_start);
2629	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2630}
2631
2632vdev_ops_t vdev_raidz_ops = {
2633	vdev_raidz_open,
2634	vdev_raidz_close,
2635	vdev_raidz_asize,
2636	vdev_raidz_io_start,
2637	vdev_raidz_io_done,
2638	vdev_raidz_state_change,
2639	NULL,
2640	NULL,
2641	NULL,
2642	vdev_raidz_xlate,
2643	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2644	B_FALSE			/* not a leaf vdev */
2645};
2646