1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#include <sys/vdev_disk.h>
33#include <sys/vdev_file.h>
34#include <sys/vdev_raidz.h>
35#include <sys/zio.h>
36#include <sys/zio_checksum.h>
37#include <sys/abd.h>
38#include <sys/fs/zfs.h>
39#include <sys/fm/fs/zfs.h>
40
41#ifdef ZFS_DEBUG
42#include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
43#endif
44
45/*
46 * Virtual device vector for RAID-Z.
47 *
48 * This vdev supports single, double, and triple parity. For single parity,
49 * we use a simple XOR of all the data columns. For double or triple parity,
50 * we use a special case of Reed-Solomon coding. This extends the
51 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54 * former is also based. The latter is designed to provide higher performance
55 * for writes.
56 *
57 * Note that the Plank paper claimed to support arbitrary N+M, but was then
58 * amended six years later identifying a critical flaw that invalidates its
59 * claims. Nevertheless, the technique can be adapted to work for up to
60 * triple parity. For additional parity, the amendment "Note: Correction to
61 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62 * is viable, but the additional complexity means that write performance will
63 * suffer.
64 *
65 * All of the methods above operate on a Galois field, defined over the
66 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67 * can be expressed with a single byte. Briefly, the operations on the
68 * field are defined as follows:
69 *
70 *   o addition (+) is represented by a bitwise XOR
71 *   o subtraction (-) is therefore identical to addition: A + B = A - B
72 *   o multiplication of A by 2 is defined by the following bitwise expression:
73 *
74 *	(A * 2)_7 = A_6
75 *	(A * 2)_6 = A_5
76 *	(A * 2)_5 = A_4
77 *	(A * 2)_4 = A_3 + A_7
78 *	(A * 2)_3 = A_2 + A_7
79 *	(A * 2)_2 = A_1 + A_7
80 *	(A * 2)_1 = A_0
81 *	(A * 2)_0 = A_7
82 *
83 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84 * As an aside, this multiplication is derived from the error correcting
85 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86 *
87 * Observe that any number in the field (except for 0) can be expressed as a
88 * power of 2 -- a generator for the field. We store a table of the powers of
89 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91 * than field addition). The inverse of a field element A (A^-1) is therefore
92 * A ^ (255 - 1) = A^254.
93 *
94 * The up-to-three parity columns, P, Q, R over several data columns,
95 * D_0, ... D_n-1, can be expressed by field operations:
96 *
97 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
98 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102 *
103 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105 * independent coefficients. (There are no additional coefficients that have
106 * this property which is why the uncorrected Plank method breaks down.)
107 *
108 * See the reconstruction code below for how P, Q and R can used individually
109 * or in concert to recover missing data columns.
110 */
111
112typedef struct raidz_col {
113	uint64_t rc_devidx;		/* child device index for I/O */
114	uint64_t rc_offset;		/* device offset */
115	uint64_t rc_size;		/* I/O size */
116	abd_t *rc_abd;			/* I/O data */
117	void *rc_gdata;			/* used to store the "good" version */
118	int rc_error;			/* I/O error for this device */
119	uint8_t rc_tried;		/* Did we attempt this I/O column? */
120	uint8_t rc_skipped;		/* Did we skip this I/O column? */
121} raidz_col_t;
122
123typedef struct raidz_map {
124	uint64_t rm_cols;		/* Regular column count */
125	uint64_t rm_scols;		/* Count including skipped columns */
126	uint64_t rm_bigcols;		/* Number of oversized columns */
127	uint64_t rm_asize;		/* Actual total I/O size */
128	uint64_t rm_missingdata;	/* Count of missing data devices */
129	uint64_t rm_missingparity;	/* Count of missing parity devices */
130	uint64_t rm_firstdatacol;	/* First data column/parity count */
131	uint64_t rm_nskip;		/* Skipped sectors for padding */
132	uint64_t rm_skipstart;		/* Column index of padding start */
133	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
134	uintptr_t rm_reports;		/* # of referencing checksum reports */
135	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
136	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
137	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
138} raidz_map_t;
139
140#define	VDEV_RAIDZ_P		0
141#define	VDEV_RAIDZ_Q		1
142#define	VDEV_RAIDZ_R		2
143
144#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146
147/*
148 * We provide a mechanism to perform the field multiplication operation on a
149 * 64-bit value all at once rather than a byte at a time. This works by
150 * creating a mask from the top bit in each byte and using that to
151 * conditionally apply the XOR of 0x1d.
152 */
153#define	VDEV_RAIDZ_64MUL_2(x, mask) \
154{ \
155	(mask) = (x) & 0x8080808080808080ULL; \
156	(mask) = ((mask) << 1) - ((mask) >> 7); \
157	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
159}
160
161#define	VDEV_RAIDZ_64MUL_4(x, mask) \
162{ \
163	VDEV_RAIDZ_64MUL_2((x), mask); \
164	VDEV_RAIDZ_64MUL_2((x), mask); \
165}
166
167#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
168
169/*
170 * Force reconstruction to use the general purpose method.
171 */
172int vdev_raidz_default_to_general;
173
174/* Powers of 2 in the Galois field defined above. */
175static const uint8_t vdev_raidz_pow2[256] = {
176	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208};
209/* Logs of 2 in the Galois field defined above. */
210static const uint8_t vdev_raidz_log2[256] = {
211	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243};
244
245static void vdev_raidz_generate_parity(raidz_map_t *rm);
246
247/*
248 * Multiply a given number by 2 raised to the given power.
249 */
250static uint8_t
251vdev_raidz_exp2(uint_t a, int exp)
252{
253	if (a == 0)
254		return (0);
255
256	ASSERT(exp >= 0);
257	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258
259	exp += vdev_raidz_log2[a];
260	if (exp > 255)
261		exp -= 255;
262
263	return (vdev_raidz_pow2[exp]);
264}
265
266static void
267vdev_raidz_map_free(raidz_map_t *rm)
268{
269	int c;
270
271	for (c = 0; c < rm->rm_firstdatacol; c++) {
272		abd_free(rm->rm_col[c].rc_abd);
273
274		if (rm->rm_col[c].rc_gdata != NULL)
275			abd_free(rm->rm_col[c].rc_gdata);
276
277	}
278
279	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
280		abd_put(rm->rm_col[c].rc_abd);
281
282	if (rm->rm_abd_copy != NULL)
283		abd_free(rm->rm_abd_copy);
284
285	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
286}
287
288static void
289vdev_raidz_map_free_vsd(zio_t *zio)
290{
291	raidz_map_t *rm = zio->io_vsd;
292
293	ASSERT0(rm->rm_freed);
294	rm->rm_freed = 1;
295
296	if (rm->rm_reports == 0)
297		vdev_raidz_map_free(rm);
298}
299
300/*ARGSUSED*/
301static void
302vdev_raidz_cksum_free(void *arg, size_t ignored)
303{
304	raidz_map_t *rm = arg;
305
306	ASSERT3U(rm->rm_reports, >, 0);
307
308	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
309		vdev_raidz_map_free(rm);
310}
311
312static void
313vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
314{
315	raidz_map_t *rm = zcr->zcr_cbdata;
316	size_t c = zcr->zcr_cbinfo;
317	size_t x, offset;
318
319	const abd_t *good = NULL;
320	const abd_t *bad = rm->rm_col[c].rc_abd;
321
322	if (good_data == NULL) {
323		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
324		return;
325	}
326
327	if (c < rm->rm_firstdatacol) {
328		/*
329		 * The first time through, calculate the parity blocks for
330		 * the good data (this relies on the fact that the good
331		 * data never changes for a given logical ZIO)
332		 */
333		if (rm->rm_col[0].rc_gdata == NULL) {
334			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
335
336			/*
337			 * Set up the rm_col[]s to generate the parity for
338			 * good_data, first saving the parity bufs and
339			 * replacing them with buffers to hold the result.
340			 */
341			for (x = 0; x < rm->rm_firstdatacol; x++) {
342				bad_parity[x] = rm->rm_col[x].rc_abd;
343				rm->rm_col[x].rc_abd =
344				    rm->rm_col[x].rc_gdata =
345				    abd_alloc_sametype(rm->rm_col[x].rc_abd,
346				    rm->rm_col[x].rc_size);
347			}
348
349			/* fill in the data columns from good_data */
350			offset = 0;
351			for (; x < rm->rm_cols; x++) {
352				abd_put(rm->rm_col[x].rc_abd);
353
354				rm->rm_col[x].rc_abd =
355				    abd_get_offset_size((abd_t *)good_data,
356				    offset, rm->rm_col[x].rc_size);
357				offset += rm->rm_col[x].rc_size;
358			}
359
360			/*
361			 * Construct the parity from the good data.
362			 */
363			vdev_raidz_generate_parity(rm);
364
365			/* restore everything back to its original state */
366			for (x = 0; x < rm->rm_firstdatacol; x++)
367				rm->rm_col[x].rc_abd = bad_parity[x];
368
369			offset = 0;
370			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
371				abd_put(rm->rm_col[x].rc_abd);
372				rm->rm_col[x].rc_abd = abd_get_offset_size(
373				    rm->rm_abd_copy, offset,
374				    rm->rm_col[x].rc_size);
375				offset += rm->rm_col[x].rc_size;
376			}
377		}
378
379		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
380		good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0,
381		    rm->rm_col[c].rc_size);
382	} else {
383		/* adjust good_data to point at the start of our column */
384		offset = 0;
385		for (x = rm->rm_firstdatacol; x < c; x++)
386			offset += rm->rm_col[x].rc_size;
387
388		good = abd_get_offset_size((abd_t *)good_data, offset,
389		    rm->rm_col[c].rc_size);
390	}
391
392	/* we drop the ereport if it ends up that the data was good */
393	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
394	abd_put((abd_t *)good);
395}
396
397/*
398 * Invoked indirectly by zfs_ereport_start_checksum(), called
399 * below when our read operation fails completely.  The main point
400 * is to keep a copy of everything we read from disk, so that at
401 * vdev_raidz_cksum_finish() time we can compare it with the good data.
402 */
403static void
404vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
405{
406	size_t c = (size_t)(uintptr_t)arg;
407	size_t offset;
408
409	raidz_map_t *rm = zio->io_vsd;
410	size_t size;
411
412	/* set up the report and bump the refcount  */
413	zcr->zcr_cbdata = rm;
414	zcr->zcr_cbinfo = c;
415	zcr->zcr_finish = vdev_raidz_cksum_finish;
416	zcr->zcr_free = vdev_raidz_cksum_free;
417
418	rm->rm_reports++;
419	ASSERT3U(rm->rm_reports, >, 0);
420
421	if (rm->rm_abd_copy != NULL)
422		return;
423
424	/*
425	 * It's the first time we're called for this raidz_map_t, so we need
426	 * to copy the data aside; there's no guarantee that our zio's buffer
427	 * won't be re-used for something else.
428	 *
429	 * Our parity data is already in separate buffers, so there's no need
430	 * to copy them.
431	 */
432
433	size = 0;
434	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
435		size += rm->rm_col[c].rc_size;
436
437	rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE);
438
439	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
440		raidz_col_t *col = &rm->rm_col[c];
441		abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
442		    col->rc_size);
443
444		ASSERT3S(tmp->abd_size, >=, col->rc_size);
445		ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
446		abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size);
447		abd_put(col->rc_abd);
448		col->rc_abd = tmp;
449
450		offset += col->rc_size;
451	}
452	ASSERT3U(offset, ==, size);
453}
454
455static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
456	vdev_raidz_map_free_vsd,
457	vdev_raidz_cksum_report
458};
459
460/*
461 * Divides the IO evenly across all child vdevs; usually, dcols is
462 * the number of children in the target vdev.
463 */
464static raidz_map_t *
465vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
466    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
467{
468	raidz_map_t *rm;
469	/* The starting RAIDZ (parent) vdev sector of the block. */
470	uint64_t b = offset >> unit_shift;
471	/* The zio's size in units of the vdev's minimum sector size. */
472	uint64_t s = size >> unit_shift;
473	/* The first column for this stripe. */
474	uint64_t f = b % dcols;
475	/* The starting byte offset on each child vdev. */
476	uint64_t o = (b / dcols) << unit_shift;
477	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
478	uint64_t off = 0;
479
480	/*
481	 * "Quotient": The number of data sectors for this stripe on all but
482	 * the "big column" child vdevs that also contain "remainder" data.
483	 */
484	q = s / (dcols - nparity);
485
486	/*
487	 * "Remainder": The number of partial stripe data sectors in this I/O.
488	 * This will add a sector to some, but not all, child vdevs.
489	 */
490	r = s - q * (dcols - nparity);
491
492	/* The number of "big columns" - those which contain remainder data. */
493	bc = (r == 0 ? 0 : r + nparity);
494
495	/*
496	 * The total number of data and parity sectors associated with
497	 * this I/O.
498	 */
499	tot = s + nparity * (q + (r == 0 ? 0 : 1));
500
501	/* acols: The columns that will be accessed. */
502	/* scols: The columns that will be accessed or skipped. */
503	if (q == 0) {
504		/* Our I/O request doesn't span all child vdevs. */
505		acols = bc;
506		scols = MIN(dcols, roundup(bc, nparity + 1));
507	} else {
508		acols = dcols;
509		scols = dcols;
510	}
511
512	ASSERT3U(acols, <=, scols);
513
514	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
515
516	rm->rm_cols = acols;
517	rm->rm_scols = scols;
518	rm->rm_bigcols = bc;
519	rm->rm_skipstart = bc;
520	rm->rm_missingdata = 0;
521	rm->rm_missingparity = 0;
522	rm->rm_firstdatacol = nparity;
523	rm->rm_abd_copy = NULL;
524	rm->rm_reports = 0;
525	rm->rm_freed = 0;
526	rm->rm_ecksuminjected = 0;
527
528	asize = 0;
529
530	for (c = 0; c < scols; c++) {
531		col = f + c;
532		coff = o;
533		if (col >= dcols) {
534			col -= dcols;
535			coff += 1ULL << unit_shift;
536		}
537		rm->rm_col[c].rc_devidx = col;
538		rm->rm_col[c].rc_offset = coff;
539		rm->rm_col[c].rc_abd = NULL;
540		rm->rm_col[c].rc_gdata = NULL;
541		rm->rm_col[c].rc_error = 0;
542		rm->rm_col[c].rc_tried = 0;
543		rm->rm_col[c].rc_skipped = 0;
544
545		if (c >= acols)
546			rm->rm_col[c].rc_size = 0;
547		else if (c < bc)
548			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
549		else
550			rm->rm_col[c].rc_size = q << unit_shift;
551
552		asize += rm->rm_col[c].rc_size;
553	}
554
555	ASSERT3U(asize, ==, tot << unit_shift);
556	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
557	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
558	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
559	ASSERT3U(rm->rm_nskip, <=, nparity);
560
561	for (c = 0; c < rm->rm_firstdatacol; c++)
562		rm->rm_col[c].rc_abd =
563		    abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
564
565	rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0,
566	    rm->rm_col[c].rc_size);
567	off = rm->rm_col[c].rc_size;
568
569	for (c = c + 1; c < acols; c++) {
570		rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off,
571		    rm->rm_col[c].rc_size);
572		off += rm->rm_col[c].rc_size;
573	}
574
575	/*
576	 * If all data stored spans all columns, there's a danger that parity
577	 * will always be on the same device and, since parity isn't read
578	 * during normal operation, that that device's I/O bandwidth won't be
579	 * used effectively. We therefore switch the parity every 1MB.
580	 *
581	 * ... at least that was, ostensibly, the theory. As a practical
582	 * matter unless we juggle the parity between all devices evenly, we
583	 * won't see any benefit. Further, occasional writes that aren't a
584	 * multiple of the LCM of the number of children and the minimum
585	 * stripe width are sufficient to avoid pessimal behavior.
586	 * Unfortunately, this decision created an implicit on-disk format
587	 * requirement that we need to support for all eternity, but only
588	 * for single-parity RAID-Z.
589	 *
590	 * If we intend to skip a sector in the zeroth column for padding
591	 * we must make sure to note this swap. We will never intend to
592	 * skip the first column since at least one data and one parity
593	 * column must appear in each row.
594	 */
595	ASSERT(rm->rm_cols >= 2);
596	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
597
598	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
599		devidx = rm->rm_col[0].rc_devidx;
600		o = rm->rm_col[0].rc_offset;
601		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
602		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
603		rm->rm_col[1].rc_devidx = devidx;
604		rm->rm_col[1].rc_offset = o;
605
606		if (rm->rm_skipstart == 0)
607			rm->rm_skipstart = 1;
608	}
609
610	return (rm);
611}
612
613struct pqr_struct {
614	uint64_t *p;
615	uint64_t *q;
616	uint64_t *r;
617};
618
619static int
620vdev_raidz_p_func(void *buf, size_t size, void *private)
621{
622	struct pqr_struct *pqr = private;
623	const uint64_t *src = buf;
624	int i, cnt = size / sizeof (src[0]);
625
626	ASSERT(pqr->p && !pqr->q && !pqr->r);
627
628	for (i = 0; i < cnt; i++, src++, pqr->p++)
629		*pqr->p ^= *src;
630
631	return (0);
632}
633
634static int
635vdev_raidz_pq_func(void *buf, size_t size, void *private)
636{
637	struct pqr_struct *pqr = private;
638	const uint64_t *src = buf;
639	uint64_t mask;
640	int i, cnt = size / sizeof (src[0]);
641
642	ASSERT(pqr->p && pqr->q && !pqr->r);
643
644	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
645		*pqr->p ^= *src;
646		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
647		*pqr->q ^= *src;
648	}
649
650	return (0);
651}
652
653static int
654vdev_raidz_pqr_func(void *buf, size_t size, void *private)
655{
656	struct pqr_struct *pqr = private;
657	const uint64_t *src = buf;
658	uint64_t mask;
659	int i, cnt = size / sizeof (src[0]);
660
661	ASSERT(pqr->p && pqr->q && pqr->r);
662
663	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
664		*pqr->p ^= *src;
665		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
666		*pqr->q ^= *src;
667		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
668		*pqr->r ^= *src;
669	}
670
671	return (0);
672}
673
674static void
675vdev_raidz_generate_parity_p(raidz_map_t *rm)
676{
677	uint64_t *p;
678	int c;
679	abd_t *src;
680
681	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
682		src = rm->rm_col[c].rc_abd;
683		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
684
685		if (c == rm->rm_firstdatacol) {
686			ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size);
687			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
688		} else {
689			struct pqr_struct pqr = { p, NULL, NULL };
690			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
691			    vdev_raidz_p_func, &pqr);
692		}
693	}
694}
695
696static void
697vdev_raidz_generate_parity_pq(raidz_map_t *rm)
698{
699	uint64_t *p, *q, pcnt, ccnt, mask, i;
700	int c;
701	abd_t *src;
702
703	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
704	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
705	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
706
707	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
708		src = rm->rm_col[c].rc_abd;
709		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
710		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
711
712		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
713
714		if (c == rm->rm_firstdatacol) {
715			ASSERT(ccnt == pcnt || ccnt == 0);
716
717			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
718			(void) memcpy(q, p, rm->rm_col[c].rc_size);
719			for (i = ccnt; i < pcnt; i++) {
720				p[i] = 0;
721				q[i] = 0;
722			}
723		} else {
724			struct pqr_struct pqr = { p, q, NULL };
725
726			ASSERT(ccnt <= pcnt);
727
728			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
729			    vdev_raidz_pq_func, &pqr);
730
731			/*
732			 * Treat short columns as though they are full of 0s.
733			 * Note that there's therefore nothing needed for P.
734			 */
735			for (i = ccnt; i < pcnt; i++) {
736				VDEV_RAIDZ_64MUL_2(q[i], mask);
737			}
738		}
739	}
740}
741
742static void
743vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
744{
745	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
746	int c;
747	abd_t *src;
748
749	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
750	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
751	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
752	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
753	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
754
755	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
756		src = rm->rm_col[c].rc_abd;
757		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
758		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
759		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
760
761		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
762
763		if (c == rm->rm_firstdatacol) {
764			ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size);
765			ASSERT(ccnt == pcnt || ccnt == 0);
766			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
767			(void) memcpy(q, p, rm->rm_col[c].rc_size);
768			(void) memcpy(r, p, rm->rm_col[c].rc_size);
769
770			for (i = ccnt; i < pcnt; i++) {
771				p[i] = 0;
772				q[i] = 0;
773				r[i] = 0;
774			}
775		} else {
776			struct pqr_struct pqr = { p, q, r };
777
778			ASSERT(ccnt <= pcnt);
779			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
780			    vdev_raidz_pqr_func, &pqr);
781
782			/*
783			 * Treat short columns as though they are full of 0s.
784			 * Note that there's therefore nothing needed for P.
785			 */
786			for (i = ccnt; i < pcnt; i++) {
787				VDEV_RAIDZ_64MUL_2(q[i], mask);
788				VDEV_RAIDZ_64MUL_4(r[i], mask);
789			}
790		}
791	}
792}
793
794/*
795 * Generate RAID parity in the first virtual columns according to the number of
796 * parity columns available.
797 */
798static void
799vdev_raidz_generate_parity(raidz_map_t *rm)
800{
801	switch (rm->rm_firstdatacol) {
802	case 1:
803		vdev_raidz_generate_parity_p(rm);
804		break;
805	case 2:
806		vdev_raidz_generate_parity_pq(rm);
807		break;
808	case 3:
809		vdev_raidz_generate_parity_pqr(rm);
810		break;
811	default:
812		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
813	}
814}
815
816/* ARGSUSED */
817static int
818vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
819{
820	uint64_t *dst = dbuf;
821	uint64_t *src = sbuf;
822	int cnt = size / sizeof (src[0]);
823
824	for (int i = 0; i < cnt; i++) {
825		dst[i] ^= src[i];
826	}
827
828	return (0);
829}
830
831/* ARGSUSED */
832static int
833vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
834    void *private)
835{
836	uint64_t *dst = dbuf;
837	uint64_t *src = sbuf;
838	uint64_t mask;
839	int cnt = size / sizeof (dst[0]);
840
841	for (int i = 0; i < cnt; i++, dst++, src++) {
842		VDEV_RAIDZ_64MUL_2(*dst, mask);
843		*dst ^= *src;
844	}
845
846	return (0);
847}
848
849/* ARGSUSED */
850static int
851vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
852{
853	uint64_t *dst = buf;
854	uint64_t mask;
855	int cnt = size / sizeof (dst[0]);
856
857	for (int i = 0; i < cnt; i++, dst++) {
858		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
859		VDEV_RAIDZ_64MUL_2(*dst, mask);
860	}
861
862	return (0);
863}
864
865struct reconst_q_struct {
866	uint64_t *q;
867	int exp;
868};
869
870static int
871vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
872{
873	struct reconst_q_struct *rq = private;
874	uint64_t *dst = buf;
875	int cnt = size / sizeof (dst[0]);
876
877	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
878		*dst ^= *rq->q;
879
880		int j;
881		uint8_t *b;
882		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
883			*b = vdev_raidz_exp2(*b, rq->exp);
884		}
885	}
886
887	return (0);
888}
889
890struct reconst_pq_struct {
891	uint8_t *p;
892	uint8_t *q;
893	uint8_t *pxy;
894	uint8_t *qxy;
895	int aexp;
896	int bexp;
897};
898
899static int
900vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
901{
902	struct reconst_pq_struct *rpq = private;
903	uint8_t *xd = xbuf;
904	uint8_t *yd = ybuf;
905
906	for (int i = 0; i < size;
907	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
908		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
909		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
910		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
911	}
912
913	return (0);
914}
915
916static int
917vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
918{
919	struct reconst_pq_struct *rpq = private;
920	uint8_t *xd = xbuf;
921
922	for (int i = 0; i < size;
923	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
924		/* same operation as vdev_raidz_reconst_pq_func() on xd */
925		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
926		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
927	}
928
929	return (0);
930}
931
932static int
933vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
934{
935	int x = tgts[0];
936	int c;
937	abd_t *dst, *src;
938
939	ASSERT(ntgts == 1);
940	ASSERT(x >= rm->rm_firstdatacol);
941	ASSERT(x < rm->rm_cols);
942
943	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
944	ASSERT(rm->rm_col[x].rc_size > 0);
945
946	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
947	dst = rm->rm_col[x].rc_abd;
948
949	ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size);
950	ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size);
951	abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size);
952
953	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
954		uint64_t size = MIN(rm->rm_col[x].rc_size,
955		    rm->rm_col[c].rc_size);
956
957		src = rm->rm_col[c].rc_abd;
958		dst = rm->rm_col[x].rc_abd;
959
960		if (c == x)
961			continue;
962
963		(void) abd_iterate_func2(dst, src, 0, 0, size,
964		    vdev_raidz_reconst_p_func, NULL);
965	}
966
967	return (1 << VDEV_RAIDZ_P);
968}
969
970static int
971vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
972{
973	int x = tgts[0];
974	int c, exp;
975	abd_t *dst, *src;
976
977	ASSERT(ntgts == 1);
978
979	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
980
981	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
982		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
983		    rm->rm_col[c].rc_size);
984
985		src = rm->rm_col[c].rc_abd;
986		dst = rm->rm_col[x].rc_abd;
987
988		if (c == rm->rm_firstdatacol) {
989			if (dst != src) {
990				ASSERT3S(dst->abd_size, >=, size);
991				ASSERT3S(src->abd_size, >=, size);
992				abd_copy_off(dst, src, 0, 0, size);
993			}
994			if (rm->rm_col[x].rc_size > size)
995				abd_zero_off(dst, size,
996				    rm->rm_col[x].rc_size - size);
997		} else {
998			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
999			if (src != dst)
1000				(void) abd_iterate_func2(dst, src, 0, 0, size,
1001				    vdev_raidz_reconst_q_pre_func, NULL);
1002			(void) abd_iterate_func(dst,
1003			    size, rm->rm_col[x].rc_size - size,
1004			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1005		}
1006	}
1007
1008	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1009	dst = rm->rm_col[x].rc_abd;
1010	exp = 255 - (rm->rm_cols - 1 - x);
1011
1012	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1013	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1014	    vdev_raidz_reconst_q_post_func, &rq);
1015
1016	return (1 << VDEV_RAIDZ_Q);
1017}
1018
1019static int
1020vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1021{
1022	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1023	abd_t *pdata, *qdata;
1024	uint64_t xsize, ysize;
1025	int x = tgts[0];
1026	int y = tgts[1];
1027	abd_t *xd, *yd;
1028
1029	ASSERT(ntgts == 2);
1030	ASSERT(x < y);
1031	ASSERT(x >= rm->rm_firstdatacol);
1032	ASSERT(y < rm->rm_cols);
1033
1034	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1035
1036	/*
1037	 * Move the parity data aside -- we're going to compute parity as
1038	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1039	 * reuse the parity generation mechanism without trashing the actual
1040	 * parity so we make those columns appear to be full of zeros by
1041	 * setting their lengths to zero.
1042	 */
1043	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1044	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1045	xsize = rm->rm_col[x].rc_size;
1046	ysize = rm->rm_col[y].rc_size;
1047
1048	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1049	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1050	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1051	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1052	rm->rm_col[x].rc_size = 0;
1053	rm->rm_col[y].rc_size = 0;
1054
1055	vdev_raidz_generate_parity_pq(rm);
1056
1057	rm->rm_col[x].rc_size = xsize;
1058	rm->rm_col[y].rc_size = ysize;
1059
1060	p = abd_to_buf(pdata);
1061	q = abd_to_buf(qdata);
1062	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1063	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1064	xd = rm->rm_col[x].rc_abd;
1065	yd = rm->rm_col[y].rc_abd;
1066
1067	/*
1068	 * We now have:
1069	 *	Pxy = P + D_x + D_y
1070	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1071	 *
1072	 * We can then solve for D_x:
1073	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1074	 * where
1075	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1076	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1077	 *
1078	 * With D_x in hand, we can easily solve for D_y:
1079	 *	D_y = P + Pxy + D_x
1080	 */
1081
1082	a = vdev_raidz_pow2[255 + x - y];
1083	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1084	tmp = 255 - vdev_raidz_log2[a ^ 1];
1085
1086	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1087	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1088
1089	ASSERT3U(xsize, >=, ysize);
1090	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1091	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1092	    vdev_raidz_reconst_pq_func, &rpq);
1093	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1094	    vdev_raidz_reconst_pq_tail_func, &rpq);
1095
1096	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1097	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1098
1099	/*
1100	 * Restore the saved parity data.
1101	 */
1102	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1103	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1104
1105	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1106}
1107
1108/* BEGIN CSTYLED */
1109/*
1110 * In the general case of reconstruction, we must solve the system of linear
1111 * equations defined by the coeffecients used to generate parity as well as
1112 * the contents of the data and parity disks. This can be expressed with
1113 * vectors for the original data (D) and the actual data (d) and parity (p)
1114 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1115 *
1116 *            __   __                     __     __
1117 *            |     |         __     __   |  p_0  |
1118 *            |  V  |         |  D_0  |   | p_m-1 |
1119 *            |     |    x    |   :   | = |  d_0  |
1120 *            |  I  |         | D_n-1 |   |   :   |
1121 *            |     |         ~~     ~~   | d_n-1 |
1122 *            ~~   ~~                     ~~     ~~
1123 *
1124 * I is simply a square identity matrix of size n, and V is a vandermonde
1125 * matrix defined by the coeffecients we chose for the various parity columns
1126 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1127 * computation as well as linear separability.
1128 *
1129 *      __               __               __     __
1130 *      |   1   ..  1 1 1 |               |  p_0  |
1131 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1132 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1133 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1134 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1135 *      |   :       : : : |   |   :   |   |  d_2  |
1136 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1137 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1138 *      |   0   ..  0 0 1 |               | d_n-1 |
1139 *      ~~               ~~               ~~     ~~
1140 *
1141 * Note that I, V, d, and p are known. To compute D, we must invert the
1142 * matrix and use the known data and parity values to reconstruct the unknown
1143 * data values. We begin by removing the rows in V|I and d|p that correspond
1144 * to failed or missing columns; we then make V|I square (n x n) and d|p
1145 * sized n by removing rows corresponding to unused parity from the bottom up
1146 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1147 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1148 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1149 *           __                               __
1150 *           |  1   1   1   1   1   1   1   1  |
1151 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1152 *           |  19 205 116  29  64  16  4   1  |      / /
1153 *           |  1   0   0   0   0   0   0   0  |     / /
1154 *           |  0   1   0   0   0   0   0   0  | <--' /
1155 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1156 *           |  0   0   0   1   0   0   0   0  |
1157 *           |  0   0   0   0   1   0   0   0  |
1158 *           |  0   0   0   0   0   1   0   0  |
1159 *           |  0   0   0   0   0   0   1   0  |
1160 *           |  0   0   0   0   0   0   0   1  |
1161 *           ~~                               ~~
1162 *           __                               __
1163 *           |  1   1   1   1   1   1   1   1  |
1164 *           |  19 205 116  29  64  16  4   1  |
1165 *           |  1   0   0   0   0   0   0   0  |
1166 *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1167 *           |  0   0   0   0   1   0   0   0  |
1168 *           |  0   0   0   0   0   1   0   0  |
1169 *           |  0   0   0   0   0   0   1   0  |
1170 *           |  0   0   0   0   0   0   0   1  |
1171 *           ~~                               ~~
1172 *
1173 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1174 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1175 * matrix is not singular.
1176 * __                                                                 __
1177 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1178 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1179 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1180 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1181 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1182 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1183 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1184 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1185 * ~~                                                                 ~~
1186 * __                                                                 __
1187 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1188 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1189 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1190 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1191 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1192 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1193 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1194 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1195 * ~~                                                                 ~~
1196 * __                                                                 __
1197 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1198 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1199 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1200 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1201 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1202 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1203 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1204 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1205 * ~~                                                                 ~~
1206 * __                                                                 __
1207 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1208 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1209 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1210 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1211 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1212 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1213 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1214 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1215 * ~~                                                                 ~~
1216 * __                                                                 __
1217 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1218 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1219 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1220 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1221 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1222 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1223 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1224 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1225 * ~~                                                                 ~~
1226 * __                                                                 __
1227 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1228 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1229 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1230 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1231 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1232 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1233 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1234 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1235 * ~~                                                                 ~~
1236 *                   __                               __
1237 *                   |  0   0   1   0   0   0   0   0  |
1238 *                   | 167 100  5   41 159 169 217 208 |
1239 *                   | 166 100  4   40 158 168 216 209 |
1240 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1241 *                   |  0   0   0   0   1   0   0   0  |
1242 *                   |  0   0   0   0   0   1   0   0  |
1243 *                   |  0   0   0   0   0   0   1   0  |
1244 *                   |  0   0   0   0   0   0   0   1  |
1245 *                   ~~                               ~~
1246 *
1247 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1248 * of the missing data.
1249 *
1250 * As is apparent from the example above, the only non-trivial rows in the
1251 * inverse matrix correspond to the data disks that we're trying to
1252 * reconstruct. Indeed, those are the only rows we need as the others would
1253 * only be useful for reconstructing data known or assumed to be valid. For
1254 * that reason, we only build the coefficients in the rows that correspond to
1255 * targeted columns.
1256 */
1257/* END CSTYLED */
1258
1259static void
1260vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1261    uint8_t **rows)
1262{
1263	int i, j;
1264	int pow;
1265
1266	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1267
1268	/*
1269	 * Fill in the missing rows of interest.
1270	 */
1271	for (i = 0; i < nmap; i++) {
1272		ASSERT3S(0, <=, map[i]);
1273		ASSERT3S(map[i], <=, 2);
1274
1275		pow = map[i] * n;
1276		if (pow > 255)
1277			pow -= 255;
1278		ASSERT(pow <= 255);
1279
1280		for (j = 0; j < n; j++) {
1281			pow -= map[i];
1282			if (pow < 0)
1283				pow += 255;
1284			rows[i][j] = vdev_raidz_pow2[pow];
1285		}
1286	}
1287}
1288
1289static void
1290vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1291    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1292{
1293	int i, j, ii, jj;
1294	uint8_t log;
1295
1296	/*
1297	 * Assert that the first nmissing entries from the array of used
1298	 * columns correspond to parity columns and that subsequent entries
1299	 * correspond to data columns.
1300	 */
1301	for (i = 0; i < nmissing; i++) {
1302		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1303	}
1304	for (; i < n; i++) {
1305		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1306	}
1307
1308	/*
1309	 * First initialize the storage where we'll compute the inverse rows.
1310	 */
1311	for (i = 0; i < nmissing; i++) {
1312		for (j = 0; j < n; j++) {
1313			invrows[i][j] = (i == j) ? 1 : 0;
1314		}
1315	}
1316
1317	/*
1318	 * Subtract all trivial rows from the rows of consequence.
1319	 */
1320	for (i = 0; i < nmissing; i++) {
1321		for (j = nmissing; j < n; j++) {
1322			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1323			jj = used[j] - rm->rm_firstdatacol;
1324			ASSERT3S(jj, <, n);
1325			invrows[i][j] = rows[i][jj];
1326			rows[i][jj] = 0;
1327		}
1328	}
1329
1330	/*
1331	 * For each of the rows of interest, we must normalize it and subtract
1332	 * a multiple of it from the other rows.
1333	 */
1334	for (i = 0; i < nmissing; i++) {
1335		for (j = 0; j < missing[i]; j++) {
1336			ASSERT0(rows[i][j]);
1337		}
1338		ASSERT3U(rows[i][missing[i]], !=, 0);
1339
1340		/*
1341		 * Compute the inverse of the first element and multiply each
1342		 * element in the row by that value.
1343		 */
1344		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1345
1346		for (j = 0; j < n; j++) {
1347			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1348			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1349		}
1350
1351		for (ii = 0; ii < nmissing; ii++) {
1352			if (i == ii)
1353				continue;
1354
1355			ASSERT3U(rows[ii][missing[i]], !=, 0);
1356
1357			log = vdev_raidz_log2[rows[ii][missing[i]]];
1358
1359			for (j = 0; j < n; j++) {
1360				rows[ii][j] ^=
1361				    vdev_raidz_exp2(rows[i][j], log);
1362				invrows[ii][j] ^=
1363				    vdev_raidz_exp2(invrows[i][j], log);
1364			}
1365		}
1366	}
1367
1368	/*
1369	 * Verify that the data that is left in the rows are properly part of
1370	 * an identity matrix.
1371	 */
1372	for (i = 0; i < nmissing; i++) {
1373		for (j = 0; j < n; j++) {
1374			if (j == missing[i]) {
1375				ASSERT3U(rows[i][j], ==, 1);
1376			} else {
1377				ASSERT0(rows[i][j]);
1378			}
1379		}
1380	}
1381}
1382
1383static void
1384vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1385    int *missing, uint8_t **invrows, const uint8_t *used)
1386{
1387	int i, j, x, cc, c;
1388	uint8_t *src;
1389	uint64_t ccount;
1390	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1391	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1392	uint8_t log = 0;
1393	uint8_t val;
1394	int ll;
1395	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1396	uint8_t *p, *pp;
1397	size_t psize;
1398
1399	psize = sizeof (invlog[0][0]) * n * nmissing;
1400	p = kmem_alloc(psize, KM_SLEEP);
1401
1402	for (pp = p, i = 0; i < nmissing; i++) {
1403		invlog[i] = pp;
1404		pp += n;
1405	}
1406
1407	for (i = 0; i < nmissing; i++) {
1408		for (j = 0; j < n; j++) {
1409			ASSERT3U(invrows[i][j], !=, 0);
1410			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1411		}
1412	}
1413
1414	for (i = 0; i < n; i++) {
1415		c = used[i];
1416		ASSERT3U(c, <, rm->rm_cols);
1417
1418		src = abd_to_buf(rm->rm_col[c].rc_abd);
1419		ccount = rm->rm_col[c].rc_size;
1420		for (j = 0; j < nmissing; j++) {
1421			cc = missing[j] + rm->rm_firstdatacol;
1422			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1423			ASSERT3U(cc, <, rm->rm_cols);
1424			ASSERT3U(cc, !=, c);
1425
1426			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1427			dcount[j] = rm->rm_col[cc].rc_size;
1428		}
1429
1430		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1431
1432		for (x = 0; x < ccount; x++, src++) {
1433			if (*src != 0)
1434				log = vdev_raidz_log2[*src];
1435
1436			for (cc = 0; cc < nmissing; cc++) {
1437				if (x >= dcount[cc])
1438					continue;
1439
1440				if (*src == 0) {
1441					val = 0;
1442				} else {
1443					if ((ll = log + invlog[cc][i]) >= 255)
1444						ll -= 255;
1445					val = vdev_raidz_pow2[ll];
1446				}
1447
1448				if (i == 0)
1449					dst[cc][x] = val;
1450				else
1451					dst[cc][x] ^= val;
1452			}
1453		}
1454	}
1455
1456	kmem_free(p, psize);
1457}
1458
1459static int
1460vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1461{
1462	int n, i, c, t, tt;
1463	int nmissing_rows;
1464	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1465	int parity_map[VDEV_RAIDZ_MAXPARITY];
1466
1467	uint8_t *p, *pp;
1468	size_t psize;
1469
1470	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1471	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1472	uint8_t *used;
1473
1474	abd_t **bufs = NULL;
1475
1476	int code = 0;
1477
1478	/*
1479	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1480	 * temporary linear ABDs.
1481	 */
1482	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1483		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1484
1485		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1486			raidz_col_t *col = &rm->rm_col[c];
1487
1488			bufs[c] = col->rc_abd;
1489			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1490			ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
1491			ASSERT3S(bufs[c]->abd_size, >=, col->rc_size);
1492			abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size);
1493		}
1494	}
1495
1496	n = rm->rm_cols - rm->rm_firstdatacol;
1497
1498	/*
1499	 * Figure out which data columns are missing.
1500	 */
1501	nmissing_rows = 0;
1502	for (t = 0; t < ntgts; t++) {
1503		if (tgts[t] >= rm->rm_firstdatacol) {
1504			missing_rows[nmissing_rows++] =
1505			    tgts[t] - rm->rm_firstdatacol;
1506		}
1507	}
1508
1509	/*
1510	 * Figure out which parity columns to use to help generate the missing
1511	 * data columns.
1512	 */
1513	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1514		ASSERT(tt < ntgts);
1515		ASSERT(c < rm->rm_firstdatacol);
1516
1517		/*
1518		 * Skip any targeted parity columns.
1519		 */
1520		if (c == tgts[tt]) {
1521			tt++;
1522			continue;
1523		}
1524
1525		code |= 1 << c;
1526
1527		parity_map[i] = c;
1528		i++;
1529	}
1530
1531	ASSERT(code != 0);
1532	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1533
1534	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1535	    nmissing_rows * n + sizeof (used[0]) * n;
1536	p = kmem_alloc(psize, KM_SLEEP);
1537
1538	for (pp = p, i = 0; i < nmissing_rows; i++) {
1539		rows[i] = pp;
1540		pp += n;
1541		invrows[i] = pp;
1542		pp += n;
1543	}
1544	used = pp;
1545
1546	for (i = 0; i < nmissing_rows; i++) {
1547		used[i] = parity_map[i];
1548	}
1549
1550	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1551		if (tt < nmissing_rows &&
1552		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1553			tt++;
1554			continue;
1555		}
1556
1557		ASSERT3S(i, <, n);
1558		used[i] = c;
1559		i++;
1560	}
1561
1562	/*
1563	 * Initialize the interesting rows of the matrix.
1564	 */
1565	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1566
1567	/*
1568	 * Invert the matrix.
1569	 */
1570	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1571	    invrows, used);
1572
1573	/*
1574	 * Reconstruct the missing data using the generated matrix.
1575	 */
1576	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1577	    invrows, used);
1578
1579	kmem_free(p, psize);
1580
1581	/*
1582	 * copy back from temporary linear abds and free them
1583	 */
1584	if (bufs) {
1585		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1586			raidz_col_t *col = &rm->rm_col[c];
1587
1588			ASSERT3S(bufs[c]->abd_size, >=, col->rc_size);
1589			ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
1590			abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size);
1591			abd_free(col->rc_abd);
1592			col->rc_abd = bufs[c];
1593		}
1594		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1595	}
1596
1597	return (code);
1598}
1599
1600static int
1601vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1602{
1603	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1604	int ntgts;
1605	int i, c;
1606	int code;
1607	int nbadparity, nbaddata;
1608	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1609
1610	/*
1611	 * The tgts list must already be sorted.
1612	 */
1613	for (i = 1; i < nt; i++) {
1614		ASSERT(t[i] > t[i - 1]);
1615	}
1616
1617	nbadparity = rm->rm_firstdatacol;
1618	nbaddata = rm->rm_cols - nbadparity;
1619	ntgts = 0;
1620	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1621		if (c < rm->rm_firstdatacol)
1622			parity_valid[c] = B_FALSE;
1623
1624		if (i < nt && c == t[i]) {
1625			tgts[ntgts++] = c;
1626			i++;
1627		} else if (rm->rm_col[c].rc_error != 0) {
1628			tgts[ntgts++] = c;
1629		} else if (c >= rm->rm_firstdatacol) {
1630			nbaddata--;
1631		} else {
1632			parity_valid[c] = B_TRUE;
1633			nbadparity--;
1634		}
1635	}
1636
1637	ASSERT(ntgts >= nt);
1638	ASSERT(nbaddata >= 0);
1639	ASSERT(nbaddata + nbadparity == ntgts);
1640
1641	dt = &tgts[nbadparity];
1642
1643	/*
1644	 * See if we can use any of our optimized reconstruction routines.
1645	 */
1646	if (!vdev_raidz_default_to_general) {
1647		switch (nbaddata) {
1648		case 1:
1649			if (parity_valid[VDEV_RAIDZ_P])
1650				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1651
1652			ASSERT(rm->rm_firstdatacol > 1);
1653
1654			if (parity_valid[VDEV_RAIDZ_Q])
1655				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1656
1657			ASSERT(rm->rm_firstdatacol > 2);
1658			break;
1659
1660		case 2:
1661			ASSERT(rm->rm_firstdatacol > 1);
1662
1663			if (parity_valid[VDEV_RAIDZ_P] &&
1664			    parity_valid[VDEV_RAIDZ_Q])
1665				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1666
1667			ASSERT(rm->rm_firstdatacol > 2);
1668
1669			break;
1670		}
1671	}
1672
1673	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1674	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1675	ASSERT(code > 0);
1676	return (code);
1677}
1678
1679static int
1680vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1681    uint64_t *ashift)
1682{
1683	vdev_t *cvd;
1684	uint64_t nparity = vd->vdev_nparity;
1685	int c;
1686	int lasterror = 0;
1687	int numerrors = 0;
1688
1689	ASSERT(nparity > 0);
1690
1691	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1692	    vd->vdev_children < nparity + 1) {
1693		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1694		return (SET_ERROR(EINVAL));
1695	}
1696
1697	vdev_open_children(vd);
1698
1699	for (c = 0; c < vd->vdev_children; c++) {
1700		cvd = vd->vdev_child[c];
1701
1702		if (cvd->vdev_open_error != 0) {
1703			lasterror = cvd->vdev_open_error;
1704			numerrors++;
1705			continue;
1706		}
1707
1708		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1709		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1710		*ashift = MAX(*ashift, cvd->vdev_ashift);
1711	}
1712
1713	*asize *= vd->vdev_children;
1714	*max_asize *= vd->vdev_children;
1715
1716	if (numerrors > nparity) {
1717		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1718		return (lasterror);
1719	}
1720
1721	return (0);
1722}
1723
1724static void
1725vdev_raidz_close(vdev_t *vd)
1726{
1727	int c;
1728
1729	for (c = 0; c < vd->vdev_children; c++)
1730		vdev_close(vd->vdev_child[c]);
1731}
1732
1733/*
1734 * Handle a read or write I/O to a RAID-Z dump device.
1735 *
1736 * The dump device is in a unique situation compared to other ZFS datasets:
1737 * writing to this device should be as simple and fast as possible.  In
1738 * addition, durability matters much less since the dump will be extracted
1739 * once the machine reboots.  For that reason, this function eschews parity for
1740 * performance and simplicity.  The dump device uses the checksum setting
1741 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1742 * dataset.
1743 *
1744 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1745 * 128 KB will not fill an entire block; in addition, they may not be properly
1746 * aligned.  In that case, this function uses the preallocated 128 KB block and
1747 * omits reading or writing any "empty" portions of that block, as opposed to
1748 * allocating a fresh appropriately-sized block.
1749 *
1750 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1751 *
1752 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1753 *
1754 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1755 * allocated which spans all five child vdevs.  8 KB of data would be written to
1756 * each of four vdevs, with the fifth containing the parity bits.
1757 *
1758 *       parity    data     data     data     data
1759 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1760 *         ^        ^        ^        ^        ^
1761 *         |        |        |        |        |
1762 *   8 KB parity    ------8 KB data blocks------
1763 *
1764 * However, when writing to the dump device, the behavior is different:
1765 *
1766 *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1767 *
1768 * Unlike the normal RAID-Z case in which the block is allocated based on the
1769 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1770 * I/O size is less than 128 KB, only the actual portions of data are written.
1771 * In this example the data is written to the third data vdev since that vdev
1772 * contains the offset [64 KB, 96 KB).
1773 *
1774 *       parity    data     data     data     data
1775 *     |        |        |        |   XX   |        |
1776 *                                    ^
1777 *                                    |
1778 *                             32 KB data block
1779 *
1780 * As a result, an individual I/O may not span all child vdevs; moreover, a
1781 * small I/O may only operate on a single child vdev.
1782 *
1783 * Note that since there are no parity bits calculated or written, this format
1784 * remains the same no matter how many parity bits are used in a normal RAID-Z
1785 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1786 * would look like:
1787 *
1788 *       parity   parity   parity    data     data     data     data
1789 *     |        |        |        |        |        |   XX   |        |
1790 *                                                      ^
1791 *                                                      |
1792 *                                               32 KB data block
1793 */
1794int
1795vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1796    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1797{
1798	vdev_t *tvd = vd->vdev_top;
1799	vdev_t *cvd;
1800	raidz_map_t *rm;
1801	raidz_col_t *rc;
1802	int c, err = 0;
1803
1804	uint64_t start, end, colstart, colend;
1805	uint64_t coloffset, colsize, colskip;
1806
1807	int flags = doread ? B_READ : B_WRITE;
1808
1809#ifdef	_KERNEL
1810
1811	/*
1812	 * Don't write past the end of the block
1813	 */
1814	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1815
1816	start = offset;
1817	end = start + size;
1818
1819	/*
1820	 * Allocate a RAID-Z map for this block.  Note that this block starts
1821	 * from the "original" offset, this is, the offset of the extent which
1822	 * contains the requisite offset of the data being read or written.
1823	 *
1824	 * Even if this I/O operation doesn't span the full block size, let's
1825	 * treat the on-disk format as if the only blocks are the complete 128
1826	 * KB size.
1827	 */
1828	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1829	    SPA_OLD_MAXBLOCKSIZE);
1830	rm = vdev_raidz_map_alloc(abd,
1831	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1832	    vd->vdev_children, vd->vdev_nparity);
1833
1834	coloffset = origoffset;
1835
1836	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1837	    c++, coloffset += rc->rc_size) {
1838		rc = &rm->rm_col[c];
1839		cvd = vd->vdev_child[rc->rc_devidx];
1840
1841		/*
1842		 * Find the start and end of this column in the RAID-Z map,
1843		 * keeping in mind that the stated size and offset of the
1844		 * operation may not fill the entire column for this vdev.
1845		 *
1846		 * If any portion of the data spans this column, issue the
1847		 * appropriate operation to the vdev.
1848		 */
1849		if (coloffset + rc->rc_size <= start)
1850			continue;
1851		if (coloffset >= end)
1852			continue;
1853
1854		colstart = MAX(coloffset, start);
1855		colend = MIN(end, coloffset + rc->rc_size);
1856		colsize = colend - colstart;
1857		colskip = colstart - coloffset;
1858
1859		VERIFY3U(colsize, <=, rc->rc_size);
1860		VERIFY3U(colskip, <=, rc->rc_size);
1861
1862		/*
1863		 * Note that the child vdev will have a vdev label at the start
1864		 * of its range of offsets, hence the need for
1865		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1866		 * example of why this calculation is needed.
1867		 */
1868		if ((err = vdev_disk_physio(cvd,
1869		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1870		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1871		    flags, isdump)) != 0)
1872			break;
1873	}
1874
1875	vdev_raidz_map_free(rm);
1876	abd_put(abd);
1877#endif	/* KERNEL */
1878
1879	return (err);
1880}
1881
1882static uint64_t
1883vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1884{
1885	uint64_t asize;
1886	uint64_t ashift = vd->vdev_top->vdev_ashift;
1887	uint64_t cols = vd->vdev_children;
1888	uint64_t nparity = vd->vdev_nparity;
1889
1890	asize = ((psize - 1) >> ashift) + 1;
1891	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1892	asize = roundup(asize, nparity + 1) << ashift;
1893
1894	return (asize);
1895}
1896
1897static void
1898vdev_raidz_child_done(zio_t *zio)
1899{
1900	raidz_col_t *rc = zio->io_private;
1901
1902	rc->rc_error = zio->io_error;
1903	rc->rc_tried = 1;
1904	rc->rc_skipped = 0;
1905}
1906
1907static void
1908vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1909{
1910#ifdef ZFS_DEBUG
1911	vdev_t *vd = zio->io_vd;
1912	vdev_t *tvd = vd->vdev_top;
1913
1914	range_seg_t logical_rs, physical_rs;
1915	logical_rs.rs_start = zio->io_offset;
1916	logical_rs.rs_end = logical_rs.rs_start +
1917	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1918
1919	raidz_col_t *rc = &rm->rm_col[col];
1920	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1921
1922	vdev_xlate(cvd, &logical_rs, &physical_rs);
1923	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1924	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1925	/*
1926	 * It would be nice to assert that rs_end is equal
1927	 * to rc_offset + rc_size but there might be an
1928	 * optional I/O at the end that is not accounted in
1929	 * rc_size.
1930	 */
1931	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1932		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1933		    rc->rc_size + (1 << tvd->vdev_ashift));
1934	} else {
1935		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1936	}
1937#endif
1938}
1939
1940/*
1941 * Start an IO operation on a RAIDZ VDev
1942 *
1943 * Outline:
1944 * - For write operations:
1945 *   1. Generate the parity data
1946 *   2. Create child zio write operations to each column's vdev, for both
1947 *      data and parity.
1948 *   3. If the column skips any sectors for padding, create optional dummy
1949 *      write zio children for those areas to improve aggregation continuity.
1950 * - For read operations:
1951 *   1. Create child zio read operations to each data column's vdev to read
1952 *      the range of data required for zio.
1953 *   2. If this is a scrub or resilver operation, or if any of the data
1954 *      vdevs have had errors, then create zio read operations to the parity
1955 *      columns' VDevs as well.
1956 */
1957static void
1958vdev_raidz_io_start(zio_t *zio)
1959{
1960	vdev_t *vd = zio->io_vd;
1961	vdev_t *tvd = vd->vdev_top;
1962	vdev_t *cvd;
1963	raidz_map_t *rm;
1964	raidz_col_t *rc;
1965	int c, i;
1966
1967	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1968	    tvd->vdev_ashift, vd->vdev_children,
1969	    vd->vdev_nparity);
1970
1971	zio->io_vsd = rm;
1972	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1973
1974	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1975
1976	if (zio->io_type == ZIO_TYPE_WRITE) {
1977		vdev_raidz_generate_parity(rm);
1978
1979		for (c = 0; c < rm->rm_cols; c++) {
1980			rc = &rm->rm_col[c];
1981			cvd = vd->vdev_child[rc->rc_devidx];
1982
1983			/*
1984			 * Verify physical to logical translation.
1985			 */
1986			vdev_raidz_io_verify(zio, rm, c);
1987
1988			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1989			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1990			    zio->io_type, zio->io_priority, 0,
1991			    vdev_raidz_child_done, rc));
1992		}
1993
1994		/*
1995		 * Generate optional I/Os for any skipped sectors to improve
1996		 * aggregation contiguity.
1997		 */
1998		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1999			ASSERT(c <= rm->rm_scols);
2000			if (c == rm->rm_scols)
2001				c = 0;
2002			rc = &rm->rm_col[c];
2003			cvd = vd->vdev_child[rc->rc_devidx];
2004			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2005			    rc->rc_offset + rc->rc_size, NULL,
2006			    1 << tvd->vdev_ashift,
2007			    zio->io_type, zio->io_priority,
2008			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2009		}
2010
2011		zio_execute(zio);
2012		return;
2013	}
2014
2015	ASSERT(zio->io_type == ZIO_TYPE_READ);
2016
2017	/*
2018	 * Iterate over the columns in reverse order so that we hit the parity
2019	 * last -- any errors along the way will force us to read the parity.
2020	 */
2021	for (c = rm->rm_cols - 1; c >= 0; c--) {
2022		rc = &rm->rm_col[c];
2023		cvd = vd->vdev_child[rc->rc_devidx];
2024		if (!vdev_readable(cvd)) {
2025			if (c >= rm->rm_firstdatacol)
2026				rm->rm_missingdata++;
2027			else
2028				rm->rm_missingparity++;
2029			rc->rc_error = SET_ERROR(ENXIO);
2030			rc->rc_tried = 1;	/* don't even try */
2031			rc->rc_skipped = 1;
2032			continue;
2033		}
2034		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2035			if (c >= rm->rm_firstdatacol)
2036				rm->rm_missingdata++;
2037			else
2038				rm->rm_missingparity++;
2039			rc->rc_error = SET_ERROR(ESTALE);
2040			rc->rc_skipped = 1;
2041			continue;
2042		}
2043		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2044		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2045			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2046			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2047			    zio->io_type, zio->io_priority, 0,
2048			    vdev_raidz_child_done, rc));
2049		}
2050	}
2051
2052	zio_execute(zio);
2053}
2054
2055
2056/*
2057 * Report a checksum error for a child of a RAID-Z device.
2058 */
2059static void
2060raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2061{
2062	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2063
2064	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2065		zio_bad_cksum_t zbc;
2066		raidz_map_t *rm = zio->io_vsd;
2067
2068		mutex_enter(&vd->vdev_stat_lock);
2069		vd->vdev_stat.vs_checksum_errors++;
2070		mutex_exit(&vd->vdev_stat_lock);
2071
2072		zbc.zbc_has_cksum = 0;
2073		zbc.zbc_injected = rm->rm_ecksuminjected;
2074
2075		zfs_ereport_post_checksum(zio->io_spa, vd,
2076		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2077		    rc->rc_abd, bad_data, &zbc);
2078	}
2079}
2080
2081/*
2082 * We keep track of whether or not there were any injected errors, so that
2083 * any ereports we generate can note it.
2084 */
2085static int
2086raidz_checksum_verify(zio_t *zio)
2087{
2088	zio_bad_cksum_t zbc;
2089	raidz_map_t *rm = zio->io_vsd;
2090
2091	int ret = zio_checksum_error(zio, &zbc);
2092	if (ret != 0 && zbc.zbc_injected != 0)
2093		rm->rm_ecksuminjected = 1;
2094
2095	return (ret);
2096}
2097
2098/*
2099 * Generate the parity from the data columns. If we tried and were able to
2100 * read the parity without error, verify that the generated parity matches the
2101 * data we read. If it doesn't, we fire off a checksum error. Return the
2102 * number such failures.
2103 */
2104static int
2105raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2106{
2107	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2108	int c, ret = 0;
2109	raidz_col_t *rc;
2110
2111	blkptr_t *bp = zio->io_bp;
2112	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2113	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2114
2115	if (checksum == ZIO_CHECKSUM_NOPARITY)
2116		return (ret);
2117
2118	for (c = 0; c < rm->rm_firstdatacol; c++) {
2119		rc = &rm->rm_col[c];
2120		if (!rc->rc_tried || rc->rc_error != 0)
2121			continue;
2122		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
2123		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
2124	}
2125
2126	vdev_raidz_generate_parity(rm);
2127
2128	for (c = 0; c < rm->rm_firstdatacol; c++) {
2129		rc = &rm->rm_col[c];
2130		if (!rc->rc_tried || rc->rc_error != 0)
2131			continue;
2132		if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) {
2133			raidz_checksum_error(zio, rc, orig[c]);
2134			rc->rc_error = SET_ERROR(ECKSUM);
2135			ret++;
2136		}
2137		abd_free(orig[c]);
2138	}
2139
2140	return (ret);
2141}
2142
2143/*
2144 * Keep statistics on all the ways that we used parity to correct data.
2145 */
2146static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2147
2148static int
2149vdev_raidz_worst_error(raidz_map_t *rm)
2150{
2151	int error = 0;
2152
2153	for (int c = 0; c < rm->rm_cols; c++)
2154		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2155
2156	return (error);
2157}
2158
2159/*
2160 * Iterate over all combinations of bad data and attempt a reconstruction.
2161 * Note that the algorithm below is non-optimal because it doesn't take into
2162 * account how reconstruction is actually performed. For example, with
2163 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2164 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2165 * cases we'd only use parity information in column 0.
2166 */
2167static int
2168vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2169{
2170	raidz_map_t *rm = zio->io_vsd;
2171	raidz_col_t *rc;
2172	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2173	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2174	int *tgts = &tstore[1];
2175	int current, next, i, c, n;
2176	int code, ret = 0;
2177
2178	ASSERT(total_errors < rm->rm_firstdatacol);
2179
2180	/*
2181	 * This simplifies one edge condition.
2182	 */
2183	tgts[-1] = -1;
2184
2185	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2186		/*
2187		 * Initialize the targets array by finding the first n columns
2188		 * that contain no error.
2189		 *
2190		 * If there were no data errors, we need to ensure that we're
2191		 * always explicitly attempting to reconstruct at least one
2192		 * data column. To do this, we simply push the highest target
2193		 * up into the data columns.
2194		 */
2195		for (c = 0, i = 0; i < n; i++) {
2196			if (i == n - 1 && data_errors == 0 &&
2197			    c < rm->rm_firstdatacol) {
2198				c = rm->rm_firstdatacol;
2199			}
2200
2201			while (rm->rm_col[c].rc_error != 0) {
2202				c++;
2203				ASSERT3S(c, <, rm->rm_cols);
2204			}
2205
2206			tgts[i] = c++;
2207		}
2208
2209		/*
2210		 * Setting tgts[n] simplifies the other edge condition.
2211		 */
2212		tgts[n] = rm->rm_cols;
2213
2214		/*
2215		 * These buffers were allocated in previous iterations.
2216		 */
2217		for (i = 0; i < n - 1; i++) {
2218			ASSERT(orig[i] != NULL);
2219		}
2220
2221		orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd,
2222		    rm->rm_col[0].rc_size);
2223
2224		current = 0;
2225		next = tgts[current];
2226
2227		while (current != n) {
2228			tgts[current] = next;
2229			current = 0;
2230
2231			/*
2232			 * Save off the original data that we're going to
2233			 * attempt to reconstruct.
2234			 */
2235			for (i = 0; i < n; i++) {
2236				ASSERT(orig[i] != NULL);
2237				c = tgts[i];
2238				ASSERT3S(c, >=, 0);
2239				ASSERT3S(c, <, rm->rm_cols);
2240				rc = &rm->rm_col[c];
2241				ASSERT3S(orig[i]->abd_size, >=, rc->rc_size);
2242				ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size);
2243				abd_copy_off(orig[i], rc->rc_abd, 0, 0,
2244				    rc->rc_size);
2245			}
2246
2247			/*
2248			 * Attempt a reconstruction and exit the outer loop on
2249			 * success.
2250			 */
2251			code = vdev_raidz_reconstruct(rm, tgts, n);
2252			if (raidz_checksum_verify(zio) == 0) {
2253				atomic_inc_64(&raidz_corrected[code]);
2254
2255				for (i = 0; i < n; i++) {
2256					c = tgts[i];
2257					rc = &rm->rm_col[c];
2258					ASSERT(rc->rc_error == 0);
2259					if (rc->rc_tried)
2260						raidz_checksum_error(zio, rc,
2261						    orig[i]);
2262					rc->rc_error = SET_ERROR(ECKSUM);
2263				}
2264
2265				ret = code;
2266				goto done;
2267			}
2268
2269			/*
2270			 * Restore the original data.
2271			 */
2272			for (i = 0; i < n; i++) {
2273				c = tgts[i];
2274				rc = &rm->rm_col[c];
2275				ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size);
2276				ASSERT3S(orig[i]->abd_size, >=, rc->rc_size);
2277				abd_copy_off(rc->rc_abd, orig[i], 0, 0,
2278				    rc->rc_size);
2279			}
2280
2281			do {
2282				/*
2283				 * Find the next valid column after the current
2284				 * position..
2285				 */
2286				for (next = tgts[current] + 1;
2287				    next < rm->rm_cols &&
2288				    rm->rm_col[next].rc_error != 0; next++)
2289					continue;
2290
2291				ASSERT(next <= tgts[current + 1]);
2292
2293				/*
2294				 * If that spot is available, we're done here.
2295				 */
2296				if (next != tgts[current + 1])
2297					break;
2298
2299				/*
2300				 * Otherwise, find the next valid column after
2301				 * the previous position.
2302				 */
2303				for (c = tgts[current - 1] + 1;
2304				    rm->rm_col[c].rc_error != 0; c++)
2305					continue;
2306
2307				tgts[current] = c;
2308				current++;
2309
2310			} while (current != n);
2311		}
2312	}
2313	n--;
2314done:
2315	for (i = 0; i < n; i++)
2316		abd_free(orig[i]);
2317
2318	return (ret);
2319}
2320
2321/*
2322 * Complete an IO operation on a RAIDZ VDev
2323 *
2324 * Outline:
2325 * - For write operations:
2326 *   1. Check for errors on the child IOs.
2327 *   2. Return, setting an error code if too few child VDevs were written
2328 *      to reconstruct the data later.  Note that partial writes are
2329 *      considered successful if they can be reconstructed at all.
2330 * - For read operations:
2331 *   1. Check for errors on the child IOs.
2332 *   2. If data errors occurred:
2333 *      a. Try to reassemble the data from the parity available.
2334 *      b. If we haven't yet read the parity drives, read them now.
2335 *      c. If all parity drives have been read but the data still doesn't
2336 *         reassemble with a correct checksum, then try combinatorial
2337 *         reconstruction.
2338 *      d. If that doesn't work, return an error.
2339 *   3. If there were unexpected errors or this is a resilver operation,
2340 *      rewrite the vdevs that had errors.
2341 */
2342static void
2343vdev_raidz_io_done(zio_t *zio)
2344{
2345	vdev_t *vd = zio->io_vd;
2346	vdev_t *cvd;
2347	raidz_map_t *rm = zio->io_vsd;
2348	raidz_col_t *rc;
2349	int unexpected_errors = 0;
2350	int parity_errors = 0;
2351	int parity_untried = 0;
2352	int data_errors = 0;
2353	int total_errors = 0;
2354	int n, c;
2355	int tgts[VDEV_RAIDZ_MAXPARITY];
2356	int code;
2357
2358	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2359
2360	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2361	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2362
2363	for (c = 0; c < rm->rm_cols; c++) {
2364		rc = &rm->rm_col[c];
2365
2366		if (rc->rc_error) {
2367			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2368
2369			if (c < rm->rm_firstdatacol)
2370				parity_errors++;
2371			else
2372				data_errors++;
2373
2374			if (!rc->rc_skipped)
2375				unexpected_errors++;
2376
2377			total_errors++;
2378		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2379			parity_untried++;
2380		}
2381	}
2382
2383	if (zio->io_type == ZIO_TYPE_WRITE) {
2384		/*
2385		 * XXX -- for now, treat partial writes as a success.
2386		 * (If we couldn't write enough columns to reconstruct
2387		 * the data, the I/O failed.  Otherwise, good enough.)
2388		 *
2389		 * Now that we support write reallocation, it would be better
2390		 * to treat partial failure as real failure unless there are
2391		 * no non-degraded top-level vdevs left, and not update DTLs
2392		 * if we intend to reallocate.
2393		 */
2394		/* XXPOLICY */
2395		if (total_errors > rm->rm_firstdatacol)
2396			zio->io_error = vdev_raidz_worst_error(rm);
2397
2398		return;
2399	}
2400
2401	ASSERT(zio->io_type == ZIO_TYPE_READ);
2402	/*
2403	 * There are three potential phases for a read:
2404	 *	1. produce valid data from the columns read
2405	 *	2. read all disks and try again
2406	 *	3. perform combinatorial reconstruction
2407	 *
2408	 * Each phase is progressively both more expensive and less likely to
2409	 * occur. If we encounter more errors than we can repair or all phases
2410	 * fail, we have no choice but to return an error.
2411	 */
2412
2413	/*
2414	 * If the number of errors we saw was correctable -- less than or equal
2415	 * to the number of parity disks read -- attempt to produce data that
2416	 * has a valid checksum. Naturally, this case applies in the absence of
2417	 * any errors.
2418	 */
2419	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2420		if (data_errors == 0) {
2421			if (raidz_checksum_verify(zio) == 0) {
2422				/*
2423				 * If we read parity information (unnecessarily
2424				 * as it happens since no reconstruction was
2425				 * needed) regenerate and verify the parity.
2426				 * We also regenerate parity when resilvering
2427				 * so we can write it out to the failed device
2428				 * later.
2429				 */
2430				if (parity_errors + parity_untried <
2431				    rm->rm_firstdatacol ||
2432				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2433					n = raidz_parity_verify(zio, rm);
2434					unexpected_errors += n;
2435					ASSERT(parity_errors + n <=
2436					    rm->rm_firstdatacol);
2437				}
2438				goto done;
2439			}
2440		} else {
2441			/*
2442			 * We either attempt to read all the parity columns or
2443			 * none of them. If we didn't try to read parity, we
2444			 * wouldn't be here in the correctable case. There must
2445			 * also have been fewer parity errors than parity
2446			 * columns or, again, we wouldn't be in this code path.
2447			 */
2448			ASSERT(parity_untried == 0);
2449			ASSERT(parity_errors < rm->rm_firstdatacol);
2450
2451			/*
2452			 * Identify the data columns that reported an error.
2453			 */
2454			n = 0;
2455			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2456				rc = &rm->rm_col[c];
2457				if (rc->rc_error != 0) {
2458					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2459					tgts[n++] = c;
2460				}
2461			}
2462
2463			ASSERT(rm->rm_firstdatacol >= n);
2464
2465			code = vdev_raidz_reconstruct(rm, tgts, n);
2466
2467			if (raidz_checksum_verify(zio) == 0) {
2468				atomic_inc_64(&raidz_corrected[code]);
2469
2470				/*
2471				 * If we read more parity disks than were used
2472				 * for reconstruction, confirm that the other
2473				 * parity disks produced correct data. This
2474				 * routine is suboptimal in that it regenerates
2475				 * the parity that we already used in addition
2476				 * to the parity that we're attempting to
2477				 * verify, but this should be a relatively
2478				 * uncommon case, and can be optimized if it
2479				 * becomes a problem. Note that we regenerate
2480				 * parity when resilvering so we can write it
2481				 * out to failed devices later.
2482				 */
2483				if (parity_errors < rm->rm_firstdatacol - n ||
2484				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2485					n = raidz_parity_verify(zio, rm);
2486					unexpected_errors += n;
2487					ASSERT(parity_errors + n <=
2488					    rm->rm_firstdatacol);
2489				}
2490
2491				goto done;
2492			}
2493		}
2494	}
2495
2496	/*
2497	 * This isn't a typical situation -- either we got a read error or
2498	 * a child silently returned bad data. Read every block so we can
2499	 * try again with as much data and parity as we can track down. If
2500	 * we've already been through once before, all children will be marked
2501	 * as tried so we'll proceed to combinatorial reconstruction.
2502	 */
2503	unexpected_errors = 1;
2504	rm->rm_missingdata = 0;
2505	rm->rm_missingparity = 0;
2506
2507	for (c = 0; c < rm->rm_cols; c++) {
2508		if (rm->rm_col[c].rc_tried)
2509			continue;
2510
2511		zio_vdev_io_redone(zio);
2512		do {
2513			rc = &rm->rm_col[c];
2514			if (rc->rc_tried)
2515				continue;
2516			zio_nowait(zio_vdev_child_io(zio, NULL,
2517			    vd->vdev_child[rc->rc_devidx],
2518			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2519			    zio->io_type, zio->io_priority, 0,
2520			    vdev_raidz_child_done, rc));
2521		} while (++c < rm->rm_cols);
2522
2523		return;
2524	}
2525
2526	/*
2527	 * At this point we've attempted to reconstruct the data given the
2528	 * errors we detected, and we've attempted to read all columns. There
2529	 * must, therefore, be one or more additional problems -- silent errors
2530	 * resulting in invalid data rather than explicit I/O errors resulting
2531	 * in absent data. We check if there is enough additional data to
2532	 * possibly reconstruct the data and then perform combinatorial
2533	 * reconstruction over all possible combinations. If that fails,
2534	 * we're cooked.
2535	 */
2536	if (total_errors > rm->rm_firstdatacol) {
2537		zio->io_error = vdev_raidz_worst_error(rm);
2538
2539	} else if (total_errors < rm->rm_firstdatacol &&
2540	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2541		/*
2542		 * If we didn't use all the available parity for the
2543		 * combinatorial reconstruction, verify that the remaining
2544		 * parity is correct.
2545		 */
2546		if (code != (1 << rm->rm_firstdatacol) - 1)
2547			(void) raidz_parity_verify(zio, rm);
2548	} else {
2549		/*
2550		 * We're here because either:
2551		 *
2552		 *	total_errors == rm_first_datacol, or
2553		 *	vdev_raidz_combrec() failed
2554		 *
2555		 * In either case, there is enough bad data to prevent
2556		 * reconstruction.
2557		 *
2558		 * Start checksum ereports for all children which haven't
2559		 * failed, and the IO wasn't speculative.
2560		 */
2561		zio->io_error = SET_ERROR(ECKSUM);
2562
2563		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2564			for (c = 0; c < rm->rm_cols; c++) {
2565				rc = &rm->rm_col[c];
2566				if (rc->rc_error == 0) {
2567					zio_bad_cksum_t zbc;
2568					zbc.zbc_has_cksum = 0;
2569					zbc.zbc_injected =
2570					    rm->rm_ecksuminjected;
2571
2572					zfs_ereport_start_checksum(
2573					    zio->io_spa,
2574					    vd->vdev_child[rc->rc_devidx],
2575					    &zio->io_bookmark, zio,
2576					    rc->rc_offset, rc->rc_size,
2577					    (void *)(uintptr_t)c, &zbc);
2578				}
2579			}
2580		}
2581	}
2582
2583done:
2584	zio_checksum_verified(zio);
2585
2586	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2587	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2588		/*
2589		 * Use the good data we have in hand to repair damaged children.
2590		 */
2591		for (c = 0; c < rm->rm_cols; c++) {
2592			rc = &rm->rm_col[c];
2593			cvd = vd->vdev_child[rc->rc_devidx];
2594
2595			if (rc->rc_error == 0)
2596				continue;
2597
2598			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2599			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2600			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2601			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2602			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2603		}
2604	}
2605}
2606
2607static void
2608vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2609{
2610	if (faulted > vd->vdev_nparity)
2611		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2612		    VDEV_AUX_NO_REPLICAS);
2613	else if (degraded + faulted != 0)
2614		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2615	else
2616		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2617}
2618
2619/*
2620 * Determine if any portion of the provided block resides on a child vdev
2621 * with a dirty DTL and therefore needs to be resilvered.  The function
2622 * assumes that at least one DTL is dirty which imples that full stripe
2623 * width blocks must be resilvered.
2624 */
2625static boolean_t
2626vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2627{
2628	uint64_t dcols = vd->vdev_children;
2629	uint64_t nparity = vd->vdev_nparity;
2630	uint64_t ashift = vd->vdev_top->vdev_ashift;
2631	/* The starting RAIDZ (parent) vdev sector of the block. */
2632	uint64_t b = offset >> ashift;
2633	/* The zio's size in units of the vdev's minimum sector size. */
2634	uint64_t s = ((psize - 1) >> ashift) + 1;
2635	/* The first column for this stripe. */
2636	uint64_t f = b % dcols;
2637
2638	if (s + nparity >= dcols)
2639		return (B_TRUE);
2640
2641	for (uint64_t c = 0; c < s + nparity; c++) {
2642		uint64_t devidx = (f + c) % dcols;
2643		vdev_t *cvd = vd->vdev_child[devidx];
2644
2645		/*
2646		 * dsl_scan_need_resilver() already checked vd with
2647		 * vdev_dtl_contains(). So here just check cvd with
2648		 * vdev_dtl_empty(), cheaper and a good approximation.
2649		 */
2650		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2651			return (B_TRUE);
2652	}
2653
2654	return (B_FALSE);
2655}
2656
2657static void
2658vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2659{
2660	vdev_t *raidvd = cvd->vdev_parent;
2661	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2662
2663	uint64_t width = raidvd->vdev_children;
2664	uint64_t tgt_col = cvd->vdev_id;
2665	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2666
2667	/* make sure the offsets are block-aligned */
2668	ASSERT0(in->rs_start % (1 << ashift));
2669	ASSERT0(in->rs_end % (1 << ashift));
2670	uint64_t b_start = in->rs_start >> ashift;
2671	uint64_t b_end = in->rs_end >> ashift;
2672
2673	uint64_t start_row = 0;
2674	if (b_start > tgt_col) /* avoid underflow */
2675		start_row = ((b_start - tgt_col - 1) / width) + 1;
2676
2677	uint64_t end_row = 0;
2678	if (b_end > tgt_col)
2679		end_row = ((b_end - tgt_col - 1) / width) + 1;
2680
2681	res->rs_start = start_row << ashift;
2682	res->rs_end = end_row << ashift;
2683
2684	ASSERT3U(res->rs_start, <=, in->rs_start);
2685	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2686}
2687
2688vdev_ops_t vdev_raidz_ops = {
2689	.vdev_op_open = vdev_raidz_open,
2690	.vdev_op_close = vdev_raidz_close,
2691	.vdev_op_asize = vdev_raidz_asize,
2692	.vdev_op_io_start = vdev_raidz_io_start,
2693	.vdev_op_io_done = vdev_raidz_io_done,
2694	.vdev_op_state_change = vdev_raidz_state_change,
2695	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2696	.vdev_op_hold = NULL,
2697	.vdev_op_rele = NULL,
2698	.vdev_op_remap = NULL,
2699	.vdev_op_xlate = vdev_raidz_xlate,
2700	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2701	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2702};
2703