1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
25 * Copyright (c) 2016 Gvozden Ne��kovi��. All rights reserved.
26 * Copyright 2019 Joyent, Inc.
27 * Copyright (c) 2014 Integros [integros.com]
28 */
29
30#include <sys/zfs_context.h>
31#include <sys/spa.h>
32#include <sys/vdev_impl.h>
33#include <sys/vdev_file.h>
34#include <sys/zio.h>
35#include <sys/zio_checksum.h>
36#include <sys/abd.h>
37#include <sys/fs/zfs.h>
38#include <sys/fm/fs/zfs.h>
39#include <sys/vdev_raidz.h>
40#include <sys/vdev_raidz_impl.h>
41
42#ifdef ZFS_DEBUG
43#include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
44#endif
45
46/*
47 * Virtual device vector for RAID-Z.
48 *
49 * This vdev supports single, double, and triple parity. For single parity,
50 * we use a simple XOR of all the data columns. For double or triple parity,
51 * we use a special case of Reed-Solomon coding. This extends the
52 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
53 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
54 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
55 * former is also based. The latter is designed to provide higher performance
56 * for writes.
57 *
58 * Note that the Plank paper claimed to support arbitrary N+M, but was then
59 * amended six years later identifying a critical flaw that invalidates its
60 * claims. Nevertheless, the technique can be adapted to work for up to
61 * triple parity. For additional parity, the amendment "Note: Correction to
62 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
63 * is viable, but the additional complexity means that write performance will
64 * suffer.
65 *
66 * All of the methods above operate on a Galois field, defined over the
67 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
68 * can be expressed with a single byte. Briefly, the operations on the
69 * field are defined as follows:
70 *
71 *   o addition (+) is represented by a bitwise XOR
72 *   o subtraction (-) is therefore identical to addition: A + B = A - B
73 *   o multiplication of A by 2 is defined by the following bitwise expression:
74 *
75 *	(A * 2)_7 = A_6
76 *	(A * 2)_6 = A_5
77 *	(A * 2)_5 = A_4
78 *	(A * 2)_4 = A_3 + A_7
79 *	(A * 2)_3 = A_2 + A_7
80 *	(A * 2)_2 = A_1 + A_7
81 *	(A * 2)_1 = A_0
82 *	(A * 2)_0 = A_7
83 *
84 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
85 * As an aside, this multiplication is derived from the error correcting
86 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
87 *
88 * Observe that any number in the field (except for 0) can be expressed as a
89 * power of 2 -- a generator for the field. We store a table of the powers of
90 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
91 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
92 * than field addition). The inverse of a field element A (A^-1) is therefore
93 * A ^ (255 - 1) = A^254.
94 *
95 * The up-to-three parity columns, P, Q, R over several data columns,
96 * D_0, ... D_n-1, can be expressed by field operations:
97 *
98 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
99 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
100 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
101 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
102 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
103 *
104 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
105 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
106 * independent coefficients. (There are no additional coefficients that have
107 * this property which is why the uncorrected Plank method breaks down.)
108 *
109 * See the reconstruction code below for how P, Q and R can used individually
110 * or in concert to recover missing data columns.
111 */
112
113#define	VDEV_RAIDZ_P		0
114#define	VDEV_RAIDZ_Q		1
115#define	VDEV_RAIDZ_R		2
116
117#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
118#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
119
120/*
121 * We provide a mechanism to perform the field multiplication operation on a
122 * 64-bit value all at once rather than a byte at a time. This works by
123 * creating a mask from the top bit in each byte and using that to
124 * conditionally apply the XOR of 0x1d.
125 */
126#define	VDEV_RAIDZ_64MUL_2(x, mask) \
127{ \
128	(mask) = (x) & 0x8080808080808080ULL; \
129	(mask) = ((mask) << 1) - ((mask) >> 7); \
130	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
131	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
132}
133
134#define	VDEV_RAIDZ_64MUL_4(x, mask) \
135{ \
136	VDEV_RAIDZ_64MUL_2((x), mask); \
137	VDEV_RAIDZ_64MUL_2((x), mask); \
138}
139
140#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
141
142void
143vdev_raidz_map_free(raidz_map_t *rm)
144{
145	int c;
146
147	for (c = 0; c < rm->rm_firstdatacol; c++) {
148		abd_free(rm->rm_col[c].rc_abd);
149
150		if (rm->rm_col[c].rc_gdata != NULL)
151			abd_free(rm->rm_col[c].rc_gdata);
152	}
153
154	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
155		abd_put(rm->rm_col[c].rc_abd);
156
157	if (rm->rm_abd_copy != NULL)
158		abd_free(rm->rm_abd_copy);
159
160	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
161}
162
163static void
164vdev_raidz_map_free_vsd(zio_t *zio)
165{
166	raidz_map_t *rm = zio->io_vsd;
167
168	ASSERT0(rm->rm_freed);
169	rm->rm_freed = 1;
170
171	if (rm->rm_reports == 0)
172		vdev_raidz_map_free(rm);
173}
174
175/*ARGSUSED*/
176static void
177vdev_raidz_cksum_free(void *arg, size_t ignored)
178{
179	raidz_map_t *rm = arg;
180
181	ASSERT3U(rm->rm_reports, >, 0);
182
183	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
184		vdev_raidz_map_free(rm);
185}
186
187static void
188vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
189{
190	raidz_map_t *rm = zcr->zcr_cbdata;
191	const size_t c = zcr->zcr_cbinfo;
192	size_t x, offset;
193
194	const abd_t *good = NULL;
195	const abd_t *bad = rm->rm_col[c].rc_abd;
196
197	if (good_data == NULL) {
198		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
199		return;
200	}
201
202	if (c < rm->rm_firstdatacol) {
203		/*
204		 * The first time through, calculate the parity blocks for
205		 * the good data (this relies on the fact that the good
206		 * data never changes for a given logical ZIO)
207		 */
208		if (rm->rm_col[0].rc_gdata == NULL) {
209			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
210
211			/*
212			 * Set up the rm_col[]s to generate the parity for
213			 * good_data, first saving the parity bufs and
214			 * replacing them with buffers to hold the result.
215			 */
216			for (x = 0; x < rm->rm_firstdatacol; x++) {
217				bad_parity[x] = rm->rm_col[x].rc_abd;
218				rm->rm_col[x].rc_abd =
219				    rm->rm_col[x].rc_gdata =
220				    abd_alloc_sametype(rm->rm_col[x].rc_abd,
221				    rm->rm_col[x].rc_size);
222			}
223
224			/* fill in the data columns from good_data */
225			offset = 0;
226			for (; x < rm->rm_cols; x++) {
227				abd_put(rm->rm_col[x].rc_abd);
228
229				rm->rm_col[x].rc_abd =
230				    abd_get_offset_size((abd_t *)good_data,
231				    offset, rm->rm_col[x].rc_size);
232				offset += rm->rm_col[x].rc_size;
233			}
234
235			/*
236			 * Construct the parity from the good data.
237			 */
238			vdev_raidz_generate_parity(rm);
239
240			/* restore everything back to its original state */
241			for (x = 0; x < rm->rm_firstdatacol; x++)
242				rm->rm_col[x].rc_abd = bad_parity[x];
243
244			offset = 0;
245			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
246				abd_put(rm->rm_col[x].rc_abd);
247				rm->rm_col[x].rc_abd = abd_get_offset_size(
248				    rm->rm_abd_copy, offset,
249				    rm->rm_col[x].rc_size);
250				offset += rm->rm_col[x].rc_size;
251			}
252		}
253
254		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
255		good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0,
256		    rm->rm_col[c].rc_size);
257	} else {
258		/* adjust good_data to point at the start of our column */
259		offset = 0;
260		for (x = rm->rm_firstdatacol; x < c; x++)
261			offset += rm->rm_col[x].rc_size;
262
263		good = abd_get_offset_size((abd_t *)good_data, offset,
264		    rm->rm_col[c].rc_size);
265	}
266
267	/* we drop the ereport if it ends up that the data was good */
268	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
269	abd_put((abd_t *)good);
270}
271
272/*
273 * Invoked indirectly by zfs_ereport_start_checksum(), called
274 * below when our read operation fails completely.  The main point
275 * is to keep a copy of everything we read from disk, so that at
276 * vdev_raidz_cksum_finish() time we can compare it with the good data.
277 */
278static void
279vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
280{
281	size_t c = (size_t)(uintptr_t)arg;
282	size_t offset;
283
284	raidz_map_t *rm = zio->io_vsd;
285	size_t size;
286
287	/* set up the report and bump the refcount  */
288	zcr->zcr_cbdata = rm;
289	zcr->zcr_cbinfo = c;
290	zcr->zcr_finish = vdev_raidz_cksum_finish;
291	zcr->zcr_free = vdev_raidz_cksum_free;
292
293	rm->rm_reports++;
294	ASSERT3U(rm->rm_reports, >, 0);
295
296	if (rm->rm_abd_copy != NULL)
297		return;
298
299	/*
300	 * It's the first time we're called for this raidz_map_t, so we need
301	 * to copy the data aside; there's no guarantee that our zio's buffer
302	 * won't be re-used for something else.
303	 *
304	 * Our parity data is already in separate buffers, so there's no need
305	 * to copy them.
306	 */
307
308	size = 0;
309	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
310		size += rm->rm_col[c].rc_size;
311
312	rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE);
313
314	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
315		raidz_col_t *col = &rm->rm_col[c];
316		abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
317		    col->rc_size);
318
319		ASSERT3S(tmp->abd_size, >=, col->rc_size);
320		ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
321		abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size);
322		abd_put(col->rc_abd);
323		col->rc_abd = tmp;
324
325		offset += col->rc_size;
326	}
327	ASSERT3U(offset, ==, size);
328}
329
330static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
331	vdev_raidz_map_free_vsd,
332	vdev_raidz_cksum_report
333};
334
335/*
336 * Divides the IO evenly across all child vdevs; usually, dcols is
337 * the number of children in the target vdev.
338 */
339raidz_map_t *
340vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
341    uint64_t nparity)
342{
343	raidz_map_t *rm;
344	/* The starting RAIDZ (parent) vdev sector of the block. */
345	uint64_t b = zio->io_offset >> ashift;
346	/* The zio's size in units of the vdev's minimum sector size. */
347	uint64_t s = zio->io_size >> ashift;
348	/* The first column for this stripe. */
349	uint64_t f = b % dcols;
350	/* The starting byte offset on each child vdev. */
351	uint64_t o = (b / dcols) << ashift;
352	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
353	uint64_t off = 0;
354
355	/*
356	 * "Quotient": The number of data sectors for this stripe on all but
357	 * the "big column" child vdevs that also contain "remainder" data.
358	 */
359	q = s / (dcols - nparity);
360
361	/*
362	 * "Remainder": The number of partial stripe data sectors in this I/O.
363	 * This will add a sector to some, but not all, child vdevs.
364	 */
365	r = s - q * (dcols - nparity);
366
367	/* The number of "big columns" - those which contain remainder data. */
368	bc = (r == 0 ? 0 : r + nparity);
369
370	/*
371	 * The total number of data and parity sectors associated with
372	 * this I/O.
373	 */
374	tot = s + nparity * (q + (r == 0 ? 0 : 1));
375
376	/* acols: The columns that will be accessed. */
377	/* scols: The columns that will be accessed or skipped. */
378	if (q == 0) {
379		/* Our I/O request doesn't span all child vdevs. */
380		acols = bc;
381		scols = MIN(dcols, roundup(bc, nparity + 1));
382	} else {
383		acols = dcols;
384		scols = dcols;
385	}
386
387	ASSERT3U(acols, <=, scols);
388
389	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
390
391	rm->rm_cols = acols;
392	rm->rm_scols = scols;
393	rm->rm_bigcols = bc;
394	rm->rm_skipstart = bc;
395	rm->rm_missingdata = 0;
396	rm->rm_missingparity = 0;
397	rm->rm_firstdatacol = nparity;
398	rm->rm_abd_copy = NULL;
399	rm->rm_reports = 0;
400	rm->rm_freed = 0;
401	rm->rm_ecksuminjected = 0;
402
403	asize = 0;
404
405	for (c = 0; c < scols; c++) {
406		col = f + c;
407		coff = o;
408		if (col >= dcols) {
409			col -= dcols;
410			coff += 1ULL << ashift;
411		}
412		rm->rm_col[c].rc_devidx = col;
413		rm->rm_col[c].rc_offset = coff;
414		rm->rm_col[c].rc_abd = NULL;
415		rm->rm_col[c].rc_gdata = NULL;
416		rm->rm_col[c].rc_error = 0;
417		rm->rm_col[c].rc_tried = 0;
418		rm->rm_col[c].rc_skipped = 0;
419
420		if (c >= acols)
421			rm->rm_col[c].rc_size = 0;
422		else if (c < bc)
423			rm->rm_col[c].rc_size = (q + 1) << ashift;
424		else
425			rm->rm_col[c].rc_size = q << ashift;
426
427		asize += rm->rm_col[c].rc_size;
428	}
429
430	ASSERT3U(asize, ==, tot << ashift);
431	rm->rm_asize = roundup(asize, (nparity + 1) << ashift);
432	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
433	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift);
434	ASSERT3U(rm->rm_nskip, <=, nparity);
435
436	for (c = 0; c < rm->rm_firstdatacol; c++)
437		rm->rm_col[c].rc_abd =
438		    abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
439
440	rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
441	    rm->rm_col[c].rc_size);
442	off = rm->rm_col[c].rc_size;
443
444	for (c = c + 1; c < acols; c++) {
445		rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off,
446		    rm->rm_col[c].rc_size);
447		off += rm->rm_col[c].rc_size;
448	}
449
450	/*
451	 * If all data stored spans all columns, there's a danger that parity
452	 * will always be on the same device and, since parity isn't read
453	 * during normal operation, that device's I/O bandwidth won't be
454	 * used effectively. We therefore switch the parity every 1MB.
455	 *
456	 * ... at least that was, ostensibly, the theory. As a practical
457	 * matter unless we juggle the parity between all devices evenly, we
458	 * won't see any benefit. Further, occasional writes that aren't a
459	 * multiple of the LCM of the number of children and the minimum
460	 * stripe width are sufficient to avoid pessimal behavior.
461	 * Unfortunately, this decision created an implicit on-disk format
462	 * requirement that we need to support for all eternity, but only
463	 * for single-parity RAID-Z.
464	 *
465	 * If we intend to skip a sector in the zeroth column for padding
466	 * we must make sure to note this swap. We will never intend to
467	 * skip the first column since at least one data and one parity
468	 * column must appear in each row.
469	 */
470	ASSERT(rm->rm_cols >= 2);
471	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
472
473	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
474		devidx = rm->rm_col[0].rc_devidx;
475		o = rm->rm_col[0].rc_offset;
476		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
477		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
478		rm->rm_col[1].rc_devidx = devidx;
479		rm->rm_col[1].rc_offset = o;
480
481		if (rm->rm_skipstart == 0)
482			rm->rm_skipstart = 1;
483	}
484
485	/* init RAIDZ parity ops */
486	rm->rm_ops = vdev_raidz_math_get_ops();
487
488	return (rm);
489}
490
491struct pqr_struct {
492	uint64_t *p;
493	uint64_t *q;
494	uint64_t *r;
495};
496
497static int
498vdev_raidz_p_func(void *buf, size_t size, void *private)
499{
500	struct pqr_struct *pqr = private;
501	const uint64_t *src = buf;
502	int i, cnt = size / sizeof (src[0]);
503
504	ASSERT(pqr->p && !pqr->q && !pqr->r);
505
506	for (i = 0; i < cnt; i++, src++, pqr->p++)
507		*pqr->p ^= *src;
508
509	return (0);
510}
511
512static int
513vdev_raidz_pq_func(void *buf, size_t size, void *private)
514{
515	struct pqr_struct *pqr = private;
516	const uint64_t *src = buf;
517	uint64_t mask;
518	int i, cnt = size / sizeof (src[0]);
519
520	ASSERT(pqr->p && pqr->q && !pqr->r);
521
522	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
523		*pqr->p ^= *src;
524		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
525		*pqr->q ^= *src;
526	}
527
528	return (0);
529}
530
531static int
532vdev_raidz_pqr_func(void *buf, size_t size, void *private)
533{
534	struct pqr_struct *pqr = private;
535	const uint64_t *src = buf;
536	uint64_t mask;
537	int i, cnt = size / sizeof (src[0]);
538
539	ASSERT(pqr->p && pqr->q && pqr->r);
540
541	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
542		*pqr->p ^= *src;
543		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
544		*pqr->q ^= *src;
545		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
546		*pqr->r ^= *src;
547	}
548
549	return (0);
550}
551
552static void
553vdev_raidz_generate_parity_p(raidz_map_t *rm)
554{
555	uint64_t *p;
556	int c;
557	abd_t *src;
558
559	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
560		src = rm->rm_col[c].rc_abd;
561		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
562
563		if (c == rm->rm_firstdatacol) {
564			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
565		} else {
566			struct pqr_struct pqr = { p, NULL, NULL };
567			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
568			    vdev_raidz_p_func, &pqr);
569		}
570	}
571}
572
573static void
574vdev_raidz_generate_parity_pq(raidz_map_t *rm)
575{
576	uint64_t *p, *q, pcnt, ccnt, mask, i;
577	int c;
578	abd_t *src;
579
580	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
581	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
582	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
583
584	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
585		src = rm->rm_col[c].rc_abd;
586		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
587		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
588
589		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
590
591		if (c == rm->rm_firstdatacol) {
592			ASSERT(ccnt == pcnt || ccnt == 0);
593
594			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
595			(void) memcpy(q, p, rm->rm_col[c].rc_size);
596			for (i = ccnt; i < pcnt; i++) {
597				p[i] = 0;
598				q[i] = 0;
599			}
600		} else {
601			struct pqr_struct pqr = { p, q, NULL };
602
603			ASSERT(ccnt <= pcnt);
604
605			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
606			    vdev_raidz_pq_func, &pqr);
607
608			/*
609			 * Treat short columns as though they are full of 0s.
610			 * Note that there's therefore nothing needed for P.
611			 */
612			for (i = ccnt; i < pcnt; i++) {
613				VDEV_RAIDZ_64MUL_2(q[i], mask);
614			}
615		}
616	}
617}
618
619static void
620vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
621{
622	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
623	int c;
624	abd_t *src;
625
626	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
627	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
628	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
629	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
630	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
631
632	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
633		src = rm->rm_col[c].rc_abd;
634		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
635		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
636		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
637
638		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
639
640		if (c == rm->rm_firstdatacol) {
641			ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size);
642			ASSERT(ccnt == pcnt || ccnt == 0);
643			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
644			(void) memcpy(q, p, rm->rm_col[c].rc_size);
645			(void) memcpy(r, p, rm->rm_col[c].rc_size);
646
647			for (i = ccnt; i < pcnt; i++) {
648				p[i] = 0;
649				q[i] = 0;
650				r[i] = 0;
651			}
652		} else {
653			struct pqr_struct pqr = { p, q, r };
654
655			ASSERT(ccnt <= pcnt);
656			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
657			    vdev_raidz_pqr_func, &pqr);
658
659			/*
660			 * Treat short columns as though they are full of 0s.
661			 * Note that there's therefore nothing needed for P.
662			 */
663			for (i = ccnt; i < pcnt; i++) {
664				VDEV_RAIDZ_64MUL_2(q[i], mask);
665				VDEV_RAIDZ_64MUL_4(r[i], mask);
666			}
667		}
668	}
669}
670
671/*
672 * Generate RAID parity in the first virtual columns according to the number of
673 * parity columns available.
674 */
675void
676vdev_raidz_generate_parity(raidz_map_t *rm)
677{
678	/* Generate using the new math implementation */
679	if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL)
680		return;
681
682	switch (rm->rm_firstdatacol) {
683	case 1:
684		vdev_raidz_generate_parity_p(rm);
685		break;
686	case 2:
687		vdev_raidz_generate_parity_pq(rm);
688		break;
689	case 3:
690		vdev_raidz_generate_parity_pqr(rm);
691		break;
692	default:
693		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
694	}
695}
696
697/* ARGSUSED */
698static int
699vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
700{
701	uint64_t *dst = dbuf;
702	uint64_t *src = sbuf;
703	int cnt = size / sizeof (src[0]);
704
705	for (int i = 0; i < cnt; i++) {
706		dst[i] ^= src[i];
707	}
708
709	return (0);
710}
711
712/* ARGSUSED */
713static int
714vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
715    void *private)
716{
717	uint64_t *dst = dbuf;
718	uint64_t *src = sbuf;
719	uint64_t mask;
720	int cnt = size / sizeof (dst[0]);
721
722	for (int i = 0; i < cnt; i++, dst++, src++) {
723		VDEV_RAIDZ_64MUL_2(*dst, mask);
724		*dst ^= *src;
725	}
726
727	return (0);
728}
729
730/* ARGSUSED */
731static int
732vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
733{
734	uint64_t *dst = buf;
735	uint64_t mask;
736	int cnt = size / sizeof (dst[0]);
737
738	for (int i = 0; i < cnt; i++, dst++) {
739		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
740		VDEV_RAIDZ_64MUL_2(*dst, mask);
741	}
742
743	return (0);
744}
745
746struct reconst_q_struct {
747	uint64_t *q;
748	int exp;
749};
750
751static int
752vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
753{
754	struct reconst_q_struct *rq = private;
755	uint64_t *dst = buf;
756	int cnt = size / sizeof (dst[0]);
757
758	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
759
760		*dst ^= *rq->q;
761		int j;
762		uint8_t *b;
763		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
764			*b = vdev_raidz_exp2(*b, rq->exp);
765		}
766	}
767
768	return (0);
769}
770
771struct reconst_pq_struct {
772	uint8_t *p;
773	uint8_t *q;
774	uint8_t *pxy;
775	uint8_t *qxy;
776	int aexp;
777	int bexp;
778};
779
780static int
781vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
782{
783	struct reconst_pq_struct *rpq = private;
784	uint8_t *xd = xbuf;
785	uint8_t *yd = ybuf;
786
787	for (int i = 0; i < size;
788	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
789		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
790		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
791		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
792	}
793
794	return (0);
795}
796
797static int
798vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
799{
800	struct reconst_pq_struct *rpq = private;
801	uint8_t *xd = xbuf;
802
803	for (int i = 0; i < size;
804	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
805		/* same operation as vdev_raidz_reconst_pq_func() on xd */
806		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
807		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
808	}
809
810	return (0);
811}
812
813static int
814vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
815{
816	int x = tgts[0];
817	int c;
818	abd_t *dst, *src;
819
820	ASSERT(ntgts == 1);
821	ASSERT(x >= rm->rm_firstdatacol);
822	ASSERT(x < rm->rm_cols);
823
824	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
825	ASSERT(rm->rm_col[x].rc_size > 0);
826
827	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
828	dst = rm->rm_col[x].rc_abd;
829
830	ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size);
831	ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size);
832	abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size);
833
834	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
835		uint64_t size = MIN(rm->rm_col[x].rc_size,
836		    rm->rm_col[c].rc_size);
837
838		src = rm->rm_col[c].rc_abd;
839		dst = rm->rm_col[x].rc_abd;
840
841		if (c == x)
842			continue;
843
844		(void) abd_iterate_func2(dst, src, 0, 0, size,
845		    vdev_raidz_reconst_p_func, NULL);
846	}
847
848	return (1 << VDEV_RAIDZ_P);
849}
850
851static int
852vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
853{
854	int x = tgts[0];
855	int c, exp;
856	abd_t *dst, *src;
857
858	ASSERT(ntgts == 1);
859
860	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
861
862	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
863		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
864		    rm->rm_col[c].rc_size);
865
866		src = rm->rm_col[c].rc_abd;
867		dst = rm->rm_col[x].rc_abd;
868
869		if (c == rm->rm_firstdatacol) {
870			if (dst != src) {
871				ASSERT3S(dst->abd_size, >=, size);
872				ASSERT3S(src->abd_size, >=, size);
873				abd_copy_off(dst, src, 0, 0, size);
874			}
875			if (rm->rm_col[x].rc_size > size)
876				abd_zero_off(dst, size,
877				    rm->rm_col[x].rc_size - size);
878		} else {
879			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
880			if (src != dst)
881				(void) abd_iterate_func2(dst, src, 0, 0, size,
882				    vdev_raidz_reconst_q_pre_func, NULL);
883			(void) abd_iterate_func(dst,
884			    size, rm->rm_col[x].rc_size - size,
885			    vdev_raidz_reconst_q_pre_tail_func, NULL);
886		}
887	}
888
889	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
890	dst = rm->rm_col[x].rc_abd;
891	exp = 255 - (rm->rm_cols - 1 - x);
892
893	struct reconst_q_struct rq = { abd_to_buf(src), exp };
894	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
895	    vdev_raidz_reconst_q_post_func, &rq);
896
897	return (1 << VDEV_RAIDZ_Q);
898}
899
900static int
901vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
902{
903	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
904	abd_t *pdata, *qdata;
905	uint64_t xsize, ysize;
906	int x = tgts[0];
907	int y = tgts[1];
908	abd_t *xd, *yd;
909
910	ASSERT(ntgts == 2);
911	ASSERT(x < y);
912	ASSERT(x >= rm->rm_firstdatacol);
913	ASSERT(y < rm->rm_cols);
914
915	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
916
917	/*
918	 * Move the parity data aside -- we're going to compute parity as
919	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
920	 * reuse the parity generation mechanism without trashing the actual
921	 * parity so we make those columns appear to be full of zeros by
922	 * setting their lengths to zero.
923	 */
924	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
925	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
926	xsize = rm->rm_col[x].rc_size;
927	ysize = rm->rm_col[y].rc_size;
928
929	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
930	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
931	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
932	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
933	rm->rm_col[x].rc_size = 0;
934	rm->rm_col[y].rc_size = 0;
935
936	vdev_raidz_generate_parity_pq(rm);
937
938	rm->rm_col[x].rc_size = xsize;
939	rm->rm_col[y].rc_size = ysize;
940
941	p = abd_to_buf(pdata);
942	q = abd_to_buf(qdata);
943	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
944	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
945	xd = rm->rm_col[x].rc_abd;
946	yd = rm->rm_col[y].rc_abd;
947
948	/*
949	 * We now have:
950	 *	Pxy = P + D_x + D_y
951	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
952	 *
953	 * We can then solve for D_x:
954	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
955	 * where
956	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
957	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
958	 *
959	 * With D_x in hand, we can easily solve for D_y:
960	 *	D_y = P + Pxy + D_x
961	 */
962
963	a = vdev_raidz_pow2[255 + x - y];
964	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
965	tmp = 255 - vdev_raidz_log2[a ^ 1];
966
967	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
968	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
969
970	ASSERT3U(xsize, >=, ysize);
971	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
972	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
973	    vdev_raidz_reconst_pq_func, &rpq);
974	(void) abd_iterate_func(xd, ysize, xsize - ysize,
975	    vdev_raidz_reconst_pq_tail_func, &rpq);
976
977	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
978	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
979
980	/*
981	 * Restore the saved parity data.
982	 */
983	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
984	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
985
986	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
987}
988
989/* BEGIN CSTYLED */
990/*
991 * In the general case of reconstruction, we must solve the system of linear
992 * equations defined by the coeffecients used to generate parity as well as
993 * the contents of the data and parity disks. This can be expressed with
994 * vectors for the original data (D) and the actual data (d) and parity (p)
995 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
996 *
997 *            __   __                     __     __
998 *            |     |         __     __   |  p_0  |
999 *            |  V  |         |  D_0  |   | p_m-1 |
1000 *            |     |    x    |   :   | = |  d_0  |
1001 *            |  I  |         | D_n-1 |   |   :   |
1002 *            |     |         ~~     ~~   | d_n-1 |
1003 *            ~~   ~~                     ~~     ~~
1004 *
1005 * I is simply a square identity matrix of size n, and V is a vandermonde
1006 * matrix defined by the coeffecients we chose for the various parity columns
1007 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1008 * computation as well as linear separability.
1009 *
1010 *      __               __               __     __
1011 *      |   1   ..  1 1 1 |               |  p_0  |
1012 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1013 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1014 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1015 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1016 *      |   :       : : : |   |   :   |   |  d_2  |
1017 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1018 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1019 *      |   0   ..  0 0 1 |               | d_n-1 |
1020 *      ~~               ~~               ~~     ~~
1021 *
1022 * Note that I, V, d, and p are known. To compute D, we must invert the
1023 * matrix and use the known data and parity values to reconstruct the unknown
1024 * data values. We begin by removing the rows in V|I and d|p that correspond
1025 * to failed or missing columns; we then make V|I square (n x n) and d|p
1026 * sized n by removing rows corresponding to unused parity from the bottom up
1027 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1028 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1029 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1030 *           __                               __
1031 *           |  1   1   1   1   1   1   1   1  |
1032 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1033 *           |  19 205 116  29  64  16  4   1  |      / /
1034 *           |  1   0   0   0   0   0   0   0  |     / /
1035 *           |  0   1   0   0   0   0   0   0  | <--' /
1036 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1037 *           |  0   0   0   1   0   0   0   0  |
1038 *           |  0   0   0   0   1   0   0   0  |
1039 *           |  0   0   0   0   0   1   0   0  |
1040 *           |  0   0   0   0   0   0   1   0  |
1041 *           |  0   0   0   0   0   0   0   1  |
1042 *           ~~                               ~~
1043 *           __                               __
1044 *           |  1   1   1   1   1   1   1   1  |
1045 *           | 128  64  32  16  8   4   2   1  |
1046 *           |  19 205 116  29  64  16  4   1  |
1047 *           |  1   0   0   0   0   0   0   0  |
1048 *           |  0   1   0   0   0   0   0   0  |
1049 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1050 *           |  0   0   0   1   0   0   0   0  |
1051 *           |  0   0   0   0   1   0   0   0  |
1052 *           |  0   0   0   0   0   1   0   0  |
1053 *           |  0   0   0   0   0   0   1   0  |
1054 *           |  0   0   0   0   0   0   0   1  |
1055 *           ~~                               ~~
1056 *
1057 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1058 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1059 * matrix is not singular.
1060 * __                                                                 __
1061 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1062 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1063 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1064 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1065 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1066 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1067 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1068 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1069 * ~~                                                                 ~~
1070 * __                                                                 __
1071 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1072 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1073 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1074 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1075 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1076 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1077 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1078 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1079 * ~~                                                                 ~~
1080 * __                                                                 __
1081 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1082 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1083 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1084 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1085 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1086 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1087 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1088 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1089 * ~~                                                                 ~~
1090 * __                                                                 __
1091 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1092 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1093 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1094 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1095 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1096 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1097 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1098 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1099 * ~~                                                                 ~~
1100 * __                                                                 __
1101 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1102 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1103 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1104 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1105 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1106 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1107 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1108 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1109 * ~~                                                                 ~~
1110 * __                                                                 __
1111 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1112 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1113 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1114 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1115 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1116 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1117 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1118 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1119 * ~~                                                                 ~~
1120 *                   __                               __
1121 *                   |  0   0   1   0   0   0   0   0  |
1122 *                   | 167 100  5   41 159 169 217 208 |
1123 *                   | 166 100  4   40 158 168 216 209 |
1124 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1125 *                   |  0   0   0   0   1   0   0   0  |
1126 *                   |  0   0   0   0   0   1   0   0  |
1127 *                   |  0   0   0   0   0   0   1   0  |
1128 *                   |  0   0   0   0   0   0   0   1  |
1129 *                   ~~                               ~~
1130 *
1131 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1132 * of the missing data.
1133 *
1134 * As is apparent from the example above, the only non-trivial rows in the
1135 * inverse matrix correspond to the data disks that we're trying to
1136 * reconstruct. Indeed, those are the only rows we need as the others would
1137 * only be useful for reconstructing data known or assumed to be valid. For
1138 * that reason, we only build the coefficients in the rows that correspond to
1139 * targeted columns.
1140 */
1141/* END CSTYLED */
1142
1143static void
1144vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1145    uint8_t **rows)
1146{
1147	int i, j;
1148	int pow;
1149
1150	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1151
1152	/*
1153	 * Fill in the missing rows of interest.
1154	 */
1155	for (i = 0; i < nmap; i++) {
1156		ASSERT3S(0, <=, map[i]);
1157		ASSERT3S(map[i], <=, 2);
1158
1159		pow = map[i] * n;
1160		if (pow > 255)
1161			pow -= 255;
1162		ASSERT(pow <= 255);
1163
1164		for (j = 0; j < n; j++) {
1165			pow -= map[i];
1166			if (pow < 0)
1167				pow += 255;
1168			rows[i][j] = vdev_raidz_pow2[pow];
1169		}
1170	}
1171}
1172
1173static void
1174vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1175    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1176{
1177	int i, j, ii, jj;
1178	uint8_t log;
1179
1180	/*
1181	 * Assert that the first nmissing entries from the array of used
1182	 * columns correspond to parity columns and that subsequent entries
1183	 * correspond to data columns.
1184	 */
1185	for (i = 0; i < nmissing; i++) {
1186		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1187	}
1188	for (; i < n; i++) {
1189		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1190	}
1191
1192	/*
1193	 * First initialize the storage where we'll compute the inverse rows.
1194	 */
1195	for (i = 0; i < nmissing; i++) {
1196		for (j = 0; j < n; j++) {
1197			invrows[i][j] = (i == j) ? 1 : 0;
1198		}
1199	}
1200
1201	/*
1202	 * Subtract all trivial rows from the rows of consequence.
1203	 */
1204	for (i = 0; i < nmissing; i++) {
1205		for (j = nmissing; j < n; j++) {
1206			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1207			jj = used[j] - rm->rm_firstdatacol;
1208			ASSERT3S(jj, <, n);
1209			invrows[i][j] = rows[i][jj];
1210			rows[i][jj] = 0;
1211		}
1212	}
1213
1214	/*
1215	 * For each of the rows of interest, we must normalize it and subtract
1216	 * a multiple of it from the other rows.
1217	 */
1218	for (i = 0; i < nmissing; i++) {
1219		for (j = 0; j < missing[i]; j++) {
1220			ASSERT0(rows[i][j]);
1221		}
1222		ASSERT3U(rows[i][missing[i]], !=, 0);
1223
1224		/*
1225		 * Compute the inverse of the first element and multiply each
1226		 * element in the row by that value.
1227		 */
1228		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1229
1230		for (j = 0; j < n; j++) {
1231			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1232			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1233		}
1234
1235		for (ii = 0; ii < nmissing; ii++) {
1236			if (i == ii)
1237				continue;
1238
1239			ASSERT3U(rows[ii][missing[i]], !=, 0);
1240
1241			log = vdev_raidz_log2[rows[ii][missing[i]]];
1242
1243			for (j = 0; j < n; j++) {
1244				rows[ii][j] ^=
1245				    vdev_raidz_exp2(rows[i][j], log);
1246				invrows[ii][j] ^=
1247				    vdev_raidz_exp2(invrows[i][j], log);
1248			}
1249		}
1250	}
1251
1252	/*
1253	 * Verify that the data that is left in the rows are properly part of
1254	 * an identity matrix.
1255	 */
1256	for (i = 0; i < nmissing; i++) {
1257		for (j = 0; j < n; j++) {
1258			if (j == missing[i]) {
1259				ASSERT3U(rows[i][j], ==, 1);
1260			} else {
1261				ASSERT0(rows[i][j]);
1262			}
1263		}
1264	}
1265}
1266
1267static void
1268vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1269    int *missing, uint8_t **invrows, const uint8_t *used)
1270{
1271	int i, j, x, cc, c;
1272	uint8_t *src;
1273	uint64_t ccount;
1274	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1275	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1276	uint8_t log = 0;
1277	uint8_t val;
1278	int ll;
1279	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1280	uint8_t *p, *pp;
1281	size_t psize;
1282
1283	psize = sizeof (invlog[0][0]) * n * nmissing;
1284	p = kmem_alloc(psize, KM_SLEEP);
1285
1286	for (pp = p, i = 0; i < nmissing; i++) {
1287		invlog[i] = pp;
1288		pp += n;
1289	}
1290
1291	for (i = 0; i < nmissing; i++) {
1292		for (j = 0; j < n; j++) {
1293			ASSERT3U(invrows[i][j], !=, 0);
1294			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1295		}
1296	}
1297
1298	for (i = 0; i < n; i++) {
1299		c = used[i];
1300		ASSERT3U(c, <, rm->rm_cols);
1301
1302		src = abd_to_buf(rm->rm_col[c].rc_abd);
1303		ccount = rm->rm_col[c].rc_size;
1304		for (j = 0; j < nmissing; j++) {
1305			cc = missing[j] + rm->rm_firstdatacol;
1306			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1307			ASSERT3U(cc, <, rm->rm_cols);
1308			ASSERT3U(cc, !=, c);
1309
1310			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1311			dcount[j] = rm->rm_col[cc].rc_size;
1312		}
1313
1314		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1315
1316		for (x = 0; x < ccount; x++, src++) {
1317			if (*src != 0)
1318				log = vdev_raidz_log2[*src];
1319
1320			for (cc = 0; cc < nmissing; cc++) {
1321				if (x >= dcount[cc])
1322					continue;
1323
1324				if (*src == 0) {
1325					val = 0;
1326				} else {
1327					if ((ll = log + invlog[cc][i]) >= 255)
1328						ll -= 255;
1329					val = vdev_raidz_pow2[ll];
1330				}
1331
1332				if (i == 0)
1333					dst[cc][x] = val;
1334				else
1335					dst[cc][x] ^= val;
1336			}
1337		}
1338	}
1339
1340	kmem_free(p, psize);
1341}
1342
1343static int
1344vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1345{
1346	int n, i, c, t, tt;
1347	int nmissing_rows;
1348	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1349	int parity_map[VDEV_RAIDZ_MAXPARITY];
1350
1351	uint8_t *p, *pp;
1352	size_t psize;
1353
1354	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1355	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1356	uint8_t *used;
1357
1358	abd_t **bufs = NULL;
1359
1360	int code = 0;
1361
1362	/*
1363	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1364	 * temporary linear ABDs.
1365	 */
1366	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1367		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1368
1369		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1370			raidz_col_t *col = &rm->rm_col[c];
1371
1372			bufs[c] = col->rc_abd;
1373			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1374			ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
1375			ASSERT3S(bufs[c]->abd_size, >=, col->rc_size);
1376			abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size);
1377		}
1378	}
1379
1380	n = rm->rm_cols - rm->rm_firstdatacol;
1381
1382	/*
1383	 * Figure out which data columns are missing.
1384	 */
1385	nmissing_rows = 0;
1386	for (t = 0; t < ntgts; t++) {
1387		if (tgts[t] >= rm->rm_firstdatacol) {
1388			missing_rows[nmissing_rows++] =
1389			    tgts[t] - rm->rm_firstdatacol;
1390		}
1391	}
1392
1393	/*
1394	 * Figure out which parity columns to use to help generate the missing
1395	 * data columns.
1396	 */
1397	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1398		ASSERT(tt < ntgts);
1399		ASSERT(c < rm->rm_firstdatacol);
1400
1401		/*
1402		 * Skip any targeted parity columns.
1403		 */
1404		if (c == tgts[tt]) {
1405			tt++;
1406			continue;
1407		}
1408
1409		code |= 1 << c;
1410
1411		parity_map[i] = c;
1412		i++;
1413	}
1414
1415	ASSERT(code != 0);
1416	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1417
1418	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1419	    nmissing_rows * n + sizeof (used[0]) * n;
1420	p = kmem_alloc(psize, KM_SLEEP);
1421
1422	for (pp = p, i = 0; i < nmissing_rows; i++) {
1423		rows[i] = pp;
1424		pp += n;
1425		invrows[i] = pp;
1426		pp += n;
1427	}
1428	used = pp;
1429
1430	for (i = 0; i < nmissing_rows; i++) {
1431		used[i] = parity_map[i];
1432	}
1433
1434	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1435		if (tt < nmissing_rows &&
1436		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1437			tt++;
1438			continue;
1439		}
1440
1441		ASSERT3S(i, <, n);
1442		used[i] = c;
1443		i++;
1444	}
1445
1446	/*
1447	 * Initialize the interesting rows of the matrix.
1448	 */
1449	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1450
1451	/*
1452	 * Invert the matrix.
1453	 */
1454	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1455	    invrows, used);
1456
1457	/*
1458	 * Reconstruct the missing data using the generated matrix.
1459	 */
1460	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1461	    invrows, used);
1462
1463	kmem_free(p, psize);
1464
1465	/*
1466	 * copy back from temporary linear abds and free them
1467	 */
1468	if (bufs) {
1469		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1470			raidz_col_t *col = &rm->rm_col[c];
1471
1472			ASSERT3S(bufs[c]->abd_size, >=, col->rc_size);
1473			ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
1474			abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size);
1475			abd_free(col->rc_abd);
1476			col->rc_abd = bufs[c];
1477		}
1478		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1479	}
1480
1481	return (code);
1482}
1483
1484int
1485vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
1486{
1487	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1488	int ntgts;
1489	int i, c, ret;
1490	int code;
1491	int nbadparity, nbaddata;
1492	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1493
1494	/*
1495	 * The tgts list must already be sorted.
1496	 */
1497	for (i = 1; i < nt; i++) {
1498		ASSERT(t[i] > t[i - 1]);
1499	}
1500
1501	nbadparity = rm->rm_firstdatacol;
1502	nbaddata = rm->rm_cols - nbadparity;
1503	ntgts = 0;
1504	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1505		if (c < rm->rm_firstdatacol)
1506			parity_valid[c] = B_FALSE;
1507
1508		if (i < nt && c == t[i]) {
1509			tgts[ntgts++] = c;
1510			i++;
1511		} else if (rm->rm_col[c].rc_error != 0) {
1512			tgts[ntgts++] = c;
1513		} else if (c >= rm->rm_firstdatacol) {
1514			nbaddata--;
1515		} else {
1516			parity_valid[c] = B_TRUE;
1517			nbadparity--;
1518		}
1519	}
1520
1521	ASSERT(ntgts >= nt);
1522	ASSERT(nbaddata >= 0);
1523	ASSERT(nbaddata + nbadparity == ntgts);
1524
1525	dt = &tgts[nbadparity];
1526
1527	/* Reconstruct using the new math implementation */
1528	ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
1529	if (ret != RAIDZ_ORIGINAL_IMPL)
1530		return (ret);
1531
1532	/*
1533	 * See if we can use any of our optimized reconstruction routines.
1534	 */
1535	switch (nbaddata) {
1536	case 1:
1537		if (parity_valid[VDEV_RAIDZ_P])
1538			return (vdev_raidz_reconstruct_p(rm, dt, 1));
1539
1540		ASSERT(rm->rm_firstdatacol > 1);
1541
1542		if (parity_valid[VDEV_RAIDZ_Q])
1543			return (vdev_raidz_reconstruct_q(rm, dt, 1));
1544
1545		ASSERT(rm->rm_firstdatacol > 2);
1546		break;
1547
1548	case 2:
1549		ASSERT(rm->rm_firstdatacol > 1);
1550
1551		if (parity_valid[VDEV_RAIDZ_P] &&
1552		    parity_valid[VDEV_RAIDZ_Q])
1553			return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1554
1555		ASSERT(rm->rm_firstdatacol > 2);
1556
1557		break;
1558	}
1559
1560	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1561	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1562	ASSERT(code > 0);
1563	return (code);
1564}
1565
1566static int
1567vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1568    uint64_t *ashift)
1569{
1570	vdev_t *cvd;
1571	uint64_t nparity = vd->vdev_nparity;
1572	int c;
1573	int lasterror = 0;
1574	int numerrors = 0;
1575
1576	ASSERT(nparity > 0);
1577
1578	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1579	    vd->vdev_children < nparity + 1) {
1580		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1581		return (SET_ERROR(EINVAL));
1582	}
1583
1584	vdev_open_children(vd);
1585
1586	for (c = 0; c < vd->vdev_children; c++) {
1587		cvd = vd->vdev_child[c];
1588
1589		if (cvd->vdev_open_error != 0) {
1590			lasterror = cvd->vdev_open_error;
1591			numerrors++;
1592			continue;
1593		}
1594
1595		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1596		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1597		*ashift = MAX(*ashift, cvd->vdev_ashift);
1598	}
1599
1600	*asize *= vd->vdev_children;
1601	*max_asize *= vd->vdev_children;
1602
1603	if (numerrors > nparity) {
1604		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1605		return (lasterror);
1606	}
1607
1608	return (0);
1609}
1610
1611static void
1612vdev_raidz_close(vdev_t *vd)
1613{
1614	int c;
1615
1616	for (c = 0; c < vd->vdev_children; c++)
1617		vdev_close(vd->vdev_child[c]);
1618}
1619
1620/*
1621 * Handle a read or write I/O to a RAID-Z dump device.
1622 *
1623 * The dump device is in a unique situation compared to other ZFS datasets:
1624 * writing to this device should be as simple and fast as possible.  In
1625 * addition, durability matters much less since the dump will be extracted
1626 * once the machine reboots.  For that reason, this function eschews parity for
1627 * performance and simplicity.  The dump device uses the checksum setting
1628 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1629 * dataset.
1630 *
1631 * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1632 * 128 KB will not fill an entire block; in addition, they may not be properly
1633 * aligned.  In that case, this function uses the preallocated 128 KB block and
1634 * omits reading or writing any "empty" portions of that block, as opposed to
1635 * allocating a fresh appropriately-sized block.
1636 *
1637 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1638 *
1639 *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1640 *
1641 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1642 * allocated which spans all five child vdevs.  8 KB of data would be written to
1643 * each of four vdevs, with the fifth containing the parity bits.
1644 *
1645 *       parity    data     data     data     data
1646 *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1647 *         ^        ^        ^        ^        ^
1648 *         |        |        |        |        |
1649 *   8 KB parity    ------8 KB data blocks------
1650 *
1651 * However, when writing to the dump device, the behavior is different:
1652 *
1653 *     vdev_raidz_dumpio(data, size: 32 KB, offset: 64 KB)
1654 *
1655 * Unlike the normal RAID-Z case in which the block is allocated based on the
1656 * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1657 * I/O size is less than 128 KB, only the actual portions of data are written.
1658 * In this example the data is written to the third data vdev since that vdev
1659 * contains the offset [64 KB, 96 KB).
1660 *
1661 *       parity    data     data     data     data
1662 *     |        |        |        |   XX   |        |
1663 *                                    ^
1664 *                                    |
1665 *                             32 KB data block
1666 *
1667 * As a result, an individual I/O may not span all child vdevs; moreover, a
1668 * small I/O may only operate on a single child vdev.
1669 *
1670 * Note that since there are no parity bits calculated or written, this format
1671 * remains the same no matter how many parity bits are used in a normal RAID-Z
1672 * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1673 * would look like:
1674 *
1675 *       parity   parity   parity    data     data     data     data
1676 *     |        |        |        |        |        |   XX   |        |
1677 *                                                      ^
1678 *                                                      |
1679 *                                               32 KB data block
1680 */
1681static int
1682vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size,
1683    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1684{
1685	vdev_t *tvd = vd->vdev_top;
1686	vdev_t *cvd;
1687	raidz_map_t *rm;
1688	raidz_col_t *rc;
1689	int c, err = 0;
1690
1691	uint64_t start, end, colstart, colend;
1692	uint64_t coloffset, colsize, colskip;
1693
1694#ifdef	_KERNEL
1695
1696	/*
1697	 * Don't write past the end of the block
1698	 */
1699	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1700
1701	start = offset;
1702	end = start + size;
1703
1704	/*
1705	 * Allocate a RAID-Z map for this block.  Note that this block starts
1706	 * from the "original" offset, this is, the offset of the extent which
1707	 * contains the requisite offset of the data being read or written.
1708	 *
1709	 * Even if this I/O operation doesn't span the full block size, let's
1710	 * treat the on-disk format as if the only blocks are the complete 128
1711	 * KB size.
1712	 */
1713
1714	/* First, fake a zio for vdev_raidz_map_alloc. */
1715	zio_t *zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
1716	zio->io_offset = origoffset;
1717	zio->io_size = SPA_OLD_MAXBLOCKSIZE;
1718	zio->io_abd = abd_get_from_buf(data - (offset - origoffset),
1719	    SPA_OLD_MAXBLOCKSIZE);
1720
1721	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1722	    vd->vdev_nparity);
1723
1724	coloffset = origoffset;
1725
1726	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1727	    c++, coloffset += rc->rc_size) {
1728		rc = &rm->rm_col[c];
1729		cvd = vd->vdev_child[rc->rc_devidx];
1730
1731		if (cvd->vdev_ops->vdev_op_dumpio == NULL) {
1732			err = EINVAL;
1733			break;
1734		}
1735
1736		/*
1737		 * Find the start and end of this column in the RAID-Z map,
1738		 * keeping in mind that the stated size and offset of the
1739		 * operation may not fill the entire column for this vdev.
1740		 *
1741		 * If any portion of the data spans this column, issue the
1742		 * appropriate operation to the vdev.
1743		 */
1744		if (coloffset + rc->rc_size <= start)
1745			continue;
1746		if (coloffset >= end)
1747			continue;
1748
1749		colstart = MAX(coloffset, start);
1750		colend = MIN(end, coloffset + rc->rc_size);
1751		colsize = colend - colstart;
1752		colskip = colstart - coloffset;
1753
1754		VERIFY3U(colsize, <=, rc->rc_size);
1755		VERIFY3U(colskip, <=, rc->rc_size);
1756
1757		/*
1758		 * Note that the child vdev will have a vdev label at the start
1759		 * of its range of offsets, hence the need for
1760		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1761		 * example of why this calculation is needed.
1762		 */
1763		if ((err = cvd->vdev_ops->vdev_op_dumpio(cvd,
1764		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1765		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 0,
1766		    doread, isdump)) != 0)
1767			break;
1768	}
1769
1770	vdev_raidz_map_free(rm);
1771	abd_put(zio->io_abd);
1772	kmem_free(zio, sizeof (zio_t));
1773
1774#endif	/* KERNEL */
1775
1776	return (err);
1777}
1778
1779static uint64_t
1780vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1781{
1782	uint64_t asize;
1783	uint64_t ashift = vd->vdev_top->vdev_ashift;
1784	uint64_t cols = vd->vdev_children;
1785	uint64_t nparity = vd->vdev_nparity;
1786
1787	asize = ((psize - 1) >> ashift) + 1;
1788	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1789	asize = roundup(asize, nparity + 1) << ashift;
1790
1791	return (asize);
1792}
1793
1794static void
1795vdev_raidz_child_done(zio_t *zio)
1796{
1797	raidz_col_t *rc = zio->io_private;
1798
1799	rc->rc_error = zio->io_error;
1800	rc->rc_tried = 1;
1801	rc->rc_skipped = 0;
1802}
1803
1804static void
1805vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1806{
1807#ifdef ZFS_DEBUG
1808	vdev_t *vd = zio->io_vd;
1809	vdev_t *tvd = vd->vdev_top;
1810
1811	range_seg64_t logical_rs, physical_rs;
1812	logical_rs.rs_start = zio->io_offset;
1813	logical_rs.rs_end = logical_rs.rs_start +
1814	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1815
1816	raidz_col_t *rc = &rm->rm_col[col];
1817	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1818
1819	vdev_xlate(cvd, &logical_rs, &physical_rs);
1820	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1821	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1822	/*
1823	 * It would be nice to assert that rs_end is equal
1824	 * to rc_offset + rc_size but there might be an
1825	 * optional I/O at the end that is not accounted in
1826	 * rc_size.
1827	 */
1828	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1829		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1830		    rc->rc_size + (1 << tvd->vdev_ashift));
1831	} else {
1832		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1833	}
1834#endif
1835}
1836
1837/*
1838 * Start an IO operation on a RAIDZ VDev
1839 *
1840 * Outline:
1841 * - For write operations:
1842 *   1. Generate the parity data
1843 *   2. Create child zio write operations to each column's vdev, for both
1844 *      data and parity.
1845 *   3. If the column skips any sectors for padding, create optional dummy
1846 *      write zio children for those areas to improve aggregation continuity.
1847 * - For read operations:
1848 *   1. Create child zio read operations to each data column's vdev to read
1849 *      the range of data required for zio.
1850 *   2. If this is a scrub or resilver operation, or if any of the data
1851 *      vdevs have had errors, then create zio read operations to the parity
1852 *      columns' VDevs as well.
1853 */
1854static void
1855vdev_raidz_io_start(zio_t *zio)
1856{
1857	vdev_t *vd = zio->io_vd;
1858	vdev_t *tvd = vd->vdev_top;
1859	vdev_t *cvd;
1860	raidz_map_t *rm;
1861	raidz_col_t *rc;
1862	int c, i;
1863
1864	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1865	    vd->vdev_nparity);
1866
1867	zio->io_vsd = rm;
1868	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1869
1870	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1871
1872	if (zio->io_type == ZIO_TYPE_WRITE) {
1873		vdev_raidz_generate_parity(rm);
1874
1875		for (c = 0; c < rm->rm_cols; c++) {
1876			rc = &rm->rm_col[c];
1877			cvd = vd->vdev_child[rc->rc_devidx];
1878
1879			/*
1880			 * Verify physical to logical translation.
1881			 */
1882			vdev_raidz_io_verify(zio, rm, c);
1883
1884			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1885			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1886			    zio->io_type, zio->io_priority, 0,
1887			    vdev_raidz_child_done, rc));
1888		}
1889
1890		/*
1891		 * Generate optional I/Os for any skipped sectors to improve
1892		 * aggregation contiguity.
1893		 */
1894		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1895			ASSERT(c <= rm->rm_scols);
1896			if (c == rm->rm_scols)
1897				c = 0;
1898			rc = &rm->rm_col[c];
1899			cvd = vd->vdev_child[rc->rc_devidx];
1900			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1901			    rc->rc_offset + rc->rc_size, NULL,
1902			    1 << tvd->vdev_ashift,
1903			    zio->io_type, zio->io_priority,
1904			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1905		}
1906
1907		zio_execute(zio);
1908		return;
1909	}
1910
1911	ASSERT(zio->io_type == ZIO_TYPE_READ);
1912
1913	/*
1914	 * Iterate over the columns in reverse order so that we hit the parity
1915	 * last -- any errors along the way will force us to read the parity.
1916	 */
1917	for (c = rm->rm_cols - 1; c >= 0; c--) {
1918		rc = &rm->rm_col[c];
1919		cvd = vd->vdev_child[rc->rc_devidx];
1920		if (!vdev_readable(cvd)) {
1921			if (c >= rm->rm_firstdatacol)
1922				rm->rm_missingdata++;
1923			else
1924				rm->rm_missingparity++;
1925			rc->rc_error = SET_ERROR(ENXIO);
1926			rc->rc_tried = 1;	/* don't even try */
1927			rc->rc_skipped = 1;
1928			continue;
1929		}
1930		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1931			if (c >= rm->rm_firstdatacol)
1932				rm->rm_missingdata++;
1933			else
1934				rm->rm_missingparity++;
1935			rc->rc_error = SET_ERROR(ESTALE);
1936			rc->rc_skipped = 1;
1937			continue;
1938		}
1939		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1940		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1941			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1942			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1943			    zio->io_type, zio->io_priority, 0,
1944			    vdev_raidz_child_done, rc));
1945		}
1946	}
1947
1948	zio_execute(zio);
1949}
1950
1951
1952/*
1953 * Report a checksum error for a child of a RAID-Z device.
1954 */
1955static void
1956raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1957{
1958	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1959
1960	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1961		zio_bad_cksum_t zbc;
1962		raidz_map_t *rm = zio->io_vsd;
1963
1964		mutex_enter(&vd->vdev_stat_lock);
1965		vd->vdev_stat.vs_checksum_errors++;
1966		mutex_exit(&vd->vdev_stat_lock);
1967
1968		zbc.zbc_has_cksum = 0;
1969		zbc.zbc_injected = rm->rm_ecksuminjected;
1970
1971		zfs_ereport_post_checksum(zio->io_spa, vd,
1972		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
1973		    rc->rc_abd, bad_data, &zbc);
1974	}
1975}
1976
1977/*
1978 * We keep track of whether or not there were any injected errors, so that
1979 * any ereports we generate can note it.
1980 */
1981static int
1982raidz_checksum_verify(zio_t *zio)
1983{
1984	zio_bad_cksum_t zbc;
1985	raidz_map_t *rm = zio->io_vsd;
1986
1987	int ret = zio_checksum_error(zio, &zbc);
1988	if (ret != 0 && zbc.zbc_injected != 0)
1989		rm->rm_ecksuminjected = 1;
1990
1991	return (ret);
1992}
1993
1994/*
1995 * Generate the parity from the data columns. If we tried and were able to
1996 * read the parity without error, verify that the generated parity matches the
1997 * data we read. If it doesn't, we fire off a checksum error. Return the
1998 * number such failures.
1999 */
2000static int
2001raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2002{
2003	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2004	int c, ret = 0;
2005	raidz_col_t *rc;
2006
2007	blkptr_t *bp = zio->io_bp;
2008	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2009	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2010
2011	if (checksum == ZIO_CHECKSUM_NOPARITY)
2012		return (ret);
2013
2014	for (c = 0; c < rm->rm_firstdatacol; c++) {
2015		rc = &rm->rm_col[c];
2016		if (!rc->rc_tried || rc->rc_error != 0)
2017			continue;
2018		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
2019		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
2020	}
2021
2022	vdev_raidz_generate_parity(rm);
2023
2024	for (c = 0; c < rm->rm_firstdatacol; c++) {
2025		rc = &rm->rm_col[c];
2026		if (!rc->rc_tried || rc->rc_error != 0)
2027			continue;
2028		if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) {
2029			raidz_checksum_error(zio, rc, orig[c]);
2030			rc->rc_error = SET_ERROR(ECKSUM);
2031			ret++;
2032		}
2033		abd_free(orig[c]);
2034	}
2035
2036	return (ret);
2037}
2038
2039static int
2040vdev_raidz_worst_error(raidz_map_t *rm)
2041{
2042	int error = 0;
2043
2044	for (int c = 0; c < rm->rm_cols; c++)
2045		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2046
2047	return (error);
2048}
2049
2050/*
2051 * Iterate over all combinations of bad data and attempt a reconstruction.
2052 * Note that the algorithm below is non-optimal because it doesn't take into
2053 * account how reconstruction is actually performed. For example, with
2054 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2055 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2056 * cases we'd only use parity information in column 0.
2057 */
2058static int
2059vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2060{
2061	raidz_map_t *rm = zio->io_vsd;
2062	raidz_col_t *rc;
2063	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2064	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2065	int *tgts = &tstore[1];
2066	int current, next, i, c, n;
2067	int code, ret = 0;
2068
2069	ASSERT(total_errors < rm->rm_firstdatacol);
2070
2071	/*
2072	 * This simplifies one edge condition.
2073	 */
2074	tgts[-1] = -1;
2075
2076	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2077		/*
2078		 * Initialize the targets array by finding the first n columns
2079		 * that contain no error.
2080		 *
2081		 * If there were no data errors, we need to ensure that we're
2082		 * always explicitly attempting to reconstruct at least one
2083		 * data column. To do this, we simply push the highest target
2084		 * up into the data columns.
2085		 */
2086		for (c = 0, i = 0; i < n; i++) {
2087			if (i == n - 1 && data_errors == 0 &&
2088			    c < rm->rm_firstdatacol) {
2089				c = rm->rm_firstdatacol;
2090			}
2091
2092			while (rm->rm_col[c].rc_error != 0) {
2093				c++;
2094				ASSERT3S(c, <, rm->rm_cols);
2095			}
2096
2097			tgts[i] = c++;
2098		}
2099
2100		/*
2101		 * Setting tgts[n] simplifies the other edge condition.
2102		 */
2103		tgts[n] = rm->rm_cols;
2104
2105		/*
2106		 * These buffers were allocated in previous iterations.
2107		 */
2108		for (i = 0; i < n - 1; i++) {
2109			ASSERT(orig[i] != NULL);
2110		}
2111
2112		orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd,
2113		    rm->rm_col[0].rc_size);
2114
2115		current = 0;
2116		next = tgts[current];
2117
2118		while (current != n) {
2119			tgts[current] = next;
2120			current = 0;
2121
2122			/*
2123			 * Save off the original data that we're going to
2124			 * attempt to reconstruct.
2125			 */
2126			for (i = 0; i < n; i++) {
2127				ASSERT(orig[i] != NULL);
2128				c = tgts[i];
2129				ASSERT3S(c, >=, 0);
2130				ASSERT3S(c, <, rm->rm_cols);
2131				rc = &rm->rm_col[c];
2132				ASSERT3S(orig[i]->abd_size, >=, rc->rc_size);
2133				ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size);
2134				abd_copy_off(orig[i], rc->rc_abd, 0, 0,
2135				    rc->rc_size);
2136			}
2137
2138			/*
2139			 * Attempt a reconstruction and exit the outer loop on
2140			 * success.
2141			 */
2142			code = vdev_raidz_reconstruct(rm, tgts, n);
2143			if (raidz_checksum_verify(zio) == 0) {
2144
2145				for (i = 0; i < n; i++) {
2146					c = tgts[i];
2147					rc = &rm->rm_col[c];
2148					ASSERT(rc->rc_error == 0);
2149					if (rc->rc_tried)
2150						raidz_checksum_error(zio, rc,
2151						    orig[i]);
2152					rc->rc_error = SET_ERROR(ECKSUM);
2153				}
2154
2155				ret = code;
2156				goto done;
2157			}
2158
2159			/*
2160			 * Restore the original data.
2161			 */
2162			for (i = 0; i < n; i++) {
2163				c = tgts[i];
2164				rc = &rm->rm_col[c];
2165				ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size);
2166				ASSERT3S(orig[i]->abd_size, >=, rc->rc_size);
2167				abd_copy_off(rc->rc_abd, orig[i], 0, 0,
2168				    rc->rc_size);
2169			}
2170
2171			do {
2172				/*
2173				 * Find the next valid column after the current
2174				 * position..
2175				 */
2176				for (next = tgts[current] + 1;
2177				    next < rm->rm_cols &&
2178				    rm->rm_col[next].rc_error != 0; next++)
2179					continue;
2180
2181				ASSERT(next <= tgts[current + 1]);
2182
2183				/*
2184				 * If that spot is available, we're done here.
2185				 */
2186				if (next != tgts[current + 1])
2187					break;
2188
2189				/*
2190				 * Otherwise, find the next valid column after
2191				 * the previous position.
2192				 */
2193				for (c = tgts[current - 1] + 1;
2194				    rm->rm_col[c].rc_error != 0; c++)
2195					continue;
2196
2197				tgts[current] = c;
2198				current++;
2199
2200			} while (current != n);
2201		}
2202	}
2203	n--;
2204done:
2205	for (i = 0; i < n; i++)
2206		abd_free(orig[i]);
2207
2208	return (ret);
2209}
2210
2211/*
2212 * Complete an IO operation on a RAIDZ VDev
2213 *
2214 * Outline:
2215 * - For write operations:
2216 *   1. Check for errors on the child IOs.
2217 *   2. Return, setting an error code if too few child VDevs were written
2218 *      to reconstruct the data later.  Note that partial writes are
2219 *      considered successful if they can be reconstructed at all.
2220 * - For read operations:
2221 *   1. Check for errors on the child IOs.
2222 *   2. If data errors occurred:
2223 *      a. Try to reassemble the data from the parity available.
2224 *      b. If we haven't yet read the parity drives, read them now.
2225 *      c. If all parity drives have been read but the data still doesn't
2226 *         reassemble with a correct checksum, then try combinatorial
2227 *         reconstruction.
2228 *      d. If that doesn't work, return an error.
2229 *   3. If there were unexpected errors or this is a resilver operation,
2230 *      rewrite the vdevs that had errors.
2231 */
2232static void
2233vdev_raidz_io_done(zio_t *zio)
2234{
2235	vdev_t *vd = zio->io_vd;
2236	vdev_t *cvd;
2237	raidz_map_t *rm = zio->io_vsd;
2238	raidz_col_t *rc;
2239	int unexpected_errors = 0;
2240	int parity_errors = 0;
2241	int parity_untried = 0;
2242	int data_errors = 0;
2243	int total_errors = 0;
2244	int n, c;
2245	int tgts[VDEV_RAIDZ_MAXPARITY];
2246	int code;
2247
2248	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2249
2250	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2251	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2252
2253	for (c = 0; c < rm->rm_cols; c++) {
2254		rc = &rm->rm_col[c];
2255
2256		if (rc->rc_error) {
2257			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2258
2259			if (c < rm->rm_firstdatacol)
2260				parity_errors++;
2261			else
2262				data_errors++;
2263
2264			if (!rc->rc_skipped)
2265				unexpected_errors++;
2266
2267			total_errors++;
2268		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2269			parity_untried++;
2270		}
2271	}
2272
2273	if (zio->io_type == ZIO_TYPE_WRITE) {
2274		/*
2275		 * XXX -- for now, treat partial writes as a success.
2276		 * (If we couldn't write enough columns to reconstruct
2277		 * the data, the I/O failed.  Otherwise, good enough.)
2278		 *
2279		 * Now that we support write reallocation, it would be better
2280		 * to treat partial failure as real failure unless there are
2281		 * no non-degraded top-level vdevs left, and not update DTLs
2282		 * if we intend to reallocate.
2283		 */
2284		/* XXPOLICY */
2285		if (total_errors > rm->rm_firstdatacol)
2286			zio->io_error = vdev_raidz_worst_error(rm);
2287
2288		return;
2289	}
2290
2291	ASSERT(zio->io_type == ZIO_TYPE_READ);
2292	/*
2293	 * There are three potential phases for a read:
2294	 *	1. produce valid data from the columns read
2295	 *	2. read all disks and try again
2296	 *	3. perform combinatorial reconstruction
2297	 *
2298	 * Each phase is progressively both more expensive and less likely to
2299	 * occur. If we encounter more errors than we can repair or all phases
2300	 * fail, we have no choice but to return an error.
2301	 */
2302
2303	/*
2304	 * If the number of errors we saw was correctable -- less than or equal
2305	 * to the number of parity disks read -- attempt to produce data that
2306	 * has a valid checksum. Naturally, this case applies in the absence of
2307	 * any errors.
2308	 */
2309	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2310		if (data_errors == 0) {
2311			if (raidz_checksum_verify(zio) == 0) {
2312				/*
2313				 * If we read parity information (unnecessarily
2314				 * as it happens since no reconstruction was
2315				 * needed) regenerate and verify the parity.
2316				 * We also regenerate parity when resilvering
2317				 * so we can write it out to the failed device
2318				 * later.
2319				 */
2320				if (parity_errors + parity_untried <
2321				    rm->rm_firstdatacol ||
2322				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2323					n = raidz_parity_verify(zio, rm);
2324					unexpected_errors += n;
2325					ASSERT(parity_errors + n <=
2326					    rm->rm_firstdatacol);
2327				}
2328				goto done;
2329			}
2330		} else {
2331			/*
2332			 * We either attempt to read all the parity columns or
2333			 * none of them. If we didn't try to read parity, we
2334			 * wouldn't be here in the correctable case. There must
2335			 * also have been fewer parity errors than parity
2336			 * columns or, again, we wouldn't be in this code path.
2337			 */
2338			ASSERT(parity_untried == 0);
2339			ASSERT(parity_errors < rm->rm_firstdatacol);
2340
2341			/*
2342			 * Identify the data columns that reported an error.
2343			 */
2344			n = 0;
2345			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2346				rc = &rm->rm_col[c];
2347				if (rc->rc_error != 0) {
2348					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2349					tgts[n++] = c;
2350				}
2351			}
2352
2353			ASSERT(rm->rm_firstdatacol >= n);
2354
2355			code = vdev_raidz_reconstruct(rm, tgts, n);
2356
2357			if (raidz_checksum_verify(zio) == 0) {
2358				/*
2359				 * If we read more parity disks than were used
2360				 * for reconstruction, confirm that the other
2361				 * parity disks produced correct data. This
2362				 * routine is suboptimal in that it regenerates
2363				 * the parity that we already used in addition
2364				 * to the parity that we're attempting to
2365				 * verify, but this should be a relatively
2366				 * uncommon case, and can be optimized if it
2367				 * becomes a problem. Note that we regenerate
2368				 * parity when resilvering so we can write it
2369				 * out to failed devices later.
2370				 */
2371				if (parity_errors < rm->rm_firstdatacol - n ||
2372				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2373					n = raidz_parity_verify(zio, rm);
2374					unexpected_errors += n;
2375					ASSERT(parity_errors + n <=
2376					    rm->rm_firstdatacol);
2377				}
2378
2379				goto done;
2380			}
2381		}
2382	}
2383
2384	/*
2385	 * This isn't a typical situation -- either we got a read error or
2386	 * a child silently returned bad data. Read every block so we can
2387	 * try again with as much data and parity as we can track down. If
2388	 * we've already been through once before, all children will be marked
2389	 * as tried so we'll proceed to combinatorial reconstruction.
2390	 */
2391	unexpected_errors = 1;
2392	rm->rm_missingdata = 0;
2393	rm->rm_missingparity = 0;
2394
2395	for (c = 0; c < rm->rm_cols; c++) {
2396		if (rm->rm_col[c].rc_tried)
2397			continue;
2398
2399		zio_vdev_io_redone(zio);
2400		do {
2401			rc = &rm->rm_col[c];
2402			if (rc->rc_tried)
2403				continue;
2404			zio_nowait(zio_vdev_child_io(zio, NULL,
2405			    vd->vdev_child[rc->rc_devidx],
2406			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2407			    zio->io_type, zio->io_priority, 0,
2408			    vdev_raidz_child_done, rc));
2409		} while (++c < rm->rm_cols);
2410
2411		return;
2412	}
2413
2414	/*
2415	 * At this point we've attempted to reconstruct the data given the
2416	 * errors we detected, and we've attempted to read all columns. There
2417	 * must, therefore, be one or more additional problems -- silent errors
2418	 * resulting in invalid data rather than explicit I/O errors resulting
2419	 * in absent data. We check if there is enough additional data to
2420	 * possibly reconstruct the data and then perform combinatorial
2421	 * reconstruction over all possible combinations. If that fails,
2422	 * we're cooked.
2423	 */
2424	if (total_errors > rm->rm_firstdatacol) {
2425		zio->io_error = vdev_raidz_worst_error(rm);
2426
2427	} else if (total_errors < rm->rm_firstdatacol &&
2428	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2429		/*
2430		 * If we didn't use all the available parity for the
2431		 * combinatorial reconstruction, verify that the remaining
2432		 * parity is correct.
2433		 */
2434		if (code != (1 << rm->rm_firstdatacol) - 1)
2435			(void) raidz_parity_verify(zio, rm);
2436	} else {
2437		/*
2438		 * We're here because either:
2439		 *
2440		 *	total_errors == rm_first_datacol, or
2441		 *	vdev_raidz_combrec() failed
2442		 *
2443		 * In either case, there is enough bad data to prevent
2444		 * reconstruction.
2445		 *
2446		 * Start checksum ereports for all children which haven't
2447		 * failed, and the IO wasn't speculative.
2448		 */
2449		zio->io_error = SET_ERROR(ECKSUM);
2450
2451		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2452			for (c = 0; c < rm->rm_cols; c++) {
2453				rc = &rm->rm_col[c];
2454				if (rc->rc_error == 0) {
2455					zio_bad_cksum_t zbc;
2456					zbc.zbc_has_cksum = 0;
2457					zbc.zbc_injected =
2458					    rm->rm_ecksuminjected;
2459
2460					zfs_ereport_start_checksum(
2461					    zio->io_spa,
2462					    vd->vdev_child[rc->rc_devidx],
2463					    &zio->io_bookmark, zio,
2464					    rc->rc_offset, rc->rc_size,
2465					    (void *)(uintptr_t)c, &zbc);
2466				}
2467			}
2468		}
2469	}
2470
2471done:
2472	zio_checksum_verified(zio);
2473
2474	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2475	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2476		/*
2477		 * Use the good data we have in hand to repair damaged children.
2478		 */
2479		for (c = 0; c < rm->rm_cols; c++) {
2480			rc = &rm->rm_col[c];
2481			cvd = vd->vdev_child[rc->rc_devidx];
2482
2483			if (rc->rc_error == 0)
2484				continue;
2485
2486			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2487			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2488			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2489			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2490			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2491		}
2492	}
2493}
2494
2495static void
2496vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2497{
2498	if (faulted > vd->vdev_nparity)
2499		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2500		    VDEV_AUX_NO_REPLICAS);
2501	else if (degraded + faulted != 0)
2502		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2503	else
2504		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2505}
2506
2507/*
2508 * Determine if any portion of the provided block resides on a child vdev
2509 * with a dirty DTL and therefore needs to be resilvered.  The function
2510 * assumes that at least one DTL is dirty which implies that full stripe
2511 * width blocks must be resilvered.
2512 */
2513static boolean_t
2514vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2515{
2516	uint64_t dcols = vd->vdev_children;
2517	uint64_t nparity = vd->vdev_nparity;
2518	uint64_t ashift = vd->vdev_top->vdev_ashift;
2519	/* The starting RAIDZ (parent) vdev sector of the block. */
2520	uint64_t b = offset >> ashift;
2521	/* The zio's size in units of the vdev's minimum sector size. */
2522	uint64_t s = ((psize - 1) >> ashift) + 1;
2523	/* The first column for this stripe. */
2524	uint64_t f = b % dcols;
2525
2526	if (s + nparity >= dcols)
2527		return (B_TRUE);
2528
2529	for (uint64_t c = 0; c < s + nparity; c++) {
2530		uint64_t devidx = (f + c) % dcols;
2531		vdev_t *cvd = vd->vdev_child[devidx];
2532
2533		/*
2534		 * dsl_scan_need_resilver() already checked vd with
2535		 * vdev_dtl_contains(). So here just check cvd with
2536		 * vdev_dtl_empty(), cheaper and a good approximation.
2537		 */
2538		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2539			return (B_TRUE);
2540	}
2541
2542	return (B_FALSE);
2543}
2544
2545static void
2546vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
2547{
2548	vdev_t *raidvd = cvd->vdev_parent;
2549	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2550
2551	uint64_t width = raidvd->vdev_children;
2552	uint64_t tgt_col = cvd->vdev_id;
2553	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2554
2555	/* make sure the offsets are block-aligned */
2556	ASSERT0(in->rs_start % (1 << ashift));
2557	ASSERT0(in->rs_end % (1 << ashift));
2558	uint64_t b_start = in->rs_start >> ashift;
2559	uint64_t b_end = in->rs_end >> ashift;
2560
2561	uint64_t start_row = 0;
2562	if (b_start > tgt_col) /* avoid underflow */
2563		start_row = ((b_start - tgt_col - 1) / width) + 1;
2564
2565	uint64_t end_row = 0;
2566	if (b_end > tgt_col)
2567		end_row = ((b_end - tgt_col - 1) / width) + 1;
2568
2569	res->rs_start = start_row << ashift;
2570	res->rs_end = end_row << ashift;
2571
2572	ASSERT3U(res->rs_start, <=, in->rs_start);
2573	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2574}
2575
2576vdev_ops_t vdev_raidz_ops = {
2577	.vdev_op_open = vdev_raidz_open,
2578	.vdev_op_close = vdev_raidz_close,
2579	.vdev_op_asize = vdev_raidz_asize,
2580	.vdev_op_io_start = vdev_raidz_io_start,
2581	.vdev_op_io_done = vdev_raidz_io_done,
2582	.vdev_op_state_change = vdev_raidz_state_change,
2583	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2584	.vdev_op_hold = NULL,
2585	.vdev_op_rele = NULL,
2586	.vdev_op_remap = NULL,
2587	.vdev_op_xlate = vdev_raidz_xlate,
2588	.vdev_op_dumpio = vdev_raidz_dumpio,
2589	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2590	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2591};
2592