1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
27 #include <sys/zio.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
30 #include <sys/vdev_raidz.h>
31 #include <sys/vdev_raidz_impl.h>
32 #include <sys/simd.h>
33 
34 #ifndef isspace
35 #define	isspace(c)	((c) == ' ' || (c) == '\t' || (c) == '\n' || \
36 			(c) == '\r' || (c) == '\f' || (c) == '\013')
37 #endif
38 
39 extern boolean_t raidz_will_scalar_work(void);
40 
41 /* Opaque implementation with NULL methods to represent original methods */
42 static const raidz_impl_ops_t vdev_raidz_original_impl = {
43 	.name = "original",
44 	.is_supported = raidz_will_scalar_work,
45 };
46 
47 /* RAIDZ parity op that contain the fastest methods */
48 static raidz_impl_ops_t vdev_raidz_fastest_impl = {
49 	.name = "fastest"
50 };
51 
52 /* All compiled in implementations */
53 const raidz_impl_ops_t *raidz_all_maths[] = {
54 	&vdev_raidz_original_impl,
55 	&vdev_raidz_scalar_impl,
56 #if defined(__amd64)
57 	&vdev_raidz_sse2_impl,
58 	&vdev_raidz_ssse3_impl,
59 	&vdev_raidz_avx2_impl,
60 #endif
61 };
62 
63 /* Indicate that benchmark has been completed */
64 static boolean_t raidz_math_initialized = B_FALSE;
65 
66 /* Select raidz implementation */
67 #define	IMPL_FASTEST	(UINT32_MAX)
68 #define	IMPL_CYCLE	(UINT32_MAX - 1)
69 #define	IMPL_ORIGINAL	(0)
70 #define	IMPL_SCALAR	(1)
71 
72 #define	RAIDZ_IMPL_READ(i)	(*(volatile uint32_t *) &(i))
73 
74 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
75 static uint32_t user_sel_impl = IMPL_FASTEST;
76 
77 /* Hold all supported implementations */
78 static size_t raidz_supp_impl_cnt = 0;
79 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
80 
81 #if defined(_KERNEL)
82 /*
83  * kstats values for supported implementations
84  * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
85  *
86  * PORTING NOTE:
87  * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code
88  * which implements a free-form kstat using additional functionality that does
89  * not exist in illumos. Because there are no software consumers of this
90  * information, we omit a kstat API. If an administrator needs to see this
91  * data for some reason, they can use mdb.
92  *
93  * The format of the kstat data on OpenZFS would be a "header" that looks like
94  * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name"
95  * arrays, starting with the parity function "implementation" name):
96  *     impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr
97  * This is followed by a row for each parity function implementation, showing
98  * the "speed" values calculated for that implementation for each of the
99  * parity generation and reconstruction functions in the "raidz_all_maths"
100  * array.
101  */
102 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
103 
104 #endif
105 
106 /*
107  * Returns the RAIDZ operations for raidz_map() parity calculations.   When
108  * a SIMD implementation is not allowed in the current context, then fallback
109  * to the fastest generic implementation.
110  */
111 const raidz_impl_ops_t *
vdev_raidz_math_get_ops(void)112 vdev_raidz_math_get_ops(void)
113 {
114 	if (!kfpu_allowed())
115 		return (&vdev_raidz_scalar_impl);
116 
117 	raidz_impl_ops_t *ops = NULL;
118 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
119 
120 	switch (impl) {
121 	case IMPL_FASTEST:
122 		ASSERT(raidz_math_initialized);
123 		ops = &vdev_raidz_fastest_impl;
124 		break;
125 	case IMPL_CYCLE:
126 		/* Cycle through all supported implementations */
127 		ASSERT(raidz_math_initialized);
128 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
129 		static size_t cycle_impl_idx = 0;
130 		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
131 		ops = raidz_supp_impl[idx];
132 		break;
133 	case IMPL_ORIGINAL:
134 		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
135 		break;
136 	case IMPL_SCALAR:
137 		ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
138 		break;
139 	default:
140 		ASSERT3U(impl, <, raidz_supp_impl_cnt);
141 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
142 		if (impl < ARRAY_SIZE(raidz_all_maths))
143 			ops = raidz_supp_impl[impl];
144 		break;
145 	}
146 
147 	ASSERT3P(ops, !=, NULL);
148 
149 	return (ops);
150 }
151 
152 /*
153  * Select parity generation method for raidz_map
154  */
155 int
vdev_raidz_math_generate(raidz_map_t * rm)156 vdev_raidz_math_generate(raidz_map_t *rm)
157 {
158 	raidz_gen_f gen_parity = NULL;
159 
160 	switch (raidz_parity(rm)) {
161 		case 1:
162 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
163 			break;
164 		case 2:
165 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
166 			break;
167 		case 3:
168 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
169 			break;
170 		default:
171 			gen_parity = NULL;
172 			cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
173 			    (uint_t)raidz_parity(rm));
174 			break;
175 	}
176 
177 	/* if method is NULL execute the original implementation */
178 	if (gen_parity == NULL)
179 		return (RAIDZ_ORIGINAL_IMPL);
180 
181 	gen_parity(rm);
182 
183 	return (0);
184 }
185 
186 static raidz_rec_f
reconstruct_fun_p_sel(raidz_map_t * rm,const int * parity_valid,const int nbaddata)187 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
188     const int nbaddata)
189 {
190 	if (nbaddata == 1 && parity_valid[CODE_P]) {
191 		return (rm->rm_ops->rec[RAIDZ_REC_P]);
192 	}
193 	return ((raidz_rec_f) NULL);
194 }
195 
196 static raidz_rec_f
reconstruct_fun_pq_sel(raidz_map_t * rm,const int * parity_valid,const int nbaddata)197 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
198     const int nbaddata)
199 {
200 	if (nbaddata == 1) {
201 		if (parity_valid[CODE_P]) {
202 			return (rm->rm_ops->rec[RAIDZ_REC_P]);
203 		} else if (parity_valid[CODE_Q]) {
204 			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
205 		}
206 	} else if (nbaddata == 2 &&
207 	    parity_valid[CODE_P] && parity_valid[CODE_Q]) {
208 		return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
209 	}
210 	return ((raidz_rec_f) NULL);
211 }
212 
213 static raidz_rec_f
reconstruct_fun_pqr_sel(raidz_map_t * rm,const int * parity_valid,const int nbaddata)214 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
215     const int nbaddata)
216 {
217 	if (nbaddata == 1) {
218 		if (parity_valid[CODE_P]) {
219 			return (rm->rm_ops->rec[RAIDZ_REC_P]);
220 		} else if (parity_valid[CODE_Q]) {
221 			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
222 		} else if (parity_valid[CODE_R]) {
223 			return (rm->rm_ops->rec[RAIDZ_REC_R]);
224 		}
225 	} else if (nbaddata == 2) {
226 		if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
227 			return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
228 		} else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
229 			return (rm->rm_ops->rec[RAIDZ_REC_PR]);
230 		} else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
231 			return (rm->rm_ops->rec[RAIDZ_REC_QR]);
232 		}
233 	} else if (nbaddata == 3 &&
234 	    parity_valid[CODE_P] && parity_valid[CODE_Q] &&
235 	    parity_valid[CODE_R]) {
236 		return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
237 	}
238 	return ((raidz_rec_f) NULL);
239 }
240 
241 /*
242  * Select data reconstruction method for raidz_map
243  * @parity_valid - Parity validity flag
244  * @dt           - Failed data index array
245  * @nbaddata     - Number of failed data columns
246  */
247 int
vdev_raidz_math_reconstruct(raidz_map_t * rm,const int * parity_valid,const int * dt,const int nbaddata)248 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
249     const int *dt, const int nbaddata)
250 {
251 	raidz_rec_f rec_fn = NULL;
252 
253 	switch (raidz_parity(rm)) {
254 	case PARITY_P:
255 		rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
256 		break;
257 	case PARITY_PQ:
258 		rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
259 		break;
260 	case PARITY_PQR:
261 		rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
262 		break;
263 	default:
264 		cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
265 		    (uint_t)raidz_parity(rm));
266 		break;
267 	}
268 
269 	if (rec_fn == NULL)
270 		return (RAIDZ_ORIGINAL_IMPL);
271 	else
272 		return (rec_fn(rm, dt));
273 }
274 
275 const char *raidz_gen_name[] = {
276 	"gen_p", "gen_pq", "gen_pqr"
277 };
278 const char *raidz_rec_name[] = {
279 	"rec_p", "rec_q", "rec_r",
280 	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
281 };
282 
283 #if defined(_KERNEL)
284 
285 #define	BENCH_D_COLS	(8ULL)
286 #define	BENCH_COLS	(BENCH_D_COLS + PARITY_PQR)
287 #define	BENCH_ZIO_SIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)	/* 128 kiB */
288 #define	BENCH_NS	MSEC2NSEC(1)			/* 1ms */
289 
290 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
291 
292 static void
benchmark_gen_impl(raidz_map_t * rm,const int fn)293 benchmark_gen_impl(raidz_map_t *rm, const int fn)
294 {
295 	(void) fn;
296 	vdev_raidz_generate_parity(rm);
297 }
298 
299 static void
benchmark_rec_impl(raidz_map_t * rm,const int fn)300 benchmark_rec_impl(raidz_map_t *rm, const int fn)
301 {
302 	static const int rec_tgt[7][3] = {
303 		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
304 		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
305 		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
306 		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
307 		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
308 		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
309 		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
310 	};
311 
312 	vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
313 }
314 
315 /*
316  * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
317  * is performed by setting the rm_ops pointer and calling the top level
318  * generate/reconstruct methods of bench_rm.
319  */
320 static void
benchmark_raidz_impl(raidz_map_t * bench_rm,const int fn,benchmark_fn bench_fn)321 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
322 {
323 	uint64_t run_cnt, speed, best_speed = 0;
324 	hrtime_t t_start, t_diff;
325 	raidz_impl_ops_t *curr_impl;
326 	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
327 	int impl, i;
328 
329 	for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
330 		/* set an implementation to benchmark */
331 		curr_impl = raidz_supp_impl[impl];
332 		bench_rm->rm_ops = curr_impl;
333 
334 		run_cnt = 0;
335 		t_start = gethrtime();
336 
337 		do {
338 			for (i = 0; i < 5; i++, run_cnt++)
339 				bench_fn(bench_rm, fn);
340 
341 			t_diff = gethrtime() - t_start;
342 		} while (t_diff < BENCH_NS);
343 
344 		speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
345 		speed /= (t_diff * BENCH_COLS);
346 
347 		if (bench_fn == benchmark_gen_impl)
348 			raidz_impl_kstats[impl].gen[fn] = speed;
349 		else
350 			raidz_impl_kstats[impl].rec[fn] = speed;
351 
352 		/* Update fastest implementation method */
353 		if (speed > best_speed) {
354 			best_speed = speed;
355 
356 			if (bench_fn == benchmark_gen_impl) {
357 				fstat->gen[fn] = impl;
358 				vdev_raidz_fastest_impl.gen[fn] =
359 				    curr_impl->gen[fn];
360 			} else {
361 				fstat->rec[fn] = impl;
362 				vdev_raidz_fastest_impl.rec[fn] =
363 				    curr_impl->rec[fn];
364 			}
365 		}
366 	}
367 }
368 #endif
369 
370 /*
371  * Initialize and benchmark all supported implementations.
372  */
373 static void
benchmark_raidz(void)374 benchmark_raidz(void)
375 {
376 	raidz_impl_ops_t *curr_impl;
377 	int i, c;
378 
379 	/* Move supported impl into raidz_supp_impl */
380 	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
381 		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
382 
383 		if (curr_impl->init)
384 			curr_impl->init();
385 
386 		if (curr_impl->is_supported())
387 			raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
388 	}
389 	membar_producer();		/* complete raidz_supp_impl[] init */
390 	raidz_supp_impl_cnt = c;	/* number of supported impl */
391 
392 #if defined(_KERNEL)
393 	zio_t *bench_zio = NULL;
394 	raidz_map_t *bench_rm = NULL;
395 	uint64_t bench_parity;
396 
397 	/* Fake a zio and run the benchmark on a warmed up buffer */
398 	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
399 	bench_zio->io_offset = 0;
400 	bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
401 	bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
402 	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
403 
404 	/* Benchmark parity generation methods */
405 	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
406 		bench_parity = fn + 1;
407 		/* New raidz_map is needed for each generate_p/q/r */
408 		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
409 		    BENCH_D_COLS + bench_parity, bench_parity);
410 
411 		benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
412 
413 		vdev_raidz_map_free(bench_rm);
414 	}
415 
416 	/* Benchmark data reconstruction methods */
417 	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
418 	    BENCH_COLS, PARITY_PQR);
419 
420 	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
421 		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
422 
423 	vdev_raidz_map_free(bench_rm);
424 
425 	/* cleanup the bench zio */
426 	abd_free(bench_zio->io_abd);
427 	kmem_free(bench_zio, sizeof (zio_t));
428 #else
429 	/*
430 	 * Skip the benchmark in user space to avoid impacting libzpool
431 	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
432 	 * is assumed to be the fastest and used by default.
433 	 */
434 	memcpy(&vdev_raidz_fastest_impl,
435 	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
436 	    sizeof (vdev_raidz_fastest_impl));
437 	strcpy(vdev_raidz_fastest_impl.name, "fastest");
438 #endif /* _KERNEL */
439 }
440 
441 void
vdev_raidz_math_init(void)442 vdev_raidz_math_init(void)
443 {
444 	/* Determine the fastest available implementation. */
445 	benchmark_raidz();
446 
447 	/* Finish initialization */
448 	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
449 	raidz_math_initialized = B_TRUE;
450 }
451 
452 void
vdev_raidz_math_fini(void)453 vdev_raidz_math_fini(void)
454 {
455 	raidz_impl_ops_t const *curr_impl;
456 
457 	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
458 		curr_impl = raidz_all_maths[i];
459 		if (curr_impl->fini)
460 			curr_impl->fini();
461 	}
462 }
463 
464 static const struct {
465 	char *name;
466 	uint32_t sel;
467 } math_impl_opts[] = {
468 		{ "cycle",	IMPL_CYCLE },
469 		{ "fastest",	IMPL_FASTEST },
470 		{ "original",	IMPL_ORIGINAL },
471 		{ "scalar",	IMPL_SCALAR }
472 };
473 
474 /*
475  * Function sets desired raidz implementation.
476  *
477  * If we are called before init(), user preference will be saved in
478  * user_sel_impl, and applied in later init() call. This occurs when module
479  * parameter is specified on module load. Otherwise, directly update
480  * zfs_vdev_raidz_impl.
481  *
482  * @val		Name of raidz implementation to use
483  * @param	Unused.
484  */
485 int
vdev_raidz_impl_set(const char * val)486 vdev_raidz_impl_set(const char *val)
487 {
488 	int err = EINVAL;
489 	char req_name[RAIDZ_IMPL_NAME_MAX];
490 	uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
491 	size_t i;
492 
493 	/* sanitize input */
494 	i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
495 	if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
496 		return (err);
497 
498 	strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
499 	while (i > 0 && !!isspace(req_name[i-1]))
500 		i--;
501 	req_name[i] = '\0';
502 
503 	/* Check mandatory options */
504 	for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
505 		if (strcmp(req_name, math_impl_opts[i].name) == 0) {
506 			impl = math_impl_opts[i].sel;
507 			err = 0;
508 			break;
509 		}
510 	}
511 
512 	/* check all supported impl if init() was already called */
513 	if (err != 0 && raidz_math_initialized) {
514 		/* check all supported implementations */
515 		for (i = 0; i < raidz_supp_impl_cnt; i++) {
516 			if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
517 				impl = i;
518 				err = 0;
519 				break;
520 			}
521 		}
522 	}
523 
524 	if (err == 0) {
525 		if (raidz_math_initialized)
526 			atomic_swap_32(&zfs_vdev_raidz_impl, impl);
527 		else
528 			atomic_swap_32(&user_sel_impl, impl);
529 	}
530 
531 	return (err);
532 }
533 
534 #if defined(_KERNEL) && defined(__linux__)
535 
536 static int
zfs_vdev_raidz_impl_set(const char * val,zfs_kernel_param_t * kp)537 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
538 {
539 	return (vdev_raidz_impl_set(val));
540 }
541 
542 static int
zfs_vdev_raidz_impl_get(char * buffer,zfs_kernel_param_t * kp)543 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
544 {
545 	int i, cnt = 0;
546 	char *fmt;
547 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
548 
549 	ASSERT(raidz_math_initialized);
550 
551 	/* list mandatory options */
552 	for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
553 		fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
554 		cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
555 	}
556 
557 	/* list all supported implementations */
558 	for (i = 0; i < raidz_supp_impl_cnt; i++) {
559 		fmt = (i == impl) ? "[%s] " : "%s ";
560 		cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
561 	}
562 
563 	return (cnt);
564 }
565 
566 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
567     zfs_vdev_raidz_impl_get, NULL, 0644);
568 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
569 #endif
570