1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
23  */
24 
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
27 #include <sys/zio.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
30 #include <sys/vdev_raidz.h>
31 #include <sys/vdev_raidz_impl.h>
32 #include <sys/simd.h>
33 
34 #ifndef isspace
35 #define	isspace(c)	((c) == ' ' || (c) == '\t' || (c) == '\n' || \
36 			(c) == '\r' || (c) == '\f' || (c) == '\013')
37 #endif
38 
39 extern boolean_t raidz_will_scalar_work(void);
40 
41 /* Opaque implementation with NULL methods to represent original methods */
42 static const raidz_impl_ops_t vdev_raidz_original_impl = {
43 	.name = "original",
44 	.is_supported = raidz_will_scalar_work,
45 };
46 
47 /* RAIDZ parity op that contain the fastest methods */
48 static raidz_impl_ops_t vdev_raidz_fastest_impl = {
49 	.name = "fastest"
50 };
51 
52 /* All compiled in implementations */
53 const raidz_impl_ops_t *raidz_all_maths[] = {
54 	&vdev_raidz_original_impl,
55 	&vdev_raidz_scalar_impl,
56 };
57 
58 /* Indicate that benchmark has been completed */
59 static boolean_t raidz_math_initialized = B_FALSE;
60 
61 /* Select raidz implementation */
62 #define	IMPL_FASTEST	(UINT32_MAX)
63 #define	IMPL_CYCLE	(UINT32_MAX - 1)
64 #define	IMPL_ORIGINAL	(0)
65 #define	IMPL_SCALAR	(1)
66 
67 #define	RAIDZ_IMPL_READ(i)	(*(volatile uint32_t *) &(i))
68 
69 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
70 static uint32_t user_sel_impl = IMPL_FASTEST;
71 
72 /* Hold all supported implementations */
73 static size_t raidz_supp_impl_cnt = 0;
74 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
75 
76 #if defined(_KERNEL)
77 /*
78  * kstats values for supported implementations
79  * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
80  *
81  * PORTING NOTE:
82  * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code
83  * which implements a free-form kstat using additional functionality that does
84  * not exist in illumos. Because there are no software consumers of this
85  * information, we omit a kstat API. If an administrator needs to see this
86  * data for some reason, they can use mdb.
87  *
88  * The format of the kstat data on OpenZFS would be a "header" that looks like
89  * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name"
90  * arrays, starting with the parity function "implementation" name):
91  *     impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr
92  * This is followed by a row for each parity function implementation, showing
93  * the "speed" values calculated for that implementation for each of the
94  * parity generation and reconstruction functions in the "raidz_all_maths"
95  * array.
96  */
97 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
98 
99 #endif
100 
101 /*
102  * Returns the RAIDZ operations for raidz_map() parity calculations.   When
103  * a SIMD implementation is not allowed in the current context, then fallback
104  * to the fastest generic implementation.
105  */
106 const raidz_impl_ops_t *
107 vdev_raidz_math_get_ops(void)
108 {
109 	/*
110 	 * illumos porting note:
111 	 * The following check from OpenZFS is disabled since we don't have
112 	 * this compiled in yet and we need to be able to change the
113 	 * implementation for the user-level test suite.
114 	 *
115 	 * if (!kfpu_allowed())
116 	 *	return (&vdev_raidz_scalar_impl);
117 	 */
118 
119 	raidz_impl_ops_t *ops = NULL;
120 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
121 
122 	switch (impl) {
123 	case IMPL_FASTEST:
124 		ASSERT(raidz_math_initialized);
125 		ops = &vdev_raidz_fastest_impl;
126 		break;
127 	case IMPL_CYCLE:
128 		/* Cycle through all supported implementations */
129 		ASSERT(raidz_math_initialized);
130 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
131 		static size_t cycle_impl_idx = 0;
132 		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
133 		ops = raidz_supp_impl[idx];
134 		break;
135 	case IMPL_ORIGINAL:
136 		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
137 		break;
138 	case IMPL_SCALAR:
139 		ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
140 		break;
141 	default:
142 		ASSERT3U(impl, <, raidz_supp_impl_cnt);
143 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
144 		if (impl < ARRAY_SIZE(raidz_all_maths))
145 			ops = raidz_supp_impl[impl];
146 		break;
147 	}
148 
149 	ASSERT3P(ops, !=, NULL);
150 
151 	return (ops);
152 }
153 
154 /*
155  * Select parity generation method for raidz_map
156  */
157 int
158 vdev_raidz_math_generate(raidz_map_t *rm)
159 {
160 	raidz_gen_f gen_parity = NULL;
161 
162 	switch (raidz_parity(rm)) {
163 		case 1:
164 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
165 			break;
166 		case 2:
167 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
168 			break;
169 		case 3:
170 			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
171 			break;
172 		default:
173 			gen_parity = NULL;
174 			cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
175 			    (uint_t)raidz_parity(rm));
176 			break;
177 	}
178 
179 	/* if method is NULL execute the original implementation */
180 	if (gen_parity == NULL)
181 		return (RAIDZ_ORIGINAL_IMPL);
182 
183 	gen_parity(rm);
184 
185 	return (0);
186 }
187 
188 static raidz_rec_f
189 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
190     const int nbaddata)
191 {
192 	if (nbaddata == 1 && parity_valid[CODE_P]) {
193 		return (rm->rm_ops->rec[RAIDZ_REC_P]);
194 	}
195 	return ((raidz_rec_f) NULL);
196 }
197 
198 static raidz_rec_f
199 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
200     const int nbaddata)
201 {
202 	if (nbaddata == 1) {
203 		if (parity_valid[CODE_P]) {
204 			return (rm->rm_ops->rec[RAIDZ_REC_P]);
205 		} else if (parity_valid[CODE_Q]) {
206 			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
207 		}
208 	} else if (nbaddata == 2 &&
209 	    parity_valid[CODE_P] && parity_valid[CODE_Q]) {
210 		return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
211 	}
212 	return ((raidz_rec_f) NULL);
213 }
214 
215 static raidz_rec_f
216 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
217     const int nbaddata)
218 {
219 	if (nbaddata == 1) {
220 		if (parity_valid[CODE_P]) {
221 			return (rm->rm_ops->rec[RAIDZ_REC_P]);
222 		} else if (parity_valid[CODE_Q]) {
223 			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
224 		} else if (parity_valid[CODE_R]) {
225 			return (rm->rm_ops->rec[RAIDZ_REC_R]);
226 		}
227 	} else if (nbaddata == 2) {
228 		if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
229 			return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
230 		} else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
231 			return (rm->rm_ops->rec[RAIDZ_REC_PR]);
232 		} else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
233 			return (rm->rm_ops->rec[RAIDZ_REC_QR]);
234 		}
235 	} else if (nbaddata == 3 &&
236 	    parity_valid[CODE_P] && parity_valid[CODE_Q] &&
237 	    parity_valid[CODE_R]) {
238 		return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
239 	}
240 	return ((raidz_rec_f) NULL);
241 }
242 
243 /*
244  * Select data reconstruction method for raidz_map
245  * @parity_valid - Parity validity flag
246  * @dt           - Failed data index array
247  * @nbaddata     - Number of failed data columns
248  */
249 int
250 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
251     const int *dt, const int nbaddata)
252 {
253 	raidz_rec_f rec_fn = NULL;
254 
255 	switch (raidz_parity(rm)) {
256 	case PARITY_P:
257 		rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
258 		break;
259 	case PARITY_PQ:
260 		rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
261 		break;
262 	case PARITY_PQR:
263 		rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
264 		break;
265 	default:
266 		cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
267 		    (uint_t)raidz_parity(rm));
268 		break;
269 	}
270 
271 	if (rec_fn == NULL)
272 		return (RAIDZ_ORIGINAL_IMPL);
273 	else
274 		return (rec_fn(rm, dt));
275 }
276 
277 const char *raidz_gen_name[] = {
278 	"gen_p", "gen_pq", "gen_pqr"
279 };
280 const char *raidz_rec_name[] = {
281 	"rec_p", "rec_q", "rec_r",
282 	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
283 };
284 
285 #if defined(_KERNEL)
286 
287 #define	BENCH_D_COLS	(8ULL)
288 #define	BENCH_COLS	(BENCH_D_COLS + PARITY_PQR)
289 #define	BENCH_ZIO_SIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)	/* 128 kiB */
290 #define	BENCH_NS	MSEC2NSEC(25)			/* 25ms */
291 
292 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
293 
294 static void
295 benchmark_gen_impl(raidz_map_t *rm, const int fn)
296 {
297 	(void) fn;
298 	vdev_raidz_generate_parity(rm);
299 }
300 
301 static void
302 benchmark_rec_impl(raidz_map_t *rm, const int fn)
303 {
304 	static const int rec_tgt[7][3] = {
305 		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
306 		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
307 		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
308 		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
309 		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
310 		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
311 		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
312 	};
313 
314 	vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
315 }
316 
317 /*
318  * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
319  * is performed by setting the rm_ops pointer and calling the top level
320  * generate/reconstruct methods of bench_rm.
321  */
322 static void
323 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
324 {
325 	uint64_t run_cnt, speed, best_speed = 0;
326 	hrtime_t t_start, t_diff;
327 	raidz_impl_ops_t *curr_impl;
328 	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
329 	int impl, i;
330 
331 	for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
332 		/* set an implementation to benchmark */
333 		curr_impl = raidz_supp_impl[impl];
334 		bench_rm->rm_ops = curr_impl;
335 
336 		run_cnt = 0;
337 		t_start = gethrtime();
338 
339 		do {
340 			for (i = 0; i < 25; i++, run_cnt++)
341 				bench_fn(bench_rm, fn);
342 
343 			t_diff = gethrtime() - t_start;
344 		} while (t_diff < BENCH_NS);
345 
346 		speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
347 		speed /= (t_diff * BENCH_COLS);
348 
349 		if (bench_fn == benchmark_gen_impl)
350 			raidz_impl_kstats[impl].gen[fn] = speed;
351 		else
352 			raidz_impl_kstats[impl].rec[fn] = speed;
353 
354 		/* Update fastest implementation method */
355 		if (speed > best_speed) {
356 			best_speed = speed;
357 
358 			if (bench_fn == benchmark_gen_impl) {
359 				fstat->gen[fn] = impl;
360 				vdev_raidz_fastest_impl.gen[fn] =
361 				    curr_impl->gen[fn];
362 			} else {
363 				fstat->rec[fn] = impl;
364 				vdev_raidz_fastest_impl.rec[fn] =
365 				    curr_impl->rec[fn];
366 			}
367 		}
368 	}
369 }
370 #endif
371 
372 /*
373  * Initialize and benchmark all supported implementations.
374  */
375 static void
376 benchmark_raidz(void)
377 {
378 	raidz_impl_ops_t *curr_impl;
379 	int i, c;
380 
381 	/* Move supported impl into raidz_supp_impl */
382 	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
383 		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
384 
385 		if (curr_impl->init)
386 			curr_impl->init();
387 
388 		if (curr_impl->is_supported())
389 			raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
390 	}
391 	membar_producer();		/* complete raidz_supp_impl[] init */
392 	raidz_supp_impl_cnt = c;	/* number of supported impl */
393 
394 #if defined(_KERNEL)
395 	zio_t *bench_zio = NULL;
396 	raidz_map_t *bench_rm = NULL;
397 	uint64_t bench_parity;
398 
399 	/* Fake a zio and run the benchmark on a warmed up buffer */
400 	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
401 	bench_zio->io_offset = 0;
402 	bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
403 	bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
404 	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
405 
406 	/* Benchmark parity generation methods */
407 	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
408 		bench_parity = fn + 1;
409 		/* New raidz_map is needed for each generate_p/q/r */
410 		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
411 		    BENCH_D_COLS + bench_parity, bench_parity);
412 
413 		benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
414 
415 		vdev_raidz_map_free(bench_rm);
416 	}
417 
418 	/* Benchmark data reconstruction methods */
419 	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
420 	    BENCH_COLS, PARITY_PQR);
421 
422 	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
423 		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
424 
425 	vdev_raidz_map_free(bench_rm);
426 
427 	/* cleanup the bench zio */
428 	abd_free(bench_zio->io_abd);
429 	kmem_free(bench_zio, sizeof (zio_t));
430 #else
431 	/*
432 	 * Skip the benchmark in user space to avoid impacting libzpool
433 	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
434 	 * is assumed to be the fastest and used by default.
435 	 */
436 	memcpy(&vdev_raidz_fastest_impl,
437 	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
438 	    sizeof (vdev_raidz_fastest_impl));
439 	strcpy(vdev_raidz_fastest_impl.name, "fastest");
440 #endif /* _KERNEL */
441 }
442 
443 void
444 vdev_raidz_math_init(void)
445 {
446 	/* Determine the fastest available implementation. */
447 	benchmark_raidz();
448 
449 	/* Finish initialization */
450 	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
451 	raidz_math_initialized = B_TRUE;
452 }
453 
454 void
455 vdev_raidz_math_fini(void)
456 {
457 	raidz_impl_ops_t const *curr_impl;
458 
459 	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
460 		curr_impl = raidz_all_maths[i];
461 		if (curr_impl->fini)
462 			curr_impl->fini();
463 	}
464 }
465 
466 static const struct {
467 	char *name;
468 	uint32_t sel;
469 } math_impl_opts[] = {
470 		{ "cycle",	IMPL_CYCLE },
471 		{ "fastest",	IMPL_FASTEST },
472 		{ "original",	IMPL_ORIGINAL },
473 		{ "scalar",	IMPL_SCALAR }
474 };
475 
476 /*
477  * Function sets desired raidz implementation.
478  *
479  * If we are called before init(), user preference will be saved in
480  * user_sel_impl, and applied in later init() call. This occurs when module
481  * parameter is specified on module load. Otherwise, directly update
482  * zfs_vdev_raidz_impl.
483  *
484  * @val		Name of raidz implementation to use
485  * @param	Unused.
486  */
487 int
488 vdev_raidz_impl_set(const char *val)
489 {
490 	int err = -EINVAL;
491 	char req_name[RAIDZ_IMPL_NAME_MAX];
492 	uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
493 	size_t i;
494 
495 	/* sanitize input */
496 	i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
497 	if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
498 		return (err);
499 
500 	strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
501 	while (i > 0 && !!isspace(req_name[i-1]))
502 		i--;
503 	req_name[i] = '\0';
504 
505 	/* Check mandatory options */
506 	for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
507 		if (strcmp(req_name, math_impl_opts[i].name) == 0) {
508 			impl = math_impl_opts[i].sel;
509 			err = 0;
510 			break;
511 		}
512 	}
513 
514 	/* check all supported impl if init() was already called */
515 	if (err != 0 && raidz_math_initialized) {
516 		/* check all supported implementations */
517 		for (i = 0; i < raidz_supp_impl_cnt; i++) {
518 			if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
519 				impl = i;
520 				err = 0;
521 				break;
522 			}
523 		}
524 	}
525 
526 	if (err == 0) {
527 		if (raidz_math_initialized)
528 			atomic_swap_32(&zfs_vdev_raidz_impl, impl);
529 		else
530 			atomic_swap_32(&user_sel_impl, impl);
531 	}
532 
533 	return (err);
534 }
535 
536 #if defined(_KERNEL) && defined(__linux__)
537 
538 static int
539 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
540 {
541 	return (vdev_raidz_impl_set(val));
542 }
543 
544 static int
545 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
546 {
547 	int i, cnt = 0;
548 	char *fmt;
549 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
550 
551 	ASSERT(raidz_math_initialized);
552 
553 	/* list mandatory options */
554 	for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
555 		fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
556 		cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
557 	}
558 
559 	/* list all supported implementations */
560 	for (i = 0; i < raidz_supp_impl_cnt; i++) {
561 		fmt = (i == impl) ? "[%s] " : "%s ";
562 		cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
563 	}
564 
565 	return (cnt);
566 }
567 
568 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
569     zfs_vdev_raidz_impl_get, NULL, 0644);
570 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
571 #endif
572