1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2016 Gvozden Ne��kovi��. All rights reserved.
23 */
24
25#include <sys/zfs_context.h>
26#include <sys/types.h>
27#include <sys/zio.h>
28#include <sys/debug.h>
29#include <sys/zfs_debug.h>
30#include <sys/vdev_raidz.h>
31#include <sys/vdev_raidz_impl.h>
32#include <sys/simd.h>
33
34#ifndef isspace
35#define	isspace(c)	((c) == ' ' || (c) == '\t' || (c) == '\n' || \
36			(c) == '\r' || (c) == '\f' || (c) == '\013')
37#endif
38
39extern boolean_t raidz_will_scalar_work(void);
40
41/* Opaque implementation with NULL methods to represent original methods */
42static const raidz_impl_ops_t vdev_raidz_original_impl = {
43	.name = "original",
44	.is_supported = raidz_will_scalar_work,
45};
46
47/* RAIDZ parity op that contain the fastest methods */
48static raidz_impl_ops_t vdev_raidz_fastest_impl = {
49	.name = "fastest"
50};
51
52/* All compiled in implementations */
53const raidz_impl_ops_t *raidz_all_maths[] = {
54	&vdev_raidz_original_impl,
55	&vdev_raidz_scalar_impl,
56#if defined(__amd64)
57	&vdev_raidz_sse2_impl,
58#endif
59#if defined(__amd64)
60	&vdev_raidz_ssse3_impl,
61#endif
62#if defined(__amd64)
63	&vdev_raidz_avx2_impl,
64#endif
65};
66
67/* Indicate that benchmark has been completed */
68static boolean_t raidz_math_initialized = B_FALSE;
69
70/* Select raidz implementation */
71#define	IMPL_FASTEST	(UINT32_MAX)
72#define	IMPL_CYCLE	(UINT32_MAX - 1)
73#define	IMPL_ORIGINAL	(0)
74#define	IMPL_SCALAR	(1)
75
76#define	RAIDZ_IMPL_READ(i)	(*(volatile uint32_t *) &(i))
77
78static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
79static uint32_t user_sel_impl = IMPL_FASTEST;
80
81/* Hold all supported implementations */
82static size_t raidz_supp_impl_cnt = 0;
83static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
84
85#if defined(_KERNEL)
86/*
87 * kstats values for supported implementations
88 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
89 *
90 * PORTING NOTE:
91 * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code
92 * which implements a free-form kstat using additional functionality that does
93 * not exist in illumos. Because there are no software consumers of this
94 * information, we omit a kstat API. If an administrator needs to see this
95 * data for some reason, they can use mdb.
96 *
97 * The format of the kstat data on OpenZFS would be a "header" that looks like
98 * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name"
99 * arrays, starting with the parity function "implementation" name):
100 *     impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr
101 * This is followed by a row for each parity function implementation, showing
102 * the "speed" values calculated for that implementation for each of the
103 * parity generation and reconstruction functions in the "raidz_all_maths"
104 * array.
105 */
106static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
107
108#endif
109
110/*
111 * Returns the RAIDZ operations for raidz_map() parity calculations.   When
112 * a SIMD implementation is not allowed in the current context, then fallback
113 * to the fastest generic implementation.
114 */
115const raidz_impl_ops_t *
116vdev_raidz_math_get_ops(void)
117{
118	if (!kfpu_allowed())
119		return (&vdev_raidz_scalar_impl);
120
121	raidz_impl_ops_t *ops = NULL;
122	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
123
124	switch (impl) {
125	case IMPL_FASTEST:
126		ASSERT(raidz_math_initialized);
127		ops = &vdev_raidz_fastest_impl;
128		break;
129	case IMPL_CYCLE:
130		/* Cycle through all supported implementations */
131		ASSERT(raidz_math_initialized);
132		ASSERT3U(raidz_supp_impl_cnt, >, 0);
133		static size_t cycle_impl_idx = 0;
134		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
135		ops = raidz_supp_impl[idx];
136		break;
137	case IMPL_ORIGINAL:
138		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
139		break;
140	case IMPL_SCALAR:
141		ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
142		break;
143	default:
144		ASSERT3U(impl, <, raidz_supp_impl_cnt);
145		ASSERT3U(raidz_supp_impl_cnt, >, 0);
146		if (impl < ARRAY_SIZE(raidz_all_maths))
147			ops = raidz_supp_impl[impl];
148		break;
149	}
150
151	ASSERT3P(ops, !=, NULL);
152
153	return (ops);
154}
155
156/*
157 * Select parity generation method for raidz_map
158 */
159int
160vdev_raidz_math_generate(raidz_map_t *rm)
161{
162	raidz_gen_f gen_parity = NULL;
163
164	switch (raidz_parity(rm)) {
165		case 1:
166			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
167			break;
168		case 2:
169			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
170			break;
171		case 3:
172			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
173			break;
174		default:
175			gen_parity = NULL;
176			cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
177			    (uint_t)raidz_parity(rm));
178			break;
179	}
180
181	/* if method is NULL execute the original implementation */
182	if (gen_parity == NULL)
183		return (RAIDZ_ORIGINAL_IMPL);
184
185	gen_parity(rm);
186
187	return (0);
188}
189
190static raidz_rec_f
191reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
192    const int nbaddata)
193{
194	if (nbaddata == 1 && parity_valid[CODE_P]) {
195		return (rm->rm_ops->rec[RAIDZ_REC_P]);
196	}
197	return ((raidz_rec_f) NULL);
198}
199
200static raidz_rec_f
201reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
202    const int nbaddata)
203{
204	if (nbaddata == 1) {
205		if (parity_valid[CODE_P]) {
206			return (rm->rm_ops->rec[RAIDZ_REC_P]);
207		} else if (parity_valid[CODE_Q]) {
208			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
209		}
210	} else if (nbaddata == 2 &&
211	    parity_valid[CODE_P] && parity_valid[CODE_Q]) {
212		return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
213	}
214	return ((raidz_rec_f) NULL);
215}
216
217static raidz_rec_f
218reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
219    const int nbaddata)
220{
221	if (nbaddata == 1) {
222		if (parity_valid[CODE_P]) {
223			return (rm->rm_ops->rec[RAIDZ_REC_P]);
224		} else if (parity_valid[CODE_Q]) {
225			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
226		} else if (parity_valid[CODE_R]) {
227			return (rm->rm_ops->rec[RAIDZ_REC_R]);
228		}
229	} else if (nbaddata == 2) {
230		if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
231			return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
232		} else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
233			return (rm->rm_ops->rec[RAIDZ_REC_PR]);
234		} else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
235			return (rm->rm_ops->rec[RAIDZ_REC_QR]);
236		}
237	} else if (nbaddata == 3 &&
238	    parity_valid[CODE_P] && parity_valid[CODE_Q] &&
239	    parity_valid[CODE_R]) {
240		return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
241	}
242	return ((raidz_rec_f) NULL);
243}
244
245/*
246 * Select data reconstruction method for raidz_map
247 * @parity_valid - Parity validity flag
248 * @dt           - Failed data index array
249 * @nbaddata     - Number of failed data columns
250 */
251int
252vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
253    const int *dt, const int nbaddata)
254{
255	raidz_rec_f rec_fn = NULL;
256
257	switch (raidz_parity(rm)) {
258	case PARITY_P:
259		rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
260		break;
261	case PARITY_PQ:
262		rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
263		break;
264	case PARITY_PQR:
265		rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
266		break;
267	default:
268		cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
269		    (uint_t)raidz_parity(rm));
270		break;
271	}
272
273	if (rec_fn == NULL)
274		return (RAIDZ_ORIGINAL_IMPL);
275	else
276		return (rec_fn(rm, dt));
277}
278
279const char *raidz_gen_name[] = {
280	"gen_p", "gen_pq", "gen_pqr"
281};
282const char *raidz_rec_name[] = {
283	"rec_p", "rec_q", "rec_r",
284	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
285};
286
287#if defined(_KERNEL)
288
289#define	BENCH_D_COLS	(8ULL)
290#define	BENCH_COLS	(BENCH_D_COLS + PARITY_PQR)
291#define	BENCH_ZIO_SIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)	/* 128 kiB */
292#define	BENCH_NS	MSEC2NSEC(25)			/* 25ms */
293
294typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
295
296static void
297benchmark_gen_impl(raidz_map_t *rm, const int fn)
298{
299	(void) fn;
300	vdev_raidz_generate_parity(rm);
301}
302
303static void
304benchmark_rec_impl(raidz_map_t *rm, const int fn)
305{
306	static const int rec_tgt[7][3] = {
307		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
308		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
309		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
310		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
311		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
312		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
313		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
314	};
315
316	vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
317}
318
319/*
320 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
321 * is performed by setting the rm_ops pointer and calling the top level
322 * generate/reconstruct methods of bench_rm.
323 */
324static void
325benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
326{
327	uint64_t run_cnt, speed, best_speed = 0;
328	hrtime_t t_start, t_diff;
329	raidz_impl_ops_t *curr_impl;
330	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
331	int impl, i;
332
333	for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
334		/* set an implementation to benchmark */
335		curr_impl = raidz_supp_impl[impl];
336		bench_rm->rm_ops = curr_impl;
337
338		run_cnt = 0;
339		t_start = gethrtime();
340
341		do {
342			for (i = 0; i < 25; i++, run_cnt++)
343				bench_fn(bench_rm, fn);
344
345			t_diff = gethrtime() - t_start;
346		} while (t_diff < BENCH_NS);
347
348		speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
349		speed /= (t_diff * BENCH_COLS);
350
351		if (bench_fn == benchmark_gen_impl)
352			raidz_impl_kstats[impl].gen[fn] = speed;
353		else
354			raidz_impl_kstats[impl].rec[fn] = speed;
355
356		/* Update fastest implementation method */
357		if (speed > best_speed) {
358			best_speed = speed;
359
360			if (bench_fn == benchmark_gen_impl) {
361				fstat->gen[fn] = impl;
362				vdev_raidz_fastest_impl.gen[fn] =
363				    curr_impl->gen[fn];
364			} else {
365				fstat->rec[fn] = impl;
366				vdev_raidz_fastest_impl.rec[fn] =
367				    curr_impl->rec[fn];
368			}
369		}
370	}
371}
372#endif
373
374/*
375 * Initialize and benchmark all supported implementations.
376 */
377static void
378benchmark_raidz(void)
379{
380	raidz_impl_ops_t *curr_impl;
381	int i, c;
382
383	/* Move supported impl into raidz_supp_impl */
384	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
385		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
386
387		if (curr_impl->init)
388			curr_impl->init();
389
390		if (curr_impl->is_supported())
391			raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
392	}
393	membar_producer();		/* complete raidz_supp_impl[] init */
394	raidz_supp_impl_cnt = c;	/* number of supported impl */
395
396#if defined(_KERNEL)
397	zio_t *bench_zio = NULL;
398	raidz_map_t *bench_rm = NULL;
399	uint64_t bench_parity;
400
401	/* Fake a zio and run the benchmark on a warmed up buffer */
402	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
403	bench_zio->io_offset = 0;
404	bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
405	bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
406	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
407
408	/* Benchmark parity generation methods */
409	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
410		bench_parity = fn + 1;
411		/* New raidz_map is needed for each generate_p/q/r */
412		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
413		    BENCH_D_COLS + bench_parity, bench_parity);
414
415		benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
416
417		vdev_raidz_map_free(bench_rm);
418	}
419
420	/* Benchmark data reconstruction methods */
421	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
422	    BENCH_COLS, PARITY_PQR);
423
424	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
425		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
426
427	vdev_raidz_map_free(bench_rm);
428
429	/* cleanup the bench zio */
430	abd_free(bench_zio->io_abd);
431	kmem_free(bench_zio, sizeof (zio_t));
432#else
433	/*
434	 * Skip the benchmark in user space to avoid impacting libzpool
435	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
436	 * is assumed to be the fastest and used by default.
437	 */
438	memcpy(&vdev_raidz_fastest_impl,
439	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
440	    sizeof (vdev_raidz_fastest_impl));
441	strcpy(vdev_raidz_fastest_impl.name, "fastest");
442#endif /* _KERNEL */
443}
444
445void
446vdev_raidz_math_init(void)
447{
448	/* Determine the fastest available implementation. */
449	benchmark_raidz();
450
451	/* Finish initialization */
452	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
453	raidz_math_initialized = B_TRUE;
454}
455
456void
457vdev_raidz_math_fini(void)
458{
459	raidz_impl_ops_t const *curr_impl;
460
461	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
462		curr_impl = raidz_all_maths[i];
463		if (curr_impl->fini)
464			curr_impl->fini();
465	}
466}
467
468static const struct {
469	char *name;
470	uint32_t sel;
471} math_impl_opts[] = {
472		{ "cycle",	IMPL_CYCLE },
473		{ "fastest",	IMPL_FASTEST },
474		{ "original",	IMPL_ORIGINAL },
475		{ "scalar",	IMPL_SCALAR }
476};
477
478/*
479 * Function sets desired raidz implementation.
480 *
481 * If we are called before init(), user preference will be saved in
482 * user_sel_impl, and applied in later init() call. This occurs when module
483 * parameter is specified on module load. Otherwise, directly update
484 * zfs_vdev_raidz_impl.
485 *
486 * @val		Name of raidz implementation to use
487 * @param	Unused.
488 */
489int
490vdev_raidz_impl_set(const char *val)
491{
492	int err = -EINVAL;
493	char req_name[RAIDZ_IMPL_NAME_MAX];
494	uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
495	size_t i;
496
497	/* sanitize input */
498	i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
499	if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
500		return (err);
501
502	strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
503	while (i > 0 && !!isspace(req_name[i-1]))
504		i--;
505	req_name[i] = '\0';
506
507	/* Check mandatory options */
508	for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
509		if (strcmp(req_name, math_impl_opts[i].name) == 0) {
510			impl = math_impl_opts[i].sel;
511			err = 0;
512			break;
513		}
514	}
515
516	/* check all supported impl if init() was already called */
517	if (err != 0 && raidz_math_initialized) {
518		/* check all supported implementations */
519		for (i = 0; i < raidz_supp_impl_cnt; i++) {
520			if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
521				impl = i;
522				err = 0;
523				break;
524			}
525		}
526	}
527
528	if (err == 0) {
529		if (raidz_math_initialized)
530			atomic_swap_32(&zfs_vdev_raidz_impl, impl);
531		else
532			atomic_swap_32(&user_sel_impl, impl);
533	}
534
535	return (err);
536}
537
538#if defined(_KERNEL) && defined(__linux__)
539
540static int
541zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
542{
543	return (vdev_raidz_impl_set(val));
544}
545
546static int
547zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
548{
549	int i, cnt = 0;
550	char *fmt;
551	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
552
553	ASSERT(raidz_math_initialized);
554
555	/* list mandatory options */
556	for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
557		fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
558		cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
559	}
560
561	/* list all supported implementations */
562	for (i = 0; i < raidz_supp_impl_cnt; i++) {
563		fmt = (i == impl) ? "[%s] " : "%s ";
564		cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
565	}
566
567	return (cnt);
568}
569
570module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
571    zfs_vdev_raidz_impl_get, NULL, 0644);
572MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
573#endif
574