1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
23 */
24
25 #include <sys/zfs_context.h>
26 #include <sys/types.h>
27 #include <sys/zio.h>
28 #include <sys/debug.h>
29 #include <sys/zfs_debug.h>
30 #include <sys/vdev_raidz.h>
31 #include <sys/vdev_raidz_impl.h>
32 #include <sys/simd.h>
33
34 #ifndef isspace
35 #define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || \
36 (c) == '\r' || (c) == '\f' || (c) == '\013')
37 #endif
38
39 extern boolean_t raidz_will_scalar_work(void);
40
41 /* Opaque implementation with NULL methods to represent original methods */
42 static const raidz_impl_ops_t vdev_raidz_original_impl = {
43 .name = "original",
44 .is_supported = raidz_will_scalar_work,
45 };
46
47 /* RAIDZ parity op that contain the fastest methods */
48 static raidz_impl_ops_t vdev_raidz_fastest_impl = {
49 .name = "fastest"
50 };
51
52 /* All compiled in implementations */
53 const raidz_impl_ops_t *raidz_all_maths[] = {
54 &vdev_raidz_original_impl,
55 &vdev_raidz_scalar_impl,
56 #if defined(__amd64)
57 &vdev_raidz_sse2_impl,
58 #endif
59 #if defined(__amd64)
60 &vdev_raidz_ssse3_impl,
61 #endif
62 #if defined(__amd64)
63 &vdev_raidz_avx2_impl,
64 #endif
65 };
66
67 /* Indicate that benchmark has been completed */
68 static boolean_t raidz_math_initialized = B_FALSE;
69
70 /* Select raidz implementation */
71 #define IMPL_FASTEST (UINT32_MAX)
72 #define IMPL_CYCLE (UINT32_MAX - 1)
73 #define IMPL_ORIGINAL (0)
74 #define IMPL_SCALAR (1)
75
76 #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
77
78 static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
79 static uint32_t user_sel_impl = IMPL_FASTEST;
80
81 /* Hold all supported implementations */
82 static size_t raidz_supp_impl_cnt = 0;
83 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
84
85 #if defined(_KERNEL)
86 /*
87 * kstats values for supported implementations
88 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
89 *
90 * PORTING NOTE:
91 * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code
92 * which implements a free-form kstat using additional functionality that does
93 * not exist in illumos. Because there are no software consumers of this
94 * information, we omit a kstat API. If an administrator needs to see this
95 * data for some reason, they can use mdb.
96 *
97 * The format of the kstat data on OpenZFS would be a "header" that looks like
98 * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name"
99 * arrays, starting with the parity function "implementation" name):
100 * impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr
101 * This is followed by a row for each parity function implementation, showing
102 * the "speed" values calculated for that implementation for each of the
103 * parity generation and reconstruction functions in the "raidz_all_maths"
104 * array.
105 */
106 static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
107
108 #endif
109
110 /*
111 * Returns the RAIDZ operations for raidz_map() parity calculations. When
112 * a SIMD implementation is not allowed in the current context, then fallback
113 * to the fastest generic implementation.
114 */
115 const raidz_impl_ops_t *
vdev_raidz_math_get_ops(void)116 vdev_raidz_math_get_ops(void)
117 {
118 if (!kfpu_allowed())
119 return (&vdev_raidz_scalar_impl);
120
121 raidz_impl_ops_t *ops = NULL;
122 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
123
124 switch (impl) {
125 case IMPL_FASTEST:
126 ASSERT(raidz_math_initialized);
127 ops = &vdev_raidz_fastest_impl;
128 break;
129 case IMPL_CYCLE:
130 /* Cycle through all supported implementations */
131 ASSERT(raidz_math_initialized);
132 ASSERT3U(raidz_supp_impl_cnt, >, 0);
133 static size_t cycle_impl_idx = 0;
134 size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
135 ops = raidz_supp_impl[idx];
136 break;
137 case IMPL_ORIGINAL:
138 ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
139 break;
140 case IMPL_SCALAR:
141 ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
142 break;
143 default:
144 ASSERT3U(impl, <, raidz_supp_impl_cnt);
145 ASSERT3U(raidz_supp_impl_cnt, >, 0);
146 if (impl < ARRAY_SIZE(raidz_all_maths))
147 ops = raidz_supp_impl[impl];
148 break;
149 }
150
151 ASSERT3P(ops, !=, NULL);
152
153 return (ops);
154 }
155
156 /*
157 * Select parity generation method for raidz_map
158 */
159 int
vdev_raidz_math_generate(raidz_map_t * rm)160 vdev_raidz_math_generate(raidz_map_t *rm)
161 {
162 raidz_gen_f gen_parity = NULL;
163
164 switch (raidz_parity(rm)) {
165 case 1:
166 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
167 break;
168 case 2:
169 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
170 break;
171 case 3:
172 gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
173 break;
174 default:
175 gen_parity = NULL;
176 cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
177 (uint_t)raidz_parity(rm));
178 break;
179 }
180
181 /* if method is NULL execute the original implementation */
182 if (gen_parity == NULL)
183 return (RAIDZ_ORIGINAL_IMPL);
184
185 gen_parity(rm);
186
187 return (0);
188 }
189
190 static raidz_rec_f
reconstruct_fun_p_sel(raidz_map_t * rm,const int * parity_valid,const int nbaddata)191 reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
192 const int nbaddata)
193 {
194 if (nbaddata == 1 && parity_valid[CODE_P]) {
195 return (rm->rm_ops->rec[RAIDZ_REC_P]);
196 }
197 return ((raidz_rec_f) NULL);
198 }
199
200 static raidz_rec_f
reconstruct_fun_pq_sel(raidz_map_t * rm,const int * parity_valid,const int nbaddata)201 reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
202 const int nbaddata)
203 {
204 if (nbaddata == 1) {
205 if (parity_valid[CODE_P]) {
206 return (rm->rm_ops->rec[RAIDZ_REC_P]);
207 } else if (parity_valid[CODE_Q]) {
208 return (rm->rm_ops->rec[RAIDZ_REC_Q]);
209 }
210 } else if (nbaddata == 2 &&
211 parity_valid[CODE_P] && parity_valid[CODE_Q]) {
212 return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
213 }
214 return ((raidz_rec_f) NULL);
215 }
216
217 static raidz_rec_f
reconstruct_fun_pqr_sel(raidz_map_t * rm,const int * parity_valid,const int nbaddata)218 reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
219 const int nbaddata)
220 {
221 if (nbaddata == 1) {
222 if (parity_valid[CODE_P]) {
223 return (rm->rm_ops->rec[RAIDZ_REC_P]);
224 } else if (parity_valid[CODE_Q]) {
225 return (rm->rm_ops->rec[RAIDZ_REC_Q]);
226 } else if (parity_valid[CODE_R]) {
227 return (rm->rm_ops->rec[RAIDZ_REC_R]);
228 }
229 } else if (nbaddata == 2) {
230 if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
231 return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
232 } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
233 return (rm->rm_ops->rec[RAIDZ_REC_PR]);
234 } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
235 return (rm->rm_ops->rec[RAIDZ_REC_QR]);
236 }
237 } else if (nbaddata == 3 &&
238 parity_valid[CODE_P] && parity_valid[CODE_Q] &&
239 parity_valid[CODE_R]) {
240 return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
241 }
242 return ((raidz_rec_f) NULL);
243 }
244
245 /*
246 * Select data reconstruction method for raidz_map
247 * @parity_valid - Parity validity flag
248 * @dt - Failed data index array
249 * @nbaddata - Number of failed data columns
250 */
251 int
vdev_raidz_math_reconstruct(raidz_map_t * rm,const int * parity_valid,const int * dt,const int nbaddata)252 vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
253 const int *dt, const int nbaddata)
254 {
255 raidz_rec_f rec_fn = NULL;
256
257 switch (raidz_parity(rm)) {
258 case PARITY_P:
259 rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
260 break;
261 case PARITY_PQ:
262 rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
263 break;
264 case PARITY_PQR:
265 rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
266 break;
267 default:
268 cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
269 (uint_t)raidz_parity(rm));
270 break;
271 }
272
273 if (rec_fn == NULL)
274 return (RAIDZ_ORIGINAL_IMPL);
275 else
276 return (rec_fn(rm, dt));
277 }
278
279 const char *raidz_gen_name[] = {
280 "gen_p", "gen_pq", "gen_pqr"
281 };
282 const char *raidz_rec_name[] = {
283 "rec_p", "rec_q", "rec_r",
284 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
285 };
286
287 #if defined(_KERNEL)
288
289 #define BENCH_D_COLS (8ULL)
290 #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
291 #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
292 #define BENCH_NS MSEC2NSEC(25) /* 25ms */
293
294 typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
295
296 static void
benchmark_gen_impl(raidz_map_t * rm,const int fn)297 benchmark_gen_impl(raidz_map_t *rm, const int fn)
298 {
299 (void) fn;
300 vdev_raidz_generate_parity(rm);
301 }
302
303 static void
benchmark_rec_impl(raidz_map_t * rm,const int fn)304 benchmark_rec_impl(raidz_map_t *rm, const int fn)
305 {
306 static const int rec_tgt[7][3] = {
307 {1, 2, 3}, /* rec_p: bad QR & D[0] */
308 {0, 2, 3}, /* rec_q: bad PR & D[0] */
309 {0, 1, 3}, /* rec_r: bad PQ & D[0] */
310 {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
311 {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
312 {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
313 {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
314 };
315
316 vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
317 }
318
319 /*
320 * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
321 * is performed by setting the rm_ops pointer and calling the top level
322 * generate/reconstruct methods of bench_rm.
323 */
324 static void
benchmark_raidz_impl(raidz_map_t * bench_rm,const int fn,benchmark_fn bench_fn)325 benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
326 {
327 uint64_t run_cnt, speed, best_speed = 0;
328 hrtime_t t_start, t_diff;
329 raidz_impl_ops_t *curr_impl;
330 raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
331 int impl, i;
332
333 for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
334 /* set an implementation to benchmark */
335 curr_impl = raidz_supp_impl[impl];
336 bench_rm->rm_ops = curr_impl;
337
338 run_cnt = 0;
339 t_start = gethrtime();
340
341 do {
342 for (i = 0; i < 25; i++, run_cnt++)
343 bench_fn(bench_rm, fn);
344
345 t_diff = gethrtime() - t_start;
346 } while (t_diff < BENCH_NS);
347
348 speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
349 speed /= (t_diff * BENCH_COLS);
350
351 if (bench_fn == benchmark_gen_impl)
352 raidz_impl_kstats[impl].gen[fn] = speed;
353 else
354 raidz_impl_kstats[impl].rec[fn] = speed;
355
356 /* Update fastest implementation method */
357 if (speed > best_speed) {
358 best_speed = speed;
359
360 if (bench_fn == benchmark_gen_impl) {
361 fstat->gen[fn] = impl;
362 vdev_raidz_fastest_impl.gen[fn] =
363 curr_impl->gen[fn];
364 } else {
365 fstat->rec[fn] = impl;
366 vdev_raidz_fastest_impl.rec[fn] =
367 curr_impl->rec[fn];
368 }
369 }
370 }
371 }
372 #endif
373
374 /*
375 * Initialize and benchmark all supported implementations.
376 */
377 static void
benchmark_raidz(void)378 benchmark_raidz(void)
379 {
380 raidz_impl_ops_t *curr_impl;
381 int i, c;
382
383 /* Move supported impl into raidz_supp_impl */
384 for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
385 curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
386
387 if (curr_impl->init)
388 curr_impl->init();
389
390 if (curr_impl->is_supported())
391 raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
392 }
393 membar_producer(); /* complete raidz_supp_impl[] init */
394 raidz_supp_impl_cnt = c; /* number of supported impl */
395
396 #if defined(_KERNEL)
397 zio_t *bench_zio = NULL;
398 raidz_map_t *bench_rm = NULL;
399 uint64_t bench_parity;
400
401 /* Fake a zio and run the benchmark on a warmed up buffer */
402 bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
403 bench_zio->io_offset = 0;
404 bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
405 bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
406 memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
407
408 /* Benchmark parity generation methods */
409 for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
410 bench_parity = fn + 1;
411 /* New raidz_map is needed for each generate_p/q/r */
412 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
413 BENCH_D_COLS + bench_parity, bench_parity);
414
415 benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
416
417 vdev_raidz_map_free(bench_rm);
418 }
419
420 /* Benchmark data reconstruction methods */
421 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
422 BENCH_COLS, PARITY_PQR);
423
424 for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
425 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
426
427 vdev_raidz_map_free(bench_rm);
428
429 /* cleanup the bench zio */
430 abd_free(bench_zio->io_abd);
431 kmem_free(bench_zio, sizeof (zio_t));
432 #else
433 /*
434 * Skip the benchmark in user space to avoid impacting libzpool
435 * consumers (zdb, zhack, zinject, ztest). The last implementation
436 * is assumed to be the fastest and used by default.
437 */
438 memcpy(&vdev_raidz_fastest_impl,
439 raidz_supp_impl[raidz_supp_impl_cnt - 1],
440 sizeof (vdev_raidz_fastest_impl));
441 strcpy(vdev_raidz_fastest_impl.name, "fastest");
442 #endif /* _KERNEL */
443 }
444
445 void
vdev_raidz_math_init(void)446 vdev_raidz_math_init(void)
447 {
448 /* Determine the fastest available implementation. */
449 benchmark_raidz();
450
451 /* Finish initialization */
452 atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
453 raidz_math_initialized = B_TRUE;
454 }
455
456 void
vdev_raidz_math_fini(void)457 vdev_raidz_math_fini(void)
458 {
459 raidz_impl_ops_t const *curr_impl;
460
461 for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
462 curr_impl = raidz_all_maths[i];
463 if (curr_impl->fini)
464 curr_impl->fini();
465 }
466 }
467
468 static const struct {
469 char *name;
470 uint32_t sel;
471 } math_impl_opts[] = {
472 { "cycle", IMPL_CYCLE },
473 { "fastest", IMPL_FASTEST },
474 { "original", IMPL_ORIGINAL },
475 { "scalar", IMPL_SCALAR }
476 };
477
478 /*
479 * Function sets desired raidz implementation.
480 *
481 * If we are called before init(), user preference will be saved in
482 * user_sel_impl, and applied in later init() call. This occurs when module
483 * parameter is specified on module load. Otherwise, directly update
484 * zfs_vdev_raidz_impl.
485 *
486 * @val Name of raidz implementation to use
487 * @param Unused.
488 */
489 int
vdev_raidz_impl_set(const char * val)490 vdev_raidz_impl_set(const char *val)
491 {
492 int err = -EINVAL;
493 char req_name[RAIDZ_IMPL_NAME_MAX];
494 uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
495 size_t i;
496
497 /* sanitize input */
498 i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
499 if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
500 return (err);
501
502 strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
503 while (i > 0 && !!isspace(req_name[i-1]))
504 i--;
505 req_name[i] = '\0';
506
507 /* Check mandatory options */
508 for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
509 if (strcmp(req_name, math_impl_opts[i].name) == 0) {
510 impl = math_impl_opts[i].sel;
511 err = 0;
512 break;
513 }
514 }
515
516 /* check all supported impl if init() was already called */
517 if (err != 0 && raidz_math_initialized) {
518 /* check all supported implementations */
519 for (i = 0; i < raidz_supp_impl_cnt; i++) {
520 if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
521 impl = i;
522 err = 0;
523 break;
524 }
525 }
526 }
527
528 if (err == 0) {
529 if (raidz_math_initialized)
530 atomic_swap_32(&zfs_vdev_raidz_impl, impl);
531 else
532 atomic_swap_32(&user_sel_impl, impl);
533 }
534
535 return (err);
536 }
537
538 #if defined(_KERNEL) && defined(__linux__)
539
540 static int
zfs_vdev_raidz_impl_set(const char * val,zfs_kernel_param_t * kp)541 zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
542 {
543 return (vdev_raidz_impl_set(val));
544 }
545
546 static int
zfs_vdev_raidz_impl_get(char * buffer,zfs_kernel_param_t * kp)547 zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
548 {
549 int i, cnt = 0;
550 char *fmt;
551 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
552
553 ASSERT(raidz_math_initialized);
554
555 /* list mandatory options */
556 for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
557 fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
558 cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
559 }
560
561 /* list all supported implementations */
562 for (i = 0; i < raidz_supp_impl_cnt; i++) {
563 fmt = (i == impl) ? "[%s] " : "%s ";
564 cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
565 }
566
567 return (cnt);
568 }
569
570 module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
571 zfs_vdev_raidz_impl_get, NULL, 0644);
572 MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
573 #endif
574