xref: /illumos-gate/usr/src/common/zfs/zfs_fletcher.c (revision 0886dcad)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
227e322df5SJonathan Adams  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
2545818ee1SMatthew Ahrens /*
2645818ee1SMatthew Ahrens  * Copyright 2013 Saso Kiselkov. All rights reserved.
27770499e1SDan Kimmel  * Copyright (c) 2016 by Delphix. All rights reserved.
28*0886dcadSAndy Fiddaman  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
29*0886dcadSAndy Fiddaman  * Copyright 2024 Oxide Computer Company
3045818ee1SMatthew Ahrens  */
31fa9e4066Sahrens 
327e322df5SJonathan Adams /*
337e322df5SJonathan Adams  * Fletcher Checksums
347e322df5SJonathan Adams  * ------------------
357e322df5SJonathan Adams  *
367e322df5SJonathan Adams  * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
377e322df5SJonathan Adams  * recurrence relations:
387e322df5SJonathan Adams  *
397e322df5SJonathan Adams  *	a  = a    + f
407e322df5SJonathan Adams  *	 i    i-1    i-1
417e322df5SJonathan Adams  *
427e322df5SJonathan Adams  *	b  = b    + a
437e322df5SJonathan Adams  *	 i    i-1    i
447e322df5SJonathan Adams  *
457e322df5SJonathan Adams  *	c  = c    + b		(fletcher-4 only)
467e322df5SJonathan Adams  *	 i    i-1    i
477e322df5SJonathan Adams  *
487e322df5SJonathan Adams  *	d  = d    + c		(fletcher-4 only)
497e322df5SJonathan Adams  *	 i    i-1    i
507e322df5SJonathan Adams  *
517e322df5SJonathan Adams  * Where
527e322df5SJonathan Adams  *	a_0 = b_0 = c_0 = d_0 = 0
537e322df5SJonathan Adams  * and
547e322df5SJonathan Adams  *	f_0 .. f_(n-1) are the input data.
557e322df5SJonathan Adams  *
567e322df5SJonathan Adams  * Using standard techniques, these translate into the following series:
577e322df5SJonathan Adams  *
587e322df5SJonathan Adams  *	     __n_			     __n_
597e322df5SJonathan Adams  *	     \   |			     \   |
607e322df5SJonathan Adams  *	a  =  >     f			b  =  >     i * f
617e322df5SJonathan Adams  *	 n   /___|   n - i		 n   /___|	 n - i
627e322df5SJonathan Adams  *	     i = 1			     i = 1
637e322df5SJonathan Adams  *
647e322df5SJonathan Adams  *
657e322df5SJonathan Adams  *	     __n_			     __n_
667e322df5SJonathan Adams  *	     \   |  i*(i+1)		     \   |  i*(i+1)*(i+2)
677e322df5SJonathan Adams  *	c  =  >     ------- f		d  =  >     ------------- f
687e322df5SJonathan Adams  *	 n   /___|     2     n - i	 n   /___|	  6	   n - i
697e322df5SJonathan Adams  *	     i = 1			     i = 1
707e322df5SJonathan Adams  *
717e322df5SJonathan Adams  * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
727e322df5SJonathan Adams  * Since the additions are done mod (2^64), errors in the high bits may not
737e322df5SJonathan Adams  * be noticed.  For this reason, fletcher-2 is deprecated.
747e322df5SJonathan Adams  *
757e322df5SJonathan Adams  * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
767e322df5SJonathan Adams  * A conservative estimate of how big the buffer can get before we overflow
777e322df5SJonathan Adams  * can be estimated using f_i = 0xffffffff for all i:
787e322df5SJonathan Adams  *
797e322df5SJonathan Adams  * % bc
807e322df5SJonathan Adams  *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
817e322df5SJonathan Adams  * 2264
827e322df5SJonathan Adams  *  quit
837e322df5SJonathan Adams  * %
847e322df5SJonathan Adams  *
857e322df5SJonathan Adams  * So blocks of up to 2k will not overflow.  Our largest block size is
867e322df5SJonathan Adams  * 128k, which has 32k 4-byte words, so we can compute the largest possible
877e322df5SJonathan Adams  * accumulators, then divide by 2^64 to figure the max amount of overflow:
887e322df5SJonathan Adams  *
897e322df5SJonathan Adams  * % bc
907e322df5SJonathan Adams  *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
917e322df5SJonathan Adams  *  a/2^64;b/2^64;c/2^64;d/2^64
927e322df5SJonathan Adams  * 0
937e322df5SJonathan Adams  * 0
947e322df5SJonathan Adams  * 1365
957e322df5SJonathan Adams  * 11186858
967e322df5SJonathan Adams  *  quit
977e322df5SJonathan Adams  * %
987e322df5SJonathan Adams  *
997e322df5SJonathan Adams  * So a and b cannot overflow.  To make sure each bit of input has some
1007e322df5SJonathan Adams  * effect on the contents of c and d, we can look at what the factors of
1017e322df5SJonathan Adams  * the coefficients in the equations for c_n and d_n are.  The number of 2s
1027e322df5SJonathan Adams  * in the factors determines the lowest set bit in the multiplier.  Running
1037e322df5SJonathan Adams  * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
1047e322df5SJonathan Adams  * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
1057e322df5SJonathan Adams  * the 64-bit accumulators, every bit of every f_i effects every accumulator,
1067e322df5SJonathan Adams  * even for 128k blocks.
1077e322df5SJonathan Adams  *
1087e322df5SJonathan Adams  * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
1097e322df5SJonathan Adams  * we could do our calculations mod (2^32 - 1) by adding in the carries
1107e322df5SJonathan Adams  * periodically, and store the number of carries in the top 32-bits.
1117e322df5SJonathan Adams  *
1127e322df5SJonathan Adams  * --------------------
1137e322df5SJonathan Adams  * Checksum Performance
1147e322df5SJonathan Adams  * --------------------
1157e322df5SJonathan Adams  *
1167e322df5SJonathan Adams  * There are two interesting components to checksum performance: cached and
1177e322df5SJonathan Adams  * uncached performance.  With cached data, fletcher-2 is about four times
1187e322df5SJonathan Adams  * faster than fletcher-4.  With uncached data, the performance difference is
1197e322df5SJonathan Adams  * negligible, since the cost of a cache fill dominates the processing time.
1207e322df5SJonathan Adams  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
1217e322df5SJonathan Adams  * efficient pass over the data.
1227e322df5SJonathan Adams  *
1237e322df5SJonathan Adams  * In normal operation, the data which is being checksummed is in a buffer
1247e322df5SJonathan Adams  * which has been filled either by:
1257e322df5SJonathan Adams  *
1267e322df5SJonathan Adams  *	1. a compression step, which will be mostly cached, or
1277e322df5SJonathan Adams  *	2. a bcopy() or copyin(), which will be uncached (because the
1287e322df5SJonathan Adams  *	   copy is cache-bypassing).
1297e322df5SJonathan Adams  *
1307e322df5SJonathan Adams  * For both cached and uncached data, both fletcher checksums are much faster
1317e322df5SJonathan Adams  * than sha-256, and slower than 'off', which doesn't touch the data at all.
1327e322df5SJonathan Adams  */
133fa9e4066Sahrens 
134fa9e4066Sahrens #include <sys/types.h>
135fa9e4066Sahrens #include <sys/sysmacros.h>
136fa9e4066Sahrens #include <sys/byteorder.h>
137*0886dcadSAndy Fiddaman #include <sys/simd.h>
138fa9e4066Sahrens #include <sys/spa.h>
139*0886dcadSAndy Fiddaman #include <sys/zio_checksum.h>
140*0886dcadSAndy Fiddaman #include <sys/zfs_context.h>
141770499e1SDan Kimmel #include <zfs_fletcher.h>
142fa9e4066Sahrens 
143*0886dcadSAndy Fiddaman #define	FLETCHER_MIN_SIMD_SIZE	64
144*0886dcadSAndy Fiddaman 
145*0886dcadSAndy Fiddaman #ifdef _KERNEL
146*0886dcadSAndy Fiddaman 
147*0886dcadSAndy Fiddaman #include <sys/atomic.h>
148*0886dcadSAndy Fiddaman #include <sys/disp.h>
149*0886dcadSAndy Fiddaman #define	KPREEMPT_DISABLE	kpreempt_disable()
150*0886dcadSAndy Fiddaman #define	KPREEMPT_ENABLE		kpreempt_enable()
151*0886dcadSAndy Fiddaman #define	MEMBAR_PRODUCER		membar_producer()
152*0886dcadSAndy Fiddaman 
153*0886dcadSAndy Fiddaman #else	/* _KERNEL */
154*0886dcadSAndy Fiddaman 
155*0886dcadSAndy Fiddaman #include <atomic.h>
156*0886dcadSAndy Fiddaman #include <string.h>
157*0886dcadSAndy Fiddaman #ifndef SET_ERROR
158*0886dcadSAndy Fiddaman #define	SET_ERROR(err) (err)
159*0886dcadSAndy Fiddaman #endif
160*0886dcadSAndy Fiddaman #define	KPREEMPT_DISABLE
161*0886dcadSAndy Fiddaman #define	KPREEMPT_ENABLE
162*0886dcadSAndy Fiddaman #define	MEMBAR_PRODUCER
163*0886dcadSAndy Fiddaman 
164*0886dcadSAndy Fiddaman #endif	/* _KERNEL */
165*0886dcadSAndy Fiddaman 
166*0886dcadSAndy Fiddaman static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
167*0886dcadSAndy Fiddaman static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
168*0886dcadSAndy Fiddaman static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
169*0886dcadSAndy Fiddaman     const void *buf, size_t size);
170*0886dcadSAndy Fiddaman static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
171*0886dcadSAndy Fiddaman     const void *buf, size_t size);
172*0886dcadSAndy Fiddaman static boolean_t fletcher_4_scalar_valid(void);
173*0886dcadSAndy Fiddaman 
174*0886dcadSAndy Fiddaman static const fletcher_4_ops_t fletcher_4_scalar_ops = {
175*0886dcadSAndy Fiddaman 	.init_native = fletcher_4_scalar_init,
176*0886dcadSAndy Fiddaman 	.fini_native = fletcher_4_scalar_fini,
177*0886dcadSAndy Fiddaman 	.compute_native = fletcher_4_scalar_native,
178*0886dcadSAndy Fiddaman 	.init_byteswap = fletcher_4_scalar_init,
179*0886dcadSAndy Fiddaman 	.fini_byteswap = fletcher_4_scalar_fini,
180*0886dcadSAndy Fiddaman 	.compute_byteswap = fletcher_4_scalar_byteswap,
181*0886dcadSAndy Fiddaman 	.valid = fletcher_4_scalar_valid,
182*0886dcadSAndy Fiddaman 	.uses_fpu_native = B_FALSE,
183*0886dcadSAndy Fiddaman 	.uses_fpu_byteswap = B_FALSE,
184*0886dcadSAndy Fiddaman 	.name = "scalar"
185*0886dcadSAndy Fiddaman };
186*0886dcadSAndy Fiddaman 
187*0886dcadSAndy Fiddaman static fletcher_4_ops_t fletcher_4_fastest_impl = {
188*0886dcadSAndy Fiddaman 	.name = "fastest",
189*0886dcadSAndy Fiddaman 	.valid = fletcher_4_scalar_valid
190*0886dcadSAndy Fiddaman };
191*0886dcadSAndy Fiddaman 
192*0886dcadSAndy Fiddaman static const fletcher_4_ops_t *fletcher_4_impls[] = {
193*0886dcadSAndy Fiddaman 	&fletcher_4_scalar_ops,
194*0886dcadSAndy Fiddaman 	&fletcher_4_superscalar_ops,
195*0886dcadSAndy Fiddaman 	&fletcher_4_superscalar4_ops,
196*0886dcadSAndy Fiddaman #ifdef __amd64
197*0886dcadSAndy Fiddaman 	&fletcher_4_sse2_ops,
198*0886dcadSAndy Fiddaman 	&fletcher_4_ssse3_ops,
199*0886dcadSAndy Fiddaman 	&fletcher_4_avx2_ops,
200*0886dcadSAndy Fiddaman 	&fletcher_4_avx512f_ops,
201*0886dcadSAndy Fiddaman 	&fletcher_4_avx512bw_ops,
202*0886dcadSAndy Fiddaman #endif
203*0886dcadSAndy Fiddaman };
204*0886dcadSAndy Fiddaman 
205*0886dcadSAndy Fiddaman /* Hold all supported implementations */
206*0886dcadSAndy Fiddaman static uint32_t fletcher_4_supp_impls_cnt = 0;
207*0886dcadSAndy Fiddaman static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
208*0886dcadSAndy Fiddaman 
209*0886dcadSAndy Fiddaman /* Select fletcher4 implementation */
210*0886dcadSAndy Fiddaman #define	IMPL_FASTEST		(UINT32_MAX)
211*0886dcadSAndy Fiddaman #define	IMPL_CYCLE		(UINT32_MAX - 1)
212*0886dcadSAndy Fiddaman #define	IMPL_SCALAR		(0)
213*0886dcadSAndy Fiddaman #define	IMPL_SUPERSCALAR	(1)
214*0886dcadSAndy Fiddaman #define	IMPL_SUPERSCALAR4	(2)
215*0886dcadSAndy Fiddaman 
216*0886dcadSAndy Fiddaman static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
217*0886dcadSAndy Fiddaman 
218*0886dcadSAndy Fiddaman #define	IMPL_READ(i)	(*(volatile uint32_t *) &(i))
219*0886dcadSAndy Fiddaman 
220*0886dcadSAndy Fiddaman static struct fletcher_4_impl_selector {
221*0886dcadSAndy Fiddaman 	const char	*fis_name;
222*0886dcadSAndy Fiddaman 	uint32_t	fis_sel;
223*0886dcadSAndy Fiddaman } fletcher_4_impl_selectors[] = {
224*0886dcadSAndy Fiddaman 	{ "cycle",	IMPL_CYCLE },
225*0886dcadSAndy Fiddaman 	{ "fastest",	IMPL_FASTEST },
226*0886dcadSAndy Fiddaman 	{ "scalar",	IMPL_SCALAR }
227*0886dcadSAndy Fiddaman };
228*0886dcadSAndy Fiddaman 
229*0886dcadSAndy Fiddaman #if defined(_KERNEL)
230*0886dcadSAndy Fiddaman static kstat_t *fletcher_4_kstat;
231*0886dcadSAndy Fiddaman static kstat_named_t fletcher_4_kstat_data[ARRAY_SIZE(fletcher_4_impls) * 2];
232*0886dcadSAndy Fiddaman 
233*0886dcadSAndy Fiddaman static struct fletcher_4_bench {
234*0886dcadSAndy Fiddaman 	uint64_t native;
235*0886dcadSAndy Fiddaman 	uint64_t byteswap;
236*0886dcadSAndy Fiddaman } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
237*0886dcadSAndy Fiddaman #endif
238*0886dcadSAndy Fiddaman 
239*0886dcadSAndy Fiddaman /* Indicate that benchmark has been completed */
240*0886dcadSAndy Fiddaman static boolean_t fletcher_4_initialized = B_FALSE;
241*0886dcadSAndy Fiddaman 
242fa9e4066Sahrens void
fletcher_init(zio_cksum_t * zcp)243770499e1SDan Kimmel fletcher_init(zio_cksum_t *zcp)
24466094686SDan Kimmel {
245770499e1SDan Kimmel 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
246770499e1SDan Kimmel }
247770499e1SDan Kimmel 
248770499e1SDan Kimmel int
fletcher_2_incremental_native(void * buf,size_t size,void * data)249770499e1SDan Kimmel fletcher_2_incremental_native(void *buf, size_t size, void *data)
250770499e1SDan Kimmel {
251770499e1SDan Kimmel 	zio_cksum_t *zcp = data;
252770499e1SDan Kimmel 
253fa9e4066Sahrens 	const uint64_t *ip = buf;
254fa9e4066Sahrens 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
255fa9e4066Sahrens 	uint64_t a0, b0, a1, b1;
256fa9e4066Sahrens 
257770499e1SDan Kimmel 	a0 = zcp->zc_word[0];
258770499e1SDan Kimmel 	a1 = zcp->zc_word[1];
259770499e1SDan Kimmel 	b0 = zcp->zc_word[2];
260770499e1SDan Kimmel 	b1 = zcp->zc_word[3];
261770499e1SDan Kimmel 
262770499e1SDan Kimmel 	for (; ip < ipend; ip += 2) {
263fa9e4066Sahrens 		a0 += ip[0];
264fa9e4066Sahrens 		a1 += ip[1];
265fa9e4066Sahrens 		b0 += a0;
266fa9e4066Sahrens 		b1 += a1;
267fa9e4066Sahrens 	}
268fa9e4066Sahrens 
269fa9e4066Sahrens 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
270770499e1SDan Kimmel 	return (0);
271fa9e4066Sahrens }
272fa9e4066Sahrens 
273fa9e4066Sahrens void
fletcher_2_native(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)274770499e1SDan Kimmel fletcher_2_native(const void *buf, size_t size,
275*0886dcadSAndy Fiddaman     const void *ctx_template __unused, zio_cksum_t *zcp)
276fa9e4066Sahrens {
277770499e1SDan Kimmel 	fletcher_init(zcp);
278770499e1SDan Kimmel 	(void) fletcher_2_incremental_native((void *) buf, size, zcp);
279770499e1SDan Kimmel }
280770499e1SDan Kimmel 
281770499e1SDan Kimmel int
fletcher_2_incremental_byteswap(void * buf,size_t size,void * data)282770499e1SDan Kimmel fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
283770499e1SDan Kimmel {
284770499e1SDan Kimmel 	zio_cksum_t *zcp = data;
285770499e1SDan Kimmel 
286fa9e4066Sahrens 	const uint64_t *ip = buf;
287fa9e4066Sahrens 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
288fa9e4066Sahrens 	uint64_t a0, b0, a1, b1;
289fa9e4066Sahrens 
290770499e1SDan Kimmel 	a0 = zcp->zc_word[0];
291770499e1SDan Kimmel 	a1 = zcp->zc_word[1];
292770499e1SDan Kimmel 	b0 = zcp->zc_word[2];
293770499e1SDan Kimmel 	b1 = zcp->zc_word[3];
294770499e1SDan Kimmel 
295770499e1SDan Kimmel 	for (; ip < ipend; ip += 2) {
296fa9e4066Sahrens 		a0 += BSWAP_64(ip[0]);
297fa9e4066Sahrens 		a1 += BSWAP_64(ip[1]);
298fa9e4066Sahrens 		b0 += a0;
299fa9e4066Sahrens 		b1 += a1;
300fa9e4066Sahrens 	}
301fa9e4066Sahrens 
302fa9e4066Sahrens 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
303770499e1SDan Kimmel 	return (0);
304fa9e4066Sahrens }
305fa9e4066Sahrens 
306fa9e4066Sahrens void
fletcher_2_byteswap(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)307770499e1SDan Kimmel fletcher_2_byteswap(const void *buf, size_t size,
308*0886dcadSAndy Fiddaman     const void *ctx_template __unused, zio_cksum_t *zcp)
309fa9e4066Sahrens {
310770499e1SDan Kimmel 	fletcher_init(zcp);
311770499e1SDan Kimmel 	(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
312fa9e4066Sahrens }
313fa9e4066Sahrens 
314*0886dcadSAndy Fiddaman static void
fletcher_4_scalar_init(fletcher_4_ctx_t * ctx)315*0886dcadSAndy Fiddaman fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
316fa9e4066Sahrens {
317*0886dcadSAndy Fiddaman 	ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
318*0886dcadSAndy Fiddaman }
319*0886dcadSAndy Fiddaman 
320*0886dcadSAndy Fiddaman static void
fletcher_4_scalar_fini(fletcher_4_ctx_t * ctx,zio_cksum_t * zcp)321*0886dcadSAndy Fiddaman fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
322*0886dcadSAndy Fiddaman {
323*0886dcadSAndy Fiddaman 	memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
324*0886dcadSAndy Fiddaman }
325fa9e4066Sahrens 
326*0886dcadSAndy Fiddaman static void
fletcher_4_scalar_native(fletcher_4_ctx_t * ctx,const void * buf,size_t size)327*0886dcadSAndy Fiddaman fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, size_t size)
328*0886dcadSAndy Fiddaman {
329ea8dc4b6Seschrock 	const uint32_t *ip = buf;
330ea8dc4b6Seschrock 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
331ea8dc4b6Seschrock 	uint64_t a, b, c, d;
332ea8dc4b6Seschrock 
333*0886dcadSAndy Fiddaman 	a = ctx->scalar.zc_word[0];
334*0886dcadSAndy Fiddaman 	b = ctx->scalar.zc_word[1];
335*0886dcadSAndy Fiddaman 	c = ctx->scalar.zc_word[2];
336*0886dcadSAndy Fiddaman 	d = ctx->scalar.zc_word[3];
337ea8dc4b6Seschrock 
338ea8dc4b6Seschrock 	for (; ip < ipend; ip++) {
339ea8dc4b6Seschrock 		a += ip[0];
340ea8dc4b6Seschrock 		b += a;
341ea8dc4b6Seschrock 		c += b;
342ea8dc4b6Seschrock 		d += c;
343ea8dc4b6Seschrock 	}
344ea8dc4b6Seschrock 
345*0886dcadSAndy Fiddaman 	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
346ea8dc4b6Seschrock }
347ea8dc4b6Seschrock 
348*0886dcadSAndy Fiddaman static void
fletcher_4_scalar_byteswap(fletcher_4_ctx_t * ctx,const void * buf,size_t size)349*0886dcadSAndy Fiddaman fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, size_t size)
350ea8dc4b6Seschrock {
351ea8dc4b6Seschrock 	const uint32_t *ip = buf;
352ea8dc4b6Seschrock 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
353ea8dc4b6Seschrock 	uint64_t a, b, c, d;
354ea8dc4b6Seschrock 
355*0886dcadSAndy Fiddaman 	a = ctx->scalar.zc_word[0];
356*0886dcadSAndy Fiddaman 	b = ctx->scalar.zc_word[1];
357*0886dcadSAndy Fiddaman 	c = ctx->scalar.zc_word[2];
358*0886dcadSAndy Fiddaman 	d = ctx->scalar.zc_word[3];
359ea8dc4b6Seschrock 
360ea8dc4b6Seschrock 	for (; ip < ipend; ip++) {
361ea8dc4b6Seschrock 		a += BSWAP_32(ip[0]);
362ea8dc4b6Seschrock 		b += a;
363ea8dc4b6Seschrock 		c += b;
364ea8dc4b6Seschrock 		d += c;
365ea8dc4b6Seschrock 	}
366ea8dc4b6Seschrock 
367*0886dcadSAndy Fiddaman 	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
368*0886dcadSAndy Fiddaman }
369*0886dcadSAndy Fiddaman 
370*0886dcadSAndy Fiddaman static boolean_t
fletcher_4_scalar_valid(void)371*0886dcadSAndy Fiddaman fletcher_4_scalar_valid(void)
372*0886dcadSAndy Fiddaman {
373*0886dcadSAndy Fiddaman 	return (B_TRUE);
374*0886dcadSAndy Fiddaman }
375*0886dcadSAndy Fiddaman 
376*0886dcadSAndy Fiddaman int
fletcher_4_impl_set(const char * val)377*0886dcadSAndy Fiddaman fletcher_4_impl_set(const char *val)
378*0886dcadSAndy Fiddaman {
379*0886dcadSAndy Fiddaman 	int err = EINVAL;
380*0886dcadSAndy Fiddaman 	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
381*0886dcadSAndy Fiddaman 	size_t i;
382*0886dcadSAndy Fiddaman 
383*0886dcadSAndy Fiddaman 	/* check mandatory implementations */
384*0886dcadSAndy Fiddaman 	for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
385*0886dcadSAndy Fiddaman 		const char *name = fletcher_4_impl_selectors[i].fis_name;
386*0886dcadSAndy Fiddaman 
387*0886dcadSAndy Fiddaman 		if (strcmp(val, name) == 0) {
388*0886dcadSAndy Fiddaman 			impl = fletcher_4_impl_selectors[i].fis_sel;
389*0886dcadSAndy Fiddaman 			err = 0;
390*0886dcadSAndy Fiddaman 			break;
391*0886dcadSAndy Fiddaman 		}
392*0886dcadSAndy Fiddaman 	}
393*0886dcadSAndy Fiddaman 
394*0886dcadSAndy Fiddaman 	if (err != 0 && fletcher_4_initialized) {
395*0886dcadSAndy Fiddaman 		/* check all supported implementations */
396*0886dcadSAndy Fiddaman 		for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
397*0886dcadSAndy Fiddaman 			const char *name = fletcher_4_supp_impls[i]->name;
398*0886dcadSAndy Fiddaman 
399*0886dcadSAndy Fiddaman 			if (strcmp(val, name) == 0) {
400*0886dcadSAndy Fiddaman 				impl = i;
401*0886dcadSAndy Fiddaman 				err = 0;
402*0886dcadSAndy Fiddaman 				break;
403*0886dcadSAndy Fiddaman 			}
404*0886dcadSAndy Fiddaman 		}
405*0886dcadSAndy Fiddaman 	}
406*0886dcadSAndy Fiddaman 
407*0886dcadSAndy Fiddaman 	if (err == 0) {
408*0886dcadSAndy Fiddaman 		atomic_swap_32(&fletcher_4_impl_chosen, impl);
409*0886dcadSAndy Fiddaman 		MEMBAR_PRODUCER;
410*0886dcadSAndy Fiddaman 	}
411*0886dcadSAndy Fiddaman 
412*0886dcadSAndy Fiddaman 	return (SET_ERROR(err));
413*0886dcadSAndy Fiddaman }
414*0886dcadSAndy Fiddaman 
415*0886dcadSAndy Fiddaman /*
416*0886dcadSAndy Fiddaman  * Returns the Fletcher 4 operations for checksums. When a SIMD
417*0886dcadSAndy Fiddaman  * implementation is not allowed in the current context, then fallback
418*0886dcadSAndy Fiddaman  * to the fastest generic implementation.
419*0886dcadSAndy Fiddaman  */
420*0886dcadSAndy Fiddaman static inline const fletcher_4_ops_t *
fletcher_4_impl_get(void)421*0886dcadSAndy Fiddaman fletcher_4_impl_get(void)
422*0886dcadSAndy Fiddaman {
423*0886dcadSAndy Fiddaman 	if (!kfpu_allowed())
424*0886dcadSAndy Fiddaman 		return (&fletcher_4_superscalar4_ops);
425*0886dcadSAndy Fiddaman 
426*0886dcadSAndy Fiddaman 	const fletcher_4_ops_t *ops = NULL;
427*0886dcadSAndy Fiddaman 	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
428*0886dcadSAndy Fiddaman 
429*0886dcadSAndy Fiddaman 	switch (impl) {
430*0886dcadSAndy Fiddaman 	case IMPL_FASTEST:
431*0886dcadSAndy Fiddaman 		ASSERT(fletcher_4_initialized);
432*0886dcadSAndy Fiddaman 		ops = &fletcher_4_fastest_impl;
433*0886dcadSAndy Fiddaman 		break;
434*0886dcadSAndy Fiddaman 	case IMPL_CYCLE:
435*0886dcadSAndy Fiddaman 		/* Cycle through supported implementations */
436*0886dcadSAndy Fiddaman 		ASSERT(fletcher_4_initialized);
437*0886dcadSAndy Fiddaman 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
438*0886dcadSAndy Fiddaman 
439*0886dcadSAndy Fiddaman 		static uint32_t cycle_count = 0;
440*0886dcadSAndy Fiddaman 		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
441*0886dcadSAndy Fiddaman 
442*0886dcadSAndy Fiddaman 		ops = fletcher_4_supp_impls[idx];
443*0886dcadSAndy Fiddaman 		break;
444*0886dcadSAndy Fiddaman 	default:
445*0886dcadSAndy Fiddaman 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
446*0886dcadSAndy Fiddaman 		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
447*0886dcadSAndy Fiddaman 
448*0886dcadSAndy Fiddaman 		ops = fletcher_4_supp_impls[impl];
449*0886dcadSAndy Fiddaman 		break;
450*0886dcadSAndy Fiddaman 	}
451*0886dcadSAndy Fiddaman 
452*0886dcadSAndy Fiddaman 	ASSERT3P(ops, !=, NULL);
453*0886dcadSAndy Fiddaman 
454*0886dcadSAndy Fiddaman 	return (ops);
455*0886dcadSAndy Fiddaman }
456*0886dcadSAndy Fiddaman 
457*0886dcadSAndy Fiddaman static inline void
fletcher_4_native_impl(const void * buf,size_t size,zio_cksum_t * zcp)458*0886dcadSAndy Fiddaman fletcher_4_native_impl(const void *buf, size_t size, zio_cksum_t *zcp)
459*0886dcadSAndy Fiddaman {
460*0886dcadSAndy Fiddaman 	fletcher_4_ctx_t ctx;
461*0886dcadSAndy Fiddaman 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
462*0886dcadSAndy Fiddaman 
463*0886dcadSAndy Fiddaman 	if (ops->uses_fpu_native)
464*0886dcadSAndy Fiddaman 		kfpu_begin();
465*0886dcadSAndy Fiddaman 	ops->init_native(&ctx);
466*0886dcadSAndy Fiddaman 	ops->compute_native(&ctx, buf, size);
467*0886dcadSAndy Fiddaman 	ops->fini_native(&ctx, zcp);
468*0886dcadSAndy Fiddaman 	if (ops->uses_fpu_native)
469*0886dcadSAndy Fiddaman 		kfpu_end();
470*0886dcadSAndy Fiddaman }
471*0886dcadSAndy Fiddaman 
472*0886dcadSAndy Fiddaman void
fletcher_4_native(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)473*0886dcadSAndy Fiddaman fletcher_4_native(const void *buf, size_t size,
474*0886dcadSAndy Fiddaman     const void *ctx_template __unused, zio_cksum_t *zcp)
475*0886dcadSAndy Fiddaman {
476*0886dcadSAndy Fiddaman 	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
477*0886dcadSAndy Fiddaman 
478*0886dcadSAndy Fiddaman 	ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t)));
479*0886dcadSAndy Fiddaman 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
480*0886dcadSAndy Fiddaman 
481*0886dcadSAndy Fiddaman 	if (size == 0 || p2size == 0) {
482*0886dcadSAndy Fiddaman 		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
483*0886dcadSAndy Fiddaman 
484*0886dcadSAndy Fiddaman 		if (size > 0) {
485*0886dcadSAndy Fiddaman 			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
486*0886dcadSAndy Fiddaman 			    buf, size);
487*0886dcadSAndy Fiddaman 		}
488*0886dcadSAndy Fiddaman 	} else {
489*0886dcadSAndy Fiddaman 		fletcher_4_native_impl(buf, p2size, zcp);
490*0886dcadSAndy Fiddaman 
491*0886dcadSAndy Fiddaman 		if (p2size < size) {
492*0886dcadSAndy Fiddaman 			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
493*0886dcadSAndy Fiddaman 			    (char *)buf + p2size, size - p2size);
494*0886dcadSAndy Fiddaman 		}
495*0886dcadSAndy Fiddaman 	}
496*0886dcadSAndy Fiddaman }
497*0886dcadSAndy Fiddaman 
498*0886dcadSAndy Fiddaman void
fletcher_4_native_varsize(const void * buf,size_t size,zio_cksum_t * zcp)499*0886dcadSAndy Fiddaman fletcher_4_native_varsize(const void *buf, size_t size, zio_cksum_t *zcp)
500*0886dcadSAndy Fiddaman {
501*0886dcadSAndy Fiddaman 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
502*0886dcadSAndy Fiddaman 	fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
503*0886dcadSAndy Fiddaman }
504*0886dcadSAndy Fiddaman 
505*0886dcadSAndy Fiddaman static inline void
fletcher_4_byteswap_impl(const void * buf,size_t size,zio_cksum_t * zcp)506*0886dcadSAndy Fiddaman fletcher_4_byteswap_impl(const void *buf, size_t size, zio_cksum_t *zcp)
507*0886dcadSAndy Fiddaman {
508*0886dcadSAndy Fiddaman 	fletcher_4_ctx_t ctx;
509*0886dcadSAndy Fiddaman 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
510*0886dcadSAndy Fiddaman 
511*0886dcadSAndy Fiddaman 	if (ops->uses_fpu_byteswap)
512*0886dcadSAndy Fiddaman 		kfpu_begin();
513*0886dcadSAndy Fiddaman 	ops->init_byteswap(&ctx);
514*0886dcadSAndy Fiddaman 	ops->compute_byteswap(&ctx, buf, size);
515*0886dcadSAndy Fiddaman 	ops->fini_byteswap(&ctx, zcp);
516*0886dcadSAndy Fiddaman 	if (ops->uses_fpu_byteswap)
517*0886dcadSAndy Fiddaman 		kfpu_end();
518770499e1SDan Kimmel }
519770499e1SDan Kimmel 
520770499e1SDan Kimmel void
fletcher_4_byteswap(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)521770499e1SDan Kimmel fletcher_4_byteswap(const void *buf, size_t size,
522*0886dcadSAndy Fiddaman     const void *ctx_template __unused, zio_cksum_t *zcp)
523770499e1SDan Kimmel {
524*0886dcadSAndy Fiddaman 	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
525*0886dcadSAndy Fiddaman 
526*0886dcadSAndy Fiddaman 	ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t)));
527*0886dcadSAndy Fiddaman 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
528*0886dcadSAndy Fiddaman 
529*0886dcadSAndy Fiddaman 	if (size == 0 || p2size == 0) {
530*0886dcadSAndy Fiddaman 		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
531*0886dcadSAndy Fiddaman 
532*0886dcadSAndy Fiddaman 		if (size > 0) {
533*0886dcadSAndy Fiddaman 			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
534*0886dcadSAndy Fiddaman 			    buf, size);
535*0886dcadSAndy Fiddaman 		}
536*0886dcadSAndy Fiddaman 	} else {
537*0886dcadSAndy Fiddaman 		fletcher_4_byteswap_impl(buf, p2size, zcp);
538*0886dcadSAndy Fiddaman 
539*0886dcadSAndy Fiddaman 		if (p2size < size) {
540*0886dcadSAndy Fiddaman 			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
541*0886dcadSAndy Fiddaman 			    (char *)buf + p2size, size - p2size);
542*0886dcadSAndy Fiddaman 		}
543*0886dcadSAndy Fiddaman 	}
544ea8dc4b6Seschrock }
545*0886dcadSAndy Fiddaman 
546*0886dcadSAndy Fiddaman /* Incremental Fletcher 4 */
547*0886dcadSAndy Fiddaman 
548*0886dcadSAndy Fiddaman #define	ZFS_FLETCHER_4_INC_MAX_SIZE	(8ULL << 20)
549*0886dcadSAndy Fiddaman 
550*0886dcadSAndy Fiddaman static inline void
fletcher_4_incremental_combine(zio_cksum_t * zcp,const size_t size,const zio_cksum_t * nzcp)551*0886dcadSAndy Fiddaman fletcher_4_incremental_combine(zio_cksum_t *zcp, const size_t size,
552*0886dcadSAndy Fiddaman     const zio_cksum_t *nzcp)
553*0886dcadSAndy Fiddaman {
554*0886dcadSAndy Fiddaman 	const uint64_t c1 = size / sizeof (uint32_t);
555*0886dcadSAndy Fiddaman 	const uint64_t c2 = c1 * (c1 + 1) / 2;
556*0886dcadSAndy Fiddaman 	const uint64_t c3 = c2 * (c1 + 2) / 3;
557*0886dcadSAndy Fiddaman 
558*0886dcadSAndy Fiddaman 	/*
559*0886dcadSAndy Fiddaman 	 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
560*0886dcadSAndy Fiddaman 	 * reason we split incremental fletcher4 computation of large buffers
561*0886dcadSAndy Fiddaman 	 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
562*0886dcadSAndy Fiddaman 	 */
563*0886dcadSAndy Fiddaman 	ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
564*0886dcadSAndy Fiddaman 
565*0886dcadSAndy Fiddaman 	zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
566*0886dcadSAndy Fiddaman 	    c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
567*0886dcadSAndy Fiddaman 	zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
568*0886dcadSAndy Fiddaman 	    c2 * zcp->zc_word[0];
569*0886dcadSAndy Fiddaman 	zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
570*0886dcadSAndy Fiddaman 	zcp->zc_word[0] += nzcp->zc_word[0];
571*0886dcadSAndy Fiddaman }
572*0886dcadSAndy Fiddaman 
573*0886dcadSAndy Fiddaman static inline void
fletcher_4_incremental_impl(boolean_t native,const void * buf,size_t size,zio_cksum_t * zcp)574*0886dcadSAndy Fiddaman fletcher_4_incremental_impl(boolean_t native, const void *buf, size_t size,
575*0886dcadSAndy Fiddaman     zio_cksum_t *zcp)
576*0886dcadSAndy Fiddaman {
577*0886dcadSAndy Fiddaman 	while (size > 0) {
578*0886dcadSAndy Fiddaman 		zio_cksum_t nzc;
579*0886dcadSAndy Fiddaman 		uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
580*0886dcadSAndy Fiddaman 
581*0886dcadSAndy Fiddaman 		if (native)
582*0886dcadSAndy Fiddaman 			fletcher_4_native(buf, len, NULL, &nzc);
583*0886dcadSAndy Fiddaman 		else
584*0886dcadSAndy Fiddaman 			fletcher_4_byteswap(buf, len, NULL, &nzc);
585*0886dcadSAndy Fiddaman 
586*0886dcadSAndy Fiddaman 		fletcher_4_incremental_combine(zcp, len, &nzc);
587*0886dcadSAndy Fiddaman 
588*0886dcadSAndy Fiddaman 		size -= len;
589*0886dcadSAndy Fiddaman 		buf += len;
590*0886dcadSAndy Fiddaman 	}
591*0886dcadSAndy Fiddaman }
592*0886dcadSAndy Fiddaman 
593*0886dcadSAndy Fiddaman int
fletcher_4_incremental_native(void * buf,size_t size,void * data)594*0886dcadSAndy Fiddaman fletcher_4_incremental_native(void *buf, size_t size, void *data)
595*0886dcadSAndy Fiddaman {
596*0886dcadSAndy Fiddaman 	zio_cksum_t *zcp = data;
597*0886dcadSAndy Fiddaman 
598*0886dcadSAndy Fiddaman 	/* Use scalar impl to directly update cksum of small blocks */
599*0886dcadSAndy Fiddaman 	if (size < SPA_MINBLOCKSIZE)
600*0886dcadSAndy Fiddaman 		fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
601*0886dcadSAndy Fiddaman 	else
602*0886dcadSAndy Fiddaman 		fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
603*0886dcadSAndy Fiddaman 	return (0);
604*0886dcadSAndy Fiddaman }
605*0886dcadSAndy Fiddaman 
606*0886dcadSAndy Fiddaman int
fletcher_4_incremental_byteswap(void * buf,size_t size,void * data)607*0886dcadSAndy Fiddaman fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
608*0886dcadSAndy Fiddaman {
609*0886dcadSAndy Fiddaman 	zio_cksum_t *zcp = data;
610*0886dcadSAndy Fiddaman 
611*0886dcadSAndy Fiddaman 	/* Use scalar impl to directly update cksum of small blocks */
612*0886dcadSAndy Fiddaman 	if (size < SPA_MINBLOCKSIZE)
613*0886dcadSAndy Fiddaman 		fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
614*0886dcadSAndy Fiddaman 	else
615*0886dcadSAndy Fiddaman 		fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
616*0886dcadSAndy Fiddaman 	return (0);
617*0886dcadSAndy Fiddaman }
618*0886dcadSAndy Fiddaman 
619*0886dcadSAndy Fiddaman #define	FLETCHER_4_FASTEST_FN_COPY(type, src)				  \
620*0886dcadSAndy Fiddaman {									  \
621*0886dcadSAndy Fiddaman 	fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;	  \
622*0886dcadSAndy Fiddaman 	fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;	  \
623*0886dcadSAndy Fiddaman 	fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
624*0886dcadSAndy Fiddaman 	fletcher_4_fastest_impl.uses_fpu_ ## type = src->uses_fpu_ ## type; \
625*0886dcadSAndy Fiddaman }
626*0886dcadSAndy Fiddaman 
627*0886dcadSAndy Fiddaman #define	FLETCHER_4_BENCH_NS	(MSEC2NSEC(1))		/* 1ms */
628*0886dcadSAndy Fiddaman 
629*0886dcadSAndy Fiddaman typedef void fletcher_checksum_func_t(const void *, size_t, const void *,
630*0886dcadSAndy Fiddaman     zio_cksum_t *);
631*0886dcadSAndy Fiddaman 
632*0886dcadSAndy Fiddaman #if defined(_KERNEL)
633*0886dcadSAndy Fiddaman static void
fletcher_4_benchmark_impl(boolean_t native,char * data,size_t data_size)634*0886dcadSAndy Fiddaman fletcher_4_benchmark_impl(boolean_t native, char *data, size_t data_size)
635*0886dcadSAndy Fiddaman {
636*0886dcadSAndy Fiddaman 	struct fletcher_4_bench *fastest_stat =
637*0886dcadSAndy Fiddaman 	    &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
638*0886dcadSAndy Fiddaman 	hrtime_t start;
639*0886dcadSAndy Fiddaman 	uint64_t run_bw, run_time_ns, best_run = 0;
640*0886dcadSAndy Fiddaman 	zio_cksum_t zc;
641*0886dcadSAndy Fiddaman 	uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
642*0886dcadSAndy Fiddaman 
643*0886dcadSAndy Fiddaman 	fletcher_checksum_func_t *fletcher_4_test =
644*0886dcadSAndy Fiddaman 	    native ? fletcher_4_native : fletcher_4_byteswap;
645*0886dcadSAndy Fiddaman 
646*0886dcadSAndy Fiddaman 	for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
647*0886dcadSAndy Fiddaman 		struct fletcher_4_bench *stat = &fletcher_4_stat_data[i];
648*0886dcadSAndy Fiddaman 		uint64_t run_count = 0;
649*0886dcadSAndy Fiddaman 
650*0886dcadSAndy Fiddaman 		/* Temporarily set an implementation */
651*0886dcadSAndy Fiddaman 		fletcher_4_impl_chosen = i;
652*0886dcadSAndy Fiddaman 
653*0886dcadSAndy Fiddaman 		KPREEMPT_DISABLE;
654*0886dcadSAndy Fiddaman 		start = gethrtime();
655*0886dcadSAndy Fiddaman 		do {
656*0886dcadSAndy Fiddaman 			for (l = 0; l < 32; l++, run_count++)
657*0886dcadSAndy Fiddaman 				fletcher_4_test(data, data_size, NULL, &zc);
658*0886dcadSAndy Fiddaman 
659*0886dcadSAndy Fiddaman 			run_time_ns = gethrtime() - start;
660*0886dcadSAndy Fiddaman 		} while (run_time_ns < FLETCHER_4_BENCH_NS);
661*0886dcadSAndy Fiddaman 		KPREEMPT_ENABLE;
662*0886dcadSAndy Fiddaman 
663*0886dcadSAndy Fiddaman 		run_bw = data_size * run_count * NANOSEC;
664*0886dcadSAndy Fiddaman 		run_bw /= run_time_ns;	/* B/s */
665*0886dcadSAndy Fiddaman 
666*0886dcadSAndy Fiddaman 		if (native)
667*0886dcadSAndy Fiddaman 			stat->native = run_bw;
668*0886dcadSAndy Fiddaman 		else
669*0886dcadSAndy Fiddaman 			stat->byteswap = run_bw;
670*0886dcadSAndy Fiddaman 
671*0886dcadSAndy Fiddaman 		if (run_bw > best_run) {
672*0886dcadSAndy Fiddaman 			best_run = run_bw;
673*0886dcadSAndy Fiddaman 
674*0886dcadSAndy Fiddaman 			if (native) {
675*0886dcadSAndy Fiddaman 				fastest_stat->native = i;
676*0886dcadSAndy Fiddaman 				FLETCHER_4_FASTEST_FN_COPY(native,
677*0886dcadSAndy Fiddaman 				    fletcher_4_supp_impls[i]);
678*0886dcadSAndy Fiddaman 			} else {
679*0886dcadSAndy Fiddaman 				fastest_stat->byteswap = i;
680*0886dcadSAndy Fiddaman 				FLETCHER_4_FASTEST_FN_COPY(byteswap,
681*0886dcadSAndy Fiddaman 				    fletcher_4_supp_impls[i]);
682*0886dcadSAndy Fiddaman 			}
683*0886dcadSAndy Fiddaman 		}
684*0886dcadSAndy Fiddaman 	}
685*0886dcadSAndy Fiddaman 
686*0886dcadSAndy Fiddaman 	/* restore original selection */
687*0886dcadSAndy Fiddaman 	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
688*0886dcadSAndy Fiddaman }
689*0886dcadSAndy Fiddaman #endif /* _KERNEL */
690*0886dcadSAndy Fiddaman 
691*0886dcadSAndy Fiddaman /*
692*0886dcadSAndy Fiddaman  * Initialize and benchmark all supported implementations.
693*0886dcadSAndy Fiddaman  */
694*0886dcadSAndy Fiddaman static void
fletcher_4_benchmark(void)695*0886dcadSAndy Fiddaman fletcher_4_benchmark(void)
696*0886dcadSAndy Fiddaman {
697*0886dcadSAndy Fiddaman 	fletcher_4_ops_t *curr_impl;
698*0886dcadSAndy Fiddaman 	int i, c;
699*0886dcadSAndy Fiddaman 
700*0886dcadSAndy Fiddaman 	/* Move supported implementations into fletcher_4_supp_impls */
701*0886dcadSAndy Fiddaman 	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
702*0886dcadSAndy Fiddaman 		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
703*0886dcadSAndy Fiddaman 
704*0886dcadSAndy Fiddaman 		if (curr_impl->valid && curr_impl->valid())
705*0886dcadSAndy Fiddaman 			fletcher_4_supp_impls[c++] = curr_impl;
706*0886dcadSAndy Fiddaman 	}
707*0886dcadSAndy Fiddaman 	MEMBAR_PRODUCER;	/* complete fletcher_4_supp_impls[] init */
708*0886dcadSAndy Fiddaman 	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */
709*0886dcadSAndy Fiddaman 
710*0886dcadSAndy Fiddaman #if defined(_KERNEL)
711*0886dcadSAndy Fiddaman 	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
712*0886dcadSAndy Fiddaman 	char *databuf = kmem_alloc(data_size, KM_SLEEP);
713*0886dcadSAndy Fiddaman 
714*0886dcadSAndy Fiddaman 	for (i = 0; i < data_size / sizeof (uint64_t); i++)
715*0886dcadSAndy Fiddaman 		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
716*0886dcadSAndy Fiddaman 
717*0886dcadSAndy Fiddaman 	fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
718*0886dcadSAndy Fiddaman 	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
719*0886dcadSAndy Fiddaman 
720*0886dcadSAndy Fiddaman 	kmem_free(databuf, data_size);
721*0886dcadSAndy Fiddaman #else
722*0886dcadSAndy Fiddaman 	/*
723*0886dcadSAndy Fiddaman 	 * Skip the benchmark in user space to avoid impacting libzpool
724*0886dcadSAndy Fiddaman 	 * consumers (zdb, zhack, zinject, ztest). The last implementation
725*0886dcadSAndy Fiddaman 	 * is assumed to be the fastest and used by default.
726*0886dcadSAndy Fiddaman 	 */
727*0886dcadSAndy Fiddaman 	memcpy(&fletcher_4_fastest_impl,
728*0886dcadSAndy Fiddaman 	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
729*0886dcadSAndy Fiddaman 	    sizeof (fletcher_4_fastest_impl));
730*0886dcadSAndy Fiddaman 	fletcher_4_fastest_impl.name = "fastest";
731*0886dcadSAndy Fiddaman #endif /* _KERNEL */
732*0886dcadSAndy Fiddaman }
733*0886dcadSAndy Fiddaman 
734*0886dcadSAndy Fiddaman void
fletcher_4_init(void)735*0886dcadSAndy Fiddaman fletcher_4_init(void)
736*0886dcadSAndy Fiddaman {
737*0886dcadSAndy Fiddaman 	/* Determine the fastest available implementation. */
738*0886dcadSAndy Fiddaman 	fletcher_4_benchmark();
739*0886dcadSAndy Fiddaman 
740*0886dcadSAndy Fiddaman #if defined(_KERNEL)
741*0886dcadSAndy Fiddaman 	/* install kstats for all implementations */
742*0886dcadSAndy Fiddaman 	for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; i++) {
743*0886dcadSAndy Fiddaman 		struct fletcher_4_bench *stat = &fletcher_4_stat_data[i];
744*0886dcadSAndy Fiddaman 		const fletcher_4_ops_t *ops = fletcher_4_supp_impls[i];
745*0886dcadSAndy Fiddaman 		kstat_named_t *kstat_native = &fletcher_4_kstat_data[i * 2];
746*0886dcadSAndy Fiddaman 		kstat_named_t *kstat_byteswap =
747*0886dcadSAndy Fiddaman 		    &fletcher_4_kstat_data[i * 2 + 1];
748*0886dcadSAndy Fiddaman 
749*0886dcadSAndy Fiddaman 		(void) snprintf(kstat_native->name,
750*0886dcadSAndy Fiddaman 		    sizeof (kstat_native->name), "%s_native", ops->name);
751*0886dcadSAndy Fiddaman 		kstat_native->data_type = KSTAT_DATA_UINT64;
752*0886dcadSAndy Fiddaman 		kstat_native->value.ui64 = stat->native;
753*0886dcadSAndy Fiddaman 
754*0886dcadSAndy Fiddaman 		(void) snprintf(kstat_byteswap->name,
755*0886dcadSAndy Fiddaman 		    sizeof (kstat_byteswap->name), "%s_byteswap", ops->name);
756*0886dcadSAndy Fiddaman 		kstat_byteswap->data_type = KSTAT_DATA_UINT64;
757*0886dcadSAndy Fiddaman 		kstat_byteswap->value.ui64 = stat->byteswap;
758*0886dcadSAndy Fiddaman 	}
759*0886dcadSAndy Fiddaman 
760*0886dcadSAndy Fiddaman 	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
761*0886dcadSAndy Fiddaman 	    KSTAT_TYPE_NAMED, ARRAY_SIZE(fletcher_4_supp_impls) * 2,
762*0886dcadSAndy Fiddaman 	    KSTAT_FLAG_VIRTUAL);
763*0886dcadSAndy Fiddaman 
764*0886dcadSAndy Fiddaman 	if (fletcher_4_kstat != NULL) {
765*0886dcadSAndy Fiddaman 		fletcher_4_kstat->ks_data = fletcher_4_kstat_data;
766*0886dcadSAndy Fiddaman 		kstat_install(fletcher_4_kstat);
767*0886dcadSAndy Fiddaman 	}
768*0886dcadSAndy Fiddaman #endif
769*0886dcadSAndy Fiddaman 
770*0886dcadSAndy Fiddaman 	/* Finish initialization */
771*0886dcadSAndy Fiddaman 	fletcher_4_initialized = B_TRUE;
772*0886dcadSAndy Fiddaman }
773*0886dcadSAndy Fiddaman 
774*0886dcadSAndy Fiddaman void
fletcher_4_fini(void)775*0886dcadSAndy Fiddaman fletcher_4_fini(void)
776*0886dcadSAndy Fiddaman {
777*0886dcadSAndy Fiddaman #if defined(_KERNEL)
778*0886dcadSAndy Fiddaman 	if (fletcher_4_kstat != NULL) {
779*0886dcadSAndy Fiddaman 		kstat_delete(fletcher_4_kstat);
780*0886dcadSAndy Fiddaman 		fletcher_4_kstat = NULL;
781*0886dcadSAndy Fiddaman 	}
782*0886dcadSAndy Fiddaman #endif
783*0886dcadSAndy Fiddaman }
784*0886dcadSAndy Fiddaman 
785*0886dcadSAndy Fiddaman /* ABD adapters */
786*0886dcadSAndy Fiddaman 
787*0886dcadSAndy Fiddaman static void
abd_fletcher_4_init(zio_abd_checksum_data_t * cdp)788*0886dcadSAndy Fiddaman abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
789*0886dcadSAndy Fiddaman {
790*0886dcadSAndy Fiddaman 	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
791*0886dcadSAndy Fiddaman 	cdp->acd_private = (void *) ops;
792*0886dcadSAndy Fiddaman 
793*0886dcadSAndy Fiddaman 	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) {
794*0886dcadSAndy Fiddaman 		if (ops->uses_fpu_native)
795*0886dcadSAndy Fiddaman 			kfpu_begin();
796*0886dcadSAndy Fiddaman 		ops->init_native(cdp->acd_ctx);
797*0886dcadSAndy Fiddaman 	} else {
798*0886dcadSAndy Fiddaman 		if (ops->uses_fpu_byteswap)
799*0886dcadSAndy Fiddaman 			kfpu_begin();
800*0886dcadSAndy Fiddaman 		ops->init_byteswap(cdp->acd_ctx);
801*0886dcadSAndy Fiddaman 	}
802*0886dcadSAndy Fiddaman }
803*0886dcadSAndy Fiddaman 
804*0886dcadSAndy Fiddaman static void
abd_fletcher_4_fini(zio_abd_checksum_data_t * cdp)805*0886dcadSAndy Fiddaman abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
806*0886dcadSAndy Fiddaman {
807*0886dcadSAndy Fiddaman 	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
808*0886dcadSAndy Fiddaman 
809*0886dcadSAndy Fiddaman 	ASSERT(ops);
810*0886dcadSAndy Fiddaman 
811*0886dcadSAndy Fiddaman 	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) {
812*0886dcadSAndy Fiddaman 		ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
813*0886dcadSAndy Fiddaman 		if (ops->uses_fpu_native)
814*0886dcadSAndy Fiddaman 			kfpu_end();
815*0886dcadSAndy Fiddaman 	} else {
816*0886dcadSAndy Fiddaman 		ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
817*0886dcadSAndy Fiddaman 		if (ops->uses_fpu_byteswap)
818*0886dcadSAndy Fiddaman 			kfpu_end();
819*0886dcadSAndy Fiddaman 	}
820*0886dcadSAndy Fiddaman }
821*0886dcadSAndy Fiddaman 
822*0886dcadSAndy Fiddaman static void
abd_fletcher_4_simd2scalar(boolean_t native,void * data,size_t size,zio_abd_checksum_data_t * cdp)823*0886dcadSAndy Fiddaman abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
824*0886dcadSAndy Fiddaman     zio_abd_checksum_data_t *cdp)
825*0886dcadSAndy Fiddaman {
826*0886dcadSAndy Fiddaman 	zio_cksum_t *zcp = cdp->acd_zcp;
827*0886dcadSAndy Fiddaman 
828*0886dcadSAndy Fiddaman 	ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
829*0886dcadSAndy Fiddaman 
830*0886dcadSAndy Fiddaman 	abd_fletcher_4_fini(cdp);
831*0886dcadSAndy Fiddaman 	cdp->acd_private = (void *)&fletcher_4_scalar_ops;
832*0886dcadSAndy Fiddaman 
833*0886dcadSAndy Fiddaman 	if (native)
834*0886dcadSAndy Fiddaman 		fletcher_4_incremental_native(data, size, zcp);
835*0886dcadSAndy Fiddaman 	else
836*0886dcadSAndy Fiddaman 		fletcher_4_incremental_byteswap(data, size, zcp);
837*0886dcadSAndy Fiddaman }
838*0886dcadSAndy Fiddaman 
839*0886dcadSAndy Fiddaman static int
abd_fletcher_4_iter(void * data,size_t size,void * private)840*0886dcadSAndy Fiddaman abd_fletcher_4_iter(void *data, size_t size, void *private)
841*0886dcadSAndy Fiddaman {
842*0886dcadSAndy Fiddaman 	zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
843*0886dcadSAndy Fiddaman 	fletcher_4_ctx_t *ctx = cdp->acd_ctx;
844*0886dcadSAndy Fiddaman 	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
845*0886dcadSAndy Fiddaman 	boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
846*0886dcadSAndy Fiddaman 	uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
847*0886dcadSAndy Fiddaman 
848*0886dcadSAndy Fiddaman 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
849*0886dcadSAndy Fiddaman 
850*0886dcadSAndy Fiddaman 	if (asize > 0) {
851*0886dcadSAndy Fiddaman 		if (native)
852*0886dcadSAndy Fiddaman 			ops->compute_native(ctx, data, asize);
853*0886dcadSAndy Fiddaman 		else
854*0886dcadSAndy Fiddaman 			ops->compute_byteswap(ctx, data, asize);
855*0886dcadSAndy Fiddaman 
856*0886dcadSAndy Fiddaman 		size -= asize;
857*0886dcadSAndy Fiddaman 		data = (char *)data + asize;
858*0886dcadSAndy Fiddaman 	}
859*0886dcadSAndy Fiddaman 
860*0886dcadSAndy Fiddaman 	if (size > 0) {
861*0886dcadSAndy Fiddaman 		ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
862*0886dcadSAndy Fiddaman 		/* At this point we have to switch to scalar impl */
863*0886dcadSAndy Fiddaman 		abd_fletcher_4_simd2scalar(native, data, size, cdp);
864*0886dcadSAndy Fiddaman 	}
865*0886dcadSAndy Fiddaman 
866*0886dcadSAndy Fiddaman 	return (0);
867*0886dcadSAndy Fiddaman }
868*0886dcadSAndy Fiddaman 
869*0886dcadSAndy Fiddaman zio_abd_checksum_func_t fletcher_4_abd_ops = {
870*0886dcadSAndy Fiddaman 	.acf_init = abd_fletcher_4_init,
871*0886dcadSAndy Fiddaman 	.acf_fini = abd_fletcher_4_fini,
872*0886dcadSAndy Fiddaman 	.acf_iter = abd_fletcher_4_iter
873*0886dcadSAndy Fiddaman };
874