1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
14 * Copyright (c) 2019 by Delphix. All rights reserved.
15 * Copyright 2020 Joyent, Inc.
16 */
17
18/*
19 * ARC buffer data (ABD).
20 *
21 * ABDs are an abstract data structure for the ARC which can use two
22 * different ways of storing the underlying data:
23 *
24 * (a) Linear buffer. In this case, all the data in the ABD is stored in one
25 *     contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
26 *
27 *         +-------------------+
28 *         | ABD (linear)      |
29 *         |   abd_flags = ... |
30 *         |   abd_size = ...  |     +--------------------------------+
31 *         |   abd_buf ------------->| raw buffer of size abd_size    |
32 *         +-------------------+     +--------------------------------+
33 *              no abd_chunks
34 *
35 * (b) Scattered buffer. In this case, the data in the ABD is split into
36 *     equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
37 *     to the chunks recorded in an array at the end of the ABD structure.
38 *
39 *         +-------------------+
40 *         | ABD (scattered)   |
41 *         |   abd_flags = ... |
42 *         |   abd_size = ...  |
43 *         |   abd_offset = 0  |                           +-----------+
44 *         |   abd_chunks[0] ----------------------------->| chunk 0   |
45 *         |   abd_chunks[1] ---------------------+        +-----------+
46 *         |   ...             |                  |        +-----------+
47 *         |   abd_chunks[N-1] ---------+         +------->| chunk 1   |
48 *         +-------------------+        |                  +-----------+
49 *                                      |                      ...
50 *                                      |                  +-----------+
51 *                                      +----------------->| chunk N-1 |
52 *                                                         +-----------+
53 *
54 * Using a large proportion of scattered ABDs decreases ARC fragmentation since
55 * when we are at the limit of allocatable space, using equal-size chunks will
56 * allow us to quickly reclaim enough space for a new large allocation (assuming
57 * it is also scattered).
58 *
59 * In addition to directly allocating a linear or scattered ABD, it is also
60 * possible to create an ABD by requesting the "sub-ABD" starting at an offset
61 * within an existing ABD. In linear buffers this is simple (set abd_buf of
62 * the new ABD to the starting point within the original raw buffer), but
63 * scattered ABDs are a little more complex. The new ABD makes a copy of the
64 * relevant abd_chunks pointers (but not the underlying data). However, to
65 * provide arbitrary rather than only chunk-aligned starting offsets, it also
66 * tracks an abd_offset field which represents the starting point of the data
67 * within the first chunk in abd_chunks. For both linear and scattered ABDs,
68 * creating an offset ABD marks the original ABD as the offset's parent, and the
69 * original ABD's abd_children refcount is incremented. This data allows us to
70 * ensure the root ABD isn't deleted before its children.
71 *
72 * Most consumers should never need to know what type of ABD they're using --
73 * the ABD public API ensures that it's possible to transparently switch from
74 * using a linear ABD to a scattered one when doing so would be beneficial.
75 *
76 * If you need to use the data within an ABD directly, if you know it's linear
77 * (because you allocated it) you can use abd_to_buf() to access the underlying
78 * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
79 * which will allocate a raw buffer if necessary. Use the abd_return_buf*
80 * functions to return any raw buffers that are no longer necessary when you're
81 * done using them.
82 *
83 * There are a variety of ABD APIs that implement basic buffer operations:
84 * compare, copy, read, write, and fill with zeroes. If you need a custom
85 * function which progressively accesses the whole ABD, use the abd_iterate_*
86 * functions.
87 */
88
89#include <sys/abd.h>
90#include <sys/param.h>
91#include <sys/zio.h>
92#include <sys/zfs_context.h>
93#include <sys/zfs_znode.h>
94
95typedef struct abd_stats {
96	kstat_named_t abdstat_struct_size;
97	kstat_named_t abdstat_scatter_cnt;
98	kstat_named_t abdstat_scatter_data_size;
99	kstat_named_t abdstat_scatter_chunk_waste;
100	kstat_named_t abdstat_linear_cnt;
101	kstat_named_t abdstat_linear_data_size;
102} abd_stats_t;
103
104static abd_stats_t abd_stats = {
105	/* Amount of memory occupied by all of the abd_t struct allocations */
106	{ "struct_size",			KSTAT_DATA_UINT64 },
107	/*
108	 * The number of scatter ABDs which are currently allocated, excluding
109	 * ABDs which don't own their data (for instance the ones which were
110	 * allocated through abd_get_offset()).
111	 */
112	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
113	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
114	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
115	/*
116	 * The amount of space wasted at the end of the last chunk across all
117	 * scatter ABDs tracked by scatter_cnt.
118	 */
119	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
120	/*
121	 * The number of linear ABDs which are currently allocated, excluding
122	 * ABDs which don't own their data (for instance the ones which were
123	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
124	 * ABD takes ownership of its buf then it will become tracked.
125	 */
126	{ "linear_cnt",				KSTAT_DATA_UINT64 },
127	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
128	{ "linear_data_size",			KSTAT_DATA_UINT64 },
129};
130
131#define	ABDSTAT(stat)		(abd_stats.stat.value.ui64)
132#define	ABDSTAT_INCR(stat, val) \
133	atomic_add_64(&abd_stats.stat.value.ui64, (val))
134#define	ABDSTAT_BUMP(stat)	ABDSTAT_INCR(stat, 1)
135#define	ABDSTAT_BUMPDOWN(stat)	ABDSTAT_INCR(stat, -1)
136
137/*
138 * It is possible to make all future ABDs be linear by setting this to B_FALSE.
139 * Otherwise, ABDs are allocated scattered by default unless the caller uses
140 * abd_alloc_linear().
141 */
142boolean_t zfs_abd_scatter_enabled = B_TRUE;
143
144/*
145 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
146 * ABD's.  Smaller allocations will use linear ABD's which uses
147 * zio_[data_]buf_alloc().
148 *
149 * Scatter ABD's use at least one page each, so sub-page allocations waste
150 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
151 * half of each page).  Using linear ABD's for small allocations means that
152 * they will be put on slabs which contain many allocations.  This can
153 * improve memory efficiency, but it also makes it much harder for ARC
154 * evictions to actually free pages, because all the buffers on one slab need
155 * to be freed in order for the slab (and underlying pages) to be freed.
156 * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
157 * possible for them to actually waste more memory than scatter (one page per
158 * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
159 *
160 * Spill blocks are typically 512B and are heavily used on systems running
161 * selinux with the default dnode size and the `xattr=sa` property set.
162 *
163 * By default we use linear allocations for 512B and 1KB, and scatter
164 * allocations for larger (1.5KB and up).
165 */
166int zfs_abd_scatter_min_size = 512 * 3;
167
168/*
169 * The size of the chunks ABD allocates. Because the sizes allocated from the
170 * kmem_cache can't change, this tunable can only be modified at boot. Changing
171 * it at runtime would cause ABD iteration to work incorrectly for ABDs which
172 * were allocated with the old size, so a safeguard has been put in place which
173 * will cause the machine to panic if you change it and try to access the data
174 * within a scattered ABD.
175 */
176size_t zfs_abd_chunk_size = 4096;
177
178#ifdef _KERNEL
179extern vmem_t *zio_alloc_arena;
180#endif
181
182kmem_cache_t *abd_chunk_cache;
183static kstat_t *abd_ksp;
184
185extern inline boolean_t abd_is_linear(abd_t *abd);
186extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size);
187extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size);
188extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size);
189extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size);
190extern inline void abd_zero(abd_t *abd, size_t size);
191
192static void *
193abd_alloc_chunk()
194{
195	void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
196	ASSERT3P(c, !=, NULL);
197	return (c);
198}
199
200static void
201abd_free_chunk(void *c)
202{
203	kmem_cache_free(abd_chunk_cache, c);
204}
205
206void
207abd_init(void)
208{
209	vmem_t *data_alloc_arena = NULL;
210
211#ifdef _KERNEL
212	data_alloc_arena = zio_alloc_arena;
213#endif
214
215	/*
216	 * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
217	 * so that no allocator metadata is stored with the buffers.
218	 */
219	abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
220	    NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
221
222	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
223	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
224	if (abd_ksp != NULL) {
225		abd_ksp->ks_data = &abd_stats;
226		kstat_install(abd_ksp);
227	}
228}
229
230void
231abd_fini(void)
232{
233	if (abd_ksp != NULL) {
234		kstat_delete(abd_ksp);
235		abd_ksp = NULL;
236	}
237
238	kmem_cache_destroy(abd_chunk_cache);
239	abd_chunk_cache = NULL;
240}
241
242static inline size_t
243abd_chunkcnt_for_bytes(size_t size)
244{
245	return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
246}
247
248static inline size_t
249abd_scatter_chunkcnt(abd_t *abd)
250{
251	ASSERT(!abd_is_linear(abd));
252	return (abd_chunkcnt_for_bytes(
253	    abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
254}
255
256static inline void
257abd_verify(abd_t *abd)
258{
259	ASSERT3U(abd->abd_size, >, 0);
260	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
261	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
262	    ABD_FLAG_OWNER | ABD_FLAG_META));
263	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
264	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
265	if (abd_is_linear(abd)) {
266		ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
267	} else {
268		ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
269		    zfs_abd_chunk_size);
270		size_t n = abd_scatter_chunkcnt(abd);
271		for (int i = 0; i < n; i++) {
272			ASSERT3P(
273			    abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
274		}
275	}
276}
277
278static inline abd_t *
279abd_alloc_struct(size_t chunkcnt)
280{
281	size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
282	abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
283	ASSERT3P(abd, !=, NULL);
284	ABDSTAT_INCR(abdstat_struct_size, size);
285
286	return (abd);
287}
288
289static inline void
290abd_free_struct(abd_t *abd)
291{
292	size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
293	int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
294	kmem_free(abd, size);
295	ABDSTAT_INCR(abdstat_struct_size, -size);
296}
297
298/*
299 * Allocate an ABD, along with its own underlying data buffers. Use this if you
300 * don't care whether the ABD is linear or not.
301 */
302abd_t *
303abd_alloc(size_t size, boolean_t is_metadata)
304{
305	/* see the comment above zfs_abd_scatter_min_size */
306	if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size)
307		return (abd_alloc_linear(size, is_metadata));
308
309	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
310
311	size_t n = abd_chunkcnt_for_bytes(size);
312	abd_t *abd = abd_alloc_struct(n);
313
314	abd->abd_flags = ABD_FLAG_OWNER;
315	if (is_metadata) {
316		abd->abd_flags |= ABD_FLAG_META;
317	}
318	abd->abd_size = size;
319	abd->abd_parent = NULL;
320	zfs_refcount_create(&abd->abd_children);
321
322	abd->abd_u.abd_scatter.abd_offset = 0;
323	abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
324
325	for (int i = 0; i < n; i++) {
326		void *c = abd_alloc_chunk();
327		ASSERT3P(c, !=, NULL);
328		abd->abd_u.abd_scatter.abd_chunks[i] = c;
329	}
330
331	ABDSTAT_BUMP(abdstat_scatter_cnt);
332	ABDSTAT_INCR(abdstat_scatter_data_size, size);
333	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
334	    n * zfs_abd_chunk_size - size);
335
336	return (abd);
337}
338
339static void
340abd_free_scatter(abd_t *abd)
341{
342	size_t n = abd_scatter_chunkcnt(abd);
343	for (int i = 0; i < n; i++) {
344		abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
345	}
346
347	zfs_refcount_destroy(&abd->abd_children);
348	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
349	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
350	ABDSTAT_INCR(abdstat_scatter_chunk_waste,
351	    abd->abd_size - n * zfs_abd_chunk_size);
352
353	abd_free_struct(abd);
354}
355
356/*
357 * Allocate an ABD that must be linear, along with its own underlying data
358 * buffer. Only use this when it would be very annoying to write your ABD
359 * consumer with a scattered ABD.
360 */
361abd_t *
362abd_alloc_linear(size_t size, boolean_t is_metadata)
363{
364	abd_t *abd = abd_alloc_struct(0);
365
366	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
367
368	abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
369	if (is_metadata) {
370		abd->abd_flags |= ABD_FLAG_META;
371	}
372	abd->abd_size = size;
373	abd->abd_parent = NULL;
374	zfs_refcount_create(&abd->abd_children);
375
376	if (is_metadata) {
377		abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
378	} else {
379		abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
380	}
381
382	ABDSTAT_BUMP(abdstat_linear_cnt);
383	ABDSTAT_INCR(abdstat_linear_data_size, size);
384
385	return (abd);
386}
387
388static void
389abd_free_linear(abd_t *abd)
390{
391	if (abd->abd_flags & ABD_FLAG_META) {
392		zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
393	} else {
394		zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
395	}
396
397	zfs_refcount_destroy(&abd->abd_children);
398	ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
399	ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
400
401	abd_free_struct(abd);
402}
403
404/*
405 * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
406 * abd_alloc_linear().
407 */
408void
409abd_free(abd_t *abd)
410{
411	abd_verify(abd);
412	ASSERT3P(abd->abd_parent, ==, NULL);
413	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
414	if (abd_is_linear(abd))
415		abd_free_linear(abd);
416	else
417		abd_free_scatter(abd);
418}
419
420/*
421 * Allocate an ABD of the same format (same metadata flag, same scatterize
422 * setting) as another ABD.
423 */
424abd_t *
425abd_alloc_sametype(abd_t *sabd, size_t size)
426{
427	boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
428	if (abd_is_linear(sabd)) {
429		return (abd_alloc_linear(size, is_metadata));
430	} else {
431		return (abd_alloc(size, is_metadata));
432	}
433}
434
435/*
436 * If we're going to use this ABD for doing I/O using the block layer, the
437 * consumer of the ABD data doesn't care if it's scattered or not, and we don't
438 * plan to store this ABD in memory for a long period of time, we should
439 * allocate the ABD type that requires the least data copying to do the I/O.
440 *
441 * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
442 * using a scatter/gather list we should switch to that and replace this call
443 * with vanilla abd_alloc().
444 */
445abd_t *
446abd_alloc_for_io(size_t size, boolean_t is_metadata)
447{
448	return (abd_alloc_linear(size, is_metadata));
449}
450
451/*
452 * Allocate a new ABD to point to offset off of sabd. It shares the underlying
453 * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
454 * any derived ABDs exist.
455 */
456/* ARGSUSED */
457static inline abd_t *
458abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
459{
460	abd_t *abd;
461
462	abd_verify(sabd);
463	ASSERT3U(off, <=, sabd->abd_size);
464
465	if (abd_is_linear(sabd)) {
466		abd = abd_alloc_struct(0);
467
468		/*
469		 * Even if this buf is filesystem metadata, we only track that
470		 * if we own the underlying data buffer, which is not true in
471		 * this case. Therefore, we don't ever use ABD_FLAG_META here.
472		 */
473		abd->abd_flags = ABD_FLAG_LINEAR;
474
475		abd->abd_u.abd_linear.abd_buf =
476		    (char *)sabd->abd_u.abd_linear.abd_buf + off;
477	} else {
478		size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
479		size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
480		    (new_offset / zfs_abd_chunk_size);
481
482		abd = abd_alloc_struct(chunkcnt);
483
484		/*
485		 * Even if this buf is filesystem metadata, we only track that
486		 * if we own the underlying data buffer, which is not true in
487		 * this case. Therefore, we don't ever use ABD_FLAG_META here.
488		 */
489		abd->abd_flags = 0;
490
491		abd->abd_u.abd_scatter.abd_offset =
492		    new_offset % zfs_abd_chunk_size;
493		abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
494
495		/* Copy the scatterlist starting at the correct offset */
496		(void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
497		    &sabd->abd_u.abd_scatter.abd_chunks[new_offset /
498		    zfs_abd_chunk_size],
499		    chunkcnt * sizeof (void *));
500	}
501
502	abd->abd_size = sabd->abd_size - off;
503	abd->abd_parent = sabd;
504	zfs_refcount_create(&abd->abd_children);
505	(void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
506
507	return (abd);
508}
509
510abd_t *
511abd_get_offset(abd_t *sabd, size_t off)
512{
513	size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
514
515	VERIFY3U(size, >, 0);
516
517	return (abd_get_offset_impl(sabd, off, size));
518}
519
520abd_t *
521abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
522{
523	ASSERT3U(off + size, <=, sabd->abd_size);
524
525	return (abd_get_offset_impl(sabd, off, size));
526}
527
528
529/*
530 * Allocate a linear ABD structure for buf. You must free this with abd_put()
531 * since the resulting ABD doesn't own its own buffer.
532 */
533abd_t *
534abd_get_from_buf(void *buf, size_t size)
535{
536	abd_t *abd = abd_alloc_struct(0);
537
538	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
539
540	/*
541	 * Even if this buf is filesystem metadata, we only track that if we
542	 * own the underlying data buffer, which is not true in this case.
543	 * Therefore, we don't ever use ABD_FLAG_META here.
544	 */
545	abd->abd_flags = ABD_FLAG_LINEAR;
546	abd->abd_size = size;
547	abd->abd_parent = NULL;
548	zfs_refcount_create(&abd->abd_children);
549
550	abd->abd_u.abd_linear.abd_buf = buf;
551
552	return (abd);
553}
554
555/*
556 * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
557 * free the underlying scatterlist or buffer.
558 */
559void
560abd_put(abd_t *abd)
561{
562	abd_verify(abd);
563	ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
564
565	if (abd->abd_parent != NULL) {
566		(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
567		    abd->abd_size, abd);
568	}
569
570	zfs_refcount_destroy(&abd->abd_children);
571	abd_free_struct(abd);
572}
573
574/*
575 * Get the raw buffer associated with a linear ABD.
576 */
577void *
578abd_to_buf(abd_t *abd)
579{
580	ASSERT(abd_is_linear(abd));
581	abd_verify(abd);
582	return (abd->abd_u.abd_linear.abd_buf);
583}
584
585/*
586 * Borrow a raw buffer from an ABD without copying the contents of the ABD
587 * into the buffer. If the ABD is scattered, this will allocate a raw buffer
588 * whose contents are undefined. To copy over the existing data in the ABD, use
589 * abd_borrow_buf_copy() instead.
590 */
591void *
592abd_borrow_buf(abd_t *abd, size_t n)
593{
594	void *buf;
595	abd_verify(abd);
596	ASSERT3U(abd->abd_size, >=, n);
597	if (abd_is_linear(abd)) {
598		buf = abd_to_buf(abd);
599	} else {
600		buf = zio_buf_alloc(n);
601	}
602	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
603
604	return (buf);
605}
606
607void *
608abd_borrow_buf_copy(abd_t *abd, size_t n)
609{
610	void *buf = abd_borrow_buf(abd, n);
611	if (!abd_is_linear(abd)) {
612		abd_copy_to_buf(buf, abd, n);
613	}
614	return (buf);
615}
616
617/*
618 * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
619 * not change the contents of the ABD and will ASSERT that you didn't modify
620 * the buffer since it was borrowed. If you want any changes you made to buf to
621 * be copied back to abd, use abd_return_buf_copy() instead.
622 */
623void
624abd_return_buf(abd_t *abd, void *buf, size_t n)
625{
626	abd_verify(abd);
627	ASSERT3U(abd->abd_size, >=, n);
628	if (abd_is_linear(abd)) {
629		ASSERT3P(buf, ==, abd_to_buf(abd));
630	} else {
631		ASSERT0(abd_cmp_buf(abd, buf, n));
632		zio_buf_free(buf, n);
633	}
634	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
635}
636
637void
638abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
639{
640	if (!abd_is_linear(abd)) {
641		abd_copy_from_buf(abd, buf, n);
642	}
643	abd_return_buf(abd, buf, n);
644}
645
646/*
647 * Give this ABD ownership of the buffer that it's storing. Can only be used on
648 * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
649 * with abd_alloc_linear() which subsequently released ownership of their buf
650 * with abd_release_ownership_of_buf().
651 */
652void
653abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
654{
655	ASSERT(abd_is_linear(abd));
656	ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
657	abd_verify(abd);
658
659	abd->abd_flags |= ABD_FLAG_OWNER;
660	if (is_metadata) {
661		abd->abd_flags |= ABD_FLAG_META;
662	}
663
664	ABDSTAT_BUMP(abdstat_linear_cnt);
665	ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
666}
667
668void
669abd_release_ownership_of_buf(abd_t *abd)
670{
671	ASSERT(abd_is_linear(abd));
672	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
673	abd_verify(abd);
674
675	abd->abd_flags &= ~ABD_FLAG_OWNER;
676	/* Disable this flag since we no longer own the data buffer */
677	abd->abd_flags &= ~ABD_FLAG_META;
678
679	ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
680	ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
681}
682
683struct abd_iter {
684	abd_t		*iter_abd;	/* ABD being iterated through */
685	size_t		iter_pos;	/* position (relative to abd_offset) */
686	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
687	size_t		iter_mapsize;	/* length of data valid at mapaddr */
688};
689
690static inline size_t
691abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
692{
693	ASSERT(!abd_is_linear(aiter->iter_abd));
694	return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
695	    aiter->iter_pos) % zfs_abd_chunk_size);
696}
697
698static inline size_t
699abd_iter_scatter_chunk_index(struct abd_iter *aiter)
700{
701	ASSERT(!abd_is_linear(aiter->iter_abd));
702	return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
703	    aiter->iter_pos) / zfs_abd_chunk_size);
704}
705
706/*
707 * Initialize the abd_iter.
708 */
709static void
710abd_iter_init(struct abd_iter *aiter, abd_t *abd)
711{
712	abd_verify(abd);
713	aiter->iter_abd = abd;
714	aiter->iter_pos = 0;
715	aiter->iter_mapaddr = NULL;
716	aiter->iter_mapsize = 0;
717}
718
719/*
720 * Advance the iterator by a certain amount. Cannot be called when a chunk is
721 * in use. This can be safely called when the aiter has already exhausted, in
722 * which case this does nothing.
723 */
724static void
725abd_iter_advance(struct abd_iter *aiter, size_t amount)
726{
727	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
728	ASSERT0(aiter->iter_mapsize);
729
730	/* There's nothing left to advance to, so do nothing */
731	if (aiter->iter_pos == aiter->iter_abd->abd_size)
732		return;
733
734	aiter->iter_pos += amount;
735}
736
737/*
738 * Map the current chunk into aiter. This can be safely called when the aiter
739 * has already exhausted, in which case this does nothing.
740 */
741static void
742abd_iter_map(struct abd_iter *aiter)
743{
744	void *paddr;
745	size_t offset = 0;
746
747	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
748	ASSERT0(aiter->iter_mapsize);
749
750	/* Panic if someone has changed zfs_abd_chunk_size */
751	IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
752	    aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
753
754	/* There's nothing left to iterate over, so do nothing */
755	if (aiter->iter_pos == aiter->iter_abd->abd_size)
756		return;
757
758	if (abd_is_linear(aiter->iter_abd)) {
759		offset = aiter->iter_pos;
760		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
761		paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
762	} else {
763		size_t index = abd_iter_scatter_chunk_index(aiter);
764		offset = abd_iter_scatter_chunk_offset(aiter);
765		aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
766		    aiter->iter_abd->abd_size - aiter->iter_pos);
767		paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
768	}
769	aiter->iter_mapaddr = (char *)paddr + offset;
770}
771
772/*
773 * Unmap the current chunk from aiter. This can be safely called when the aiter
774 * has already exhausted, in which case this does nothing.
775 */
776static void
777abd_iter_unmap(struct abd_iter *aiter)
778{
779	/* There's nothing left to unmap, so do nothing */
780	if (aiter->iter_pos == aiter->iter_abd->abd_size)
781		return;
782
783	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
784	ASSERT3U(aiter->iter_mapsize, >, 0);
785
786	aiter->iter_mapaddr = NULL;
787	aiter->iter_mapsize = 0;
788}
789
790int
791abd_iterate_func(abd_t *abd, size_t off, size_t size,
792    abd_iter_func_t *func, void *private)
793{
794	int ret = 0;
795	struct abd_iter aiter;
796
797	abd_verify(abd);
798	ASSERT3U(off + size, <=, abd->abd_size);
799
800	abd_iter_init(&aiter, abd);
801	abd_iter_advance(&aiter, off);
802
803	while (size > 0) {
804		abd_iter_map(&aiter);
805
806		size_t len = MIN(aiter.iter_mapsize, size);
807		ASSERT3U(len, >, 0);
808
809		ret = func(aiter.iter_mapaddr, len, private);
810
811		abd_iter_unmap(&aiter);
812
813		if (ret != 0)
814			break;
815
816		size -= len;
817		abd_iter_advance(&aiter, len);
818	}
819
820	return (ret);
821}
822
823struct buf_arg {
824	void *arg_buf;
825};
826
827static int
828abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
829{
830	struct buf_arg *ba_ptr = private;
831
832	(void) memcpy(ba_ptr->arg_buf, buf, size);
833	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
834
835	return (0);
836}
837
838/*
839 * Copy abd to buf. (off is the offset in abd.)
840 */
841void
842abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
843{
844	struct buf_arg ba_ptr = { buf };
845
846	(void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
847	    &ba_ptr);
848}
849
850static int
851abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
852{
853	int ret;
854	struct buf_arg *ba_ptr = private;
855
856	ret = memcmp(buf, ba_ptr->arg_buf, size);
857	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
858
859	return (ret);
860}
861
862/*
863 * Compare the contents of abd to buf. (off is the offset in abd.)
864 */
865int
866abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
867{
868	struct buf_arg ba_ptr = { (void *) buf };
869
870	return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
871}
872
873static int
874abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
875{
876	struct buf_arg *ba_ptr = private;
877
878	(void) memcpy(buf, ba_ptr->arg_buf, size);
879	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
880
881	return (0);
882}
883
884/*
885 * Copy from buf to abd. (off is the offset in abd.)
886 */
887void
888abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
889{
890	struct buf_arg ba_ptr = { (void *) buf };
891
892	(void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
893	    &ba_ptr);
894}
895
896/*ARGSUSED*/
897static int
898abd_zero_off_cb(void *buf, size_t size, void *private)
899{
900	(void) memset(buf, 0, size);
901	return (0);
902}
903
904/*
905 * Zero out the abd from a particular offset to the end.
906 */
907void
908abd_zero_off(abd_t *abd, size_t off, size_t size)
909{
910	(void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
911}
912
913/*
914 * Iterate over two ABDs and call func incrementally on the two ABDs' data in
915 * equal-sized chunks (passed to func as raw buffers). func could be called many
916 * times during this iteration.
917 */
918int
919abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
920    size_t size, abd_iter_func2_t *func, void *private)
921{
922	int ret = 0;
923	struct abd_iter daiter, saiter;
924
925	abd_verify(dabd);
926	abd_verify(sabd);
927
928	ASSERT3U(doff + size, <=, dabd->abd_size);
929	ASSERT3U(soff + size, <=, sabd->abd_size);
930
931	abd_iter_init(&daiter, dabd);
932	abd_iter_init(&saiter, sabd);
933	abd_iter_advance(&daiter, doff);
934	abd_iter_advance(&saiter, soff);
935
936	while (size > 0) {
937		abd_iter_map(&daiter);
938		abd_iter_map(&saiter);
939
940		size_t dlen = MIN(daiter.iter_mapsize, size);
941		size_t slen = MIN(saiter.iter_mapsize, size);
942		size_t len = MIN(dlen, slen);
943		ASSERT(dlen > 0 || slen > 0);
944
945		ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
946		    private);
947
948		abd_iter_unmap(&saiter);
949		abd_iter_unmap(&daiter);
950
951		if (ret != 0)
952			break;
953
954		size -= len;
955		abd_iter_advance(&daiter, len);
956		abd_iter_advance(&saiter, len);
957	}
958
959	return (ret);
960}
961
962/*ARGSUSED*/
963static int
964abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
965{
966	(void) memcpy(dbuf, sbuf, size);
967	return (0);
968}
969
970/*
971 * Copy from sabd to dabd starting from soff and doff.
972 */
973void
974abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
975{
976	(void) abd_iterate_func2(dabd, sabd, doff, soff, size,
977	    abd_copy_off_cb, NULL);
978}
979
980/*ARGSUSED*/
981static int
982abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
983{
984	return (memcmp(bufa, bufb, size));
985}
986
987/*
988 * Compares the first size bytes of two ABDs.
989 */
990int
991abd_cmp(abd_t *dabd, abd_t *sabd, size_t size)
992{
993	return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
994}
995
996/*
997 * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
998 *
999 * @cabds          parity ABDs, must have equal size
1000 * @dabd           data ABD. Can be NULL (in this case @dsize = 0)
1001 * @func_raidz_gen should be implemented so that its behaviour
1002 *                 is the same when taking linear and when taking scatter
1003 */
1004void
1005abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
1006    ssize_t csize, ssize_t dsize, const unsigned parity,
1007    void (*func_raidz_gen)(void **, const void *, size_t, size_t))
1008{
1009	int i;
1010	ssize_t len, dlen;
1011	struct abd_iter caiters[3];
1012	struct abd_iter daiter = {0};
1013	void *caddrs[3];
1014
1015	ASSERT3U(parity, <=, 3);
1016
1017	for (i = 0; i < parity; i++)
1018		abd_iter_init(&caiters[i], cabds[i]);
1019
1020	if (dabd)
1021		abd_iter_init(&daiter, dabd);
1022
1023	ASSERT3S(dsize, >=, 0);
1024
1025#ifdef _KERNEL
1026	kpreempt_disable();
1027#endif
1028	while (csize > 0) {
1029		len = csize;
1030
1031		if (dabd && dsize > 0)
1032			abd_iter_map(&daiter);
1033
1034		for (i = 0; i < parity; i++) {
1035			abd_iter_map(&caiters[i]);
1036			caddrs[i] = caiters[i].iter_mapaddr;
1037		}
1038
1039		switch (parity) {
1040			case 3:
1041				len = MIN(caiters[2].iter_mapsize, len);
1042				/* falls through */
1043			case 2:
1044				len = MIN(caiters[1].iter_mapsize, len);
1045				/* falls through */
1046			case 1:
1047				len = MIN(caiters[0].iter_mapsize, len);
1048		}
1049
1050		/* must be progressive */
1051		ASSERT3S(len, >, 0);
1052
1053		if (dabd && dsize > 0) {
1054			/* this needs precise iter.length */
1055			len = MIN(daiter.iter_mapsize, len);
1056			len = MIN(dsize, len);
1057			dlen = len;
1058		} else
1059			dlen = 0;
1060
1061		/* must be progressive */
1062		ASSERT3S(len, >, 0);
1063		/*
1064		 * The iterated function likely will not do well if each
1065		 * segment except the last one is not multiple of 512 (raidz).
1066		 */
1067		ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1068
1069		func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
1070
1071		for (i = parity-1; i >= 0; i--) {
1072			abd_iter_unmap(&caiters[i]);
1073			abd_iter_advance(&caiters[i], len);
1074		}
1075
1076		if (dabd && dsize > 0) {
1077			abd_iter_unmap(&daiter);
1078			abd_iter_advance(&daiter, dlen);
1079			dsize -= dlen;
1080		}
1081
1082		csize -= len;
1083
1084		ASSERT3S(dsize, >=, 0);
1085		ASSERT3S(csize, >=, 0);
1086	}
1087#ifdef _KERNEL
1088	kpreempt_enable();
1089#endif
1090}
1091
1092/*
1093 * Iterate over code ABDs and data reconstruction target ABDs and call
1094 * @func_raidz_rec. Function maps at most 6 pages atomically.
1095 *
1096 * @cabds           parity ABDs, must have equal size
1097 * @tabds           rec target ABDs, at most 3
1098 * @tsize           size of data target columns
1099 * @func_raidz_rec  expects syndrome data in target columns. Function
1100 *                  reconstructs data and overwrites target columns.
1101 */
1102void
1103abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
1104    ssize_t tsize, const unsigned parity,
1105    void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
1106    const unsigned *mul),
1107    const unsigned *mul)
1108{
1109	int i;
1110	ssize_t len;
1111	struct abd_iter citers[3];
1112	struct abd_iter xiters[3];
1113	void *caddrs[3], *xaddrs[3];
1114
1115	ASSERT3U(parity, <=, 3);
1116
1117	for (i = 0; i < parity; i++) {
1118		abd_iter_init(&citers[i], cabds[i]);
1119		abd_iter_init(&xiters[i], tabds[i]);
1120	}
1121
1122#ifdef _KERNEL
1123	kpreempt_disable();
1124#endif
1125	while (tsize > 0) {
1126
1127		for (i = 0; i < parity; i++) {
1128			abd_iter_map(&citers[i]);
1129			abd_iter_map(&xiters[i]);
1130			caddrs[i] = citers[i].iter_mapaddr;
1131			xaddrs[i] = xiters[i].iter_mapaddr;
1132		}
1133
1134		len = tsize;
1135		switch (parity) {
1136			case 3:
1137				len = MIN(xiters[2].iter_mapsize, len);
1138				len = MIN(citers[2].iter_mapsize, len);
1139				/* falls through */
1140			case 2:
1141				len = MIN(xiters[1].iter_mapsize, len);
1142				len = MIN(citers[1].iter_mapsize, len);
1143				/* falls through */
1144			case 1:
1145				len = MIN(xiters[0].iter_mapsize, len);
1146				len = MIN(citers[0].iter_mapsize, len);
1147		}
1148		/* must be progressive */
1149		ASSERT3S(len, >, 0);
1150		/*
1151		 * The iterated function likely will not do well if each
1152		 * segment except the last one is not multiple of 512 (raidz).
1153		 */
1154		ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1155
1156		func_raidz_rec(xaddrs, len, caddrs, mul);
1157
1158		for (i = parity-1; i >= 0; i--) {
1159			abd_iter_unmap(&xiters[i]);
1160			abd_iter_unmap(&citers[i]);
1161			abd_iter_advance(&xiters[i], len);
1162			abd_iter_advance(&citers[i], len);
1163		}
1164
1165		tsize -= len;
1166		ASSERT3S(tsize, >=, 0);
1167	}
1168#ifdef _KERNEL
1169	kpreempt_enable();
1170#endif
1171}
1172