1/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source.  A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright 2017 Nexenta Inc.  All rights reserved.
14 * Copyright 2020 Joyent, Inc.
15 */
16
17/* needed when building libzpool */
18#ifndef	_KERNEL
19#include <sys/zfs_context.h>
20#endif
21
22#include <sys/sunddi.h>
23#include <sys/dkio.h>
24#include <sys/dkioc_free_util.h>
25#include <sys/sysmacros.h>
26#include <sys/file.h>
27#include <sys/sdt.h>
28
29static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *,
30    uint64_t len_blk);
31static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *,
32    uint64_t, dfl_iter_fn_t, void *, int);
33static int process_range(dkioc_free_list_t *, uint64_t, uint64_t,
34    dfl_iter_fn_t, void *, int);
35
36/*
37 * Copy-in convenience function for variable-length dkioc_free_list_t
38 * structures. The pointer to be copied from is in `arg' (may be a pointer
39 * to userspace). A new buffer is allocated and a pointer to it is placed
40 * in `out'. `ddi_flags' indicates whether the pointer is from user-
41 * or kernelspace (FKIOCTL) and `kmflags' are the flags passed to
42 * kmem_zalloc when allocating the new structure.
43 * Returns 0 on success, or an errno on failure.
44 */
45int
46dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags)
47{
48	dkioc_free_list_t *dfl;
49
50	if (ddi_flags & FKIOCTL) {
51		dkioc_free_list_t *dfl_in = arg;
52
53		if (dfl_in->dfl_num_exts == 0 ||
54		    dfl_in->dfl_num_exts > DFL_COPYIN_MAX_EXTS)
55			return (SET_ERROR(EINVAL));
56		dfl = kmem_alloc(DFL_SZ(dfl_in->dfl_num_exts), kmflags);
57		if (dfl == NULL)
58			return (SET_ERROR(ENOMEM));
59		bcopy(dfl_in, dfl, DFL_SZ(dfl_in->dfl_num_exts));
60	} else {
61		uint64_t num_exts;
62
63		if (ddi_copyin(((uint8_t *)arg) + offsetof(dkioc_free_list_t,
64		    dfl_num_exts), &num_exts, sizeof (num_exts),
65		    ddi_flags) != 0)
66			return (SET_ERROR(EFAULT));
67		if (num_exts == 0 || num_exts > DFL_COPYIN_MAX_EXTS)
68			return (SET_ERROR(EINVAL));
69		dfl = kmem_alloc(DFL_SZ(num_exts), kmflags);
70		if (dfl == NULL)
71			return (SET_ERROR(ENOMEM));
72		if (ddi_copyin(arg, dfl, DFL_SZ(num_exts), ddi_flags) != 0 ||
73		    dfl->dfl_num_exts != num_exts) {
74			kmem_free(dfl, DFL_SZ(num_exts));
75			return (SET_ERROR(EFAULT));
76		}
77	}
78
79	*out = dfl;
80	return (0);
81}
82
83/* Frees a variable-length dkioc_free_list_t structure. */
84void
85dfl_free(dkioc_free_list_t *dfl)
86{
87	kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
88}
89
90/*
91 * Convenience function to resize and segment the array of extents in
92 * a DKIOCFREE request as required by a driver.
93 *
94 * Some devices that implement DKIOCFREE (e.g. vioblk) have limits
95 * on either the number of extents that can be submitted in a single request,
96 * or the total number of blocks that can be submitted in a single request.
97 * In addition, devices may have alignment requirements on the starting
98 * address stricter than the device block size.
99 *
100 * Since there is currently no mechanism for callers of DKIOCFREE to discover
101 * such restrictions, instead of rejecting any requests that do not conform to
102 * some undiscoverable (to the caller) set of requirements, a driver can use
103 * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as
104 * required to conform to its requirements.
105 *
106 * The original request is passed as 'dfl' and the alignment requirements
107 * are passed in 'dfi'. Additionally the maximum offset of the device allowed
108 * in bytes) is passed as max_off -- this allows a driver with
109 * multiple instances of different sizes but similar requirements (e.g.
110 * a partitioned blkdev device) to not construct a separate dkioc_free_info_t
111 * struct for each device.
112 *
113 * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of
114 * arg passed to it as needed. If the extents in the dkioc_free_list_t passed
115 * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will
116 * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter()
117 * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t
118 * instances and populate them with the adjusted extents that do conform to the
119 * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t
120 * passed to it when this occurs. The net result is that 'func' can always
121 * assume it will be called with a dkioc_free_list_t with extents that
122 * comply with the requirements in 'dfi'. 'func' is also responsible for
123 * freeing the dkioc_free_list_t passed to it (likely via a completion
124 * callback).
125 *
126 * Combined with the behavior described above, dfl_iter() can be viewed as
127 * consuming the dkioc_free_list_t passed to it. Either it will pass it along
128 * to 'func' (and let 'func' handle freeing it), or it will free it and
129 * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still
130 * letting 'func' handle freeing the new instances). This way neither the
131 * dfl_iter() caller nor nor the driver need to worry about treating
132 * conforming and non-conforming requests differently.
133 *
134 * Unfortunately, the DKIOCFREE ioctl provides no method for communicating
135 * any notion of partial completion -- either it returns success (0) or
136 * an error. It's not clear if such a notion would even be possible while
137 * supporting multiple types of devices (NVMe, SCSI, etc.) with the same
138 * interface. As such, there's little benefit to providing more detailed error
139 * semantics beyond what DKIOCFREE can handle.
140 *
141 * Due to this, a somewhat simplistic approach is taken to error handling. The
142 * original list of extents is first checked to make sure they all appear
143 * valid -- that is they do not start or extend beyond the end of the device.
144 * Any request that contains such extents is always rejected in it's entirety.
145 * It is possible after applying any needed adjustments to the original list
146 * of extents that the result is not acceptable to the driver. For example,
147 * a device with a 512 byte block size that tries to free the range 513-1023
148 * (bytes) would not be able to be processed. Such extents will be silently
149 * ignored. If the original request consists of nothing but such requests,
150 * dfl_iter() will never call 'func' and will merely return 0.
151 */
152int
153dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off,
154    dfl_iter_fn_t func, void *arg, int kmflag)
155{
156	dkioc_free_list_ext_t *ext;
157	uint64_t n_bytes, n_segs, start_idx, i;
158	uint_t bsize = 1U << dfi->dfi_bshift;
159	int r = 0;
160	boolean_t need_copy = B_FALSE;
161
162	/*
163	 * Make sure the block size derived from dfi_bshift is at least 512
164	 * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is
165	 * to prevent any problems with other parts of the system that might
166	 * assume a minimum block size of 512, and the upper bound is just
167	 * to prevent overflow when creating the block size from dfi_bshift
168	 * (though it seems unlikely we'll have _block_ sizes near a GiB
169	 * any time soon).
170	 */
171	if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) {
172		r = SET_ERROR(EINVAL);
173		goto done;
174	}
175
176	/* Max bytes must be a multiple of the block size */
177	if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) {
178		r = SET_ERROR(EINVAL);
179		goto done;
180	}
181
182	/* Start offset alignment must also be a multiple of the block size */
183	if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) {
184		r = SET_ERROR(EINVAL);
185		goto done;
186	}
187
188	/* Max bytes in an extent must be a multiple of the block size */
189	if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) {
190		r = SET_ERROR(EINVAL);
191		goto done;
192	}
193
194	/*
195	 * It makes no sense to allow a single extent to be larger than the
196	 * total allowed for an entire request.
197	 */
198	if (dfi->dfi_max_ext_bytes > 0 &&
199	    dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) {
200		r = SET_ERROR(EINVAL);
201		goto done;
202	}
203
204	/*
205	 * The first pass, align everything as needed and make sure all the
206	 * extents look valid.
207	 */
208	if ((r = adjust_exts(dfl, dfi, max_off)) != 0) {
209		goto done;
210	}
211
212	/*
213	 * Go through and split things up as needed. The general idea is to
214	 * split along the original extent boundaries when needed. We only
215	 * split an extent from the original request into multiple extents
216	 * if the original extent is by itself too big for the device to
217	 * process in a single request.
218	 */
219	start_idx = 0;
220	n_bytes = n_segs = 0;
221	ext = dfl->dfl_exts;
222	for (i = 0; i < dfl->dfl_num_exts; i++, ext++) {
223		uint64_t start = dfl->dfl_offset + ext->dfle_start;
224		uint64_t len = ext->dfle_length;
225
226		if (len == 0) {
227			/*
228			 * If we encounter a zero length extent, we're going
229			 * to create a new copy of dfl no matter what --
230			 * the size of dfl is determined by dfl_num_exts so
231			 * we cannot do things like shift the contents and
232			 * reduce dfl_num_exts to get a contiguous array
233			 * of non-zero length extents.
234			 */
235			need_copy = B_TRUE;
236			continue;
237		}
238
239		if (dfi->dfi_max_ext_bytes > 0 &&
240		    len > dfi->dfi_max_ext_bytes) {
241			/*
242			 * An extent that's too large. Dispatch what we've
243			 * accumulated, and then split this extent into
244			 * smaller ones the device can accept.
245			 */
246			if ((r = process_range(dfl, start_idx, i - start_idx,
247			    func, arg, kmflag)) != 0) {
248				goto done;
249			}
250
251			if ((r = split_extent(dfl, dfi, i, func, arg,
252			    kmflag)) != 0) {
253				goto done;
254			}
255			start_idx = i + 1;
256			n_segs = 0;
257			n_bytes = 0;
258			continue;
259		}
260
261		if (dfi->dfi_max_bytes > 0 &&
262		    n_bytes + len > dfi->dfi_max_bytes) {
263			/*
264			 * This extent would put us over the limit for total
265			 * bytes that can be trimmed in one request.
266			 * Dispatch what we've accumulated. Then deal
267			 * with this extent.
268			 */
269			if ((r = process_range(dfl, start_idx, i - start_idx,
270			    func, arg, kmflag)) != 0) {
271				goto done;
272			}
273
274			if (len < dfi->dfi_max_bytes) {
275				/*
276				 * After dispatching what we've accumulated,
277				 * this extent can fit in a new request
278				 * Just add it to the accumulated list of
279				 * extents and move on.
280				 */
281				start_idx = i;
282				n_segs = 1;
283				n_bytes = len;
284				continue;
285			}
286
287			/*
288			 * Even after starting a new request, this extent
289			 * is too big. Split it until it fits.
290			 */
291			if ((r = split_extent(dfl, dfi, i, func, arg,
292			    kmflag)) != 0) {
293				goto done;
294			}
295
296			start_idx = i + 1;
297			n_segs = 0;
298			n_bytes = 0;
299			continue;
300		}
301
302		if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) {
303			/*
304			 * This extent will put us over the limit on the number
305			 * of extents the device can accept. Dispatch what
306			 * we've accumulated so far, start a new
307			 * request, then re-evaluate this extent (in case
308			 * the extent on it's own is too large).
309			 */
310			if ((r = process_range(dfl, start_idx, i - start_idx,
311			    func, arg, kmflag)) != 0) {
312				goto done;
313			}
314
315			start_idx = i;
316			n_segs = 0;
317			n_bytes = 0;
318			continue;
319		}
320
321		n_segs++;
322		n_bytes += len;
323	}
324
325	/*
326	 * If a copy wasn't required, and we haven't processed a subset of
327	 * the extents already, we can just use the original request.
328	 */
329	if (!need_copy && start_idx == 0) {
330		return (func(dfl, arg, kmflag));
331	}
332
333	r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag);
334
335done:
336	dfl_free(dfl);
337	return (r);
338}
339
340/*
341 * Adjust the start and length of each extent in dfl so that it conforms to
342 * the requirements in dfi. It also verifies that no extent extends beyond
343 * the end of the device (given by len_blk).
344 *
345 * Returns 0 on success, or an error value.
346 */
347static int
348adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi,
349    uint64_t max_off)
350{
351	dkioc_free_list_ext_t *exts = dfl->dfl_exts;
352	/*
353	 * These must be uint64_t to prevent the P2 macros from truncating
354	 * the result.
355	 */
356	const uint64_t align = dfi->dfi_align;
357	const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift;
358
359	for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) {
360		/*
361		 * Since there are no known requirements on the value of
362		 * dfl_offset, it's possible (though odd) to have a scenario
363		 * where dfl_offset == 1, and dfle_start == 511 (resulting
364		 * in an actual start offset of 512). As such, we always
365		 * apply the offset and find the resulting starting offset
366		 * and length (in bytes) first, then apply any rounding
367		 * and alignment.
368		 */
369		uint64_t start = exts->dfle_start + dfl->dfl_offset;
370		uint64_t end = start + exts->dfle_length;
371
372		/*
373		 * Make sure after applying dfl->dfl_offset and any alignment
374		 * adjustments that the results don't overflow.
375		 */
376		if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) {
377			return (SET_ERROR(EOVERFLOW));
378		}
379
380		if (end < start) {
381			return (SET_ERROR(EOVERFLOW));
382		}
383
384		/*
385		 * Make sure we don't extend past the end of the device
386		 */
387		if (end > max_off) {
388			return (SET_ERROR(EINVAL));
389		}
390
391		start = P2ROUNDUP(start, align);
392		end = P2ALIGN(end, bsize);
393
394		/*
395		 * Remove the offset so that when it's later applied again,
396		 * the correct start value is obtained.
397		 */
398		exts->dfle_start = start - dfl->dfl_offset;
399
400		/*
401		 * If the original length was less than the block size
402		 * of the device, we can end up with end < start. If that
403		 * happens we just set the length to zero.
404		 */
405		exts->dfle_length = (end < start) ? 0 : end - start;
406	}
407
408	return (0);
409}
410
411/*
412 * Take a subset of extents from dfl (starting at start_idx, with n entries)
413 * and create a new dkioc_free_list_t, passing that to func.
414 */
415static int
416process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n,
417    dfl_iter_fn_t func, void *arg, int kmflag)
418{
419	dkioc_free_list_t *new_dfl = NULL;
420	dkioc_free_list_ext_t *new_exts = NULL;
421	dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx;
422	size_t actual_n = n;
423	int r = 0;
424
425	if (n == 0) {
426		return (0);
427	}
428
429	/*
430	 * Ignore any zero length extents. No known devices attach any
431	 * semantic meaning to such extents, and are likely just a result of
432	 * narrowing the range of the extent to fit the device alignment
433	 * requirements. It is possible the original caller submitted a
434	 * zero length extent, but we ignore those as well. Since we can't
435	 * communicate partial results back to the caller anyway, it's
436	 * unclear whether reporting that one of potentially many exents was
437	 * too small (without being able to identify which one) to the caller
438	 * of the DKIOCFREE request would be useful.
439	 */
440	for (uint64_t i = 0; i < n; i++) {
441		if (exts[i].dfle_length == 0 && --actual_n == 0) {
442			return (0);
443		}
444	}
445
446	new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag);
447	if (new_dfl == NULL) {
448		return (SET_ERROR(ENOMEM));
449	}
450
451	new_dfl->dfl_flags = dfl->dfl_flags;
452	new_dfl->dfl_num_exts = actual_n;
453	new_dfl->dfl_offset = dfl->dfl_offset;
454	new_exts = new_dfl->dfl_exts;
455
456	for (uint64_t i = 0; i < n; i++) {
457		if (exts[i].dfle_length == 0) {
458			continue;
459		}
460
461		*new_exts++ = exts[i];
462	}
463
464	return (func(new_dfl, arg, kmflag));
465}
466
467/*
468 * If dfi_max_ext_bytes is set, use as the max segment length,
469 * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX
470 */
471#define	MAX_SEGLEN(dfi) \
472	(((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes :	\
473	((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX)
474
475/*
476 * Split the extent at idx into multiple lists (calling func for each one).
477 */
478static int
479split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx,
480    dfl_iter_fn_t func, void *arg, int kmflag)
481{
482	ASSERT3U(idx, <, dfl->dfl_num_exts);
483
484	const uint64_t		maxlen = MAX_SEGLEN(dfi);
485	dkioc_free_list_ext_t	*ext = dfl->dfl_exts + idx;
486	uint64_t		remain = ext->dfle_length;
487	int			r;
488
489	/*
490	 * Break the extent into as many single requests as needed. While it
491	 * would be possible in some circumstances to combine the final chunk
492	 * of the extent (after splitting) with the remaining extents in the
493	 * original request, it's not clear there's much benefit from the
494	 * added complexity. Such behavior could be added in the future if
495	 * it's determined to be worthwhile.
496	 */
497	while (remain > 0) {
498		uint64_t start = dfl->dfl_offset + ext->dfle_start;
499		uint64_t len = remain;
500
501		/*
502		 * If we know we have at least one more segment left after
503		 * the current iteration of this loop, split it so that
504		 * the next segment starts on an aligned boundary.
505		 */
506		if (len > maxlen) {
507			uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align);
508			len = end - start;
509		}
510
511		ext->dfle_length = len;
512
513		if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) {
514			return (r);
515		}
516
517		ext->dfle_start += len;
518		remain -= len;
519	}
520
521	return (0);
522}
523