1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
14  * Copyright 2020 Joyent, Inc.
15  */
16 
17 /* needed when building libzpool */
18 #ifndef	_KERNEL
19 #include <sys/zfs_context.h>
20 #endif
21 
22 #include <sys/sunddi.h>
23 #include <sys/dkio.h>
24 #include <sys/dkioc_free_util.h>
25 #include <sys/sysmacros.h>
26 #include <sys/file.h>
27 #include <sys/sdt.h>
28 
29 static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *,
30     uint64_t len_blk);
31 static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *,
32     uint64_t, dfl_iter_fn_t, void *, int);
33 static int process_range(dkioc_free_list_t *, uint64_t, uint64_t,
34     dfl_iter_fn_t, void *, int);
35 
36 /*
37  * Copy-in convenience function for variable-length dkioc_free_list_t
38  * structures. The pointer to be copied from is in `arg' (may be a pointer
39  * to userspace). A new buffer is allocated and a pointer to it is placed
40  * in `out'. `ddi_flags' indicates whether the pointer is from user-
41  * or kernelspace (FKIOCTL) and `kmflags' are the flags passed to
42  * kmem_zalloc when allocating the new structure.
43  * Returns 0 on success, or an errno on failure.
44  */
45 int
dfl_copyin(void * arg,dkioc_free_list_t ** out,int ddi_flags,int kmflags)46 dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags)
47 {
48 	dkioc_free_list_t *dfl;
49 
50 	if (ddi_flags & FKIOCTL) {
51 		dkioc_free_list_t *dfl_in = arg;
52 
53 		if (dfl_in->dfl_num_exts == 0 ||
54 		    dfl_in->dfl_num_exts > DFL_COPYIN_MAX_EXTS)
55 			return (SET_ERROR(EINVAL));
56 		dfl = kmem_alloc(DFL_SZ(dfl_in->dfl_num_exts), kmflags);
57 		if (dfl == NULL)
58 			return (SET_ERROR(ENOMEM));
59 		bcopy(dfl_in, dfl, DFL_SZ(dfl_in->dfl_num_exts));
60 	} else {
61 		uint64_t num_exts;
62 
63 		if (ddi_copyin(((uint8_t *)arg) + offsetof(dkioc_free_list_t,
64 		    dfl_num_exts), &num_exts, sizeof (num_exts),
65 		    ddi_flags) != 0)
66 			return (SET_ERROR(EFAULT));
67 		if (num_exts == 0 || num_exts > DFL_COPYIN_MAX_EXTS)
68 			return (SET_ERROR(EINVAL));
69 		dfl = kmem_alloc(DFL_SZ(num_exts), kmflags);
70 		if (dfl == NULL)
71 			return (SET_ERROR(ENOMEM));
72 		if (ddi_copyin(arg, dfl, DFL_SZ(num_exts), ddi_flags) != 0 ||
73 		    dfl->dfl_num_exts != num_exts) {
74 			kmem_free(dfl, DFL_SZ(num_exts));
75 			return (SET_ERROR(EFAULT));
76 		}
77 	}
78 
79 	*out = dfl;
80 	return (0);
81 }
82 
83 /* Frees a variable-length dkioc_free_list_t structure. */
84 void
dfl_free(dkioc_free_list_t * dfl)85 dfl_free(dkioc_free_list_t *dfl)
86 {
87 	kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
88 }
89 
90 /*
91  * Convenience function to resize and segment the array of extents in
92  * a DKIOCFREE request as required by a driver.
93  *
94  * Some devices that implement DKIOCFREE (e.g. vioblk) have limits
95  * on either the number of extents that can be submitted in a single request,
96  * or the total number of blocks that can be submitted in a single request.
97  * In addition, devices may have alignment requirements on the starting
98  * address stricter than the device block size.
99  *
100  * Since there is currently no mechanism for callers of DKIOCFREE to discover
101  * such restrictions, instead of rejecting any requests that do not conform to
102  * some undiscoverable (to the caller) set of requirements, a driver can use
103  * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as
104  * required to conform to its requirements.
105  *
106  * The original request is passed as 'dfl' and the alignment requirements
107  * are passed in 'dfi'. Additionally the maximum offset of the device allowed
108  * in bytes) is passed as max_off -- this allows a driver with
109  * multiple instances of different sizes but similar requirements (e.g.
110  * a partitioned blkdev device) to not construct a separate dkioc_free_info_t
111  * struct for each device.
112  *
113  * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of
114  * arg passed to it as needed. If the extents in the dkioc_free_list_t passed
115  * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will
116  * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter()
117  * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t
118  * instances and populate them with the adjusted extents that do conform to the
119  * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t
120  * passed to it when this occurs. The net result is that 'func' can always
121  * assume it will be called with a dkioc_free_list_t with extents that
122  * comply with the requirements in 'dfi'. 'func' is also responsible for
123  * freeing the dkioc_free_list_t passed to it (likely via a completion
124  * callback).
125  *
126  * Combined with the behavior described above, dfl_iter() can be viewed as
127  * consuming the dkioc_free_list_t passed to it. Either it will pass it along
128  * to 'func' (and let 'func' handle freeing it), or it will free it and
129  * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still
130  * letting 'func' handle freeing the new instances). This way neither the
131  * dfl_iter() caller nor nor the driver need to worry about treating
132  * conforming and non-conforming requests differently.
133  *
134  * Unfortunately, the DKIOCFREE ioctl provides no method for communicating
135  * any notion of partial completion -- either it returns success (0) or
136  * an error. It's not clear if such a notion would even be possible while
137  * supporting multiple types of devices (NVMe, SCSI, etc.) with the same
138  * interface. As such, there's little benefit to providing more detailed error
139  * semantics beyond what DKIOCFREE can handle.
140  *
141  * Due to this, a somewhat simplistic approach is taken to error handling. The
142  * original list of extents is first checked to make sure they all appear
143  * valid -- that is they do not start or extend beyond the end of the device.
144  * Any request that contains such extents is always rejected in it's entirety.
145  * It is possible after applying any needed adjustments to the original list
146  * of extents that the result is not acceptable to the driver. For example,
147  * a device with a 512 byte block size that tries to free the range 513-1023
148  * (bytes) would not be able to be processed. Such extents will be silently
149  * ignored. If the original request consists of nothing but such requests,
150  * dfl_iter() will never call 'func' and will merely return 0.
151  */
152 int
dfl_iter(dkioc_free_list_t * dfl,const dkioc_free_info_t * dfi,uint64_t max_off,dfl_iter_fn_t func,void * arg,int kmflag)153 dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off,
154     dfl_iter_fn_t func, void *arg, int kmflag)
155 {
156 	dkioc_free_list_ext_t *ext;
157 	uint64_t n_bytes, n_segs, start_idx, i;
158 	uint_t bsize = 1U << dfi->dfi_bshift;
159 	int r = 0;
160 	boolean_t need_copy = B_FALSE;
161 
162 	/*
163 	 * Make sure the block size derived from dfi_bshift is at least 512
164 	 * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is
165 	 * to prevent any problems with other parts of the system that might
166 	 * assume a minimum block size of 512, and the upper bound is just
167 	 * to prevent overflow when creating the block size from dfi_bshift
168 	 * (though it seems unlikely we'll have _block_ sizes near a GiB
169 	 * any time soon).
170 	 */
171 	if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) {
172 		r = SET_ERROR(EINVAL);
173 		goto done;
174 	}
175 
176 	/* Max bytes must be a multiple of the block size */
177 	if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) {
178 		r = SET_ERROR(EINVAL);
179 		goto done;
180 	}
181 
182 	/* Start offset alignment must also be a multiple of the block size */
183 	if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) {
184 		r = SET_ERROR(EINVAL);
185 		goto done;
186 	}
187 
188 	/* Max bytes in an extent must be a multiple of the block size */
189 	if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) {
190 		r = SET_ERROR(EINVAL);
191 		goto done;
192 	}
193 
194 	/*
195 	 * It makes no sense to allow a single extent to be larger than the
196 	 * total allowed for an entire request.
197 	 */
198 	if (dfi->dfi_max_ext_bytes > 0 &&
199 	    dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) {
200 		r = SET_ERROR(EINVAL);
201 		goto done;
202 	}
203 
204 	/*
205 	 * The first pass, align everything as needed and make sure all the
206 	 * extents look valid.
207 	 */
208 	if ((r = adjust_exts(dfl, dfi, max_off)) != 0) {
209 		goto done;
210 	}
211 
212 	/*
213 	 * Go through and split things up as needed. The general idea is to
214 	 * split along the original extent boundaries when needed. We only
215 	 * split an extent from the original request into multiple extents
216 	 * if the original extent is by itself too big for the device to
217 	 * process in a single request.
218 	 */
219 	start_idx = 0;
220 	n_bytes = n_segs = 0;
221 	ext = dfl->dfl_exts;
222 	for (i = 0; i < dfl->dfl_num_exts; i++, ext++) {
223 		uint64_t start = dfl->dfl_offset + ext->dfle_start;
224 		uint64_t len = ext->dfle_length;
225 
226 		if (len == 0) {
227 			/*
228 			 * If we encounter a zero length extent, we're going
229 			 * to create a new copy of dfl no matter what --
230 			 * the size of dfl is determined by dfl_num_exts so
231 			 * we cannot do things like shift the contents and
232 			 * reduce dfl_num_exts to get a contiguous array
233 			 * of non-zero length extents.
234 			 */
235 			need_copy = B_TRUE;
236 			continue;
237 		}
238 
239 		if (dfi->dfi_max_ext_bytes > 0 &&
240 		    len > dfi->dfi_max_ext_bytes) {
241 			/*
242 			 * An extent that's too large. Dispatch what we've
243 			 * accumulated, and then split this extent into
244 			 * smaller ones the device can accept.
245 			 */
246 			if ((r = process_range(dfl, start_idx, i - start_idx,
247 			    func, arg, kmflag)) != 0) {
248 				goto done;
249 			}
250 
251 			if ((r = split_extent(dfl, dfi, i, func, arg,
252 			    kmflag)) != 0) {
253 				goto done;
254 			}
255 			start_idx = i + 1;
256 			n_segs = 0;
257 			n_bytes = 0;
258 			continue;
259 		}
260 
261 		if (dfi->dfi_max_bytes > 0 &&
262 		    n_bytes + len > dfi->dfi_max_bytes) {
263 			/*
264 			 * This extent would put us over the limit for total
265 			 * bytes that can be trimmed in one request.
266 			 * Dispatch what we've accumulated. Then deal
267 			 * with this extent.
268 			 */
269 			if ((r = process_range(dfl, start_idx, i - start_idx,
270 			    func, arg, kmflag)) != 0) {
271 				goto done;
272 			}
273 
274 			if (len < dfi->dfi_max_bytes) {
275 				/*
276 				 * After dispatching what we've accumulated,
277 				 * this extent can fit in a new request
278 				 * Just add it to the accumulated list of
279 				 * extents and move on.
280 				 */
281 				start_idx = i;
282 				n_segs = 1;
283 				n_bytes = len;
284 				continue;
285 			}
286 
287 			/*
288 			 * Even after starting a new request, this extent
289 			 * is too big. Split it until it fits.
290 			 */
291 			if ((r = split_extent(dfl, dfi, i, func, arg,
292 			    kmflag)) != 0) {
293 				goto done;
294 			}
295 
296 			start_idx = i + 1;
297 			n_segs = 0;
298 			n_bytes = 0;
299 			continue;
300 		}
301 
302 		if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) {
303 			/*
304 			 * This extent will put us over the limit on the number
305 			 * of extents the device can accept. Dispatch what
306 			 * we've accumulated so far.
307 			 */
308 			if ((r = process_range(dfl, start_idx, i - start_idx,
309 			    func, arg, kmflag)) != 0) {
310 				goto done;
311 			}
312 
313 			start_idx = i;
314 			n_segs = 1;
315 			n_bytes = len;
316 			continue;
317 		}
318 
319 		n_segs++;
320 		n_bytes += len;
321 	}
322 
323 	/*
324 	 * If a copy wasn't required, and we haven't processed a subset of
325 	 * the extents already, we can just use the original request.
326 	 */
327 	if (!need_copy && start_idx == 0) {
328 		return (func(dfl, arg, kmflag));
329 	}
330 
331 	r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag);
332 
333 done:
334 	dfl_free(dfl);
335 	return (r);
336 }
337 
338 /*
339  * Adjust the start and length of each extent in dfl so that it conforms to
340  * the requirements in dfi. It also verifies that no extent extends beyond
341  * the end of the device (given by len_blk).
342  *
343  * Returns 0 on success, or an error value.
344  */
345 static int
adjust_exts(dkioc_free_list_t * dfl,const dkioc_free_info_t * dfi,uint64_t max_off)346 adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi,
347     uint64_t max_off)
348 {
349 	dkioc_free_list_ext_t *exts = dfl->dfl_exts;
350 	/*
351 	 * These must be uint64_t to prevent the P2 macros from truncating
352 	 * the result.
353 	 */
354 	const uint64_t align = dfi->dfi_align;
355 	const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift;
356 
357 	for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) {
358 		/*
359 		 * Since there are no known requirements on the value of
360 		 * dfl_offset, it's possible (though odd) to have a scenario
361 		 * where dfl_offset == 1, and dfle_start == 511 (resulting
362 		 * in an actual start offset of 512). As such, we always
363 		 * apply the offset and find the resulting starting offset
364 		 * and length (in bytes) first, then apply any rounding
365 		 * and alignment.
366 		 */
367 		uint64_t start = exts->dfle_start + dfl->dfl_offset;
368 		uint64_t end = start + exts->dfle_length;
369 
370 		/*
371 		 * Make sure after applying dfl->dfl_offset and any alignment
372 		 * adjustments that the results don't overflow.
373 		 */
374 		if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) {
375 			return (SET_ERROR(EOVERFLOW));
376 		}
377 
378 		if (end < start) {
379 			return (SET_ERROR(EOVERFLOW));
380 		}
381 
382 		/*
383 		 * Make sure we don't extend past the end of the device
384 		 */
385 		if (end > max_off) {
386 			return (SET_ERROR(EINVAL));
387 		}
388 
389 		start = P2ROUNDUP(start, align);
390 		end = P2ALIGN(end, bsize);
391 
392 		/*
393 		 * Remove the offset so that when it's later applied again,
394 		 * the correct start value is obtained.
395 		 */
396 		exts->dfle_start = start - dfl->dfl_offset;
397 
398 		/*
399 		 * If the original length was less than the block size
400 		 * of the device, we can end up with end < start. If that
401 		 * happens we just set the length to zero.
402 		 */
403 		exts->dfle_length = (end < start) ? 0 : end - start;
404 	}
405 
406 	return (0);
407 }
408 
409 /*
410  * Take a subset of extents from dfl (starting at start_idx, with n entries)
411  * and create a new dkioc_free_list_t, passing that to func.
412  */
413 static int
process_range(dkioc_free_list_t * dfl,uint64_t start_idx,uint64_t n,dfl_iter_fn_t func,void * arg,int kmflag)414 process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n,
415     dfl_iter_fn_t func, void *arg, int kmflag)
416 {
417 	dkioc_free_list_t *new_dfl = NULL;
418 	dkioc_free_list_ext_t *new_exts = NULL;
419 	dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx;
420 	size_t actual_n = n;
421 	int r = 0;
422 
423 	if (n == 0) {
424 		return (0);
425 	}
426 
427 	/*
428 	 * Ignore any zero length extents. No known devices attach any
429 	 * semantic meaning to such extents, and are likely just a result of
430 	 * narrowing the range of the extent to fit the device alignment
431 	 * requirements. It is possible the original caller submitted a
432 	 * zero length extent, but we ignore those as well. Since we can't
433 	 * communicate partial results back to the caller anyway, it's
434 	 * unclear whether reporting that one of potentially many exents was
435 	 * too small (without being able to identify which one) to the caller
436 	 * of the DKIOCFREE request would be useful.
437 	 */
438 	for (uint64_t i = 0; i < n; i++) {
439 		if (exts[i].dfle_length == 0 && --actual_n == 0) {
440 			return (0);
441 		}
442 	}
443 
444 	new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag);
445 	if (new_dfl == NULL) {
446 		return (SET_ERROR(ENOMEM));
447 	}
448 
449 	new_dfl->dfl_flags = dfl->dfl_flags;
450 	new_dfl->dfl_num_exts = actual_n;
451 	new_dfl->dfl_offset = dfl->dfl_offset;
452 	new_exts = new_dfl->dfl_exts;
453 
454 	for (uint64_t i = 0; i < n; i++) {
455 		if (exts[i].dfle_length == 0) {
456 			continue;
457 		}
458 
459 		*new_exts++ = exts[i];
460 	}
461 
462 	return (func(new_dfl, arg, kmflag));
463 }
464 
465 /*
466  * If dfi_max_ext_bytes is set, use as the max segment length,
467  * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX
468  */
469 #define	MAX_SEGLEN(dfi) \
470 	(((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes :	\
471 	((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX)
472 
473 /*
474  * Split the extent at idx into multiple lists (calling func for each one).
475  */
476 static int
split_extent(dkioc_free_list_t * dfl,const dkioc_free_info_t * dfi,uint64_t idx,dfl_iter_fn_t func,void * arg,int kmflag)477 split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx,
478     dfl_iter_fn_t func, void *arg, int kmflag)
479 {
480 	ASSERT3U(idx, <, dfl->dfl_num_exts);
481 
482 	const uint64_t		maxlen = MAX_SEGLEN(dfi);
483 	dkioc_free_list_ext_t	*ext = dfl->dfl_exts + idx;
484 	uint64_t		remain = ext->dfle_length;
485 	int			r;
486 
487 	/*
488 	 * Break the extent into as many single requests as needed. While it
489 	 * would be possible in some circumstances to combine the final chunk
490 	 * of the extent (after splitting) with the remaining extents in the
491 	 * original request, it's not clear there's much benefit from the
492 	 * added complexity. Such behavior could be added in the future if
493 	 * it's determined to be worthwhile.
494 	 */
495 	while (remain > 0) {
496 		uint64_t start = dfl->dfl_offset + ext->dfle_start;
497 		uint64_t len = remain;
498 
499 		/*
500 		 * If we know we have at least one more segment left after
501 		 * the current iteration of this loop, split it so that
502 		 * the next segment starts on an aligned boundary.
503 		 */
504 		if (len > maxlen) {
505 			uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align);
506 			len = end - start;
507 		}
508 
509 		ext->dfle_length = len;
510 
511 		if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) {
512 			return (r);
513 		}
514 
515 		ext->dfle_start += len;
516 		remain -= len;
517 	}
518 
519 	return (0);
520 }
521