/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2021 Tintri by DDN, Inc. All rights reserved. * Copyright 2020 Joyent, Inc. */ /* needed when building libzpool */ #ifndef _KERNEL #include #endif #include #include #include #include #include #include static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *, uint64_t len_blk); static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *, uint64_t, dfl_iter_fn_t, void *, int); static int process_range(dkioc_free_list_t *, uint64_t, uint64_t, dfl_iter_fn_t, void *, int); /* * Copy-in convenience function for variable-length dkioc_free_list_t * structures. The pointer to be copied from is in `arg' (may be a pointer * to userspace). A new buffer is allocated and a pointer to it is placed * in `out'. `ddi_flags' indicates whether the pointer is from user- * or kernelspace (FKIOCTL) and `kmflags' are the flags passed to * kmem_zalloc when allocating the new structure. * Returns 0 on success, or an errno on failure. */ int dfl_copyin(void *arg, dkioc_free_list_t **out, int ddi_flags, int kmflags) { dkioc_free_list_t *dfl; if (ddi_flags & FKIOCTL) { dkioc_free_list_t *dfl_in = arg; if (dfl_in->dfl_num_exts == 0 || dfl_in->dfl_num_exts > DFL_COPYIN_MAX_EXTS) return (SET_ERROR(EINVAL)); dfl = kmem_alloc(DFL_SZ(dfl_in->dfl_num_exts), kmflags); if (dfl == NULL) return (SET_ERROR(ENOMEM)); bcopy(dfl_in, dfl, DFL_SZ(dfl_in->dfl_num_exts)); } else { uint64_t num_exts; if (ddi_copyin(((uint8_t *)arg) + offsetof(dkioc_free_list_t, dfl_num_exts), &num_exts, sizeof (num_exts), ddi_flags) != 0) return (SET_ERROR(EFAULT)); if (num_exts == 0 || num_exts > DFL_COPYIN_MAX_EXTS) return (SET_ERROR(EINVAL)); dfl = kmem_alloc(DFL_SZ(num_exts), kmflags); if (dfl == NULL) return (SET_ERROR(ENOMEM)); if (ddi_copyin(arg, dfl, DFL_SZ(num_exts), ddi_flags) != 0 || dfl->dfl_num_exts != num_exts) { kmem_free(dfl, DFL_SZ(num_exts)); return (SET_ERROR(EFAULT)); } } *out = dfl; return (0); } /* Frees a variable-length dkioc_free_list_t structure. */ void dfl_free(dkioc_free_list_t *dfl) { kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); } /* * Convenience function to resize and segment the array of extents in * a DKIOCFREE request as required by a driver. * * Some devices that implement DKIOCFREE (e.g. vioblk) have limits * on either the number of extents that can be submitted in a single request, * or the total number of blocks that can be submitted in a single request. * In addition, devices may have alignment requirements on the starting * address stricter than the device block size. * * Since there is currently no mechanism for callers of DKIOCFREE to discover * such restrictions, instead of rejecting any requests that do not conform to * some undiscoverable (to the caller) set of requirements, a driver can use * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as * required to conform to its requirements. * * The original request is passed as 'dfl' and the alignment requirements * are passed in 'dfi'. Additionally the maximum offset of the device allowed * in bytes) is passed as max_off -- this allows a driver with * multiple instances of different sizes but similar requirements (e.g. * a partitioned blkdev device) to not construct a separate dkioc_free_info_t * struct for each device. * * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of * arg passed to it as needed. If the extents in the dkioc_free_list_t passed * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter() * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t * instances and populate them with the adjusted extents that do conform to the * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t * passed to it when this occurs. The net result is that 'func' can always * assume it will be called with a dkioc_free_list_t with extents that * comply with the requirements in 'dfi'. 'func' is also responsible for * freeing the dkioc_free_list_t passed to it (likely via a completion * callback). * * Combined with the behavior described above, dfl_iter() can be viewed as * consuming the dkioc_free_list_t passed to it. Either it will pass it along * to 'func' (and let 'func' handle freeing it), or it will free it and * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still * letting 'func' handle freeing the new instances). This way neither the * dfl_iter() caller nor nor the driver need to worry about treating * conforming and non-conforming requests differently. * * Unfortunately, the DKIOCFREE ioctl provides no method for communicating * any notion of partial completion -- either it returns success (0) or * an error. It's not clear if such a notion would even be possible while * supporting multiple types of devices (NVMe, SCSI, etc.) with the same * interface. As such, there's little benefit to providing more detailed error * semantics beyond what DKIOCFREE can handle. * * Due to this, a somewhat simplistic approach is taken to error handling. The * original list of extents is first checked to make sure they all appear * valid -- that is they do not start or extend beyond the end of the device. * Any request that contains such extents is always rejected in it's entirety. * It is possible after applying any needed adjustments to the original list * of extents that the result is not acceptable to the driver. For example, * a device with a 512 byte block size that tries to free the range 513-1023 * (bytes) would not be able to be processed. Such extents will be silently * ignored. If the original request consists of nothing but such requests, * dfl_iter() will never call 'func' and will merely return 0. */ int dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off, dfl_iter_fn_t func, void *arg, int kmflag) { dkioc_free_list_ext_t *ext; uint64_t n_bytes, n_segs, start_idx, i; uint_t bsize = 1U << dfi->dfi_bshift; int r = 0; boolean_t need_copy = B_FALSE; /* * Make sure the block size derived from dfi_bshift is at least 512 * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is * to prevent any problems with other parts of the system that might * assume a minimum block size of 512, and the upper bound is just * to prevent overflow when creating the block size from dfi_bshift * (though it seems unlikely we'll have _block_ sizes near a GiB * any time soon). */ if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) { r = SET_ERROR(EINVAL); goto done; } /* Max bytes must be a multiple of the block size */ if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) { r = SET_ERROR(EINVAL); goto done; } /* Start offset alignment must also be a multiple of the block size */ if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) { r = SET_ERROR(EINVAL); goto done; } /* Max bytes in an extent must be a multiple of the block size */ if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) { r = SET_ERROR(EINVAL); goto done; } /* * It makes no sense to allow a single extent to be larger than the * total allowed for an entire request. */ if (dfi->dfi_max_ext_bytes > 0 && dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) { r = SET_ERROR(EINVAL); goto done; } /* * The first pass, align everything as needed and make sure all the * extents look valid. */ if ((r = adjust_exts(dfl, dfi, max_off)) != 0) { goto done; } /* * Go through and split things up as needed. The general idea is to * split along the original extent boundaries when needed. We only * split an extent from the original request into multiple extents * if the original extent is by itself too big for the device to * process in a single request. */ start_idx = 0; n_bytes = n_segs = 0; ext = dfl->dfl_exts; for (i = 0; i < dfl->dfl_num_exts; i++, ext++) { uint64_t start = dfl->dfl_offset + ext->dfle_start; uint64_t len = ext->dfle_length; if (len == 0) { /* * If we encounter a zero length extent, we're going * to create a new copy of dfl no matter what -- * the size of dfl is determined by dfl_num_exts so * we cannot do things like shift the contents and * reduce dfl_num_exts to get a contiguous array * of non-zero length extents. */ need_copy = B_TRUE; continue; } if (dfi->dfi_max_ext_bytes > 0 && len > dfi->dfi_max_ext_bytes) { /* * An extent that's too large. Dispatch what we've * accumulated, and then split this extent into * smaller ones the device can accept. */ if ((r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag)) != 0) { goto done; } if ((r = split_extent(dfl, dfi, i, func, arg, kmflag)) != 0) { goto done; } start_idx = i + 1; n_segs = 0; n_bytes = 0; continue; } if (dfi->dfi_max_bytes > 0 && n_bytes + len > dfi->dfi_max_bytes) { /* * This extent would put us over the limit for total * bytes that can be trimmed in one request. * Dispatch what we've accumulated. Then deal * with this extent. */ if ((r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag)) != 0) { goto done; } if (len < dfi->dfi_max_bytes) { /* * After dispatching what we've accumulated, * this extent can fit in a new request * Just add it to the accumulated list of * extents and move on. */ start_idx = i; n_segs = 1; n_bytes = len; continue; } /* * Even after starting a new request, this extent * is too big. Split it until it fits. */ if ((r = split_extent(dfl, dfi, i, func, arg, kmflag)) != 0) { goto done; } start_idx = i + 1; n_segs = 0; n_bytes = 0; continue; } if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) { /* * This extent will put us over the limit on the number * of extents the device can accept. Dispatch what * we've accumulated so far. */ if ((r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag)) != 0) { goto done; } start_idx = i; n_segs = 1; n_bytes = len; continue; } n_segs++; n_bytes += len; } /* * If a copy wasn't required, and we haven't processed a subset of * the extents already, we can just use the original request. */ if (!need_copy && start_idx == 0) { return (func(dfl, arg, kmflag)); } r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag); done: dfl_free(dfl); return (r); } /* * Adjust the start and length of each extent in dfl so that it conforms to * the requirements in dfi. It also verifies that no extent extends beyond * the end of the device (given by len_blk). * * Returns 0 on success, or an error value. */ static int adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off) { dkioc_free_list_ext_t *exts = dfl->dfl_exts; /* * These must be uint64_t to prevent the P2 macros from truncating * the result. */ const uint64_t align = dfi->dfi_align; const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift; for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) { /* * Since there are no known requirements on the value of * dfl_offset, it's possible (though odd) to have a scenario * where dfl_offset == 1, and dfle_start == 511 (resulting * in an actual start offset of 512). As such, we always * apply the offset and find the resulting starting offset * and length (in bytes) first, then apply any rounding * and alignment. */ uint64_t start = exts->dfle_start + dfl->dfl_offset; uint64_t end = start + exts->dfle_length; /* * Make sure after applying dfl->dfl_offset and any alignment * adjustments that the results don't overflow. */ if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) { return (SET_ERROR(EOVERFLOW)); } if (end < start) { return (SET_ERROR(EOVERFLOW)); } /* * Make sure we don't extend past the end of the device */ if (end > max_off) { return (SET_ERROR(EINVAL)); } start = P2ROUNDUP(start, align); end = P2ALIGN(end, bsize); /* * Remove the offset so that when it's later applied again, * the correct start value is obtained. */ exts->dfle_start = start - dfl->dfl_offset; /* * If the original length was less than the block size * of the device, we can end up with end < start. If that * happens we just set the length to zero. */ exts->dfle_length = (end < start) ? 0 : end - start; } return (0); } /* * Take a subset of extents from dfl (starting at start_idx, with n entries) * and create a new dkioc_free_list_t, passing that to func. */ static int process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n, dfl_iter_fn_t func, void *arg, int kmflag) { dkioc_free_list_t *new_dfl = NULL; dkioc_free_list_ext_t *new_exts = NULL; dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx; size_t actual_n = n; int r = 0; if (n == 0) { return (0); } /* * Ignore any zero length extents. No known devices attach any * semantic meaning to such extents, and are likely just a result of * narrowing the range of the extent to fit the device alignment * requirements. It is possible the original caller submitted a * zero length extent, but we ignore those as well. Since we can't * communicate partial results back to the caller anyway, it's * unclear whether reporting that one of potentially many exents was * too small (without being able to identify which one) to the caller * of the DKIOCFREE request would be useful. */ for (uint64_t i = 0; i < n; i++) { if (exts[i].dfle_length == 0 && --actual_n == 0) { return (0); } } new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag); if (new_dfl == NULL) { return (SET_ERROR(ENOMEM)); } new_dfl->dfl_flags = dfl->dfl_flags; new_dfl->dfl_num_exts = actual_n; new_dfl->dfl_offset = dfl->dfl_offset; new_exts = new_dfl->dfl_exts; for (uint64_t i = 0; i < n; i++) { if (exts[i].dfle_length == 0) { continue; } *new_exts++ = exts[i]; } return (func(new_dfl, arg, kmflag)); } /* * If dfi_max_ext_bytes is set, use as the max segment length, * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX */ #define MAX_SEGLEN(dfi) \ (((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes : \ ((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX) /* * Split the extent at idx into multiple lists (calling func for each one). */ static int split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx, dfl_iter_fn_t func, void *arg, int kmflag) { ASSERT3U(idx, <, dfl->dfl_num_exts); const uint64_t maxlen = MAX_SEGLEN(dfi); dkioc_free_list_ext_t *ext = dfl->dfl_exts + idx; uint64_t remain = ext->dfle_length; int r; /* * Break the extent into as many single requests as needed. While it * would be possible in some circumstances to combine the final chunk * of the extent (after splitting) with the remaining extents in the * original request, it's not clear there's much benefit from the * added complexity. Such behavior could be added in the future if * it's determined to be worthwhile. */ while (remain > 0) { uint64_t start = dfl->dfl_offset + ext->dfle_start; uint64_t len = remain; /* * If we know we have at least one more segment left after * the current iteration of this loop, split it so that * the next segment starts on an aligned boundary. */ if (len > maxlen) { uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align); len = end - start; } ext->dfle_length = len; if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) { return (r); } ext->dfle_start += len; remain -= len; } return (0); }