1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22e14bb325SJeff Bonwick * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #include <sys/zfs_context.h> 27fa9e4066Sahrens #include <sys/spa.h> 28fa9e4066Sahrens #include <sys/vdev_impl.h> 29fa9e4066Sahrens #include <sys/zio.h> 30fa9e4066Sahrens #include <sys/avl.h> 31fa9e4066Sahrens 32614409b5Sahrens /* 33614409b5Sahrens * These tunables are for performance analysis. 34614409b5Sahrens */ 35614409b5Sahrens /* 36614409b5Sahrens * zfs_vdev_max_pending is the maximum number of i/os concurrently 37614409b5Sahrens * pending to each device. zfs_vdev_min_pending is the initial number 38614409b5Sahrens * of i/os pending to each device (before it starts ramping up to 39614409b5Sahrens * max_pending). 40614409b5Sahrens */ 41614409b5Sahrens int zfs_vdev_max_pending = 35; 42614409b5Sahrens int zfs_vdev_min_pending = 4; 43614409b5Sahrens 44614409b5Sahrens /* deadline = pri + (lbolt >> time_shift) */ 45614409b5Sahrens int zfs_vdev_time_shift = 6; 46614409b5Sahrens 47614409b5Sahrens /* exponential I/O issue ramp-up rate */ 48614409b5Sahrens int zfs_vdev_ramp_rate = 2; 49614409b5Sahrens 50614409b5Sahrens /* 51614409b5Sahrens * i/os will be aggregated into a single large i/o up to 52614409b5Sahrens * zfs_vdev_aggregation_limit bytes long. 53614409b5Sahrens */ 54614409b5Sahrens int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; 55614409b5Sahrens 56fa9e4066Sahrens /* 57fa9e4066Sahrens * Virtual device vector for disk I/O scheduling. 58fa9e4066Sahrens */ 59fa9e4066Sahrens int 60fa9e4066Sahrens vdev_queue_deadline_compare(const void *x1, const void *x2) 61fa9e4066Sahrens { 62fa9e4066Sahrens const zio_t *z1 = x1; 63fa9e4066Sahrens const zio_t *z2 = x2; 64fa9e4066Sahrens 65fa9e4066Sahrens if (z1->io_deadline < z2->io_deadline) 66fa9e4066Sahrens return (-1); 67fa9e4066Sahrens if (z1->io_deadline > z2->io_deadline) 68fa9e4066Sahrens return (1); 69fa9e4066Sahrens 70fa9e4066Sahrens if (z1->io_offset < z2->io_offset) 71fa9e4066Sahrens return (-1); 72fa9e4066Sahrens if (z1->io_offset > z2->io_offset) 73fa9e4066Sahrens return (1); 74fa9e4066Sahrens 75fa9e4066Sahrens if (z1 < z2) 76fa9e4066Sahrens return (-1); 77fa9e4066Sahrens if (z1 > z2) 78fa9e4066Sahrens return (1); 79fa9e4066Sahrens 80fa9e4066Sahrens return (0); 81fa9e4066Sahrens } 82fa9e4066Sahrens 83fa9e4066Sahrens int 84fa9e4066Sahrens vdev_queue_offset_compare(const void *x1, const void *x2) 85fa9e4066Sahrens { 86fa9e4066Sahrens const zio_t *z1 = x1; 87fa9e4066Sahrens const zio_t *z2 = x2; 88fa9e4066Sahrens 89fa9e4066Sahrens if (z1->io_offset < z2->io_offset) 90fa9e4066Sahrens return (-1); 91fa9e4066Sahrens if (z1->io_offset > z2->io_offset) 92fa9e4066Sahrens return (1); 93fa9e4066Sahrens 94fa9e4066Sahrens if (z1 < z2) 95fa9e4066Sahrens return (-1); 96fa9e4066Sahrens if (z1 > z2) 97fa9e4066Sahrens return (1); 98fa9e4066Sahrens 99fa9e4066Sahrens return (0); 100fa9e4066Sahrens } 101fa9e4066Sahrens 102fa9e4066Sahrens void 103fa9e4066Sahrens vdev_queue_init(vdev_t *vd) 104fa9e4066Sahrens { 105fa9e4066Sahrens vdev_queue_t *vq = &vd->vdev_queue; 106fa9e4066Sahrens 107fa9e4066Sahrens mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 108fa9e4066Sahrens 109fa9e4066Sahrens avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, 110fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_deadline_node)); 111fa9e4066Sahrens 112fa9e4066Sahrens avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, 113fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 114fa9e4066Sahrens 115fa9e4066Sahrens avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, 116fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 117fa9e4066Sahrens 118fa9e4066Sahrens avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, 119fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 120fa9e4066Sahrens } 121fa9e4066Sahrens 122fa9e4066Sahrens void 123fa9e4066Sahrens vdev_queue_fini(vdev_t *vd) 124fa9e4066Sahrens { 125fa9e4066Sahrens vdev_queue_t *vq = &vd->vdev_queue; 126fa9e4066Sahrens 127fa9e4066Sahrens avl_destroy(&vq->vq_deadline_tree); 128fa9e4066Sahrens avl_destroy(&vq->vq_read_tree); 129fa9e4066Sahrens avl_destroy(&vq->vq_write_tree); 130fa9e4066Sahrens avl_destroy(&vq->vq_pending_tree); 131fa9e4066Sahrens 132fa9e4066Sahrens mutex_destroy(&vq->vq_lock); 133fa9e4066Sahrens } 134fa9e4066Sahrens 135ea8dc4b6Seschrock static void 136ea8dc4b6Seschrock vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 137ea8dc4b6Seschrock { 138ea8dc4b6Seschrock avl_add(&vq->vq_deadline_tree, zio); 139ea8dc4b6Seschrock avl_add(zio->io_vdev_tree, zio); 140ea8dc4b6Seschrock } 141ea8dc4b6Seschrock 142ea8dc4b6Seschrock static void 143ea8dc4b6Seschrock vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 144ea8dc4b6Seschrock { 145ea8dc4b6Seschrock avl_remove(&vq->vq_deadline_tree, zio); 146ea8dc4b6Seschrock avl_remove(zio->io_vdev_tree, zio); 147ea8dc4b6Seschrock } 148ea8dc4b6Seschrock 149fa9e4066Sahrens static void 150fa9e4066Sahrens vdev_queue_agg_io_done(zio_t *aio) 151fa9e4066Sahrens { 152fa9e4066Sahrens zio_t *dio; 153fa9e4066Sahrens uint64_t offset = 0; 154fa9e4066Sahrens 155fa9e4066Sahrens while ((dio = aio->io_delegate_list) != NULL) { 156fa9e4066Sahrens if (aio->io_type == ZIO_TYPE_READ) 157fa9e4066Sahrens bcopy((char *)aio->io_data + offset, dio->io_data, 158fa9e4066Sahrens dio->io_size); 159fa9e4066Sahrens offset += dio->io_size; 160fa9e4066Sahrens aio->io_delegate_list = dio->io_delegate_next; 161fa9e4066Sahrens dio->io_delegate_next = NULL; 162fa9e4066Sahrens dio->io_error = aio->io_error; 163e05725b1Sbonwick zio_execute(dio); 164fa9e4066Sahrens } 165fa9e4066Sahrens ASSERT3U(offset, ==, aio->io_size); 166fa9e4066Sahrens 167fa9e4066Sahrens zio_buf_free(aio->io_data, aio->io_size); 168fa9e4066Sahrens } 169fa9e4066Sahrens 170fa9e4066Sahrens #define IS_ADJACENT(io, nio) \ 171fa9e4066Sahrens ((io)->io_offset + (io)->io_size == (nio)->io_offset) 172fa9e4066Sahrens 173fa9e4066Sahrens static zio_t * 174e05725b1Sbonwick vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) 175fa9e4066Sahrens { 176fa9e4066Sahrens zio_t *fio, *lio, *aio, *dio; 177fa9e4066Sahrens avl_tree_t *tree; 178fa9e4066Sahrens uint64_t size; 179*8ad4d6ddSJeff Bonwick int flags; 180fa9e4066Sahrens 181fa9e4066Sahrens ASSERT(MUTEX_HELD(&vq->vq_lock)); 182fa9e4066Sahrens 183fa9e4066Sahrens if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 184fa9e4066Sahrens avl_numnodes(&vq->vq_deadline_tree) == 0) 185fa9e4066Sahrens return (NULL); 186fa9e4066Sahrens 187fa9e4066Sahrens fio = lio = avl_first(&vq->vq_deadline_tree); 188fa9e4066Sahrens 189fa9e4066Sahrens tree = fio->io_vdev_tree; 190fa9e4066Sahrens size = fio->io_size; 191*8ad4d6ddSJeff Bonwick flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; 192*8ad4d6ddSJeff Bonwick 193*8ad4d6ddSJeff Bonwick if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { 194*8ad4d6ddSJeff Bonwick /* 195*8ad4d6ddSJeff Bonwick * We can aggregate I/Os that are adjacent and of the 196*8ad4d6ddSJeff Bonwick * same flavor, as expressed by the AGG_INHERIT flags. 197*8ad4d6ddSJeff Bonwick * The latter is necessary so that certain attributes 198*8ad4d6ddSJeff Bonwick * of the I/O, such as whether it's a normal I/O or a 199*8ad4d6ddSJeff Bonwick * scrub/resilver, can be preserved in the aggregate. 200*8ad4d6ddSJeff Bonwick */ 201*8ad4d6ddSJeff Bonwick while ((dio = AVL_PREV(tree, fio)) != NULL && 202*8ad4d6ddSJeff Bonwick IS_ADJACENT(dio, fio) && 203*8ad4d6ddSJeff Bonwick (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 204*8ad4d6ddSJeff Bonwick size + dio->io_size <= zfs_vdev_aggregation_limit) { 205*8ad4d6ddSJeff Bonwick dio->io_delegate_next = fio; 206*8ad4d6ddSJeff Bonwick fio = dio; 207*8ad4d6ddSJeff Bonwick size += dio->io_size; 208*8ad4d6ddSJeff Bonwick } 209*8ad4d6ddSJeff Bonwick while ((dio = AVL_NEXT(tree, lio)) != NULL && 210*8ad4d6ddSJeff Bonwick IS_ADJACENT(lio, dio) && 211*8ad4d6ddSJeff Bonwick (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 212*8ad4d6ddSJeff Bonwick size + dio->io_size <= zfs_vdev_aggregation_limit) { 213*8ad4d6ddSJeff Bonwick lio->io_delegate_next = dio; 214*8ad4d6ddSJeff Bonwick lio = dio; 215*8ad4d6ddSJeff Bonwick size += dio->io_size; 216*8ad4d6ddSJeff Bonwick } 217fa9e4066Sahrens } 218fa9e4066Sahrens 219fa9e4066Sahrens if (fio != lio) { 220fa9e4066Sahrens char *buf = zio_buf_alloc(size); 221fa9e4066Sahrens uint64_t offset = 0; 222fa9e4066Sahrens 223614409b5Sahrens ASSERT(size <= zfs_vdev_aggregation_limit); 224fa9e4066Sahrens 225e14bb325SJeff Bonwick aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, 226e14bb325SJeff Bonwick buf, size, fio->io_type, ZIO_PRIORITY_NOW, 227*8ad4d6ddSJeff Bonwick flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, 228fa9e4066Sahrens vdev_queue_agg_io_done, NULL); 229fa9e4066Sahrens 230fa9e4066Sahrens aio->io_delegate_list = fio; 231fa9e4066Sahrens 232fa9e4066Sahrens for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { 233fa9e4066Sahrens ASSERT(dio->io_type == aio->io_type); 234ea8dc4b6Seschrock ASSERT(dio->io_vdev_tree == tree); 235fa9e4066Sahrens if (dio->io_type == ZIO_TYPE_WRITE) 236fa9e4066Sahrens bcopy(dio->io_data, buf + offset, dio->io_size); 237fa9e4066Sahrens offset += dio->io_size; 238ea8dc4b6Seschrock vdev_queue_io_remove(vq, dio); 239fa9e4066Sahrens zio_vdev_io_bypass(dio); 240fa9e4066Sahrens } 241fa9e4066Sahrens 242fa9e4066Sahrens ASSERT(offset == size); 243fa9e4066Sahrens 244fa9e4066Sahrens avl_add(&vq->vq_pending_tree, aio); 245fa9e4066Sahrens 246fa9e4066Sahrens return (aio); 247fa9e4066Sahrens } 248fa9e4066Sahrens 249ea8dc4b6Seschrock ASSERT(fio->io_vdev_tree == tree); 250ea8dc4b6Seschrock vdev_queue_io_remove(vq, fio); 251fa9e4066Sahrens 252fa9e4066Sahrens avl_add(&vq->vq_pending_tree, fio); 253fa9e4066Sahrens 254fa9e4066Sahrens return (fio); 255fa9e4066Sahrens } 256fa9e4066Sahrens 257fa9e4066Sahrens zio_t * 258fa9e4066Sahrens vdev_queue_io(zio_t *zio) 259fa9e4066Sahrens { 260fa9e4066Sahrens vdev_queue_t *vq = &zio->io_vd->vdev_queue; 261fa9e4066Sahrens zio_t *nio; 262fa9e4066Sahrens 263fa9e4066Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 264fa9e4066Sahrens 265fa9e4066Sahrens if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 266fa9e4066Sahrens return (zio); 267fa9e4066Sahrens 268fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 269fa9e4066Sahrens 270fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_READ) 271fa9e4066Sahrens zio->io_vdev_tree = &vq->vq_read_tree; 272fa9e4066Sahrens else 273fa9e4066Sahrens zio->io_vdev_tree = &vq->vq_write_tree; 274fa9e4066Sahrens 275fa9e4066Sahrens mutex_enter(&vq->vq_lock); 276fa9e4066Sahrens 277e14bb325SJeff Bonwick zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; 278fa9e4066Sahrens 279ea8dc4b6Seschrock vdev_queue_io_add(vq, zio); 280fa9e4066Sahrens 281e05725b1Sbonwick nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); 282fa9e4066Sahrens 283fa9e4066Sahrens mutex_exit(&vq->vq_lock); 284fa9e4066Sahrens 285e05725b1Sbonwick if (nio == NULL) 286e05725b1Sbonwick return (NULL); 287e05725b1Sbonwick 288e05725b1Sbonwick if (nio->io_done == vdev_queue_agg_io_done) { 289e05725b1Sbonwick zio_nowait(nio); 290e05725b1Sbonwick return (NULL); 291e05725b1Sbonwick } 292fa9e4066Sahrens 293e05725b1Sbonwick return (nio); 294fa9e4066Sahrens } 295fa9e4066Sahrens 296fa9e4066Sahrens void 297fa9e4066Sahrens vdev_queue_io_done(zio_t *zio) 298fa9e4066Sahrens { 299fa9e4066Sahrens vdev_queue_t *vq = &zio->io_vd->vdev_queue; 300fa9e4066Sahrens 301fa9e4066Sahrens mutex_enter(&vq->vq_lock); 302fa9e4066Sahrens 303fa9e4066Sahrens avl_remove(&vq->vq_pending_tree, zio); 304fa9e4066Sahrens 305e14bb325SJeff Bonwick for (int i = 0; i < zfs_vdev_ramp_rate; i++) { 306e14bb325SJeff Bonwick zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); 307fa9e4066Sahrens if (nio == NULL) 308fa9e4066Sahrens break; 309fa9e4066Sahrens mutex_exit(&vq->vq_lock); 310e05725b1Sbonwick if (nio->io_done == vdev_queue_agg_io_done) { 311e05725b1Sbonwick zio_nowait(nio); 312e05725b1Sbonwick } else { 313fa9e4066Sahrens zio_vdev_io_reissue(nio); 314e05725b1Sbonwick zio_execute(nio); 315e05725b1Sbonwick } 316fa9e4066Sahrens mutex_enter(&vq->vq_lock); 317fa9e4066Sahrens } 318fa9e4066Sahrens 319fa9e4066Sahrens mutex_exit(&vq->vq_lock); 320fa9e4066Sahrens } 321