1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22*a3f829aeSBill Moore * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #include <sys/zfs_context.h> 27fa9e4066Sahrens #include <sys/spa.h> 28fa9e4066Sahrens #include <sys/vdev_impl.h> 29fa9e4066Sahrens #include <sys/zio.h> 30fa9e4066Sahrens #include <sys/avl.h> 31fa9e4066Sahrens 32614409b5Sahrens /* 33614409b5Sahrens * These tunables are for performance analysis. 34614409b5Sahrens */ 35614409b5Sahrens /* 36614409b5Sahrens * zfs_vdev_max_pending is the maximum number of i/os concurrently 37614409b5Sahrens * pending to each device. zfs_vdev_min_pending is the initial number 38614409b5Sahrens * of i/os pending to each device (before it starts ramping up to 39614409b5Sahrens * max_pending). 40614409b5Sahrens */ 41614409b5Sahrens int zfs_vdev_max_pending = 35; 42614409b5Sahrens int zfs_vdev_min_pending = 4; 43614409b5Sahrens 44614409b5Sahrens /* deadline = pri + (lbolt >> time_shift) */ 45614409b5Sahrens int zfs_vdev_time_shift = 6; 46614409b5Sahrens 47614409b5Sahrens /* exponential I/O issue ramp-up rate */ 48614409b5Sahrens int zfs_vdev_ramp_rate = 2; 49614409b5Sahrens 50614409b5Sahrens /* 51614409b5Sahrens * i/os will be aggregated into a single large i/o up to 52614409b5Sahrens * zfs_vdev_aggregation_limit bytes long. 53614409b5Sahrens */ 54614409b5Sahrens int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; 55614409b5Sahrens 56fa9e4066Sahrens /* 57fa9e4066Sahrens * Virtual device vector for disk I/O scheduling. 58fa9e4066Sahrens */ 59fa9e4066Sahrens int 60fa9e4066Sahrens vdev_queue_deadline_compare(const void *x1, const void *x2) 61fa9e4066Sahrens { 62fa9e4066Sahrens const zio_t *z1 = x1; 63fa9e4066Sahrens const zio_t *z2 = x2; 64fa9e4066Sahrens 65fa9e4066Sahrens if (z1->io_deadline < z2->io_deadline) 66fa9e4066Sahrens return (-1); 67fa9e4066Sahrens if (z1->io_deadline > z2->io_deadline) 68fa9e4066Sahrens return (1); 69fa9e4066Sahrens 70fa9e4066Sahrens if (z1->io_offset < z2->io_offset) 71fa9e4066Sahrens return (-1); 72fa9e4066Sahrens if (z1->io_offset > z2->io_offset) 73fa9e4066Sahrens return (1); 74fa9e4066Sahrens 75fa9e4066Sahrens if (z1 < z2) 76fa9e4066Sahrens return (-1); 77fa9e4066Sahrens if (z1 > z2) 78fa9e4066Sahrens return (1); 79fa9e4066Sahrens 80fa9e4066Sahrens return (0); 81fa9e4066Sahrens } 82fa9e4066Sahrens 83fa9e4066Sahrens int 84fa9e4066Sahrens vdev_queue_offset_compare(const void *x1, const void *x2) 85fa9e4066Sahrens { 86fa9e4066Sahrens const zio_t *z1 = x1; 87fa9e4066Sahrens const zio_t *z2 = x2; 88fa9e4066Sahrens 89fa9e4066Sahrens if (z1->io_offset < z2->io_offset) 90fa9e4066Sahrens return (-1); 91fa9e4066Sahrens if (z1->io_offset > z2->io_offset) 92fa9e4066Sahrens return (1); 93fa9e4066Sahrens 94fa9e4066Sahrens if (z1 < z2) 95fa9e4066Sahrens return (-1); 96fa9e4066Sahrens if (z1 > z2) 97fa9e4066Sahrens return (1); 98fa9e4066Sahrens 99fa9e4066Sahrens return (0); 100fa9e4066Sahrens } 101fa9e4066Sahrens 102fa9e4066Sahrens void 103fa9e4066Sahrens vdev_queue_init(vdev_t *vd) 104fa9e4066Sahrens { 105fa9e4066Sahrens vdev_queue_t *vq = &vd->vdev_queue; 106fa9e4066Sahrens 107fa9e4066Sahrens mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 108fa9e4066Sahrens 109fa9e4066Sahrens avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, 110fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_deadline_node)); 111fa9e4066Sahrens 112fa9e4066Sahrens avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, 113fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 114fa9e4066Sahrens 115fa9e4066Sahrens avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, 116fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 117fa9e4066Sahrens 118fa9e4066Sahrens avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, 119fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 120fa9e4066Sahrens } 121fa9e4066Sahrens 122fa9e4066Sahrens void 123fa9e4066Sahrens vdev_queue_fini(vdev_t *vd) 124fa9e4066Sahrens { 125fa9e4066Sahrens vdev_queue_t *vq = &vd->vdev_queue; 126fa9e4066Sahrens 127fa9e4066Sahrens avl_destroy(&vq->vq_deadline_tree); 128fa9e4066Sahrens avl_destroy(&vq->vq_read_tree); 129fa9e4066Sahrens avl_destroy(&vq->vq_write_tree); 130fa9e4066Sahrens avl_destroy(&vq->vq_pending_tree); 131fa9e4066Sahrens 132fa9e4066Sahrens mutex_destroy(&vq->vq_lock); 133fa9e4066Sahrens } 134fa9e4066Sahrens 135ea8dc4b6Seschrock static void 136ea8dc4b6Seschrock vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 137ea8dc4b6Seschrock { 138ea8dc4b6Seschrock avl_add(&vq->vq_deadline_tree, zio); 139ea8dc4b6Seschrock avl_add(zio->io_vdev_tree, zio); 140ea8dc4b6Seschrock } 141ea8dc4b6Seschrock 142ea8dc4b6Seschrock static void 143ea8dc4b6Seschrock vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 144ea8dc4b6Seschrock { 145ea8dc4b6Seschrock avl_remove(&vq->vq_deadline_tree, zio); 146ea8dc4b6Seschrock avl_remove(zio->io_vdev_tree, zio); 147ea8dc4b6Seschrock } 148ea8dc4b6Seschrock 149fa9e4066Sahrens static void 150fa9e4066Sahrens vdev_queue_agg_io_done(zio_t *aio) 151fa9e4066Sahrens { 152*a3f829aeSBill Moore zio_t *pio; 153fa9e4066Sahrens 154*a3f829aeSBill Moore while ((pio = zio_walk_parents(aio)) != NULL) 155fa9e4066Sahrens if (aio->io_type == ZIO_TYPE_READ) 156*a3f829aeSBill Moore bcopy((char *)aio->io_data + (pio->io_offset - 157*a3f829aeSBill Moore aio->io_offset), pio->io_data, pio->io_size); 158fa9e4066Sahrens 159fa9e4066Sahrens zio_buf_free(aio->io_data, aio->io_size); 160fa9e4066Sahrens } 161fa9e4066Sahrens 162fa9e4066Sahrens #define IS_ADJACENT(io, nio) \ 163fa9e4066Sahrens ((io)->io_offset + (io)->io_size == (nio)->io_offset) 164fa9e4066Sahrens 165fa9e4066Sahrens static zio_t * 166e05725b1Sbonwick vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) 167fa9e4066Sahrens { 168*a3f829aeSBill Moore zio_t *fio, *lio, *aio, *dio, *nio; 169*a3f829aeSBill Moore avl_tree_t *t; 170fa9e4066Sahrens uint64_t size; 1718ad4d6ddSJeff Bonwick int flags; 172fa9e4066Sahrens 173fa9e4066Sahrens ASSERT(MUTEX_HELD(&vq->vq_lock)); 174fa9e4066Sahrens 175fa9e4066Sahrens if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 176fa9e4066Sahrens avl_numnodes(&vq->vq_deadline_tree) == 0) 177fa9e4066Sahrens return (NULL); 178fa9e4066Sahrens 179fa9e4066Sahrens fio = lio = avl_first(&vq->vq_deadline_tree); 180fa9e4066Sahrens 181*a3f829aeSBill Moore t = fio->io_vdev_tree; 182fa9e4066Sahrens size = fio->io_size; 1838ad4d6ddSJeff Bonwick flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; 1848ad4d6ddSJeff Bonwick 1858ad4d6ddSJeff Bonwick if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { 1868ad4d6ddSJeff Bonwick /* 1878ad4d6ddSJeff Bonwick * We can aggregate I/Os that are adjacent and of the 1888ad4d6ddSJeff Bonwick * same flavor, as expressed by the AGG_INHERIT flags. 1898ad4d6ddSJeff Bonwick * The latter is necessary so that certain attributes 1908ad4d6ddSJeff Bonwick * of the I/O, such as whether it's a normal I/O or a 1918ad4d6ddSJeff Bonwick * scrub/resilver, can be preserved in the aggregate. 1928ad4d6ddSJeff Bonwick */ 193*a3f829aeSBill Moore while ((dio = AVL_PREV(t, fio)) != NULL && 1948ad4d6ddSJeff Bonwick IS_ADJACENT(dio, fio) && 1958ad4d6ddSJeff Bonwick (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 1968ad4d6ddSJeff Bonwick size + dio->io_size <= zfs_vdev_aggregation_limit) { 1978ad4d6ddSJeff Bonwick fio = dio; 1988ad4d6ddSJeff Bonwick size += dio->io_size; 1998ad4d6ddSJeff Bonwick } 200*a3f829aeSBill Moore while ((dio = AVL_NEXT(t, lio)) != NULL && 2018ad4d6ddSJeff Bonwick IS_ADJACENT(lio, dio) && 2028ad4d6ddSJeff Bonwick (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 2038ad4d6ddSJeff Bonwick size + dio->io_size <= zfs_vdev_aggregation_limit) { 2048ad4d6ddSJeff Bonwick lio = dio; 2058ad4d6ddSJeff Bonwick size += dio->io_size; 2068ad4d6ddSJeff Bonwick } 207fa9e4066Sahrens } 208fa9e4066Sahrens 209fa9e4066Sahrens if (fio != lio) { 210614409b5Sahrens ASSERT(size <= zfs_vdev_aggregation_limit); 211fa9e4066Sahrens 212e14bb325SJeff Bonwick aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, 213*a3f829aeSBill Moore zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, 2148ad4d6ddSJeff Bonwick flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, 215fa9e4066Sahrens vdev_queue_agg_io_done, NULL); 216fa9e4066Sahrens 217*a3f829aeSBill Moore /* We want to process lio, then stop */ 218*a3f829aeSBill Moore lio = AVL_NEXT(t, lio); 219*a3f829aeSBill Moore for (dio = fio; dio != lio; dio = nio) { 220fa9e4066Sahrens ASSERT(dio->io_type == aio->io_type); 221*a3f829aeSBill Moore ASSERT(dio->io_vdev_tree == t); 222*a3f829aeSBill Moore 223fa9e4066Sahrens if (dio->io_type == ZIO_TYPE_WRITE) 224*a3f829aeSBill Moore bcopy(dio->io_data, (char *)aio->io_data + 225*a3f829aeSBill Moore (dio->io_offset - aio->io_offset), 226*a3f829aeSBill Moore dio->io_size); 227*a3f829aeSBill Moore nio = AVL_NEXT(t, dio); 228*a3f829aeSBill Moore 229*a3f829aeSBill Moore zio_add_child(dio, aio); 230ea8dc4b6Seschrock vdev_queue_io_remove(vq, dio); 231fa9e4066Sahrens zio_vdev_io_bypass(dio); 232*a3f829aeSBill Moore zio_execute(dio); 233fa9e4066Sahrens } 234fa9e4066Sahrens 235fa9e4066Sahrens avl_add(&vq->vq_pending_tree, aio); 236fa9e4066Sahrens 237fa9e4066Sahrens return (aio); 238fa9e4066Sahrens } 239fa9e4066Sahrens 240*a3f829aeSBill Moore ASSERT(fio->io_vdev_tree == t); 241ea8dc4b6Seschrock vdev_queue_io_remove(vq, fio); 242fa9e4066Sahrens 243fa9e4066Sahrens avl_add(&vq->vq_pending_tree, fio); 244fa9e4066Sahrens 245fa9e4066Sahrens return (fio); 246fa9e4066Sahrens } 247fa9e4066Sahrens 248fa9e4066Sahrens zio_t * 249fa9e4066Sahrens vdev_queue_io(zio_t *zio) 250fa9e4066Sahrens { 251fa9e4066Sahrens vdev_queue_t *vq = &zio->io_vd->vdev_queue; 252fa9e4066Sahrens zio_t *nio; 253fa9e4066Sahrens 254fa9e4066Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 255fa9e4066Sahrens 256fa9e4066Sahrens if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 257fa9e4066Sahrens return (zio); 258fa9e4066Sahrens 259fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 260fa9e4066Sahrens 261fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_READ) 262fa9e4066Sahrens zio->io_vdev_tree = &vq->vq_read_tree; 263fa9e4066Sahrens else 264fa9e4066Sahrens zio->io_vdev_tree = &vq->vq_write_tree; 265fa9e4066Sahrens 266fa9e4066Sahrens mutex_enter(&vq->vq_lock); 267fa9e4066Sahrens 268e14bb325SJeff Bonwick zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; 269fa9e4066Sahrens 270ea8dc4b6Seschrock vdev_queue_io_add(vq, zio); 271fa9e4066Sahrens 272e05725b1Sbonwick nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); 273fa9e4066Sahrens 274fa9e4066Sahrens mutex_exit(&vq->vq_lock); 275fa9e4066Sahrens 276e05725b1Sbonwick if (nio == NULL) 277e05725b1Sbonwick return (NULL); 278e05725b1Sbonwick 279e05725b1Sbonwick if (nio->io_done == vdev_queue_agg_io_done) { 280e05725b1Sbonwick zio_nowait(nio); 281e05725b1Sbonwick return (NULL); 282e05725b1Sbonwick } 283fa9e4066Sahrens 284e05725b1Sbonwick return (nio); 285fa9e4066Sahrens } 286fa9e4066Sahrens 287fa9e4066Sahrens void 288fa9e4066Sahrens vdev_queue_io_done(zio_t *zio) 289fa9e4066Sahrens { 290fa9e4066Sahrens vdev_queue_t *vq = &zio->io_vd->vdev_queue; 291fa9e4066Sahrens 292fa9e4066Sahrens mutex_enter(&vq->vq_lock); 293fa9e4066Sahrens 294fa9e4066Sahrens avl_remove(&vq->vq_pending_tree, zio); 295fa9e4066Sahrens 296e14bb325SJeff Bonwick for (int i = 0; i < zfs_vdev_ramp_rate; i++) { 297e14bb325SJeff Bonwick zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); 298fa9e4066Sahrens if (nio == NULL) 299fa9e4066Sahrens break; 300fa9e4066Sahrens mutex_exit(&vq->vq_lock); 301e05725b1Sbonwick if (nio->io_done == vdev_queue_agg_io_done) { 302e05725b1Sbonwick zio_nowait(nio); 303e05725b1Sbonwick } else { 304fa9e4066Sahrens zio_vdev_io_reissue(nio); 305e05725b1Sbonwick zio_execute(nio); 306e05725b1Sbonwick } 307fa9e4066Sahrens mutex_enter(&vq->vq_lock); 308fa9e4066Sahrens } 309fa9e4066Sahrens 310fa9e4066Sahrens mutex_exit(&vq->vq_lock); 311fa9e4066Sahrens } 312