1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22ea8dc4b6Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/zfs_context.h> 29fa9e4066Sahrens #include <sys/spa.h> 30fa9e4066Sahrens #include <sys/vdev_impl.h> 31fa9e4066Sahrens #include <sys/zio.h> 32fa9e4066Sahrens #include <sys/avl.h> 33fa9e4066Sahrens 34*614409b5Sahrens /* 35*614409b5Sahrens * These tunables are for performance analysis. 36*614409b5Sahrens */ 37*614409b5Sahrens /* 38*614409b5Sahrens * zfs_vdev_max_pending is the maximum number of i/os concurrently 39*614409b5Sahrens * pending to each device. zfs_vdev_min_pending is the initial number 40*614409b5Sahrens * of i/os pending to each device (before it starts ramping up to 41*614409b5Sahrens * max_pending). 42*614409b5Sahrens */ 43*614409b5Sahrens int zfs_vdev_max_pending = 35; 44*614409b5Sahrens int zfs_vdev_min_pending = 4; 45*614409b5Sahrens 46*614409b5Sahrens /* maximum scrub/resilver I/O queue */ 47*614409b5Sahrens int zfs_scrub_limit = 70; 48*614409b5Sahrens 49*614409b5Sahrens /* deadline = pri + (lbolt >> time_shift) */ 50*614409b5Sahrens int zfs_vdev_time_shift = 6; 51*614409b5Sahrens 52*614409b5Sahrens /* exponential I/O issue ramp-up rate */ 53*614409b5Sahrens int zfs_vdev_ramp_rate = 2; 54*614409b5Sahrens 55*614409b5Sahrens /* 56*614409b5Sahrens * i/os will be aggregated into a single large i/o up to 57*614409b5Sahrens * zfs_vdev_aggregation_limit bytes long. 58*614409b5Sahrens */ 59*614409b5Sahrens int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; 60*614409b5Sahrens 61fa9e4066Sahrens /* 62fa9e4066Sahrens * Virtual device vector for disk I/O scheduling. 63fa9e4066Sahrens */ 64fa9e4066Sahrens int 65fa9e4066Sahrens vdev_queue_deadline_compare(const void *x1, const void *x2) 66fa9e4066Sahrens { 67fa9e4066Sahrens const zio_t *z1 = x1; 68fa9e4066Sahrens const zio_t *z2 = x2; 69fa9e4066Sahrens 70fa9e4066Sahrens if (z1->io_deadline < z2->io_deadline) 71fa9e4066Sahrens return (-1); 72fa9e4066Sahrens if (z1->io_deadline > z2->io_deadline) 73fa9e4066Sahrens return (1); 74fa9e4066Sahrens 75fa9e4066Sahrens if (z1->io_offset < z2->io_offset) 76fa9e4066Sahrens return (-1); 77fa9e4066Sahrens if (z1->io_offset > z2->io_offset) 78fa9e4066Sahrens return (1); 79fa9e4066Sahrens 80fa9e4066Sahrens if (z1 < z2) 81fa9e4066Sahrens return (-1); 82fa9e4066Sahrens if (z1 > z2) 83fa9e4066Sahrens return (1); 84fa9e4066Sahrens 85fa9e4066Sahrens return (0); 86fa9e4066Sahrens } 87fa9e4066Sahrens 88fa9e4066Sahrens int 89fa9e4066Sahrens vdev_queue_offset_compare(const void *x1, const void *x2) 90fa9e4066Sahrens { 91fa9e4066Sahrens const zio_t *z1 = x1; 92fa9e4066Sahrens const zio_t *z2 = x2; 93fa9e4066Sahrens 94fa9e4066Sahrens if (z1->io_offset < z2->io_offset) 95fa9e4066Sahrens return (-1); 96fa9e4066Sahrens if (z1->io_offset > z2->io_offset) 97fa9e4066Sahrens return (1); 98fa9e4066Sahrens 99fa9e4066Sahrens if (z1 < z2) 100fa9e4066Sahrens return (-1); 101fa9e4066Sahrens if (z1 > z2) 102fa9e4066Sahrens return (1); 103fa9e4066Sahrens 104fa9e4066Sahrens return (0); 105fa9e4066Sahrens } 106fa9e4066Sahrens 107fa9e4066Sahrens void 108fa9e4066Sahrens vdev_queue_init(vdev_t *vd) 109fa9e4066Sahrens { 110fa9e4066Sahrens vdev_queue_t *vq = &vd->vdev_queue; 111fa9e4066Sahrens 112fa9e4066Sahrens mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 113fa9e4066Sahrens 114fa9e4066Sahrens avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, 115fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_deadline_node)); 116fa9e4066Sahrens 117fa9e4066Sahrens avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, 118fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 119fa9e4066Sahrens 120fa9e4066Sahrens avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, 121fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 122fa9e4066Sahrens 123fa9e4066Sahrens avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, 124fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 125fa9e4066Sahrens } 126fa9e4066Sahrens 127fa9e4066Sahrens void 128fa9e4066Sahrens vdev_queue_fini(vdev_t *vd) 129fa9e4066Sahrens { 130fa9e4066Sahrens vdev_queue_t *vq = &vd->vdev_queue; 131fa9e4066Sahrens 132ea8dc4b6Seschrock ASSERT(vq->vq_scrub_count == 0); 133ea8dc4b6Seschrock 134fa9e4066Sahrens avl_destroy(&vq->vq_deadline_tree); 135fa9e4066Sahrens avl_destroy(&vq->vq_read_tree); 136fa9e4066Sahrens avl_destroy(&vq->vq_write_tree); 137fa9e4066Sahrens avl_destroy(&vq->vq_pending_tree); 138fa9e4066Sahrens 139fa9e4066Sahrens mutex_destroy(&vq->vq_lock); 140fa9e4066Sahrens } 141fa9e4066Sahrens 142ea8dc4b6Seschrock static void 143ea8dc4b6Seschrock vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 144ea8dc4b6Seschrock { 145ea8dc4b6Seschrock avl_add(&vq->vq_deadline_tree, zio); 146ea8dc4b6Seschrock avl_add(zio->io_vdev_tree, zio); 147ea8dc4b6Seschrock 148d80c45e0Sbonwick if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && 149*614409b5Sahrens ++vq->vq_scrub_count >= zfs_scrub_limit) 150ea8dc4b6Seschrock spa_scrub_throttle(zio->io_spa, 1); 151ea8dc4b6Seschrock } 152ea8dc4b6Seschrock 153ea8dc4b6Seschrock static void 154ea8dc4b6Seschrock vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 155ea8dc4b6Seschrock { 156d80c45e0Sbonwick if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && 157*614409b5Sahrens vq->vq_scrub_count-- >= zfs_scrub_limit) 158ea8dc4b6Seschrock spa_scrub_throttle(zio->io_spa, -1); 159ea8dc4b6Seschrock 160ea8dc4b6Seschrock avl_remove(&vq->vq_deadline_tree, zio); 161ea8dc4b6Seschrock avl_remove(zio->io_vdev_tree, zio); 162ea8dc4b6Seschrock } 163ea8dc4b6Seschrock 164fa9e4066Sahrens static void 165fa9e4066Sahrens vdev_queue_agg_io_done(zio_t *aio) 166fa9e4066Sahrens { 167fa9e4066Sahrens zio_t *dio; 168fa9e4066Sahrens uint64_t offset = 0; 169fa9e4066Sahrens 170fa9e4066Sahrens while ((dio = aio->io_delegate_list) != NULL) { 171fa9e4066Sahrens if (aio->io_type == ZIO_TYPE_READ) 172fa9e4066Sahrens bcopy((char *)aio->io_data + offset, dio->io_data, 173fa9e4066Sahrens dio->io_size); 174fa9e4066Sahrens offset += dio->io_size; 175fa9e4066Sahrens aio->io_delegate_list = dio->io_delegate_next; 176fa9e4066Sahrens dio->io_delegate_next = NULL; 177fa9e4066Sahrens dio->io_error = aio->io_error; 178fa9e4066Sahrens zio_next_stage(dio); 179fa9e4066Sahrens } 180fa9e4066Sahrens ASSERT3U(offset, ==, aio->io_size); 181fa9e4066Sahrens 182fa9e4066Sahrens zio_buf_free(aio->io_data, aio->io_size); 183fa9e4066Sahrens } 184fa9e4066Sahrens 185fa9e4066Sahrens #define IS_ADJACENT(io, nio) \ 186fa9e4066Sahrens ((io)->io_offset + (io)->io_size == (nio)->io_offset) 187fa9e4066Sahrens 188fa9e4066Sahrens typedef void zio_issue_func_t(zio_t *); 189fa9e4066Sahrens 190fa9e4066Sahrens static zio_t * 191fa9e4066Sahrens vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, 192fa9e4066Sahrens zio_issue_func_t **funcp) 193fa9e4066Sahrens { 194fa9e4066Sahrens zio_t *fio, *lio, *aio, *dio; 195fa9e4066Sahrens avl_tree_t *tree; 196fa9e4066Sahrens uint64_t size; 197fa9e4066Sahrens 198fa9e4066Sahrens ASSERT(MUTEX_HELD(&vq->vq_lock)); 199fa9e4066Sahrens 200fa9e4066Sahrens *funcp = NULL; 201fa9e4066Sahrens 202fa9e4066Sahrens if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 203fa9e4066Sahrens avl_numnodes(&vq->vq_deadline_tree) == 0) 204fa9e4066Sahrens return (NULL); 205fa9e4066Sahrens 206fa9e4066Sahrens fio = lio = avl_first(&vq->vq_deadline_tree); 207fa9e4066Sahrens 208fa9e4066Sahrens tree = fio->io_vdev_tree; 209fa9e4066Sahrens size = fio->io_size; 210fa9e4066Sahrens 211fa9e4066Sahrens while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && 212*614409b5Sahrens size + dio->io_size <= zfs_vdev_aggregation_limit) { 213fa9e4066Sahrens dio->io_delegate_next = fio; 214fa9e4066Sahrens fio = dio; 215fa9e4066Sahrens size += dio->io_size; 216fa9e4066Sahrens } 217fa9e4066Sahrens 218fa9e4066Sahrens while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && 219*614409b5Sahrens size + dio->io_size <= zfs_vdev_aggregation_limit) { 220fa9e4066Sahrens lio->io_delegate_next = dio; 221fa9e4066Sahrens lio = dio; 222fa9e4066Sahrens size += dio->io_size; 223fa9e4066Sahrens } 224fa9e4066Sahrens 225fa9e4066Sahrens if (fio != lio) { 226fa9e4066Sahrens char *buf = zio_buf_alloc(size); 227fa9e4066Sahrens uint64_t offset = 0; 228fa9e4066Sahrens int nagg = 0; 229fa9e4066Sahrens 230*614409b5Sahrens ASSERT(size <= zfs_vdev_aggregation_limit); 231fa9e4066Sahrens 232fa9e4066Sahrens aio = zio_vdev_child_io(fio, NULL, fio->io_vd, 233fa9e4066Sahrens fio->io_offset, buf, size, fio->io_type, 234fa9e4066Sahrens ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | 235ea8dc4b6Seschrock ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | 236ea8dc4b6Seschrock ZIO_FLAG_NOBOOKMARK, 237fa9e4066Sahrens vdev_queue_agg_io_done, NULL); 238fa9e4066Sahrens 239fa9e4066Sahrens aio->io_delegate_list = fio; 240fa9e4066Sahrens 241fa9e4066Sahrens for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { 242fa9e4066Sahrens ASSERT(dio->io_type == aio->io_type); 243ea8dc4b6Seschrock ASSERT(dio->io_vdev_tree == tree); 244fa9e4066Sahrens if (dio->io_type == ZIO_TYPE_WRITE) 245fa9e4066Sahrens bcopy(dio->io_data, buf + offset, dio->io_size); 246fa9e4066Sahrens offset += dio->io_size; 247ea8dc4b6Seschrock vdev_queue_io_remove(vq, dio); 248fa9e4066Sahrens zio_vdev_io_bypass(dio); 249fa9e4066Sahrens nagg++; 250fa9e4066Sahrens } 251fa9e4066Sahrens 252fa9e4066Sahrens ASSERT(offset == size); 253fa9e4066Sahrens 254fa9e4066Sahrens dprintf("%5s T=%llu off=%8llx agg=%3d " 255fa9e4066Sahrens "old=%5llx new=%5llx\n", 256fa9e4066Sahrens zio_type_name[fio->io_type], 257fa9e4066Sahrens fio->io_deadline, fio->io_offset, nagg, fio->io_size, size); 258fa9e4066Sahrens 259fa9e4066Sahrens avl_add(&vq->vq_pending_tree, aio); 260fa9e4066Sahrens 261fa9e4066Sahrens *funcp = zio_nowait; 262fa9e4066Sahrens return (aio); 263fa9e4066Sahrens } 264fa9e4066Sahrens 265ea8dc4b6Seschrock ASSERT(fio->io_vdev_tree == tree); 266ea8dc4b6Seschrock vdev_queue_io_remove(vq, fio); 267fa9e4066Sahrens 268fa9e4066Sahrens avl_add(&vq->vq_pending_tree, fio); 269fa9e4066Sahrens 270fa9e4066Sahrens *funcp = zio_next_stage; 271fa9e4066Sahrens 272fa9e4066Sahrens return (fio); 273fa9e4066Sahrens } 274fa9e4066Sahrens 275fa9e4066Sahrens zio_t * 276fa9e4066Sahrens vdev_queue_io(zio_t *zio) 277fa9e4066Sahrens { 278fa9e4066Sahrens vdev_queue_t *vq = &zio->io_vd->vdev_queue; 279fa9e4066Sahrens zio_t *nio; 280fa9e4066Sahrens zio_issue_func_t *func; 281fa9e4066Sahrens 282fa9e4066Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 283fa9e4066Sahrens 284fa9e4066Sahrens if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 285fa9e4066Sahrens return (zio); 286fa9e4066Sahrens 287fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 288fa9e4066Sahrens 289fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_READ) 290fa9e4066Sahrens zio->io_vdev_tree = &vq->vq_read_tree; 291fa9e4066Sahrens else 292fa9e4066Sahrens zio->io_vdev_tree = &vq->vq_write_tree; 293fa9e4066Sahrens 294fa9e4066Sahrens mutex_enter(&vq->vq_lock); 295fa9e4066Sahrens 296*614409b5Sahrens zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + 297fa9e4066Sahrens zio->io_priority; 298fa9e4066Sahrens 299ea8dc4b6Seschrock vdev_queue_io_add(vq, zio); 300fa9e4066Sahrens 301*614409b5Sahrens nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func); 302fa9e4066Sahrens 303fa9e4066Sahrens mutex_exit(&vq->vq_lock); 304fa9e4066Sahrens 305fa9e4066Sahrens if (nio == NULL || func != zio_nowait) 306fa9e4066Sahrens return (nio); 307fa9e4066Sahrens 308fa9e4066Sahrens func(nio); 309fa9e4066Sahrens return (NULL); 310fa9e4066Sahrens } 311fa9e4066Sahrens 312fa9e4066Sahrens void 313fa9e4066Sahrens vdev_queue_io_done(zio_t *zio) 314fa9e4066Sahrens { 315fa9e4066Sahrens vdev_queue_t *vq = &zio->io_vd->vdev_queue; 316fa9e4066Sahrens zio_t *nio; 317fa9e4066Sahrens zio_issue_func_t *func; 318fa9e4066Sahrens int i; 319fa9e4066Sahrens 320fa9e4066Sahrens mutex_enter(&vq->vq_lock); 321fa9e4066Sahrens 322fa9e4066Sahrens avl_remove(&vq->vq_pending_tree, zio); 323fa9e4066Sahrens 324*614409b5Sahrens for (i = 0; i < zfs_vdev_ramp_rate; i++) { 325*614409b5Sahrens nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func); 326fa9e4066Sahrens if (nio == NULL) 327fa9e4066Sahrens break; 328fa9e4066Sahrens mutex_exit(&vq->vq_lock); 329fa9e4066Sahrens if (func == zio_next_stage) 330fa9e4066Sahrens zio_vdev_io_reissue(nio); 331fa9e4066Sahrens func(nio); 332fa9e4066Sahrens mutex_enter(&vq->vq_lock); 333fa9e4066Sahrens } 334fa9e4066Sahrens 335fa9e4066Sahrens mutex_exit(&vq->vq_lock); 336fa9e4066Sahrens } 337