1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5*ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6*ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22*ea8dc4b6Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/zfs_context.h> 29fa9e4066Sahrens #include <sys/spa.h> 30fa9e4066Sahrens #include <sys/vdev_impl.h> 31fa9e4066Sahrens #include <sys/zio.h> 32fa9e4066Sahrens #include <sys/avl.h> 33fa9e4066Sahrens 34fa9e4066Sahrens /* 35fa9e4066Sahrens * Virtual device vector for disk I/O scheduling. 36fa9e4066Sahrens */ 37fa9e4066Sahrens int 38fa9e4066Sahrens vdev_queue_deadline_compare(const void *x1, const void *x2) 39fa9e4066Sahrens { 40fa9e4066Sahrens const zio_t *z1 = x1; 41fa9e4066Sahrens const zio_t *z2 = x2; 42fa9e4066Sahrens 43fa9e4066Sahrens if (z1->io_deadline < z2->io_deadline) 44fa9e4066Sahrens return (-1); 45fa9e4066Sahrens if (z1->io_deadline > z2->io_deadline) 46fa9e4066Sahrens return (1); 47fa9e4066Sahrens 48fa9e4066Sahrens if (z1->io_offset < z2->io_offset) 49fa9e4066Sahrens return (-1); 50fa9e4066Sahrens if (z1->io_offset > z2->io_offset) 51fa9e4066Sahrens return (1); 52fa9e4066Sahrens 53fa9e4066Sahrens if (z1 < z2) 54fa9e4066Sahrens return (-1); 55fa9e4066Sahrens if (z1 > z2) 56fa9e4066Sahrens return (1); 57fa9e4066Sahrens 58fa9e4066Sahrens return (0); 59fa9e4066Sahrens } 60fa9e4066Sahrens 61fa9e4066Sahrens int 62fa9e4066Sahrens vdev_queue_offset_compare(const void *x1, const void *x2) 63fa9e4066Sahrens { 64fa9e4066Sahrens const zio_t *z1 = x1; 65fa9e4066Sahrens const zio_t *z2 = x2; 66fa9e4066Sahrens 67fa9e4066Sahrens if (z1->io_offset < z2->io_offset) 68fa9e4066Sahrens return (-1); 69fa9e4066Sahrens if (z1->io_offset > z2->io_offset) 70fa9e4066Sahrens return (1); 71fa9e4066Sahrens 72fa9e4066Sahrens if (z1 < z2) 73fa9e4066Sahrens return (-1); 74fa9e4066Sahrens if (z1 > z2) 75fa9e4066Sahrens return (1); 76fa9e4066Sahrens 77fa9e4066Sahrens return (0); 78fa9e4066Sahrens } 79fa9e4066Sahrens 80fa9e4066Sahrens void 81fa9e4066Sahrens vdev_queue_init(vdev_t *vd) 82fa9e4066Sahrens { 83fa9e4066Sahrens vdev_queue_t *vq = &vd->vdev_queue; 84fa9e4066Sahrens 85fa9e4066Sahrens mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 86fa9e4066Sahrens 87fa9e4066Sahrens avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, 88fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_deadline_node)); 89fa9e4066Sahrens 90fa9e4066Sahrens avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, 91fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 92fa9e4066Sahrens 93fa9e4066Sahrens avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, 94fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 95fa9e4066Sahrens 96fa9e4066Sahrens avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, 97fa9e4066Sahrens sizeof (zio_t), offsetof(struct zio, io_offset_node)); 98fa9e4066Sahrens } 99fa9e4066Sahrens 100fa9e4066Sahrens void 101fa9e4066Sahrens vdev_queue_fini(vdev_t *vd) 102fa9e4066Sahrens { 103fa9e4066Sahrens vdev_queue_t *vq = &vd->vdev_queue; 104fa9e4066Sahrens 105*ea8dc4b6Seschrock ASSERT(vq->vq_scrub_count == 0); 106*ea8dc4b6Seschrock 107fa9e4066Sahrens avl_destroy(&vq->vq_deadline_tree); 108fa9e4066Sahrens avl_destroy(&vq->vq_read_tree); 109fa9e4066Sahrens avl_destroy(&vq->vq_write_tree); 110fa9e4066Sahrens avl_destroy(&vq->vq_pending_tree); 111fa9e4066Sahrens 112fa9e4066Sahrens mutex_destroy(&vq->vq_lock); 113fa9e4066Sahrens } 114fa9e4066Sahrens 115*ea8dc4b6Seschrock static void 116*ea8dc4b6Seschrock vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 117*ea8dc4b6Seschrock { 118*ea8dc4b6Seschrock avl_add(&vq->vq_deadline_tree, zio); 119*ea8dc4b6Seschrock avl_add(zio->io_vdev_tree, zio); 120*ea8dc4b6Seschrock 121*ea8dc4b6Seschrock if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && 122*ea8dc4b6Seschrock ++vq->vq_scrub_count >= vq->vq_scrub_limit) 123*ea8dc4b6Seschrock spa_scrub_throttle(zio->io_spa, 1); 124*ea8dc4b6Seschrock } 125*ea8dc4b6Seschrock 126*ea8dc4b6Seschrock static void 127*ea8dc4b6Seschrock vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 128*ea8dc4b6Seschrock { 129*ea8dc4b6Seschrock if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && 130*ea8dc4b6Seschrock vq->vq_scrub_count-- >= vq->vq_scrub_limit) 131*ea8dc4b6Seschrock spa_scrub_throttle(zio->io_spa, -1); 132*ea8dc4b6Seschrock 133*ea8dc4b6Seschrock avl_remove(&vq->vq_deadline_tree, zio); 134*ea8dc4b6Seschrock avl_remove(zio->io_vdev_tree, zio); 135*ea8dc4b6Seschrock } 136*ea8dc4b6Seschrock 137fa9e4066Sahrens static void 138fa9e4066Sahrens vdev_queue_agg_io_done(zio_t *aio) 139fa9e4066Sahrens { 140fa9e4066Sahrens zio_t *dio; 141fa9e4066Sahrens uint64_t offset = 0; 142fa9e4066Sahrens 143fa9e4066Sahrens while ((dio = aio->io_delegate_list) != NULL) { 144fa9e4066Sahrens if (aio->io_type == ZIO_TYPE_READ) 145fa9e4066Sahrens bcopy((char *)aio->io_data + offset, dio->io_data, 146fa9e4066Sahrens dio->io_size); 147fa9e4066Sahrens offset += dio->io_size; 148fa9e4066Sahrens aio->io_delegate_list = dio->io_delegate_next; 149fa9e4066Sahrens dio->io_delegate_next = NULL; 150fa9e4066Sahrens dio->io_error = aio->io_error; 151fa9e4066Sahrens zio_next_stage(dio); 152fa9e4066Sahrens } 153fa9e4066Sahrens ASSERT3U(offset, ==, aio->io_size); 154fa9e4066Sahrens 155fa9e4066Sahrens zio_buf_free(aio->io_data, aio->io_size); 156fa9e4066Sahrens } 157fa9e4066Sahrens 158fa9e4066Sahrens #define IS_ADJACENT(io, nio) \ 159fa9e4066Sahrens ((io)->io_offset + (io)->io_size == (nio)->io_offset) 160fa9e4066Sahrens 161fa9e4066Sahrens typedef void zio_issue_func_t(zio_t *); 162fa9e4066Sahrens 163fa9e4066Sahrens static zio_t * 164fa9e4066Sahrens vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, 165fa9e4066Sahrens zio_issue_func_t **funcp) 166fa9e4066Sahrens { 167fa9e4066Sahrens zio_t *fio, *lio, *aio, *dio; 168fa9e4066Sahrens avl_tree_t *tree; 169fa9e4066Sahrens uint64_t size; 170fa9e4066Sahrens 171fa9e4066Sahrens ASSERT(MUTEX_HELD(&vq->vq_lock)); 172fa9e4066Sahrens 173fa9e4066Sahrens *funcp = NULL; 174fa9e4066Sahrens 175fa9e4066Sahrens if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 176fa9e4066Sahrens avl_numnodes(&vq->vq_deadline_tree) == 0) 177fa9e4066Sahrens return (NULL); 178fa9e4066Sahrens 179fa9e4066Sahrens fio = lio = avl_first(&vq->vq_deadline_tree); 180fa9e4066Sahrens 181fa9e4066Sahrens tree = fio->io_vdev_tree; 182fa9e4066Sahrens size = fio->io_size; 183fa9e4066Sahrens 184fa9e4066Sahrens while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && 185fa9e4066Sahrens size + dio->io_size <= vq->vq_agg_limit) { 186fa9e4066Sahrens dio->io_delegate_next = fio; 187fa9e4066Sahrens fio = dio; 188fa9e4066Sahrens size += dio->io_size; 189fa9e4066Sahrens } 190fa9e4066Sahrens 191fa9e4066Sahrens while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && 192fa9e4066Sahrens size + dio->io_size <= vq->vq_agg_limit) { 193fa9e4066Sahrens lio->io_delegate_next = dio; 194fa9e4066Sahrens lio = dio; 195fa9e4066Sahrens size += dio->io_size; 196fa9e4066Sahrens } 197fa9e4066Sahrens 198fa9e4066Sahrens if (fio != lio) { 199fa9e4066Sahrens char *buf = zio_buf_alloc(size); 200fa9e4066Sahrens uint64_t offset = 0; 201fa9e4066Sahrens int nagg = 0; 202fa9e4066Sahrens 203fa9e4066Sahrens ASSERT(size <= vq->vq_agg_limit); 204fa9e4066Sahrens 205fa9e4066Sahrens aio = zio_vdev_child_io(fio, NULL, fio->io_vd, 206fa9e4066Sahrens fio->io_offset, buf, size, fio->io_type, 207fa9e4066Sahrens ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | 208*ea8dc4b6Seschrock ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | 209*ea8dc4b6Seschrock ZIO_FLAG_NOBOOKMARK, 210fa9e4066Sahrens vdev_queue_agg_io_done, NULL); 211fa9e4066Sahrens 212fa9e4066Sahrens aio->io_delegate_list = fio; 213fa9e4066Sahrens 214fa9e4066Sahrens for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { 215fa9e4066Sahrens ASSERT(dio->io_type == aio->io_type); 216*ea8dc4b6Seschrock ASSERT(dio->io_vdev_tree == tree); 217fa9e4066Sahrens if (dio->io_type == ZIO_TYPE_WRITE) 218fa9e4066Sahrens bcopy(dio->io_data, buf + offset, dio->io_size); 219fa9e4066Sahrens offset += dio->io_size; 220*ea8dc4b6Seschrock vdev_queue_io_remove(vq, dio); 221fa9e4066Sahrens zio_vdev_io_bypass(dio); 222fa9e4066Sahrens nagg++; 223fa9e4066Sahrens } 224fa9e4066Sahrens 225fa9e4066Sahrens ASSERT(offset == size); 226fa9e4066Sahrens 227fa9e4066Sahrens dprintf("%5s T=%llu off=%8llx agg=%3d " 228fa9e4066Sahrens "old=%5llx new=%5llx\n", 229fa9e4066Sahrens zio_type_name[fio->io_type], 230fa9e4066Sahrens fio->io_deadline, fio->io_offset, nagg, fio->io_size, size); 231fa9e4066Sahrens 232fa9e4066Sahrens avl_add(&vq->vq_pending_tree, aio); 233fa9e4066Sahrens 234fa9e4066Sahrens *funcp = zio_nowait; 235fa9e4066Sahrens return (aio); 236fa9e4066Sahrens } 237fa9e4066Sahrens 238*ea8dc4b6Seschrock ASSERT(fio->io_vdev_tree == tree); 239*ea8dc4b6Seschrock vdev_queue_io_remove(vq, fio); 240fa9e4066Sahrens 241fa9e4066Sahrens avl_add(&vq->vq_pending_tree, fio); 242fa9e4066Sahrens 243fa9e4066Sahrens *funcp = zio_next_stage; 244fa9e4066Sahrens 245fa9e4066Sahrens return (fio); 246fa9e4066Sahrens } 247fa9e4066Sahrens 248fa9e4066Sahrens zio_t * 249fa9e4066Sahrens vdev_queue_io(zio_t *zio) 250fa9e4066Sahrens { 251fa9e4066Sahrens vdev_queue_t *vq = &zio->io_vd->vdev_queue; 252fa9e4066Sahrens zio_t *nio; 253fa9e4066Sahrens zio_issue_func_t *func; 254fa9e4066Sahrens 255fa9e4066Sahrens ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 256fa9e4066Sahrens 257fa9e4066Sahrens if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 258fa9e4066Sahrens return (zio); 259fa9e4066Sahrens 260fa9e4066Sahrens zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 261fa9e4066Sahrens 262fa9e4066Sahrens if (zio->io_type == ZIO_TYPE_READ) 263fa9e4066Sahrens zio->io_vdev_tree = &vq->vq_read_tree; 264fa9e4066Sahrens else 265fa9e4066Sahrens zio->io_vdev_tree = &vq->vq_write_tree; 266fa9e4066Sahrens 267fa9e4066Sahrens mutex_enter(&vq->vq_lock); 268fa9e4066Sahrens 269fa9e4066Sahrens zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) + 270fa9e4066Sahrens zio->io_priority; 271fa9e4066Sahrens 272*ea8dc4b6Seschrock vdev_queue_io_add(vq, zio); 273fa9e4066Sahrens 274fa9e4066Sahrens nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func); 275fa9e4066Sahrens 276fa9e4066Sahrens mutex_exit(&vq->vq_lock); 277fa9e4066Sahrens 278fa9e4066Sahrens if (nio == NULL || func != zio_nowait) 279fa9e4066Sahrens return (nio); 280fa9e4066Sahrens 281fa9e4066Sahrens func(nio); 282fa9e4066Sahrens return (NULL); 283fa9e4066Sahrens } 284fa9e4066Sahrens 285fa9e4066Sahrens void 286fa9e4066Sahrens vdev_queue_io_done(zio_t *zio) 287fa9e4066Sahrens { 288fa9e4066Sahrens vdev_queue_t *vq = &zio->io_vd->vdev_queue; 289fa9e4066Sahrens zio_t *nio; 290fa9e4066Sahrens zio_issue_func_t *func; 291fa9e4066Sahrens int i; 292fa9e4066Sahrens 293fa9e4066Sahrens mutex_enter(&vq->vq_lock); 294fa9e4066Sahrens 295fa9e4066Sahrens avl_remove(&vq->vq_pending_tree, zio); 296fa9e4066Sahrens 297fa9e4066Sahrens for (i = 0; i < vq->vq_ramp_rate; i++) { 298fa9e4066Sahrens nio = vdev_queue_io_to_issue(vq, vq->vq_max_pending, &func); 299fa9e4066Sahrens if (nio == NULL) 300fa9e4066Sahrens break; 301fa9e4066Sahrens mutex_exit(&vq->vq_lock); 302fa9e4066Sahrens if (func == zio_next_stage) 303fa9e4066Sahrens zio_vdev_io_reissue(nio); 304fa9e4066Sahrens func(nio); 305fa9e4066Sahrens mutex_enter(&vq->vq_lock); 306fa9e4066Sahrens } 307fa9e4066Sahrens 308fa9e4066Sahrens mutex_exit(&vq->vq_lock); 309fa9e4066Sahrens } 310