1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2014 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/conf.h> 27 #include <sys/file.h> 28 #include <sys/ddi.h> 29 #include <sys/sunddi.h> 30 #include <sys/modctl.h> 31 #include <sys/scsi/scsi.h> 32 #include <sys/scsi/impl/scsi_reset_notify.h> 33 #include <sys/scsi/generic/mode.h> 34 #include <sys/disp.h> 35 #include <sys/byteorder.h> 36 #include <sys/atomic.h> 37 #include <sys/sdt.h> 38 #include <sys/dkio.h> 39 #include <sys/dmu.h> 40 #include <sys/arc.h> 41 #include <sys/zvol.h> 42 #include <sys/zfs_rlock.h> 43 44 #include <sys/stmf.h> 45 #include <sys/lpif.h> 46 #include <sys/portif.h> 47 #include <sys/stmf_ioctl.h> 48 #include <sys/stmf_sbd_ioctl.h> 49 50 #include "stmf_sbd.h" 51 #include "sbd_impl.h" 52 53 54 /* 55 * This file contains direct calls into the zfs module. 56 * These functions mimic zvol_read and zvol_write except pointers 57 * to the data buffers are passed instead of copying the data itself. 58 * 59 * zfs internal interfaces referenced here: 60 * 61 * FUNCTIONS 62 * dmu_buf_hold_array_by_dnode() 63 * dmu_buf_rele_array() 64 * 65 * arc_loan_buf() 66 * dmu_assign_arcbuf() 67 * dmu_return_arcbuf() 68 * arc_buf_size() 69 * 70 * dmu_tx_create() 71 * dmu_tx_hold_write() 72 * dmu_tx_assign() 73 * dmu_tx_commit(tx) 74 * dmu_tx_abort(tx) 75 * zil_commit() 76 * 77 * zfs_range_lock() 78 * zfs_range_unlock() 79 * 80 * zvol_log_write() 81 * 82 * dmu_read_uio() 83 * dmu_write_uio() 84 * MINOR DATA 85 * zv_volsize 86 * zv_volblocksize 87 * zv_flags - for WCE 88 * zv_objset - dmu_tx_create 89 * zv_zilog - zil_commit 90 * zv_znode - zfs_range_lock 91 * zv_dn - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf 92 * GLOBAL DATA 93 * zvol_maxphys 94 */ 95 96 /* 97 * Take direct control of the volume instead of using the driver 98 * interfaces provided by zvol.c. Gather parameters and handles 99 * needed to make direct calls into zfs/dmu/zvol. The driver is 100 * opened exclusively at this point, so these parameters cannot change. 101 * 102 * NOTE: the object size and WCE can change while the device 103 * is open, so they must be fetched for every operation. 104 */ 105 int 106 sbd_zvol_get_volume_params(sbd_lu_t *sl) 107 { 108 int ret; 109 110 ret = zvol_get_volume_params(sl->sl_zvol_minor, 111 &sl->sl_blksize, /* volume block size */ 112 &sl->sl_max_xfer_len, /* max data chunk size */ 113 &sl->sl_zvol_minor_hdl, /* minor soft state */ 114 &sl->sl_zvol_objset_hdl, /* dmu_tx_create */ 115 &sl->sl_zvol_zil_hdl, /* zil_commit */ 116 &sl->sl_zvol_rl_hdl, /* zfs_range_lock */ 117 &sl->sl_zvol_dn_hdl); /* dmu_buf_hold_array_by_dnode, */ 118 /* dmu_request_arcbuf, */ 119 /* dmu_assign_arcbuf */ 120 121 if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) { 122 cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to " 123 "small zvol blocksize (%d)\n", (int)sl->sl_blksize); 124 ret = ENOTSUP; 125 } 126 127 return (ret); 128 } 129 130 /* 131 * Return the number of elements in a scatter/gather list required for 132 * the given span in the zvol. Elements are 1:1 with zvol blocks. 133 */ 134 uint32_t 135 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len) 136 { 137 uint64_t blksz = sl->sl_blksize; 138 uint64_t endoff = off + len; 139 uint64_t numsegs; 140 141 numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz; 142 return ((uint32_t)numsegs); 143 } 144 145 /* 146 * Return an array of dmu_buf_t pointers for the requested range. 147 * The dmu buffers are either in cache or read in synchronously. 148 * Fill in the dbuf sglist from the dmu_buf_t array. 149 */ 150 static void *RDTAG = "sbd_zvol_read"; 151 152 int 153 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 154 { 155 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 156 rl_t *rl; 157 int numbufs, error; 158 uint64_t len = dbuf->db_data_size; 159 uint64_t offset = zvio->zvio_offset; 160 dmu_buf_t **dbpp, *dbp; 161 162 /* Make sure request is reasonable */ 163 if (len > sl->sl_max_xfer_len) 164 return (E2BIG); 165 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 166 return (EIO); 167 168 /* 169 * The range lock is only held until the dmu buffers read in and 170 * held; not during the callers use of the data. 171 */ 172 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); 173 174 error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl, 175 offset, len, TRUE, RDTAG, &numbufs, &dbpp, 176 DMU_READ_PREFETCH); 177 178 zfs_range_unlock(rl); 179 180 if (error == ECKSUM) 181 error = EIO; 182 183 if (error == 0) { 184 /* 185 * Fill in db_sglist from the dmu_buf_t array. 186 */ 187 int i; 188 stmf_sglist_ent_t *sgl; 189 uint64_t odiff, seglen; 190 191 zvio->zvio_dbp = dbpp; 192 /* make sure db_sglist is large enough */ 193 if (dbuf->db_sglist_length != numbufs) { 194 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n", 195 dbuf->db_sglist_length, numbufs); 196 } 197 198 sgl = &dbuf->db_sglist[0]; 199 for (i = 0; i < numbufs; i++) { 200 dbp = dbpp[i]; 201 odiff = offset - dbp->db_offset; 202 ASSERT(odiff == 0 || i == 0); 203 sgl->seg_addr = (uint8_t *)dbp->db_data + odiff; 204 seglen = MIN(len, dbp->db_size - odiff); 205 sgl->seg_length = (uint32_t)seglen; 206 offset += seglen; 207 len -= seglen; 208 sgl++; 209 } 210 ASSERT(len == 0); 211 212 } 213 return (error); 214 } 215 216 /* 217 * Release a dmu_buf_t array. 218 */ 219 /*ARGSUSED*/ 220 void 221 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 222 { 223 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 224 225 ASSERT(zvio->zvio_dbp); 226 ASSERT(dbuf->db_sglist_length); 227 228 dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG); 229 } 230 231 /* 232 * Allocate enough loaned arc buffers for the requested region. 233 * Mimic the handling of the dmu_buf_t array used for reads as closely 234 * as possible even though the arc_buf_t's are anonymous until released. 235 * The buffers will match the zvol object blocks sizes and alignments 236 * such that a data copy may be avoided when the buffers are assigned. 237 */ 238 int 239 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 240 { 241 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 242 int blkshift, numbufs, i; 243 uint64_t blksize; 244 arc_buf_t **abp; 245 stmf_sglist_ent_t *sgl; 246 uint64_t len = dbuf->db_data_size; 247 uint64_t offset = zvio->zvio_offset; 248 249 /* Make sure request is reasonable */ 250 if (len > sl->sl_max_xfer_len) 251 return (E2BIG); 252 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 253 return (EIO); 254 255 /* 256 * Break up the request into chunks to match 257 * the volume block size. Only full, and aligned 258 * buffers will avoid the data copy in the dmu. 259 */ 260 /* 261 * calculate how may dbufs are needed 262 */ 263 blksize = sl->sl_blksize; 264 ASSERT(ISP2(blksize)); 265 blkshift = highbit(blksize - 1); 266 /* 267 * taken from dmu_buf_hold_array_by_dnode() 268 */ 269 numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) - 270 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 271 if (dbuf->db_sglist_length != numbufs) { 272 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n", 273 dbuf->db_sglist_length, numbufs); 274 } 275 /* 276 * allocate a holder for the needed arc_buf pointers 277 */ 278 abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP); 279 /* 280 * The write operation uses loaned arc buffers so that 281 * the xfer_data is done outside of a dmu transaction. 282 * These buffers will exactly match the request unlike 283 * the dmu buffers obtained from the read operation. 284 */ 285 /* 286 * allocate the arc buffers and fill in the stmf sglist 287 */ 288 sgl = &dbuf->db_sglist[0]; 289 for (i = 0; i < numbufs; i++) { 290 uint64_t seglen; 291 292 /* first block may not be aligned */ 293 seglen = P2NPHASE(offset, blksize); 294 if (seglen == 0) 295 seglen = blksize; 296 seglen = MIN(seglen, len); 297 abp[i] = arc_loan_buf(dmu_objset_spa(sl->sl_zvol_objset_hdl), 298 B_FALSE, (int)seglen); 299 ASSERT(arc_buf_size(abp[i]) == (int)seglen); 300 sgl->seg_addr = abp[i]->b_data; 301 sgl->seg_length = (uint32_t)seglen; 302 sgl++; 303 offset += seglen; 304 len -= seglen; 305 } 306 ASSERT(len == 0); 307 308 zvio->zvio_abp = abp; 309 return (0); 310 } 311 312 /*ARGSUSED*/ 313 void 314 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 315 { 316 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 317 int i; 318 arc_buf_t **abp = zvio->zvio_abp; 319 320 /* free arcbufs */ 321 for (i = 0; i < dbuf->db_sglist_length; i++) 322 dmu_return_arcbuf(*abp++); 323 kmem_free(zvio->zvio_abp, 324 sizeof (arc_buf_t *) * dbuf->db_sglist_length); 325 zvio->zvio_abp = NULL; 326 } 327 328 /* 329 * Release the arc_buf_t array allocated above and handle these cases : 330 * 331 * flags == 0 - create transaction and assign all arc bufs to offsets 332 * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices 333 */ 334 int 335 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 336 { 337 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 338 dmu_tx_t *tx; 339 int sync, i, error; 340 rl_t *rl; 341 arc_buf_t **abp = zvio->zvio_abp; 342 int flags = zvio->zvio_flags; 343 uint64_t toffset, offset = zvio->zvio_offset; 344 uint64_t resid, len = dbuf->db_data_size; 345 346 ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT); 347 348 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); 349 350 tx = dmu_tx_create(sl->sl_zvol_objset_hdl); 351 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len); 352 error = dmu_tx_assign(tx, TXG_WAIT); 353 354 if (error) { 355 dmu_tx_abort(tx); 356 zfs_range_unlock(rl); 357 sbd_zvol_rele_write_bufs_abort(sl, dbuf); 358 return (error); 359 } 360 361 toffset = offset; 362 resid = len; 363 for (i = 0; i < dbuf->db_sglist_length; i++) { 364 arc_buf_t *abuf; 365 int size; 366 367 abuf = abp[i]; 368 size = arc_buf_size(abuf); 369 dmu_assign_arcbuf_dnode(sl->sl_zvol_dn_hdl, toffset, abuf, 370 tx); 371 toffset += size; 372 resid -= size; 373 } 374 ASSERT(resid == 0); 375 376 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); 377 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, 378 (ssize_t)len, sync); 379 dmu_tx_commit(tx); 380 zfs_range_unlock(rl); 381 kmem_free(zvio->zvio_abp, 382 sizeof (arc_buf_t *) * dbuf->db_sglist_length); 383 zvio->zvio_abp = NULL; 384 if (sync && (flags & ZVIO_COMMIT)) 385 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ); 386 return (0); 387 } 388 389 /* 390 * Copy interface for callers using direct zvol access. 391 * Very similar to zvol_read but the uio may have multiple iovec entries. 392 */ 393 int 394 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) 395 { 396 int error; 397 rl_t *rl; 398 uint64_t len = (uint64_t)uio->uio_resid; 399 uint64_t offset = (uint64_t)uio->uio_loffset; 400 401 /* Make sure request is reasonable */ 402 if (len > sl->sl_max_xfer_len) 403 return (E2BIG); 404 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 405 return (EIO); 406 407 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); 408 409 error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len); 410 411 zfs_range_unlock(rl); 412 if (error == ECKSUM) 413 error = EIO; 414 return (error); 415 } 416 417 /* 418 * Copy interface for callers using direct zvol access. 419 * Very similar to zvol_write but the uio may have multiple iovec entries. 420 */ 421 int 422 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) 423 { 424 rl_t *rl; 425 dmu_tx_t *tx; 426 int error, sync; 427 uint64_t len = (uint64_t)uio->uio_resid; 428 uint64_t offset = (uint64_t)uio->uio_loffset; 429 430 ASSERT(flags == 0 || flags == ZVIO_COMMIT); 431 432 /* Make sure request is reasonable */ 433 if (len > sl->sl_max_xfer_len) 434 return (E2BIG); 435 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 436 return (EIO); 437 438 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); 439 440 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); 441 442 tx = dmu_tx_create(sl->sl_zvol_objset_hdl); 443 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid); 444 error = dmu_tx_assign(tx, TXG_WAIT); 445 if (error) { 446 dmu_tx_abort(tx); 447 } else { 448 error = dmu_write_uio_dnode(sl->sl_zvol_dn_hdl, uio, len, tx); 449 if (error == 0) { 450 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, 451 (ssize_t)len, sync); 452 } 453 dmu_tx_commit(tx); 454 } 455 zfs_range_unlock(rl); 456 if (sync && (flags & ZVIO_COMMIT)) 457 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ); 458 if (error == ECKSUM) 459 error = EIO; 460 return (error); 461 } 462