1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2014 by Delphix. All rights reserved.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/file.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/modctl.h>
31 #include <sys/scsi/scsi.h>
32 #include <sys/scsi/impl/scsi_reset_notify.h>
33 #include <sys/scsi/generic/mode.h>
34 #include <sys/disp.h>
35 #include <sys/byteorder.h>
36 #include <sys/atomic.h>
37 #include <sys/sdt.h>
38 #include <sys/dkio.h>
39 #include <sys/dmu.h>
40 #include <sys/arc.h>
41 #include <sys/zvol.h>
42 #include <sys/zfs_rlock.h>
43 
44 #include <sys/stmf.h>
45 #include <sys/lpif.h>
46 #include <sys/portif.h>
47 #include <sys/stmf_ioctl.h>
48 #include <sys/stmf_sbd_ioctl.h>
49 
50 #include "stmf_sbd.h"
51 #include "sbd_impl.h"
52 
53 
54 /*
55  * This file contains direct calls into the zfs module.
56  * These functions mimic zvol_read and zvol_write except pointers
57  * to the data buffers are passed instead of copying the data itself.
58  *
59  * zfs internal interfaces referenced here:
60  *
61  * FUNCTIONS
62  *    dmu_buf_hold_array_by_dnode()
63  *    dmu_buf_rele_array()
64  *
65  *    arc_loan_buf()
66  *    dmu_assign_arcbuf()
67  *    dmu_return_arcbuf()
68  *    arc_buf_size()
69  *
70  *    dmu_tx_create()
71  *    dmu_tx_hold_write()
72  *    dmu_tx_assign()
73  *    dmu_tx_commit(tx)
74  *    dmu_tx_abort(tx)
75  *    zil_commit()
76  *
77  *    zfs_range_lock()
78  *    zfs_range_unlock()
79  *
80  *    zvol_log_write()
81  *
82  *    dmu_read_uio()
83  *    dmu_write_uio()
84  * MINOR DATA
85  *    zv_volsize
86  *    zv_volblocksize
87  *    zv_flags		- for WCE
88  *    zv_objset		- dmu_tx_create
89  *    zv_zilog		- zil_commit
90  *    zv_znode		- zfs_range_lock
91  *    zv_dn		- dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
92  * GLOBAL DATA
93  *    zvol_maxphys
94  */
95 
96 /*
97  * Take direct control of the volume instead of using the driver
98  * interfaces provided by zvol.c. Gather parameters and handles
99  * needed to make direct calls into zfs/dmu/zvol. The driver is
100  * opened exclusively at this point, so these parameters cannot change.
101  *
102  * NOTE: the object size and WCE can change while the device
103  * is open, so they must be fetched for every operation.
104  */
105 int
106 sbd_zvol_get_volume_params(sbd_lu_t *sl)
107 {
108 	int ret;
109 
110 	ret = zvol_get_volume_params(sl->sl_zvol_minor,
111 	    &sl->sl_blksize,		/* volume block size */
112 	    &sl->sl_max_xfer_len,	/* max data chunk size */
113 	    &sl->sl_zvol_minor_hdl,	/* minor soft state */
114 	    &sl->sl_zvol_objset_hdl,	/* dmu_tx_create */
115 	    &sl->sl_zvol_zil_hdl,	/* zil_commit */
116 	    &sl->sl_zvol_rl_hdl,	/* zfs_range_lock */
117 	    &sl->sl_zvol_dn_hdl);	/* dmu_buf_hold_array_by_dnode, */
118 					/* dmu_request_arcbuf, */
119 					/* dmu_assign_arcbuf */
120 
121 	if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) {
122 		cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to "
123 		    "small zvol blocksize (%d)\n", (int)sl->sl_blksize);
124 		ret = ENOTSUP;
125 	}
126 
127 	return (ret);
128 }
129 
130 /*
131  * Return the number of elements in a scatter/gather list required for
132  * the given span in the zvol. Elements are 1:1 with zvol blocks.
133  */
134 uint32_t
135 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len)
136 {
137 	uint64_t blksz = sl->sl_blksize;
138 	uint64_t endoff = off + len;
139 	uint64_t numsegs;
140 
141 	numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz;
142 	return ((uint32_t)numsegs);
143 }
144 
145 /*
146  * Return an array of dmu_buf_t pointers for the requested range.
147  * The dmu buffers are either in cache or read in synchronously.
148  * Fill in the dbuf sglist from the dmu_buf_t array.
149  */
150 static void *RDTAG = "sbd_zvol_read";
151 
152 int
153 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
154 {
155 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
156 	rl_t		*rl;
157 	int		numbufs, error;
158 	uint64_t	len = dbuf->db_data_size;
159 	uint64_t	offset = zvio->zvio_offset;
160 	dmu_buf_t	**dbpp, *dbp;
161 
162 	/* Make sure request is reasonable */
163 	if (len > sl->sl_max_xfer_len)
164 		return (E2BIG);
165 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
166 		return (EIO);
167 
168 	/*
169 	 * The range lock is only held until the dmu buffers read in and
170 	 * held; not during the callers use of the data.
171 	 */
172 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
173 
174 	error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl,
175 	    offset, len, TRUE, RDTAG, &numbufs, &dbpp,
176 	    DMU_READ_PREFETCH);
177 
178 	zfs_range_unlock(rl);
179 
180 	if (error == ECKSUM)
181 		error = EIO;
182 
183 	if (error == 0) {
184 		/*
185 		 * Fill in db_sglist from the dmu_buf_t array.
186 		 */
187 		int		i;
188 		stmf_sglist_ent_t *sgl;
189 		uint64_t	odiff, seglen;
190 
191 		zvio->zvio_dbp = dbpp;
192 		/* make sure db_sglist is large enough */
193 		if (dbuf->db_sglist_length != numbufs) {
194 			cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
195 			    dbuf->db_sglist_length, numbufs);
196 		}
197 
198 		sgl = &dbuf->db_sglist[0];
199 		for (i = 0; i < numbufs; i++) {
200 			dbp = dbpp[i];
201 			odiff =  offset - dbp->db_offset;
202 			ASSERT(odiff == 0 || i == 0);
203 			sgl->seg_addr = (uint8_t *)dbp->db_data + odiff;
204 			seglen = MIN(len, dbp->db_size - odiff);
205 			sgl->seg_length = (uint32_t)seglen;
206 			offset += seglen;
207 			len -= seglen;
208 			sgl++;
209 		}
210 		ASSERT(len == 0);
211 
212 	}
213 	return (error);
214 }
215 
216 /*
217  * Release a dmu_buf_t array.
218  */
219 /*ARGSUSED*/
220 void
221 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
222 {
223 	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
224 
225 	ASSERT(zvio->zvio_dbp);
226 	ASSERT(dbuf->db_sglist_length);
227 
228 	dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG);
229 }
230 
231 /*
232  * Allocate enough loaned arc buffers for the requested region.
233  * Mimic the handling of the dmu_buf_t array used for reads as closely
234  * as possible even though the arc_buf_t's are anonymous until released.
235  * The buffers will match the zvol object blocks sizes and alignments
236  * such that a data copy may be avoided when the buffers are assigned.
237  */
238 int
239 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
240 {
241 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
242 	int		blkshift, numbufs, i;
243 	uint64_t	blksize;
244 	arc_buf_t	**abp;
245 	stmf_sglist_ent_t *sgl;
246 	uint64_t	len = dbuf->db_data_size;
247 	uint64_t	offset = zvio->zvio_offset;
248 
249 	/* Make sure request is reasonable */
250 	if (len > sl->sl_max_xfer_len)
251 		return (E2BIG);
252 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
253 		return (EIO);
254 
255 	/*
256 	 * Break up the request into chunks to match
257 	 * the volume block size. Only full, and aligned
258 	 * buffers will avoid the data copy in the dmu.
259 	 */
260 	/*
261 	 * calculate how may dbufs are needed
262 	 */
263 	blksize = sl->sl_blksize;
264 	ASSERT(ISP2(blksize));
265 	blkshift = highbit(blksize - 1);
266 	/*
267 	 * taken from dmu_buf_hold_array_by_dnode()
268 	 */
269 	numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) -
270 	    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
271 	if (dbuf->db_sglist_length != numbufs) {
272 		cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
273 		    dbuf->db_sglist_length, numbufs);
274 	}
275 	/*
276 	 * allocate a holder for the needed arc_buf pointers
277 	 */
278 	abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP);
279 	/*
280 	 * The write operation uses loaned arc buffers so that
281 	 * the xfer_data is done outside of a dmu transaction.
282 	 * These buffers will exactly match the request unlike
283 	 * the dmu buffers obtained from the read operation.
284 	 */
285 	/*
286 	 * allocate the arc buffers and fill in the stmf sglist
287 	 */
288 	sgl = &dbuf->db_sglist[0];
289 	for (i = 0; i < numbufs; i++) {
290 		uint64_t seglen;
291 
292 		/* first block may not be aligned */
293 		seglen = P2NPHASE(offset, blksize);
294 		if (seglen == 0)
295 			seglen = blksize;
296 		seglen = MIN(seglen, len);
297 		abp[i] = arc_loan_buf(dmu_objset_spa(sl->sl_zvol_objset_hdl),
298 		    B_FALSE, (int)seglen);
299 		ASSERT(arc_buf_size(abp[i]) == (int)seglen);
300 		sgl->seg_addr = abp[i]->b_data;
301 		sgl->seg_length = (uint32_t)seglen;
302 		sgl++;
303 		offset += seglen;
304 		len -= seglen;
305 	}
306 	ASSERT(len == 0);
307 
308 	zvio->zvio_abp = abp;
309 	return (0);
310 }
311 
312 /*ARGSUSED*/
313 void
314 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
315 {
316 	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
317 	int i;
318 	arc_buf_t **abp = zvio->zvio_abp;
319 
320 	/* free arcbufs */
321 	for (i = 0; i < dbuf->db_sglist_length; i++)
322 		dmu_return_arcbuf(*abp++);
323 	kmem_free(zvio->zvio_abp,
324 	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
325 	zvio->zvio_abp = NULL;
326 }
327 
328 /*
329  * Release the arc_buf_t array allocated above and handle these cases :
330  *
331  * flags == 0 - create transaction and assign all arc bufs to offsets
332  * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices
333  */
334 int
335 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
336 {
337 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
338 	dmu_tx_t	*tx;
339 	int		sync, i, error;
340 	rl_t		*rl;
341 	arc_buf_t	**abp = zvio->zvio_abp;
342 	int		flags = zvio->zvio_flags;
343 	uint64_t	toffset, offset = zvio->zvio_offset;
344 	uint64_t	resid, len = dbuf->db_data_size;
345 
346 	ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT);
347 
348 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
349 
350 	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
351 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len);
352 	error = dmu_tx_assign(tx, TXG_WAIT);
353 
354 	if (error) {
355 		dmu_tx_abort(tx);
356 		zfs_range_unlock(rl);
357 		sbd_zvol_rele_write_bufs_abort(sl, dbuf);
358 		return (error);
359 	}
360 
361 	toffset = offset;
362 	resid = len;
363 	for (i = 0; i < dbuf->db_sglist_length; i++) {
364 		arc_buf_t *abuf;
365 		int size;
366 
367 		abuf = abp[i];
368 		size = arc_buf_size(abuf);
369 		dmu_assign_arcbuf_dnode(sl->sl_zvol_dn_hdl, toffset, abuf,
370 		    tx);
371 		toffset += size;
372 		resid -= size;
373 	}
374 	ASSERT(resid == 0);
375 
376 	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
377 	zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
378 	    (ssize_t)len, sync);
379 	dmu_tx_commit(tx);
380 	zfs_range_unlock(rl);
381 	kmem_free(zvio->zvio_abp,
382 	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
383 	zvio->zvio_abp = NULL;
384 	if (sync && (flags & ZVIO_COMMIT))
385 		zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
386 	return (0);
387 }
388 
389 /*
390  * Copy interface for callers using direct zvol access.
391  * Very similar to zvol_read but the uio may have multiple iovec entries.
392  */
393 int
394 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
395 {
396 	int		error;
397 	rl_t		*rl;
398 	uint64_t	len = (uint64_t)uio->uio_resid;
399 	uint64_t	offset = (uint64_t)uio->uio_loffset;
400 
401 	/* Make sure request is reasonable */
402 	if (len > sl->sl_max_xfer_len)
403 		return (E2BIG);
404 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
405 		return (EIO);
406 
407 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
408 
409 	error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len);
410 
411 	zfs_range_unlock(rl);
412 	if (error == ECKSUM)
413 		error = EIO;
414 	return (error);
415 }
416 
417 /*
418  * Copy interface for callers using direct zvol access.
419  * Very similar to zvol_write but the uio may have multiple iovec entries.
420  */
421 int
422 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
423 {
424 	rl_t		*rl;
425 	dmu_tx_t	*tx;
426 	int		error, sync;
427 	uint64_t	len = (uint64_t)uio->uio_resid;
428 	uint64_t	offset = (uint64_t)uio->uio_loffset;
429 
430 	ASSERT(flags == 0 || flags == ZVIO_COMMIT);
431 
432 	/* Make sure request is reasonable */
433 	if (len > sl->sl_max_xfer_len)
434 		return (E2BIG);
435 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
436 		return (EIO);
437 
438 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
439 
440 	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
441 
442 	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
443 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid);
444 	error = dmu_tx_assign(tx, TXG_WAIT);
445 	if (error) {
446 		dmu_tx_abort(tx);
447 	} else {
448 		error = dmu_write_uio_dnode(sl->sl_zvol_dn_hdl, uio, len, tx);
449 		if (error == 0) {
450 			zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
451 			    (ssize_t)len, sync);
452 		}
453 		dmu_tx_commit(tx);
454 	}
455 	zfs_range_unlock(rl);
456 	if (sync && (flags & ZVIO_COMMIT))
457 		zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
458 	if (error == ECKSUM)
459 		error = EIO;
460 	return (error);
461 }
462