1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2014, 2018 by Delphix. All rights reserved.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/file.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/modctl.h>
31 #include <sys/scsi/scsi.h>
32 #include <sys/scsi/impl/scsi_reset_notify.h>
33 #include <sys/scsi/generic/mode.h>
34 #include <sys/disp.h>
35 #include <sys/byteorder.h>
36 #include <sys/atomic.h>
37 #include <sys/sdt.h>
38 #include <sys/dkio.h>
39 #include <sys/dmu.h>
40 #include <sys/arc.h>
41 #include <sys/zvol.h>
42 #include <sys/zfs_rlock.h>
43 #include <sys/zil.h>
44 
45 #include <sys/stmf.h>
46 #include <sys/lpif.h>
47 #include <sys/portif.h>
48 #include <sys/stmf_ioctl.h>
49 #include <sys/stmf_sbd_ioctl.h>
50 
51 #include "stmf_sbd.h"
52 #include "sbd_impl.h"
53 
54 
55 /*
56  * This file contains direct calls into the zfs module.
57  * These functions mimic zvol_read and zvol_write except pointers
58  * to the data buffers are passed instead of copying the data itself.
59  *
60  * zfs internal interfaces referenced here:
61  *
62  * FUNCTIONS
63  *    dmu_buf_hold_array_by_dnode()
64  *    dmu_buf_rele_array()
65  *
66  *    arc_loan_buf()
67  *    dmu_assign_arcbuf()
68  *    dmu_return_arcbuf()
69  *    arc_buf_size()
70  *
71  *    dmu_tx_create()
72  *    dmu_tx_hold_write()
73  *    dmu_tx_assign()
74  *    dmu_tx_commit(tx)
75  *    dmu_tx_abort(tx)
76  *    zil_commit()
77  *
78  *    rangelock_enter()
79  *    rangelock_exit()
80  *
81  *    zvol_log_write()
82  *
83  *    dmu_read_uio()
84  *    dmu_write_uio()
85  * MINOR DATA
86  *    zv_volsize
87  *    zv_volblocksize
88  *    zv_flags		- for WCE
89  *    zv_objset		- dmu_tx_create
90  *    zv_zilog		- zil_commit
91  *    zv_znode		- rangelock_enter
92  *    zv_dn		- dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
93  * GLOBAL DATA
94  *    zvol_maxphys
95  */
96 
97 /*
98  * Take direct control of the volume instead of using the driver
99  * interfaces provided by zvol.c. Gather parameters and handles
100  * needed to make direct calls into zfs/dmu/zvol. The driver is
101  * opened exclusively at this point, so these parameters cannot change.
102  *
103  * NOTE: the object size and WCE can change while the device
104  * is open, so they must be fetched for every operation.
105  */
106 int
sbd_zvol_get_volume_params(sbd_lu_t * sl)107 sbd_zvol_get_volume_params(sbd_lu_t *sl)
108 {
109 	int ret;
110 
111 	ret = zvol_get_volume_params(sl->sl_zvol_minor,
112 	    &sl->sl_blksize,		/* volume block size */
113 	    &sl->sl_max_xfer_len,	/* max data chunk size */
114 	    &sl->sl_zvol_minor_hdl,	/* minor soft state */
115 	    &sl->sl_zvol_objset_hdl,	/* dmu_tx_create */
116 	    &sl->sl_zvol_zil_hdl,	/* zil_commit */
117 	    &sl->sl_zvol_rl_hdl,	/* locked_range_t */
118 	    &sl->sl_zvol_dn_hdl);	/* dmu_buf_hold_array_by_dnode, */
119 					/* dmu_request_arcbuf, */
120 					/* dmu_assign_arcbuf */
121 
122 	if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) {
123 		cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to "
124 		    "small zvol blocksize (%d)\n", (int)sl->sl_blksize);
125 		ret = ENOTSUP;
126 	}
127 
128 	return (ret);
129 }
130 
131 /*
132  * Return the number of elements in a scatter/gather list required for
133  * the given span in the zvol. Elements are 1:1 with zvol blocks.
134  */
135 uint32_t
sbd_zvol_numsegs(sbd_lu_t * sl,uint64_t off,uint32_t len)136 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len)
137 {
138 	uint64_t blksz = sl->sl_blksize;
139 	uint64_t endoff = off + len;
140 	uint64_t numsegs;
141 
142 	numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz;
143 	return ((uint32_t)numsegs);
144 }
145 
146 /*
147  * Return an array of dmu_buf_t pointers for the requested range.
148  * The dmu buffers are either in cache or read in synchronously.
149  * Fill in the dbuf sglist from the dmu_buf_t array.
150  */
151 static void *RDTAG = "sbd_zvol_read";
152 
153 int
sbd_zvol_alloc_read_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)154 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
155 {
156 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
157 	locked_range_t	*lr;
158 	int		numbufs, error;
159 	uint64_t	len = dbuf->db_data_size;
160 	uint64_t	offset = zvio->zvio_offset;
161 	dmu_buf_t	**dbpp, *dbp;
162 
163 	/* Make sure request is reasonable */
164 	if (len > sl->sl_max_xfer_len)
165 		return (E2BIG);
166 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
167 		return (EIO);
168 
169 	/*
170 	 * The range lock is only held until the dmu buffers read in and
171 	 * held; not during the callers use of the data.
172 	 */
173 	lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
174 
175 	error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl,
176 	    offset, len, TRUE, RDTAG, &numbufs, &dbpp,
177 	    DMU_READ_PREFETCH);
178 
179 	rangelock_exit(lr);
180 
181 	if (error == ECKSUM)
182 		error = EIO;
183 
184 	if (error == 0) {
185 		/*
186 		 * Fill in db_sglist from the dmu_buf_t array.
187 		 */
188 		int		i;
189 		stmf_sglist_ent_t *sgl;
190 		uint64_t	odiff, seglen;
191 
192 		zvio->zvio_dbp = dbpp;
193 		/* make sure db_sglist is large enough */
194 		if (dbuf->db_sglist_length != numbufs) {
195 			cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
196 			    dbuf->db_sglist_length, numbufs);
197 		}
198 
199 		sgl = &dbuf->db_sglist[0];
200 		for (i = 0; i < numbufs; i++) {
201 			dbp = dbpp[i];
202 			odiff =  offset - dbp->db_offset;
203 			ASSERT(odiff == 0 || i == 0);
204 			sgl->seg_addr = (uint8_t *)dbp->db_data + odiff;
205 			seglen = MIN(len, dbp->db_size - odiff);
206 			sgl->seg_length = (uint32_t)seglen;
207 			offset += seglen;
208 			len -= seglen;
209 			sgl++;
210 		}
211 		ASSERT(len == 0);
212 
213 	}
214 	return (error);
215 }
216 
217 /*
218  * Release a dmu_buf_t array.
219  */
220 /*ARGSUSED*/
221 void
sbd_zvol_rele_read_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)222 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
223 {
224 	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
225 
226 	ASSERT(zvio->zvio_dbp);
227 	ASSERT(dbuf->db_sglist_length);
228 
229 	dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG);
230 }
231 
232 /*
233  * Allocate enough loaned arc buffers for the requested region.
234  * Mimic the handling of the dmu_buf_t array used for reads as closely
235  * as possible even though the arc_buf_t's are anonymous until released.
236  * The buffers will match the zvol object blocks sizes and alignments
237  * such that a data copy may be avoided when the buffers are assigned.
238  */
239 int
sbd_zvol_alloc_write_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)240 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
241 {
242 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
243 	int		blkshift, numbufs, i;
244 	uint64_t	blksize;
245 	arc_buf_t	**abp;
246 	stmf_sglist_ent_t *sgl;
247 	uint64_t	len = dbuf->db_data_size;
248 	uint64_t	offset = zvio->zvio_offset;
249 
250 	/* Make sure request is reasonable */
251 	if (len > sl->sl_max_xfer_len)
252 		return (E2BIG);
253 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
254 		return (EIO);
255 
256 	/*
257 	 * Break up the request into chunks to match
258 	 * the volume block size. Only full, and aligned
259 	 * buffers will avoid the data copy in the dmu.
260 	 */
261 	/*
262 	 * calculate how may dbufs are needed
263 	 */
264 	blksize = sl->sl_blksize;
265 	ASSERT(ISP2(blksize));
266 	blkshift = highbit(blksize - 1);
267 	/*
268 	 * taken from dmu_buf_hold_array_by_dnode()
269 	 */
270 	numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) -
271 	    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
272 	if (dbuf->db_sglist_length != numbufs) {
273 		cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
274 		    dbuf->db_sglist_length, numbufs);
275 	}
276 	/*
277 	 * allocate a holder for the needed arc_buf pointers
278 	 */
279 	abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP);
280 	/*
281 	 * The write operation uses loaned arc buffers so that
282 	 * the xfer_data is done outside of a dmu transaction.
283 	 * These buffers will exactly match the request unlike
284 	 * the dmu buffers obtained from the read operation.
285 	 */
286 	/*
287 	 * allocate the arc buffers and fill in the stmf sglist
288 	 */
289 	sgl = &dbuf->db_sglist[0];
290 	for (i = 0; i < numbufs; i++) {
291 		uint64_t seglen;
292 
293 		/* first block may not be aligned */
294 		seglen = P2NPHASE(offset, blksize);
295 		if (seglen == 0)
296 			seglen = blksize;
297 		seglen = MIN(seglen, len);
298 		abp[i] = arc_loan_buf(dmu_objset_spa(sl->sl_zvol_objset_hdl),
299 		    B_FALSE, (int)seglen);
300 		ASSERT(arc_buf_size(abp[i]) == (int)seglen);
301 		sgl->seg_addr = abp[i]->b_data;
302 		sgl->seg_length = (uint32_t)seglen;
303 		sgl++;
304 		offset += seglen;
305 		len -= seglen;
306 	}
307 	ASSERT(len == 0);
308 
309 	zvio->zvio_abp = abp;
310 	return (0);
311 }
312 
313 /*ARGSUSED*/
314 void
sbd_zvol_rele_write_bufs_abort(sbd_lu_t * sl,stmf_data_buf_t * dbuf)315 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
316 {
317 	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
318 	int i;
319 	arc_buf_t **abp = zvio->zvio_abp;
320 
321 	/* free arcbufs */
322 	for (i = 0; i < dbuf->db_sglist_length; i++)
323 		dmu_return_arcbuf(*abp++);
324 	kmem_free(zvio->zvio_abp,
325 	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
326 	zvio->zvio_abp = NULL;
327 }
328 
329 /*
330  * Release the arc_buf_t array allocated above and handle these cases :
331  *
332  * flags == 0 - create transaction and assign all arc bufs to offsets
333  * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices
334  */
335 int
sbd_zvol_rele_write_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)336 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
337 {
338 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
339 	dmu_tx_t	*tx;
340 	int		sync, i, error;
341 	locked_range_t	*lr;
342 	arc_buf_t	**abp = zvio->zvio_abp;
343 	int		flags = zvio->zvio_flags;
344 	uint64_t	toffset, offset = zvio->zvio_offset;
345 	uint64_t	resid, len = dbuf->db_data_size;
346 
347 	ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT);
348 
349 	lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
350 
351 	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
352 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len);
353 	error = dmu_tx_assign(tx, TXG_WAIT);
354 
355 	if (error) {
356 		dmu_tx_abort(tx);
357 		rangelock_exit(lr);
358 		sbd_zvol_rele_write_bufs_abort(sl, dbuf);
359 		return (error);
360 	}
361 
362 	toffset = offset;
363 	resid = len;
364 	for (i = 0; i < dbuf->db_sglist_length; i++) {
365 		arc_buf_t *abuf;
366 		int size;
367 
368 		abuf = abp[i];
369 		size = arc_buf_size(abuf);
370 		(void) dmu_assign_arcbuf_by_dnode(sl->sl_zvol_dn_hdl,
371 		    toffset, abuf, tx);
372 		toffset += size;
373 		resid -= size;
374 	}
375 	ASSERT(resid == 0);
376 
377 	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
378 	zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
379 	    (ssize_t)len, sync);
380 	dmu_tx_commit(tx);
381 	rangelock_exit(lr);
382 	kmem_free(zvio->zvio_abp,
383 	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
384 	zvio->zvio_abp = NULL;
385 	if (sync && (flags & ZVIO_COMMIT))
386 		zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
387 	return (0);
388 }
389 
390 /*
391  * Copy interface for callers using direct zvol access.
392  * Very similar to zvol_read but the uio may have multiple iovec entries.
393  */
394 int
sbd_zvol_copy_read(sbd_lu_t * sl,uio_t * uio)395 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
396 {
397 	uint64_t	len = (uint64_t)uio->uio_resid;
398 	uint64_t	offset = (uint64_t)uio->uio_loffset;
399 
400 	/* Make sure request is reasonable */
401 	if (len > sl->sl_max_xfer_len)
402 		return (E2BIG);
403 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
404 		return (EIO);
405 
406 	locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len,
407 	    RL_READER);
408 	int error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len);
409 	rangelock_exit(lr);
410 
411 	if (error == ECKSUM)
412 		error = EIO;
413 	return (error);
414 }
415 
416 /*
417  * Copy interface for callers using direct zvol access.
418  * Very similar to zvol_write but the uio may have multiple iovec entries.
419  */
420 int
sbd_zvol_copy_write(sbd_lu_t * sl,uio_t * uio,int flags)421 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
422 {
423 	dmu_tx_t	*tx;
424 	int		error, sync;
425 	uint64_t	len = (uint64_t)uio->uio_resid;
426 	uint64_t	offset = (uint64_t)uio->uio_loffset;
427 
428 	ASSERT(flags == 0 || flags == ZVIO_COMMIT);
429 
430 	/* Make sure request is reasonable */
431 	if (len > sl->sl_max_xfer_len)
432 		return (E2BIG);
433 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
434 		return (EIO);
435 
436 	locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len,
437 	    RL_WRITER);
438 	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
439 
440 	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
441 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid);
442 	error = dmu_tx_assign(tx, TXG_WAIT);
443 	if (error) {
444 		dmu_tx_abort(tx);
445 	} else {
446 		error = dmu_write_uio_dnode(sl->sl_zvol_dn_hdl, uio, len, tx);
447 		if (error == 0) {
448 			zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
449 			    (ssize_t)len, sync);
450 		}
451 		dmu_tx_commit(tx);
452 	}
453 	rangelock_exit(lr);
454 
455 	if (sync && (flags & ZVIO_COMMIT))
456 		zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
457 	if (error == ECKSUM)
458 		error = EIO;
459 	return (error);
460 }
461