xref: /illumos-gate/usr/src/uts/common/fs/zfs/zvol.c (revision da6c28aa)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
22893a6d32Sahrens  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27fa9e4066Sahrens 
28fa9e4066Sahrens /*
29fa9e4066Sahrens  * ZFS volume emulation driver.
30fa9e4066Sahrens  *
31fa9e4066Sahrens  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
32fa9e4066Sahrens  * Volumes are accessed through the symbolic links named:
33fa9e4066Sahrens  *
34fa9e4066Sahrens  * /dev/zvol/dsk/<pool_name>/<dataset_name>
35fa9e4066Sahrens  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
36fa9e4066Sahrens  *
37fa9e4066Sahrens  * These links are created by the ZFS-specific devfsadm link generator.
38fa9e4066Sahrens  * Volumes are persistent through reboot.  No user command needs to be
39fa9e4066Sahrens  * run before opening and using a device.
40fa9e4066Sahrens  */
41fa9e4066Sahrens 
42fa9e4066Sahrens #include <sys/types.h>
43fa9e4066Sahrens #include <sys/param.h>
44fa9e4066Sahrens #include <sys/errno.h>
45fa9e4066Sahrens #include <sys/uio.h>
46fa9e4066Sahrens #include <sys/buf.h>
47fa9e4066Sahrens #include <sys/modctl.h>
48fa9e4066Sahrens #include <sys/open.h>
49fa9e4066Sahrens #include <sys/kmem.h>
50fa9e4066Sahrens #include <sys/conf.h>
51fa9e4066Sahrens #include <sys/cmn_err.h>
52fa9e4066Sahrens #include <sys/stat.h>
53fa9e4066Sahrens #include <sys/zap.h>
54fa9e4066Sahrens #include <sys/spa.h>
55fa9e4066Sahrens #include <sys/zio.h>
56fa9e4066Sahrens #include <sys/dsl_prop.h>
57fa9e4066Sahrens #include <sys/dkio.h>
58fa9e4066Sahrens #include <sys/efi_partition.h>
59fa9e4066Sahrens #include <sys/byteorder.h>
60fa9e4066Sahrens #include <sys/pathname.h>
61fa9e4066Sahrens #include <sys/ddi.h>
62fa9e4066Sahrens #include <sys/sunddi.h>
63fa9e4066Sahrens #include <sys/crc32.h>
64fa9e4066Sahrens #include <sys/dirent.h>
65fa9e4066Sahrens #include <sys/policy.h>
66fa9e4066Sahrens #include <sys/fs/zfs.h>
67fa9e4066Sahrens #include <sys/zfs_ioctl.h>
68fa9e4066Sahrens #include <sys/mkdev.h>
6922ac5be4Sperrin #include <sys/zil.h>
70c5c6ffa0Smaybee #include <sys/refcount.h>
71c2e6a7d6Sperrin #include <sys/zfs_znode.h>
72c2e6a7d6Sperrin #include <sys/zfs_rlock.h>
73fa9e4066Sahrens 
74fa9e4066Sahrens #include "zfs_namecheck.h"
75fa9e4066Sahrens 
76fa9e4066Sahrens #define	ZVOL_OBJ		1ULL
77fa9e4066Sahrens #define	ZVOL_ZAP_OBJ		2ULL
78fa9e4066Sahrens 
79fa9e4066Sahrens static void *zvol_state;
80fa9e4066Sahrens 
81fa9e4066Sahrens /*
82fa9e4066Sahrens  * This lock protects the zvol_state structure from being modified
83fa9e4066Sahrens  * while it's being used, e.g. an open that comes in before a create
84fa9e4066Sahrens  * finishes.  It also protects temporary opens of the dataset so that,
85fa9e4066Sahrens  * e.g., an open doesn't get a spurious EBUSY.
86fa9e4066Sahrens  */
87fa9e4066Sahrens static kmutex_t zvol_state_lock;
88fa9e4066Sahrens static uint32_t zvol_minors;
89fa9e4066Sahrens 
90fa9e4066Sahrens /*
91fa9e4066Sahrens  * The in-core state of each volume.
92fa9e4066Sahrens  */
93fa9e4066Sahrens typedef struct zvol_state {
94fa9e4066Sahrens 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
95fa9e4066Sahrens 	uint64_t	zv_volsize;	/* amount of space we advertise */
9667bd71c6Sperrin 	uint64_t	zv_volblocksize; /* volume block size */
97fa9e4066Sahrens 	minor_t		zv_minor;	/* minor number */
98fa9e4066Sahrens 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
99fa9e4066Sahrens 	uint8_t		zv_readonly;	/* hard readonly; like write-protect */
100fa9e4066Sahrens 	objset_t	*zv_objset;	/* objset handle */
101fa9e4066Sahrens 	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
102fa9e4066Sahrens 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
103fa9e4066Sahrens 	uint32_t	zv_total_opens;	/* total open count */
10422ac5be4Sperrin 	zilog_t		*zv_zilog;	/* ZIL handle */
10522ac5be4Sperrin 	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
106c2e6a7d6Sperrin 	znode_t		zv_znode;	/* for range locking */
107fa9e4066Sahrens } zvol_state_t;
108fa9e4066Sahrens 
10967bd71c6Sperrin /*
11067bd71c6Sperrin  * zvol maximum transfer in one DMU tx.
11167bd71c6Sperrin  */
11267bd71c6Sperrin int zvol_maxphys = DMU_MAX_ACCESS/2;
11367bd71c6Sperrin 
114feb08c6bSbillm static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
11567bd71c6Sperrin 
116fa9e4066Sahrens static void
11791ebeef5Sahrens zvol_size_changed(zvol_state_t *zv, major_t maj)
118fa9e4066Sahrens {
11991ebeef5Sahrens 	dev_t dev = makedevice(maj, zv->zv_minor);
120fa9e4066Sahrens 
121fa9e4066Sahrens 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
122fa9e4066Sahrens 	    "Size", zv->zv_volsize) == DDI_SUCCESS);
123fa9e4066Sahrens 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
124fa9e4066Sahrens 	    "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
125fa9e4066Sahrens }
126fa9e4066Sahrens 
127fa9e4066Sahrens int
128e9dbad6fSeschrock zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
129fa9e4066Sahrens {
130e9dbad6fSeschrock 	if (volsize == 0)
131fa9e4066Sahrens 		return (EINVAL);
132fa9e4066Sahrens 
133e9dbad6fSeschrock 	if (volsize % blocksize != 0)
1345c5460e9Seschrock 		return (EINVAL);
1355c5460e9Seschrock 
136fa9e4066Sahrens #ifdef _ILP32
137e9dbad6fSeschrock 	if (volsize - 1 > SPEC_MAXOFFSET_T)
138fa9e4066Sahrens 		return (EOVERFLOW);
139fa9e4066Sahrens #endif
140fa9e4066Sahrens 	return (0);
141fa9e4066Sahrens }
142fa9e4066Sahrens 
143fa9e4066Sahrens int
144e9dbad6fSeschrock zvol_check_volblocksize(uint64_t volblocksize)
145fa9e4066Sahrens {
146e9dbad6fSeschrock 	if (volblocksize < SPA_MINBLOCKSIZE ||
147e9dbad6fSeschrock 	    volblocksize > SPA_MAXBLOCKSIZE ||
148e9dbad6fSeschrock 	    !ISP2(volblocksize))
149fa9e4066Sahrens 		return (EDOM);
150fa9e4066Sahrens 
151fa9e4066Sahrens 	return (0);
152fa9e4066Sahrens }
153fa9e4066Sahrens 
154fa9e4066Sahrens static void
155fa9e4066Sahrens zvol_readonly_changed_cb(void *arg, uint64_t newval)
156fa9e4066Sahrens {
157fa9e4066Sahrens 	zvol_state_t *zv = arg;
158fa9e4066Sahrens 
159fa9e4066Sahrens 	zv->zv_readonly = (uint8_t)newval;
160fa9e4066Sahrens }
161fa9e4066Sahrens 
162fa9e4066Sahrens int
163a2eea2e1Sahrens zvol_get_stats(objset_t *os, nvlist_t *nv)
164fa9e4066Sahrens {
165fa9e4066Sahrens 	int error;
166fa9e4066Sahrens 	dmu_object_info_t doi;
167a2eea2e1Sahrens 	uint64_t val;
168fa9e4066Sahrens 
169fa9e4066Sahrens 
170a2eea2e1Sahrens 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
171fa9e4066Sahrens 	if (error)
172fa9e4066Sahrens 		return (error);
173fa9e4066Sahrens 
174a2eea2e1Sahrens 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
175a2eea2e1Sahrens 
176fa9e4066Sahrens 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
177fa9e4066Sahrens 
178a2eea2e1Sahrens 	if (error == 0) {
179a2eea2e1Sahrens 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
180a2eea2e1Sahrens 		    doi.doi_data_block_size);
181a2eea2e1Sahrens 	}
182fa9e4066Sahrens 
183fa9e4066Sahrens 	return (error);
184fa9e4066Sahrens }
185fa9e4066Sahrens 
186fa9e4066Sahrens /*
187fa9e4066Sahrens  * Find a free minor number.
188fa9e4066Sahrens  */
189fa9e4066Sahrens static minor_t
190fa9e4066Sahrens zvol_minor_alloc(void)
191fa9e4066Sahrens {
192fa9e4066Sahrens 	minor_t minor;
193fa9e4066Sahrens 
194fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&zvol_state_lock));
195fa9e4066Sahrens 
196fa9e4066Sahrens 	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
197fa9e4066Sahrens 		if (ddi_get_soft_state(zvol_state, minor) == NULL)
198fa9e4066Sahrens 			return (minor);
199fa9e4066Sahrens 
200fa9e4066Sahrens 	return (0);
201fa9e4066Sahrens }
202fa9e4066Sahrens 
203fa9e4066Sahrens static zvol_state_t *
204e9dbad6fSeschrock zvol_minor_lookup(const char *name)
205fa9e4066Sahrens {
206fa9e4066Sahrens 	minor_t minor;
207fa9e4066Sahrens 	zvol_state_t *zv;
208fa9e4066Sahrens 
209fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&zvol_state_lock));
210fa9e4066Sahrens 
211fa9e4066Sahrens 	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
212fa9e4066Sahrens 		zv = ddi_get_soft_state(zvol_state, minor);
213fa9e4066Sahrens 		if (zv == NULL)
214fa9e4066Sahrens 			continue;
215fa9e4066Sahrens 		if (strcmp(zv->zv_name, name) == 0)
216fa9e4066Sahrens 			break;
217fa9e4066Sahrens 	}
218fa9e4066Sahrens 
219fa9e4066Sahrens 	return (zv);
220fa9e4066Sahrens }
221fa9e4066Sahrens 
222ecd6cf80Smarks /* ARGSUSED */
223fa9e4066Sahrens void
224ecd6cf80Smarks zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
225fa9e4066Sahrens {
226*da6c28aaSamw 	zfs_creat_t *zct = arg;
227*da6c28aaSamw 	nvlist_t *nvprops = zct->zct_props;
228fa9e4066Sahrens 	int error;
229e9dbad6fSeschrock 	uint64_t volblocksize, volsize;
230fa9e4066Sahrens 
231ecd6cf80Smarks 	VERIFY(nvlist_lookup_uint64(nvprops,
232e9dbad6fSeschrock 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
233ecd6cf80Smarks 	if (nvlist_lookup_uint64(nvprops,
234e9dbad6fSeschrock 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
235e9dbad6fSeschrock 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
236e9dbad6fSeschrock 
237e9dbad6fSeschrock 	/*
238e9dbad6fSeschrock 	 * These properites must be removed from the list so the generic
239e9dbad6fSeschrock 	 * property setting step won't apply to them.
240e9dbad6fSeschrock 	 */
241ecd6cf80Smarks 	VERIFY(nvlist_remove_all(nvprops,
242e9dbad6fSeschrock 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
243ecd6cf80Smarks 	(void) nvlist_remove_all(nvprops,
244e9dbad6fSeschrock 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
245e9dbad6fSeschrock 
246e9dbad6fSeschrock 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
247fa9e4066Sahrens 	    DMU_OT_NONE, 0, tx);
248fa9e4066Sahrens 	ASSERT(error == 0);
249fa9e4066Sahrens 
250fa9e4066Sahrens 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
251fa9e4066Sahrens 	    DMU_OT_NONE, 0, tx);
252fa9e4066Sahrens 	ASSERT(error == 0);
253fa9e4066Sahrens 
254e9dbad6fSeschrock 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
255fa9e4066Sahrens 	ASSERT(error == 0);
256fa9e4066Sahrens }
257fa9e4066Sahrens 
25822ac5be4Sperrin /*
25922ac5be4Sperrin  * Replay a TX_WRITE ZIL transaction that didn't get committed
26022ac5be4Sperrin  * after a system failure
26122ac5be4Sperrin  */
26222ac5be4Sperrin static int
26322ac5be4Sperrin zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
26422ac5be4Sperrin {
26522ac5be4Sperrin 	objset_t *os = zv->zv_objset;
26622ac5be4Sperrin 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
26722ac5be4Sperrin 	uint64_t off = lr->lr_offset;
26822ac5be4Sperrin 	uint64_t len = lr->lr_length;
26922ac5be4Sperrin 	dmu_tx_t *tx;
27022ac5be4Sperrin 	int error;
27122ac5be4Sperrin 
27222ac5be4Sperrin 	if (byteswap)
27322ac5be4Sperrin 		byteswap_uint64_array(lr, sizeof (*lr));
27422ac5be4Sperrin 
27522ac5be4Sperrin 	tx = dmu_tx_create(os);
27622ac5be4Sperrin 	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
27722ac5be4Sperrin 	error = dmu_tx_assign(tx, zv->zv_txg_assign);
27822ac5be4Sperrin 	if (error) {
27922ac5be4Sperrin 		dmu_tx_abort(tx);
28022ac5be4Sperrin 	} else {
28122ac5be4Sperrin 		dmu_write(os, ZVOL_OBJ, off, len, data, tx);
28222ac5be4Sperrin 		dmu_tx_commit(tx);
28322ac5be4Sperrin 	}
28422ac5be4Sperrin 
28522ac5be4Sperrin 	return (error);
28622ac5be4Sperrin }
28722ac5be4Sperrin 
28822ac5be4Sperrin /* ARGSUSED */
28922ac5be4Sperrin static int
29022ac5be4Sperrin zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
29122ac5be4Sperrin {
29222ac5be4Sperrin 	return (ENOTSUP);
29322ac5be4Sperrin }
29422ac5be4Sperrin 
29522ac5be4Sperrin /*
29622ac5be4Sperrin  * Callback vectors for replaying records.
29722ac5be4Sperrin  * Only TX_WRITE is needed for zvol.
29822ac5be4Sperrin  */
29922ac5be4Sperrin zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
30022ac5be4Sperrin 	zvol_replay_err,	/* 0 no such transaction type */
30122ac5be4Sperrin 	zvol_replay_err,	/* TX_CREATE */
30222ac5be4Sperrin 	zvol_replay_err,	/* TX_MKDIR */
30322ac5be4Sperrin 	zvol_replay_err,	/* TX_MKXATTR */
30422ac5be4Sperrin 	zvol_replay_err,	/* TX_SYMLINK */
30522ac5be4Sperrin 	zvol_replay_err,	/* TX_REMOVE */
30622ac5be4Sperrin 	zvol_replay_err,	/* TX_RMDIR */
30722ac5be4Sperrin 	zvol_replay_err,	/* TX_LINK */
30822ac5be4Sperrin 	zvol_replay_err,	/* TX_RENAME */
30922ac5be4Sperrin 	zvol_replay_write,	/* TX_WRITE */
31022ac5be4Sperrin 	zvol_replay_err,	/* TX_TRUNCATE */
31122ac5be4Sperrin 	zvol_replay_err,	/* TX_SETATTR */
31222ac5be4Sperrin 	zvol_replay_err,	/* TX_ACL */
31322ac5be4Sperrin };
31422ac5be4Sperrin 
315fa9e4066Sahrens /*
316fa9e4066Sahrens  * Create a minor node for the specified volume.
317fa9e4066Sahrens  */
318fa9e4066Sahrens int
31991ebeef5Sahrens zvol_create_minor(const char *name, major_t maj)
320fa9e4066Sahrens {
321fa9e4066Sahrens 	zvol_state_t *zv;
322fa9e4066Sahrens 	objset_t *os;
32367bd71c6Sperrin 	dmu_object_info_t doi;
324fa9e4066Sahrens 	uint64_t volsize;
325fa9e4066Sahrens 	minor_t minor = 0;
326fa9e4066Sahrens 	struct pathname linkpath;
327fa9e4066Sahrens 	int ds_mode = DS_MODE_PRIMARY;
328fa9e4066Sahrens 	vnode_t *vp = NULL;
329fa9e4066Sahrens 	char *devpath;
330fa9e4066Sahrens 	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + 1 + strlen(name) + 1;
331fa9e4066Sahrens 	char chrbuf[30], blkbuf[30];
332fa9e4066Sahrens 	int error;
333fa9e4066Sahrens 
334fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
335fa9e4066Sahrens 
336fa9e4066Sahrens 	if ((zv = zvol_minor_lookup(name)) != NULL) {
337fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
338fa9e4066Sahrens 		return (EEXIST);
339fa9e4066Sahrens 	}
340fa9e4066Sahrens 
341fa9e4066Sahrens 	if (strchr(name, '@') != 0)
342fa9e4066Sahrens 		ds_mode |= DS_MODE_READONLY;
343fa9e4066Sahrens 
344fa9e4066Sahrens 	error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
345fa9e4066Sahrens 
346fa9e4066Sahrens 	if (error) {
347fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
348fa9e4066Sahrens 		return (error);
349fa9e4066Sahrens 	}
350fa9e4066Sahrens 
351fa9e4066Sahrens 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
352fa9e4066Sahrens 
353fa9e4066Sahrens 	if (error) {
354fa9e4066Sahrens 		dmu_objset_close(os);
355fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
356fa9e4066Sahrens 		return (error);
357fa9e4066Sahrens 	}
358fa9e4066Sahrens 
359fa9e4066Sahrens 	/*
360fa9e4066Sahrens 	 * If there's an existing /dev/zvol symlink, try to use the
361fa9e4066Sahrens 	 * same minor number we used last time.
362fa9e4066Sahrens 	 */
363fa9e4066Sahrens 	devpath = kmem_alloc(devpathlen, KM_SLEEP);
364fa9e4066Sahrens 
365fa9e4066Sahrens 	(void) sprintf(devpath, "%s/%s", ZVOL_FULL_DEV_DIR, name);
366fa9e4066Sahrens 
367fa9e4066Sahrens 	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
368fa9e4066Sahrens 
369fa9e4066Sahrens 	kmem_free(devpath, devpathlen);
370fa9e4066Sahrens 
371fa9e4066Sahrens 	if (error == 0 && vp->v_type != VLNK)
372fa9e4066Sahrens 		error = EINVAL;
373fa9e4066Sahrens 
374fa9e4066Sahrens 	if (error == 0) {
375fa9e4066Sahrens 		pn_alloc(&linkpath);
376fa9e4066Sahrens 		error = pn_getsymlink(vp, &linkpath, kcred);
377fa9e4066Sahrens 		if (error == 0) {
378fa9e4066Sahrens 			char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
379fa9e4066Sahrens 			if (ms != NULL) {
380fa9e4066Sahrens 				ms += strlen(ZVOL_PSEUDO_DEV);
381fa9e4066Sahrens 				minor = stoi(&ms);
382fa9e4066Sahrens 			}
383fa9e4066Sahrens 		}
384fa9e4066Sahrens 		pn_free(&linkpath);
385fa9e4066Sahrens 	}
386fa9e4066Sahrens 
387fa9e4066Sahrens 	if (vp != NULL)
388fa9e4066Sahrens 		VN_RELE(vp);
389fa9e4066Sahrens 
390fa9e4066Sahrens 	/*
391fa9e4066Sahrens 	 * If we found a minor but it's already in use, we must pick a new one.
392fa9e4066Sahrens 	 */
393fa9e4066Sahrens 	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
394fa9e4066Sahrens 		minor = 0;
395fa9e4066Sahrens 
396fa9e4066Sahrens 	if (minor == 0)
397fa9e4066Sahrens 		minor = zvol_minor_alloc();
398fa9e4066Sahrens 
399fa9e4066Sahrens 	if (minor == 0) {
400fa9e4066Sahrens 		dmu_objset_close(os);
401fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
402fa9e4066Sahrens 		return (ENXIO);
403fa9e4066Sahrens 	}
404fa9e4066Sahrens 
405fa9e4066Sahrens 	if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
406fa9e4066Sahrens 		dmu_objset_close(os);
407fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
408fa9e4066Sahrens 		return (EAGAIN);
409fa9e4066Sahrens 	}
410fa9e4066Sahrens 
411e9dbad6fSeschrock 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
412e9dbad6fSeschrock 	    (char *)name);
413fa9e4066Sahrens 
414fa9e4066Sahrens 	(void) sprintf(chrbuf, "%uc,raw", minor);
415fa9e4066Sahrens 
416fa9e4066Sahrens 	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
417fa9e4066Sahrens 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
418fa9e4066Sahrens 		ddi_soft_state_free(zvol_state, minor);
419fa9e4066Sahrens 		dmu_objset_close(os);
420fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
421fa9e4066Sahrens 		return (EAGAIN);
422fa9e4066Sahrens 	}
423fa9e4066Sahrens 
424fa9e4066Sahrens 	(void) sprintf(blkbuf, "%uc", minor);
425fa9e4066Sahrens 
426fa9e4066Sahrens 	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
427fa9e4066Sahrens 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
428fa9e4066Sahrens 		ddi_remove_minor_node(zfs_dip, chrbuf);
429fa9e4066Sahrens 		ddi_soft_state_free(zvol_state, minor);
430fa9e4066Sahrens 		dmu_objset_close(os);
431fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
432fa9e4066Sahrens 		return (EAGAIN);
433fa9e4066Sahrens 	}
434fa9e4066Sahrens 
435fa9e4066Sahrens 	zv = ddi_get_soft_state(zvol_state, minor);
436fa9e4066Sahrens 
437fa9e4066Sahrens 	(void) strcpy(zv->zv_name, name);
438fa9e4066Sahrens 	zv->zv_min_bs = DEV_BSHIFT;
439fa9e4066Sahrens 	zv->zv_minor = minor;
440fa9e4066Sahrens 	zv->zv_volsize = volsize;
441fa9e4066Sahrens 	zv->zv_objset = os;
442fa9e4066Sahrens 	zv->zv_mode = ds_mode;
44367bd71c6Sperrin 	zv->zv_zilog = zil_open(os, zvol_get_data);
444c2e6a7d6Sperrin 	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
445c2e6a7d6Sperrin 	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
446c2e6a7d6Sperrin 	    sizeof (rl_t), offsetof(rl_t, r_node));
447c2e6a7d6Sperrin 
44867bd71c6Sperrin 
44967bd71c6Sperrin 	/* get and cache the blocksize */
45067bd71c6Sperrin 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
45167bd71c6Sperrin 	ASSERT(error == 0);
45267bd71c6Sperrin 	zv->zv_volblocksize = doi.doi_data_block_size;
45322ac5be4Sperrin 
454893a6d32Sahrens 	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector);
455fa9e4066Sahrens 
45691ebeef5Sahrens 	zvol_size_changed(zv, maj);
457fa9e4066Sahrens 
458ea8dc4b6Seschrock 	/* XXX this should handle the possible i/o error */
459fa9e4066Sahrens 	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
460fa9e4066Sahrens 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
461fa9e4066Sahrens 
462fa9e4066Sahrens 	zvol_minors++;
463fa9e4066Sahrens 
464fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
465fa9e4066Sahrens 
466fa9e4066Sahrens 	return (0);
467fa9e4066Sahrens }
468fa9e4066Sahrens 
469fa9e4066Sahrens /*
470fa9e4066Sahrens  * Remove minor node for the specified volume.
471fa9e4066Sahrens  */
472fa9e4066Sahrens int
473e9dbad6fSeschrock zvol_remove_minor(const char *name)
474fa9e4066Sahrens {
475fa9e4066Sahrens 	zvol_state_t *zv;
476fa9e4066Sahrens 	char namebuf[30];
477fa9e4066Sahrens 
478fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
479fa9e4066Sahrens 
480e9dbad6fSeschrock 	if ((zv = zvol_minor_lookup(name)) == NULL) {
481fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
482fa9e4066Sahrens 		return (ENXIO);
483fa9e4066Sahrens 	}
484fa9e4066Sahrens 
485fa9e4066Sahrens 	if (zv->zv_total_opens != 0) {
486fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
487fa9e4066Sahrens 		return (EBUSY);
488fa9e4066Sahrens 	}
489fa9e4066Sahrens 
490fa9e4066Sahrens 	(void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
491fa9e4066Sahrens 	ddi_remove_minor_node(zfs_dip, namebuf);
492fa9e4066Sahrens 
493fa9e4066Sahrens 	(void) sprintf(namebuf, "%uc", zv->zv_minor);
494fa9e4066Sahrens 	ddi_remove_minor_node(zfs_dip, namebuf);
495fa9e4066Sahrens 
496fa9e4066Sahrens 	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
497fa9e4066Sahrens 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
498fa9e4066Sahrens 
49922ac5be4Sperrin 	zil_close(zv->zv_zilog);
50022ac5be4Sperrin 	zv->zv_zilog = NULL;
501fa9e4066Sahrens 	dmu_objset_close(zv->zv_objset);
502fa9e4066Sahrens 	zv->zv_objset = NULL;
503c2e6a7d6Sperrin 	avl_destroy(&zv->zv_znode.z_range_avl);
504c2e6a7d6Sperrin 	mutex_destroy(&zv->zv_znode.z_range_lock);
505fa9e4066Sahrens 
506fa9e4066Sahrens 	ddi_soft_state_free(zvol_state, zv->zv_minor);
507fa9e4066Sahrens 
508fa9e4066Sahrens 	zvol_minors--;
509fa9e4066Sahrens 
510fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
511fa9e4066Sahrens 
512fa9e4066Sahrens 	return (0);
513fa9e4066Sahrens }
514fa9e4066Sahrens 
515fa9e4066Sahrens int
51691ebeef5Sahrens zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
517fa9e4066Sahrens {
518fa9e4066Sahrens 	zvol_state_t *zv;
519fa9e4066Sahrens 	dmu_tx_t *tx;
520fa9e4066Sahrens 	int error;
5215c5460e9Seschrock 	dmu_object_info_t doi;
522fa9e4066Sahrens 
523fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
524fa9e4066Sahrens 
525e9dbad6fSeschrock 	if ((zv = zvol_minor_lookup(name)) == NULL) {
526fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
527fa9e4066Sahrens 		return (ENXIO);
528fa9e4066Sahrens 	}
529fa9e4066Sahrens 
5305c5460e9Seschrock 	if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
531e9dbad6fSeschrock 	    (error = zvol_check_volsize(volsize,
532e9dbad6fSeschrock 	    doi.doi_data_block_size)) != 0) {
5335c5460e9Seschrock 		mutex_exit(&zvol_state_lock);
5345c5460e9Seschrock 		return (error);
5355c5460e9Seschrock 	}
5365c5460e9Seschrock 
537fa9e4066Sahrens 	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
538fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
539fa9e4066Sahrens 		return (EROFS);
540fa9e4066Sahrens 	}
541fa9e4066Sahrens 
542fa9e4066Sahrens 	tx = dmu_tx_create(zv->zv_objset);
543ea8dc4b6Seschrock 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
544e9dbad6fSeschrock 	dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
545fa9e4066Sahrens 	error = dmu_tx_assign(tx, TXG_WAIT);
546fa9e4066Sahrens 	if (error) {
547fa9e4066Sahrens 		dmu_tx_abort(tx);
548fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
549fa9e4066Sahrens 		return (error);
550fa9e4066Sahrens 	}
551fa9e4066Sahrens 
552fa9e4066Sahrens 	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
553e9dbad6fSeschrock 	    &volsize, tx);
554ea8dc4b6Seschrock 	if (error == 0) {
555e9dbad6fSeschrock 		error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
556fa9e4066Sahrens 		    DMU_OBJECT_END, tx);
557ea8dc4b6Seschrock 	}
558fa9e4066Sahrens 
559fa9e4066Sahrens 	dmu_tx_commit(tx);
560fa9e4066Sahrens 
561fa9e4066Sahrens 	if (error == 0) {
562e9dbad6fSeschrock 		zv->zv_volsize = volsize;
56391ebeef5Sahrens 		zvol_size_changed(zv, maj);
564fa9e4066Sahrens 	}
565fa9e4066Sahrens 
566fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
567fa9e4066Sahrens 
568fa9e4066Sahrens 	return (error);
569fa9e4066Sahrens }
570fa9e4066Sahrens 
571fa9e4066Sahrens int
572e9dbad6fSeschrock zvol_set_volblocksize(const char *name, uint64_t volblocksize)
573fa9e4066Sahrens {
574fa9e4066Sahrens 	zvol_state_t *zv;
575fa9e4066Sahrens 	dmu_tx_t *tx;
576fa9e4066Sahrens 	int error;
577fa9e4066Sahrens 
578fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
579fa9e4066Sahrens 
580e9dbad6fSeschrock 	if ((zv = zvol_minor_lookup(name)) == NULL) {
581fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
582fa9e4066Sahrens 		return (ENXIO);
583fa9e4066Sahrens 	}
584fa9e4066Sahrens 
585fa9e4066Sahrens 	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
586fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
587fa9e4066Sahrens 		return (EROFS);
588fa9e4066Sahrens 	}
589fa9e4066Sahrens 
590fa9e4066Sahrens 	tx = dmu_tx_create(zv->zv_objset);
591fa9e4066Sahrens 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
592fa9e4066Sahrens 	error = dmu_tx_assign(tx, TXG_WAIT);
593fa9e4066Sahrens 	if (error) {
594fa9e4066Sahrens 		dmu_tx_abort(tx);
595fa9e4066Sahrens 	} else {
596fa9e4066Sahrens 		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
597e9dbad6fSeschrock 		    volblocksize, 0, tx);
598fa9e4066Sahrens 		if (error == ENOTSUP)
599fa9e4066Sahrens 			error = EBUSY;
600fa9e4066Sahrens 		dmu_tx_commit(tx);
601fa9e4066Sahrens 	}
602fa9e4066Sahrens 
603fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
604fa9e4066Sahrens 
605fa9e4066Sahrens 	return (error);
606fa9e4066Sahrens }
607fa9e4066Sahrens 
608fa9e4066Sahrens /*ARGSUSED*/
609fa9e4066Sahrens int
610fa9e4066Sahrens zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
611fa9e4066Sahrens {
612fa9e4066Sahrens 	minor_t minor = getminor(*devp);
613fa9e4066Sahrens 	zvol_state_t *zv;
614fa9e4066Sahrens 
615fa9e4066Sahrens 	if (minor == 0)			/* This is the control device */
616fa9e4066Sahrens 		return (0);
617fa9e4066Sahrens 
618fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
619fa9e4066Sahrens 
620fa9e4066Sahrens 	zv = ddi_get_soft_state(zvol_state, minor);
621fa9e4066Sahrens 	if (zv == NULL) {
622fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
623fa9e4066Sahrens 		return (ENXIO);
624fa9e4066Sahrens 	}
625fa9e4066Sahrens 
626fa9e4066Sahrens 	ASSERT(zv->zv_objset != NULL);
627fa9e4066Sahrens 
628fa9e4066Sahrens 	if ((flag & FWRITE) &&
629fa9e4066Sahrens 	    (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) {
630fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
631fa9e4066Sahrens 		return (EROFS);
632fa9e4066Sahrens 	}
633fa9e4066Sahrens 
634fa9e4066Sahrens 	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
635fa9e4066Sahrens 		zv->zv_open_count[otyp]++;
636fa9e4066Sahrens 		zv->zv_total_opens++;
637fa9e4066Sahrens 	}
638fa9e4066Sahrens 
639fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
640fa9e4066Sahrens 
641fa9e4066Sahrens 	return (0);
642fa9e4066Sahrens }
643fa9e4066Sahrens 
644fa9e4066Sahrens /*ARGSUSED*/
645fa9e4066Sahrens int
646fa9e4066Sahrens zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
647fa9e4066Sahrens {
648fa9e4066Sahrens 	minor_t minor = getminor(dev);
649fa9e4066Sahrens 	zvol_state_t *zv;
650fa9e4066Sahrens 
651fa9e4066Sahrens 	if (minor == 0)		/* This is the control device */
652fa9e4066Sahrens 		return (0);
653fa9e4066Sahrens 
654fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
655fa9e4066Sahrens 
656fa9e4066Sahrens 	zv = ddi_get_soft_state(zvol_state, minor);
657fa9e4066Sahrens 	if (zv == NULL) {
658fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
659fa9e4066Sahrens 		return (ENXIO);
660fa9e4066Sahrens 	}
661fa9e4066Sahrens 
662fa9e4066Sahrens 	/*
663fa9e4066Sahrens 	 * The next statement is a workaround for the following DDI bug:
664fa9e4066Sahrens 	 * 6343604 specfs race: multiple "last-close" of the same device
665fa9e4066Sahrens 	 */
666fa9e4066Sahrens 	if (zv->zv_total_opens == 0) {
667fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
668fa9e4066Sahrens 		return (0);
669fa9e4066Sahrens 	}
670fa9e4066Sahrens 
671fa9e4066Sahrens 	/*
672fa9e4066Sahrens 	 * If the open count is zero, this is a spurious close.
673fa9e4066Sahrens 	 * That indicates a bug in the kernel / DDI framework.
674fa9e4066Sahrens 	 */
675fa9e4066Sahrens 	ASSERT(zv->zv_open_count[otyp] != 0);
676fa9e4066Sahrens 	ASSERT(zv->zv_total_opens != 0);
677fa9e4066Sahrens 
678fa9e4066Sahrens 	/*
679fa9e4066Sahrens 	 * You may get multiple opens, but only one close.
680fa9e4066Sahrens 	 */
681fa9e4066Sahrens 	zv->zv_open_count[otyp]--;
682fa9e4066Sahrens 	zv->zv_total_opens--;
683fa9e4066Sahrens 
684fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
685fa9e4066Sahrens 
686fa9e4066Sahrens 	return (0);
687fa9e4066Sahrens }
688fa9e4066Sahrens 
689feb08c6bSbillm static void
69067bd71c6Sperrin zvol_get_done(dmu_buf_t *db, void *vzgd)
69167bd71c6Sperrin {
69267bd71c6Sperrin 	zgd_t *zgd = (zgd_t *)vzgd;
693c2e6a7d6Sperrin 	rl_t *rl = zgd->zgd_rl;
69467bd71c6Sperrin 
69567bd71c6Sperrin 	dmu_buf_rele(db, vzgd);
696c2e6a7d6Sperrin 	zfs_range_unlock(rl);
69767bd71c6Sperrin 	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
69867bd71c6Sperrin 	kmem_free(zgd, sizeof (zgd_t));
69967bd71c6Sperrin }
70067bd71c6Sperrin 
70167bd71c6Sperrin /*
70267bd71c6Sperrin  * Get data to generate a TX_WRITE intent log record.
70367bd71c6Sperrin  */
704feb08c6bSbillm static int
70567bd71c6Sperrin zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
70667bd71c6Sperrin {
70767bd71c6Sperrin 	zvol_state_t *zv = arg;
70867bd71c6Sperrin 	objset_t *os = zv->zv_objset;
70967bd71c6Sperrin 	dmu_buf_t *db;
710c2e6a7d6Sperrin 	rl_t *rl;
71167bd71c6Sperrin 	zgd_t *zgd;
712c2e6a7d6Sperrin 	uint64_t boff; 			/* block starting offset */
713c2e6a7d6Sperrin 	int dlen = lr->lr_length;	/* length of user data */
71467bd71c6Sperrin 	int error;
71567bd71c6Sperrin 
71667bd71c6Sperrin 	ASSERT(zio);
717c2e6a7d6Sperrin 	ASSERT(dlen != 0);
718feb08c6bSbillm 
719c2e6a7d6Sperrin 	/*
720c2e6a7d6Sperrin 	 * Write records come in two flavors: immediate and indirect.
721c2e6a7d6Sperrin 	 * For small writes it's cheaper to store the data with the
722c2e6a7d6Sperrin 	 * log record (immediate); for large writes it's cheaper to
723c2e6a7d6Sperrin 	 * sync the data and get a pointer to it (indirect) so that
724c2e6a7d6Sperrin 	 * we don't have to write the data twice.
725c2e6a7d6Sperrin 	 */
726c2e6a7d6Sperrin 	if (buf != NULL) /* immediate write */
727c2e6a7d6Sperrin 		return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
72867bd71c6Sperrin 
72967bd71c6Sperrin 	zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
73067bd71c6Sperrin 	zgd->zgd_zilog = zv->zv_zilog;
73167bd71c6Sperrin 	zgd->zgd_bp = &lr->lr_blkptr;
73267bd71c6Sperrin 
73367bd71c6Sperrin 	/*
734c2e6a7d6Sperrin 	 * Lock the range of the block to ensure that when the data is
735c2e6a7d6Sperrin 	 * written out and it's checksum is being calculated that no other
736c2e6a7d6Sperrin 	 * thread can change the block.
73767bd71c6Sperrin 	 */
738c2e6a7d6Sperrin 	boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
739c2e6a7d6Sperrin 	rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
740c2e6a7d6Sperrin 	    RL_READER);
741c2e6a7d6Sperrin 	zgd->zgd_rl = rl;
742c2e6a7d6Sperrin 
743c2e6a7d6Sperrin 	VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
74467bd71c6Sperrin 	error = dmu_sync(zio, db, &lr->lr_blkptr,
74567bd71c6Sperrin 	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
746feb08c6bSbillm 	if (error == 0)
74767bd71c6Sperrin 		zil_add_vdev(zv->zv_zilog,
74867bd71c6Sperrin 		    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
74967bd71c6Sperrin 	/*
75067bd71c6Sperrin 	 * If we get EINPROGRESS, then we need to wait for a
75167bd71c6Sperrin 	 * write IO initiated by dmu_sync() to complete before
75267bd71c6Sperrin 	 * we can release this dbuf.  We will finish everything
75367bd71c6Sperrin 	 * up in the zvol_get_done() callback.
75467bd71c6Sperrin 	 */
75567bd71c6Sperrin 	if (error == EINPROGRESS)
75667bd71c6Sperrin 		return (0);
75767bd71c6Sperrin 	dmu_buf_rele(db, zgd);
758c2e6a7d6Sperrin 	zfs_range_unlock(rl);
75967bd71c6Sperrin 	kmem_free(zgd, sizeof (zgd_t));
76067bd71c6Sperrin 	return (error);
76167bd71c6Sperrin }
76267bd71c6Sperrin 
763a24e15ceSperrin /*
764a24e15ceSperrin  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
76522ac5be4Sperrin  *
76622ac5be4Sperrin  * We store data in the log buffers if it's small enough.
76767bd71c6Sperrin  * Otherwise we will later flush the data out via dmu_sync().
76822ac5be4Sperrin  */
76967bd71c6Sperrin ssize_t zvol_immediate_write_sz = 32768;
77022ac5be4Sperrin 
771feb08c6bSbillm static void
772feb08c6bSbillm zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
77322ac5be4Sperrin {
774feb08c6bSbillm 	uint32_t blocksize = zv->zv_volblocksize;
77522ac5be4Sperrin 	lr_write_t *lr;
77622ac5be4Sperrin 
777a24e15ceSperrin 	while (len) {
778feb08c6bSbillm 		ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
779feb08c6bSbillm 		itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
780feb08c6bSbillm 
781feb08c6bSbillm 		itx->itx_wr_state =
782feb08c6bSbillm 		    len > zvol_immediate_write_sz ?  WR_INDIRECT : WR_NEED_COPY;
783feb08c6bSbillm 		itx->itx_private = zv;
784feb08c6bSbillm 		lr = (lr_write_t *)&itx->itx_lr;
785feb08c6bSbillm 		lr->lr_foid = ZVOL_OBJ;
786feb08c6bSbillm 		lr->lr_offset = off;
787feb08c6bSbillm 		lr->lr_length = nbytes;
788feb08c6bSbillm 		lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
789feb08c6bSbillm 		BP_ZERO(&lr->lr_blkptr);
790feb08c6bSbillm 
791feb08c6bSbillm 		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
792a24e15ceSperrin 		len -= nbytes;
793a24e15ceSperrin 		off += nbytes;
79422ac5be4Sperrin 	}
79522ac5be4Sperrin }
79622ac5be4Sperrin 
797fa9e4066Sahrens int
798fa9e4066Sahrens zvol_strategy(buf_t *bp)
799fa9e4066Sahrens {
800fa9e4066Sahrens 	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
801fa9e4066Sahrens 	uint64_t off, volsize;
802fa9e4066Sahrens 	size_t size, resid;
803fa9e4066Sahrens 	char *addr;
80422ac5be4Sperrin 	objset_t *os;
805c2e6a7d6Sperrin 	rl_t *rl;
806fa9e4066Sahrens 	int error = 0;
807c2e6a7d6Sperrin 	boolean_t reading;
808fa9e4066Sahrens 
809fa9e4066Sahrens 	if (zv == NULL) {
810fa9e4066Sahrens 		bioerror(bp, ENXIO);
811fa9e4066Sahrens 		biodone(bp);
812fa9e4066Sahrens 		return (0);
813fa9e4066Sahrens 	}
814fa9e4066Sahrens 
815fa9e4066Sahrens 	if (getminor(bp->b_edev) == 0) {
816fa9e4066Sahrens 		bioerror(bp, EINVAL);
817fa9e4066Sahrens 		biodone(bp);
818fa9e4066Sahrens 		return (0);
819fa9e4066Sahrens 	}
820fa9e4066Sahrens 
821a2eea2e1Sahrens 	if ((zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) &&
822a2eea2e1Sahrens 	    !(bp->b_flags & B_READ)) {
823fa9e4066Sahrens 		bioerror(bp, EROFS);
824fa9e4066Sahrens 		biodone(bp);
825fa9e4066Sahrens 		return (0);
826fa9e4066Sahrens 	}
827fa9e4066Sahrens 
828fa9e4066Sahrens 	off = ldbtob(bp->b_blkno);
829fa9e4066Sahrens 	volsize = zv->zv_volsize;
830fa9e4066Sahrens 
83122ac5be4Sperrin 	os = zv->zv_objset;
83222ac5be4Sperrin 	ASSERT(os != NULL);
833fa9e4066Sahrens 
834fa9e4066Sahrens 	bp_mapin(bp);
835fa9e4066Sahrens 	addr = bp->b_un.b_addr;
836fa9e4066Sahrens 	resid = bp->b_bcount;
837fa9e4066Sahrens 
838a24e15ceSperrin 	/*
839a24e15ceSperrin 	 * There must be no buffer changes when doing a dmu_sync() because
840a24e15ceSperrin 	 * we can't change the data whilst calculating the checksum.
841a24e15ceSperrin 	 */
842a24e15ceSperrin 	reading = bp->b_flags & B_READ;
843c2e6a7d6Sperrin 	rl = zfs_range_lock(&zv->zv_znode, off, resid,
844c2e6a7d6Sperrin 	    reading ? RL_READER : RL_WRITER);
845a24e15ceSperrin 
846fa9e4066Sahrens 	while (resid != 0 && off < volsize) {
847fa9e4066Sahrens 
84867bd71c6Sperrin 		size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
849fa9e4066Sahrens 
850fa9e4066Sahrens 		if (size > volsize - off)	/* don't write past the end */
851fa9e4066Sahrens 			size = volsize - off;
852fa9e4066Sahrens 
853a24e15ceSperrin 		if (reading) {
854a24e15ceSperrin 			error = dmu_read(os, ZVOL_OBJ, off, size, addr);
855fa9e4066Sahrens 		} else {
85622ac5be4Sperrin 			dmu_tx_t *tx = dmu_tx_create(os);
857fa9e4066Sahrens 			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
858fa9e4066Sahrens 			error = dmu_tx_assign(tx, TXG_WAIT);
859fa9e4066Sahrens 			if (error) {
860fa9e4066Sahrens 				dmu_tx_abort(tx);
861fa9e4066Sahrens 			} else {
86222ac5be4Sperrin 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
863feb08c6bSbillm 				zvol_log_write(zv, tx, off, size);
864fa9e4066Sahrens 				dmu_tx_commit(tx);
865fa9e4066Sahrens 			}
866fa9e4066Sahrens 		}
867fa9e4066Sahrens 		if (error)
868fa9e4066Sahrens 			break;
869fa9e4066Sahrens 		off += size;
870fa9e4066Sahrens 		addr += size;
871fa9e4066Sahrens 		resid -= size;
872fa9e4066Sahrens 	}
873c2e6a7d6Sperrin 	zfs_range_unlock(rl);
874fa9e4066Sahrens 
875fa9e4066Sahrens 	if ((bp->b_resid = resid) == bp->b_bcount)
876fa9e4066Sahrens 		bioerror(bp, off > volsize ? EINVAL : error);
877fa9e4066Sahrens 
878a6c67037Sperrin 	if (!(bp->b_flags & B_ASYNC) && !reading && !zil_disable)
879feb08c6bSbillm 		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
880feb08c6bSbillm 
881feb08c6bSbillm 	biodone(bp);
88222ac5be4Sperrin 
883fa9e4066Sahrens 	return (0);
884fa9e4066Sahrens }
885fa9e4066Sahrens 
88667bd71c6Sperrin /*
88767bd71c6Sperrin  * Set the buffer count to the zvol maximum transfer.
88867bd71c6Sperrin  * Using our own routine instead of the default minphys()
88967bd71c6Sperrin  * means that for larger writes we write bigger buffers on X86
89067bd71c6Sperrin  * (128K instead of 56K) and flush the disk write cache less often
89167bd71c6Sperrin  * (every zvol_maxphys - currently 1MB) instead of minphys (currently
89267bd71c6Sperrin  * 56K on X86 and 128K on sparc).
89367bd71c6Sperrin  */
89467bd71c6Sperrin void
89567bd71c6Sperrin zvol_minphys(struct buf *bp)
89667bd71c6Sperrin {
89767bd71c6Sperrin 	if (bp->b_bcount > zvol_maxphys)
89867bd71c6Sperrin 		bp->b_bcount = zvol_maxphys;
89967bd71c6Sperrin }
90067bd71c6Sperrin 
901fa9e4066Sahrens /*ARGSUSED*/
902fa9e4066Sahrens int
903feb08c6bSbillm zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
904fa9e4066Sahrens {
905c7ca1008Sgw 	minor_t minor = getminor(dev);
906c7ca1008Sgw 	zvol_state_t *zv;
907c2e6a7d6Sperrin 	rl_t *rl;
908feb08c6bSbillm 	int error = 0;
909fa9e4066Sahrens 
910c7ca1008Sgw 	if (minor == 0)			/* This is the control device */
911c7ca1008Sgw 		return (ENXIO);
912c7ca1008Sgw 
913c7ca1008Sgw 	zv = ddi_get_soft_state(zvol_state, minor);
914c7ca1008Sgw 	if (zv == NULL)
915c7ca1008Sgw 		return (ENXIO);
916c7ca1008Sgw 
917c2e6a7d6Sperrin 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
918c2e6a7d6Sperrin 	    RL_READER);
919feb08c6bSbillm 	while (uio->uio_resid > 0) {
920feb08c6bSbillm 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
921fa9e4066Sahrens 
922feb08c6bSbillm 		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
923feb08c6bSbillm 		if (error)
924feb08c6bSbillm 			break;
925feb08c6bSbillm 	}
926c2e6a7d6Sperrin 	zfs_range_unlock(rl);
927feb08c6bSbillm 	return (error);
928fa9e4066Sahrens }
929fa9e4066Sahrens 
930fa9e4066Sahrens /*ARGSUSED*/
931fa9e4066Sahrens int
932feb08c6bSbillm zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
933fa9e4066Sahrens {
934c7ca1008Sgw 	minor_t minor = getminor(dev);
935c7ca1008Sgw 	zvol_state_t *zv;
936c2e6a7d6Sperrin 	rl_t *rl;
937feb08c6bSbillm 	int error = 0;
938feb08c6bSbillm 
939c7ca1008Sgw 	if (minor == 0)			/* This is the control device */
940c7ca1008Sgw 		return (ENXIO);
941c7ca1008Sgw 
942c7ca1008Sgw 	zv = ddi_get_soft_state(zvol_state, minor);
943c7ca1008Sgw 	if (zv == NULL)
944c7ca1008Sgw 		return (ENXIO);
945c7ca1008Sgw 
946c2e6a7d6Sperrin 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
947c2e6a7d6Sperrin 	    RL_WRITER);
948feb08c6bSbillm 	while (uio->uio_resid > 0) {
949feb08c6bSbillm 		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
950feb08c6bSbillm 		uint64_t off = uio->uio_loffset;
951feb08c6bSbillm 
952feb08c6bSbillm 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
953feb08c6bSbillm 		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
954feb08c6bSbillm 		error = dmu_tx_assign(tx, TXG_WAIT);
955feb08c6bSbillm 		if (error) {
956feb08c6bSbillm 			dmu_tx_abort(tx);
957feb08c6bSbillm 			break;
958feb08c6bSbillm 		}
959feb08c6bSbillm 		error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx);
960feb08c6bSbillm 		if (error == 0)
961feb08c6bSbillm 			zvol_log_write(zv, tx, off, bytes);
962feb08c6bSbillm 		dmu_tx_commit(tx);
963feb08c6bSbillm 
964feb08c6bSbillm 		if (error)
965feb08c6bSbillm 			break;
966feb08c6bSbillm 	}
967c2e6a7d6Sperrin 	zfs_range_unlock(rl);
968feb08c6bSbillm 	return (error);
969fa9e4066Sahrens }
970fa9e4066Sahrens 
971fa9e4066Sahrens /*
972fa9e4066Sahrens  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
973fa9e4066Sahrens  */
974fa9e4066Sahrens /*ARGSUSED*/
975fa9e4066Sahrens int
976fa9e4066Sahrens zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
977fa9e4066Sahrens {
978fa9e4066Sahrens 	zvol_state_t *zv;
979af2c4821Smaybee 	struct dk_cinfo dki;
980fa9e4066Sahrens 	struct dk_minfo dkm;
981fa9e4066Sahrens 	dk_efi_t efi;
982af2c4821Smaybee 	struct dk_callback *dkc;
983fa9e4066Sahrens 	struct uuid uuid = EFI_RESERVED;
984fa9e4066Sahrens 	uint32_t crc;
985fa9e4066Sahrens 	int error = 0;
986fa9e4066Sahrens 
987fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
988fa9e4066Sahrens 
989fa9e4066Sahrens 	zv = ddi_get_soft_state(zvol_state, getminor(dev));
990fa9e4066Sahrens 
991fa9e4066Sahrens 	if (zv == NULL) {
992fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
993fa9e4066Sahrens 		return (ENXIO);
994fa9e4066Sahrens 	}
995fa9e4066Sahrens 
996fa9e4066Sahrens 	switch (cmd) {
997fa9e4066Sahrens 
998fa9e4066Sahrens 	case DKIOCINFO:
999af2c4821Smaybee 		bzero(&dki, sizeof (dki));
1000af2c4821Smaybee 		(void) strcpy(dki.dki_cname, "zvol");
1001af2c4821Smaybee 		(void) strcpy(dki.dki_dname, "zvol");
1002af2c4821Smaybee 		dki.dki_ctype = DKC_UNKNOWN;
1003af2c4821Smaybee 		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1004fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
1005af2c4821Smaybee 		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1006fa9e4066Sahrens 			error = EFAULT;
1007fa9e4066Sahrens 		return (error);
1008fa9e4066Sahrens 
1009fa9e4066Sahrens 	case DKIOCGMEDIAINFO:
1010fa9e4066Sahrens 		bzero(&dkm, sizeof (dkm));
1011fa9e4066Sahrens 		dkm.dki_lbsize = 1U << zv->zv_min_bs;
1012fa9e4066Sahrens 		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1013fa9e4066Sahrens 		dkm.dki_media_type = DK_UNKNOWN;
1014fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
1015fa9e4066Sahrens 		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1016fa9e4066Sahrens 			error = EFAULT;
1017fa9e4066Sahrens 		return (error);
1018fa9e4066Sahrens 
1019fa9e4066Sahrens 	case DKIOCGETEFI:
1020fa9e4066Sahrens 		if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) {
1021fa9e4066Sahrens 			mutex_exit(&zvol_state_lock);
1022fa9e4066Sahrens 			return (EFAULT);
1023fa9e4066Sahrens 		}
102468a5ac4dSmaybee 		efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
1025fa9e4066Sahrens 
102668a5ac4dSmaybee 		/*
102768a5ac4dSmaybee 		 * Some clients may attempt to request a PMBR for the
102868a5ac4dSmaybee 		 * zvol.  Currently this interface will return ENOTTY to
102968a5ac4dSmaybee 		 * such requests.  These requests could be supported by
103068a5ac4dSmaybee 		 * adding a check for lba == 0 and consing up an appropriate
103168a5ac4dSmaybee 		 * RMBR.
103268a5ac4dSmaybee 		 */
103368a5ac4dSmaybee 		if (efi.dki_lba == 1) {
103468a5ac4dSmaybee 			efi_gpt_t gpt;
103568a5ac4dSmaybee 			efi_gpe_t gpe;
103668a5ac4dSmaybee 
103768a5ac4dSmaybee 			bzero(&gpt, sizeof (gpt));
103868a5ac4dSmaybee 			bzero(&gpe, sizeof (gpe));
103968a5ac4dSmaybee 
104068a5ac4dSmaybee 			if (efi.dki_length < sizeof (gpt)) {
104168a5ac4dSmaybee 				mutex_exit(&zvol_state_lock);
104268a5ac4dSmaybee 				return (EINVAL);
104368a5ac4dSmaybee 			}
1044fa9e4066Sahrens 
104568a5ac4dSmaybee 			gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
104668a5ac4dSmaybee 			gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
104768a5ac4dSmaybee 			gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
104868a5ac4dSmaybee 			gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
104968a5ac4dSmaybee 			gpt.efi_gpt_LastUsableLBA =
105068a5ac4dSmaybee 			    LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
105168a5ac4dSmaybee 			gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1052523be69cSmaybee 			gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
105368a5ac4dSmaybee 			gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe));
1054fa9e4066Sahrens 
105568a5ac4dSmaybee 			UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
105668a5ac4dSmaybee 			gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA;
105768a5ac4dSmaybee 			gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA;
105868a5ac4dSmaybee 
105968a5ac4dSmaybee 			CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
106068a5ac4dSmaybee 			gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1061fa9e4066Sahrens 
106268a5ac4dSmaybee 			CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
106368a5ac4dSmaybee 			gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1064fa9e4066Sahrens 
106568a5ac4dSmaybee 			mutex_exit(&zvol_state_lock);
106668a5ac4dSmaybee 			if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag))
106768a5ac4dSmaybee 				error = EFAULT;
106868a5ac4dSmaybee 		} else if (efi.dki_lba == 2) {
106968a5ac4dSmaybee 			efi_gpe_t gpe;
1070fa9e4066Sahrens 
107168a5ac4dSmaybee 			bzero(&gpe, sizeof (gpe));
1072fa9e4066Sahrens 
107368a5ac4dSmaybee 			if (efi.dki_length < sizeof (gpe)) {
107468a5ac4dSmaybee 				mutex_exit(&zvol_state_lock);
107568a5ac4dSmaybee 				return (EINVAL);
107668a5ac4dSmaybee 			}
1077fa9e4066Sahrens 
107868a5ac4dSmaybee 			UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
107968a5ac4dSmaybee 			gpe.efi_gpe_StartingLBA = LE_64(34ULL);
108068a5ac4dSmaybee 			gpe.efi_gpe_EndingLBA =
108168a5ac4dSmaybee 			    LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
1082fa9e4066Sahrens 
108368a5ac4dSmaybee 			mutex_exit(&zvol_state_lock);
108468a5ac4dSmaybee 			if (ddi_copyout(&gpe, efi.dki_data, sizeof (gpe), flag))
108568a5ac4dSmaybee 				error = EFAULT;
108668a5ac4dSmaybee 		} else {
108768a5ac4dSmaybee 			mutex_exit(&zvol_state_lock);
108868a5ac4dSmaybee 			error = EINVAL;
108968a5ac4dSmaybee 		}
1090fa9e4066Sahrens 		return (error);
1091fa9e4066Sahrens 
1092feb08c6bSbillm 	case DKIOCFLUSHWRITECACHE:
1093af2c4821Smaybee 		dkc = (struct dk_callback *)arg;
1094feb08c6bSbillm 		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
1095af2c4821Smaybee 		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1096af2c4821Smaybee 			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
1097af2c4821Smaybee 			error = 0;
1098af2c4821Smaybee 		}
1099feb08c6bSbillm 		break;
1100feb08c6bSbillm 
1101b6130eadSmaybee 	case DKIOCGGEOM:
1102b6130eadSmaybee 	case DKIOCGVTOC:
1103b6130eadSmaybee 		/* commands using these (like prtvtoc) expect ENOTSUP */
1104b6130eadSmaybee 		error = ENOTSUP;
1105b6130eadSmaybee 		break;
1106b6130eadSmaybee 
1107fa9e4066Sahrens 	default:
110868a5ac4dSmaybee 		error = ENOTTY;
1109fa9e4066Sahrens 		break;
1110fa9e4066Sahrens 
1111fa9e4066Sahrens 	}
1112fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
1113fa9e4066Sahrens 	return (error);
1114fa9e4066Sahrens }
1115fa9e4066Sahrens 
1116fa9e4066Sahrens int
1117fa9e4066Sahrens zvol_busy(void)
1118fa9e4066Sahrens {
1119fa9e4066Sahrens 	return (zvol_minors != 0);
1120fa9e4066Sahrens }
1121fa9e4066Sahrens 
1122fa9e4066Sahrens void
1123fa9e4066Sahrens zvol_init(void)
1124fa9e4066Sahrens {
1125fa9e4066Sahrens 	VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0);
1126fa9e4066Sahrens 	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
1127fa9e4066Sahrens }
1128fa9e4066Sahrens 
1129fa9e4066Sahrens void
1130fa9e4066Sahrens zvol_fini(void)
1131fa9e4066Sahrens {
1132fa9e4066Sahrens 	mutex_destroy(&zvol_state_lock);
1133fa9e4066Sahrens 	ddi_soft_state_fini(&zvol_state);
1134fa9e4066Sahrens }
1135