1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2019, Joyent, Inc.
25 */
26
27/*
28 * Virtual disk server
29 */
30
31
32#include <sys/types.h>
33#include <sys/conf.h>
34#include <sys/crc32.h>
35#include <sys/ddi.h>
36#include <sys/dkio.h>
37#include <sys/file.h>
38#include <sys/fs/hsfs_isospec.h>
39#include <sys/mdeg.h>
40#include <sys/mhd.h>
41#include <sys/modhash.h>
42#include <sys/note.h>
43#include <sys/pathname.h>
44#include <sys/sdt.h>
45#include <sys/sunddi.h>
46#include <sys/sunldi.h>
47#include <sys/sysmacros.h>
48#include <sys/vio_common.h>
49#include <sys/vio_util.h>
50#include <sys/vdsk_mailbox.h>
51#include <sys/vdsk_common.h>
52#include <sys/vtoc.h>
53#include <sys/vfs.h>
54#include <sys/stat.h>
55#include <sys/scsi/impl/uscsi.h>
56#include <sys/ontrap.h>
57#include <vm/seg_map.h>
58
59#define	ONE_MEGABYTE	(1ULL << 20)
60#define	ONE_GIGABYTE	(1ULL << 30)
61#define	ONE_TERABYTE	(1ULL << 40)
62
63/* Virtual disk server initialization flags */
64#define	VDS_LDI			0x01
65#define	VDS_MDEG		0x02
66
67/* Virtual disk server tunable parameters */
68#define	VDS_RETRIES		5
69#define	VDS_LDC_DELAY		1000 /* 1 msecs */
70#define	VDS_DEV_DELAY		10000000 /* 10 secs */
71#define	VDS_NCHAINS		32
72
73/* Identification parameters for MD, synthetic dkio(7i) structures, etc. */
74#define	VDS_NAME		"virtual-disk-server"
75
76#define	VD_NAME			"vd"
77#define	VD_VOLUME_NAME		"vdisk"
78#define	VD_ASCIILABEL		"Virtual Disk"
79
80#define	VD_CHANNEL_ENDPOINT	"channel-endpoint"
81#define	VD_ID_PROP		"id"
82#define	VD_BLOCK_DEVICE_PROP	"vds-block-device"
83#define	VD_BLOCK_DEVICE_OPTS	"vds-block-device-opts"
84#define	VD_REG_PROP		"reg"
85
86/* Virtual disk initialization flags */
87#define	VD_DISK_READY		0x01
88#define	VD_LOCKING		0x02
89#define	VD_LDC			0x04
90#define	VD_DRING		0x08
91#define	VD_SID			0x10
92#define	VD_SEQ_NUM		0x20
93#define	VD_SETUP_ERROR		0x40
94
95/* Number of backup labels */
96#define	VD_DSKIMG_NUM_BACKUP	5
97
98/* Timeout for SCSI I/O */
99#define	VD_SCSI_RDWR_TIMEOUT	30	/* 30 secs */
100
101/*
102 * Default number of threads for the I/O queue. In many cases, we will not
103 * receive more than 8 I/O requests at the same time. However there are
104 * cases (for example during the OS installation) where we can have a lot
105 * more (up to the limit of the DRing size).
106 */
107#define	VD_IOQ_NTHREADS		8
108
109/* Maximum number of logical partitions */
110#define	VD_MAXPART	(NDKMAP + 1)
111
112/*
113 * By Solaris convention, slice/partition 2 represents the entire disk;
114 * unfortunately, this convention does not appear to be codified.
115 */
116#define	VD_ENTIRE_DISK_SLICE	2
117
118/* Logical block address for EFI */
119#define	VD_EFI_LBA_GPT		1	/* LBA of the GPT */
120#define	VD_EFI_LBA_GPE		2	/* LBA of the GPE */
121
122#define	VD_EFI_DEV_SET(dev, vdsk, ioctl)	\
123	VDSK_EFI_DEV_SET(dev, vdsk, ioctl,	\
124	    (vdsk)->vdisk_bsize, (vdsk)->vdisk_size)
125
126/*
127 * Flags defining the behavior for flushing asynchronous writes used to
128 * performed some write I/O requests.
129 *
130 * The VD_AWFLUSH_IMMEDIATE enables immediate flushing of asynchronous
131 * writes. This ensures that data are committed to the backend when the I/O
132 * request reply is sent to the guest domain so this prevents any data to
133 * be lost in case a service domain unexpectedly crashes.
134 *
135 * The flag VD_AWFLUSH_DEFER indicates that flushing is deferred to another
136 * thread while the request is immediatly marked as completed. In that case,
137 * a guest domain can a receive a reply that its write request is completed
138 * while data haven't been flushed to disk yet.
139 *
140 * Flags VD_AWFLUSH_IMMEDIATE and VD_AWFLUSH_DEFER are mutually exclusive.
141 */
142#define	VD_AWFLUSH_IMMEDIATE	0x01	/* immediate flushing */
143#define	VD_AWFLUSH_DEFER	0x02	/* defer flushing */
144#define	VD_AWFLUSH_GROUP	0x04	/* group requests before flushing */
145
146/* Driver types */
147typedef enum vd_driver {
148	VD_DRIVER_UNKNOWN = 0,	/* driver type unknown  */
149	VD_DRIVER_DISK,		/* disk driver */
150	VD_DRIVER_VOLUME	/* volume driver */
151} vd_driver_t;
152
153#define	VD_DRIVER_NAME_LEN	64
154
155#define	VDS_NUM_DRIVERS	(sizeof (vds_driver_types) / sizeof (vd_driver_type_t))
156
157typedef struct vd_driver_type {
158	char name[VD_DRIVER_NAME_LEN];	/* driver name */
159	vd_driver_t type;		/* driver type (disk or volume) */
160} vd_driver_type_t;
161
162/*
163 * There is no reliable way to determine if a device is representing a disk
164 * or a volume, especially with pseudo devices. So we maintain a list of well
165 * known drivers and the type of device they represent (either a disk or a
166 * volume).
167 *
168 * The list can be extended by adding a "driver-type-list" entry in vds.conf
169 * with the following syntax:
170 *
171 *	driver-type-list="<driver>:<type>", ... ,"<driver>:<type>";
172 *
173 * Where:
174 *	<driver> is the name of a driver (limited to 64 characters)
175 *	<type> is either the string "disk" or "volume"
176 *
177 * Invalid entries in "driver-type-list" will be ignored.
178 *
179 * For example, the following line in vds.conf:
180 *
181 *	driver-type-list="foo:disk","bar:volume";
182 *
183 * defines that "foo" is a disk driver, and driver "bar" is a volume driver.
184 *
185 * When a list is defined in vds.conf, it is checked before the built-in list
186 * (vds_driver_types[]) so that any definition from this list can be overriden
187 * using vds.conf.
188 */
189vd_driver_type_t vds_driver_types[] = {
190	{ "dad",	VD_DRIVER_DISK },	/* Solaris */
191	{ "did",	VD_DRIVER_DISK },	/* Sun Cluster */
192	{ "dlmfdrv",	VD_DRIVER_DISK },	/* Hitachi HDLM */
193	{ "emcp",	VD_DRIVER_DISK },	/* EMC Powerpath */
194	{ "lofi",	VD_DRIVER_VOLUME },	/* Solaris */
195	{ "md",		VD_DRIVER_VOLUME },	/* Solaris - SVM */
196	{ "sd",		VD_DRIVER_DISK },	/* Solaris */
197	{ "ssd",	VD_DRIVER_DISK },	/* Solaris */
198	{ "vdc",	VD_DRIVER_DISK },	/* Solaris */
199	{ "vxdmp",	VD_DRIVER_DISK },	/* Veritas */
200	{ "vxio",	VD_DRIVER_VOLUME },	/* Veritas - VxVM */
201	{ "zfs",	VD_DRIVER_VOLUME }	/* Solaris */
202};
203
204/* Return a cpp token as a string */
205#define	STRINGIZE(token)	#token
206
207/*
208 * Print a message prefixed with the current function name to the message log
209 * (and optionally to the console for verbose boots); these macros use cpp's
210 * concatenation of string literals and C99 variable-length-argument-list
211 * macros
212 */
213#define	PRN(...)	_PRN("?%s():  "__VA_ARGS__, "")
214#define	_PRN(format, ...)					\
215	cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__)
216
217/* Return a pointer to the "i"th vdisk dring element */
218#define	VD_DRING_ELEM(i)	((vd_dring_entry_t *)(void *)	\
219	    (vd->dring + (i)*vd->descriptor_size))
220
221/* Return the virtual disk client's type as a string (for use in messages) */
222#define	VD_CLIENT(vd)							\
223	(((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" :	\
224	    (((vd)->xfer_mode == VIO_DRING_MODE_V1_0) ? "dring client" :    \
225		(((vd)->xfer_mode == 0) ? "null client" :		\
226		    "unsupported client")))
227
228/* Read disk label from a disk image */
229#define	VD_DSKIMG_LABEL_READ(vd, labelp) \
230	vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \
231	    0, sizeof (struct dk_label))
232
233/* Write disk label to a disk image */
234#define	VD_DSKIMG_LABEL_WRITE(vd, labelp)	\
235	vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \
236	    0, sizeof (struct dk_label))
237
238/* Identify if a backend is a disk image */
239#define	VD_DSKIMG(vd)	((vd)->vdisk_type == VD_DISK_TYPE_DISK &&	\
240	((vd)->file || (vd)->volume))
241
242/* Next index in a write queue */
243#define	VD_WRITE_INDEX_NEXT(vd, id)		\
244	((((id) + 1) >= vd->dring_len)? 0 : (id) + 1)
245
246/* Message for disk access rights reset failure */
247#define	VD_RESET_ACCESS_FAILURE_MSG \
248	"Fail to reset disk access rights for disk %s"
249
250/*
251 * Specification of an MD node passed to the MDEG to filter any
252 * 'vport' nodes that do not belong to the specified node. This
253 * template is copied for each vds instance and filled in with
254 * the appropriate 'cfg-handle' value before being passed to the MDEG.
255 */
256static mdeg_prop_spec_t	vds_prop_template[] = {
257	{ MDET_PROP_STR,	"name",		VDS_NAME },
258	{ MDET_PROP_VAL,	"cfg-handle",	NULL },
259	{ MDET_LIST_END,	NULL,		NULL }
260};
261
262#define	VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val);
263
264/*
265 * Matching criteria passed to the MDEG to register interest
266 * in changes to 'virtual-device-port' nodes identified by their
267 * 'id' property.
268 */
269static md_prop_match_t	vd_prop_match[] = {
270	{ MDET_PROP_VAL,	VD_ID_PROP },
271	{ MDET_LIST_END,	NULL }
272};
273
274static mdeg_node_match_t vd_match = {"virtual-device-port",
275				    vd_prop_match};
276
277/*
278 * Options for the VD_BLOCK_DEVICE_OPTS property.
279 */
280#define	VD_OPT_RDONLY		0x1	/* read-only  */
281#define	VD_OPT_SLICE		0x2	/* single slice */
282#define	VD_OPT_EXCLUSIVE	0x4	/* exclusive access */
283
284#define	VD_OPTION_NLEN	128
285
286typedef struct vd_option {
287	char vdo_name[VD_OPTION_NLEN];
288	uint64_t vdo_value;
289} vd_option_t;
290
291vd_option_t vd_bdev_options[] = {
292	{ "ro",		VD_OPT_RDONLY },
293	{ "slice",	VD_OPT_SLICE },
294	{ "excl",	VD_OPT_EXCLUSIVE }
295};
296
297/* Debugging macros */
298#ifdef DEBUG
299
300static int	vd_msglevel = 0;
301
302#define	PR0 if (vd_msglevel > 0)	PRN
303#define	PR1 if (vd_msglevel > 1)	PRN
304#define	PR2 if (vd_msglevel > 2)	PRN
305
306#define	VD_DUMP_DRING_ELEM(elem)					\
307	PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n",		\
308	    elem->hdr.dstate,						\
309	    elem->payload.operation,					\
310	    elem->payload.status,					\
311	    elem->payload.nbytes,					\
312	    elem->payload.addr,						\
313	    elem->payload.ncookies);
314
315char *
316vd_decode_state(int state)
317{
318	char *str;
319
320#define	CASE_STATE(_s)	case _s: str = #_s; break;
321
322	switch (state) {
323	CASE_STATE(VD_STATE_INIT)
324	CASE_STATE(VD_STATE_VER)
325	CASE_STATE(VD_STATE_ATTR)
326	CASE_STATE(VD_STATE_DRING)
327	CASE_STATE(VD_STATE_RDX)
328	CASE_STATE(VD_STATE_DATA)
329	default: str = "unknown"; break;
330	}
331
332#undef CASE_STATE
333
334	return (str);
335}
336
337void
338vd_decode_tag(vio_msg_t *msg)
339{
340	char *tstr, *sstr, *estr;
341
342#define	CASE_TYPE(_s)	case _s: tstr = #_s; break;
343
344	switch (msg->tag.vio_msgtype) {
345	CASE_TYPE(VIO_TYPE_CTRL)
346	CASE_TYPE(VIO_TYPE_DATA)
347	CASE_TYPE(VIO_TYPE_ERR)
348	default: tstr = "unknown"; break;
349	}
350
351#undef CASE_TYPE
352
353#define	CASE_SUBTYPE(_s) case _s: sstr = #_s; break;
354
355	switch (msg->tag.vio_subtype) {
356	CASE_SUBTYPE(VIO_SUBTYPE_INFO)
357	CASE_SUBTYPE(VIO_SUBTYPE_ACK)
358	CASE_SUBTYPE(VIO_SUBTYPE_NACK)
359	default: sstr = "unknown"; break;
360	}
361
362#undef CASE_SUBTYPE
363
364#define	CASE_ENV(_s)	case _s: estr = #_s; break;
365
366	switch (msg->tag.vio_subtype_env) {
367	CASE_ENV(VIO_VER_INFO)
368	CASE_ENV(VIO_ATTR_INFO)
369	CASE_ENV(VIO_DRING_REG)
370	CASE_ENV(VIO_DRING_UNREG)
371	CASE_ENV(VIO_RDX)
372	CASE_ENV(VIO_PKT_DATA)
373	CASE_ENV(VIO_DESC_DATA)
374	CASE_ENV(VIO_DRING_DATA)
375	default: estr = "unknown"; break;
376	}
377
378#undef CASE_ENV
379
380	PR1("(%x/%x/%x) message : (%s/%s/%s)",
381	    msg->tag.vio_msgtype, msg->tag.vio_subtype,
382	    msg->tag.vio_subtype_env, tstr, sstr, estr);
383}
384
385#else	/* !DEBUG */
386
387#define	PR0(...)
388#define	PR1(...)
389#define	PR2(...)
390
391#define	VD_DUMP_DRING_ELEM(elem)
392
393#define	vd_decode_state(_s)	(NULL)
394#define	vd_decode_tag(_s)	(NULL)
395
396#endif	/* DEBUG */
397
398
399/*
400 * Soft state structure for a vds instance
401 */
402typedef struct vds {
403	uint_t		initialized;	/* driver inst initialization flags */
404	dev_info_t	*dip;		/* driver inst devinfo pointer */
405	ldi_ident_t	ldi_ident;	/* driver's identifier for LDI */
406	mod_hash_t	*vd_table;	/* table of virtual disks served */
407	mdeg_node_spec_t *ispecp;	/* mdeg node specification */
408	mdeg_handle_t	mdeg;		/* handle for MDEG operations  */
409	vd_driver_type_t *driver_types;	/* extra driver types (from vds.conf) */
410	int		num_drivers;	/* num of extra driver types */
411} vds_t;
412
413/*
414 * Types of descriptor-processing tasks
415 */
416typedef enum vd_task_type {
417	VD_NONFINAL_RANGE_TASK,	/* task for intermediate descriptor in range */
418	VD_FINAL_RANGE_TASK,	/* task for last in a range of descriptors */
419} vd_task_type_t;
420
421/*
422 * Structure describing the task for processing a descriptor
423 */
424typedef struct vd_task {
425	struct vd		*vd;		/* vd instance task is for */
426	vd_task_type_t		type;		/* type of descriptor task */
427	int			index;		/* dring elem index for task */
428	vio_msg_t		*msg;		/* VIO message task is for */
429	size_t			msglen;		/* length of message content */
430	vd_dring_payload_t	*request;	/* request task will perform */
431	struct buf		buf;		/* buf(9s) for I/O request */
432	ldc_mem_handle_t	mhdl;		/* task memory handle */
433	int			status;		/* status of processing task */
434	int	(*completef)(struct vd_task *task); /* completion func ptr */
435	uint32_t		write_index;	/* index in the write_queue */
436} vd_task_t;
437
438/*
439 * Soft state structure for a virtual disk instance
440 */
441typedef struct vd {
442	uint64_t		id;		/* vdisk id */
443	uint_t			initialized;	/* vdisk initialization flags */
444	uint64_t		operations;	/* bitmask of VD_OPs exported */
445	vio_ver_t		version;	/* ver negotiated with client */
446	vds_t			*vds;		/* server for this vdisk */
447	ddi_taskq_t		*startq;	/* queue for I/O start tasks */
448	ddi_taskq_t		*completionq;	/* queue for completion tasks */
449	ddi_taskq_t		*ioq;		/* queue for I/O */
450	uint32_t		write_index;	/* next write index */
451	buf_t			**write_queue;	/* queue for async writes */
452	ldi_handle_t		ldi_handle[V_NUMPAR];	/* LDI slice handles */
453	char			device_path[MAXPATHLEN + 1]; /* vdisk device */
454	dev_t			dev[V_NUMPAR];	/* dev numbers for slices */
455	int			open_flags;	/* open flags */
456	uint_t			nslices;	/* number of slices we export */
457	size_t			vdisk_size;	/* number of blocks in vdisk */
458	size_t			vdisk_bsize;	/* blk size of the vdisk */
459	vd_disk_type_t		vdisk_type;	/* slice or entire disk */
460	vd_disk_label_t		vdisk_label;	/* EFI or VTOC label */
461	vd_media_t		vdisk_media;	/* media type of backing dev. */
462	boolean_t		is_atapi_dev;	/* Is this an IDE CD-ROM dev? */
463	ushort_t		max_xfer_sz;	/* max xfer size in DEV_BSIZE */
464	size_t			backend_bsize;	/* blk size of backend device */
465	int			vio_bshift;	/* shift for blk convertion */
466	boolean_t		volume;		/* is vDisk backed by volume */
467	boolean_t		zvol;		/* is vDisk backed by a zvol */
468	boolean_t		file;		/* is vDisk backed by a file? */
469	boolean_t		scsi;		/* is vDisk backed by scsi? */
470	vnode_t			*file_vnode;	/* file vnode */
471	size_t			dskimg_size;	/* size of disk image */
472	ddi_devid_t		dskimg_devid;	/* devid for disk image */
473	int			efi_reserved;	/* EFI reserved slice */
474	caddr_t			flabel;		/* fake label for slice type */
475	uint_t			flabel_size;	/* fake label size */
476	uint_t			flabel_limit;	/* limit of the fake label */
477	struct dk_geom		dk_geom;	/* synthetic for slice type */
478	struct extvtoc		vtoc;		/* synthetic for slice type */
479	vd_slice_t		slices[VD_MAXPART]; /* logical partitions */
480	boolean_t		ownership;	/* disk ownership status */
481	ldc_status_t		ldc_state;	/* LDC connection state */
482	ldc_handle_t		ldc_handle;	/* handle for LDC comm */
483	size_t			max_msglen;	/* largest LDC message len */
484	vd_state_t		state;		/* client handshake state */
485	uint8_t			xfer_mode;	/* transfer mode with client */
486	uint32_t		sid;		/* client's session ID */
487	uint64_t		seq_num;	/* message sequence number */
488	uint64_t		dring_ident;	/* identifier of dring */
489	ldc_dring_handle_t	dring_handle;	/* handle for dring ops */
490	uint32_t		descriptor_size;	/* num bytes in desc */
491	uint32_t		dring_len;	/* number of dring elements */
492	uint8_t			dring_mtype;	/* dring mem map type */
493	caddr_t			dring;		/* address of dring */
494	caddr_t			vio_msgp;	/* vio msg staging buffer */
495	vd_task_t		inband_task;	/* task for inband descriptor */
496	vd_task_t		*dring_task;	/* tasks dring elements */
497
498	kmutex_t		lock;		/* protects variables below */
499	boolean_t		enabled;	/* is vdisk enabled? */
500	boolean_t		reset_state;	/* reset connection state? */
501	boolean_t		reset_ldc;	/* reset LDC channel? */
502} vd_t;
503
504/*
505 * Macros to manipulate the fake label (flabel) for single slice disks.
506 *
507 * If we fake a VTOC label then the fake label consists of only one block
508 * containing the VTOC label (struct dk_label).
509 *
510 * If we fake an EFI label then the fake label consists of a blank block
511 * followed by a GPT (efi_gpt_t) and a GPE (efi_gpe_t).
512 *
513 */
514#define	VD_LABEL_VTOC_SIZE(lba)					\
515	P2ROUNDUP(sizeof (struct dk_label), (lba))
516
517#define	VD_LABEL_EFI_SIZE(lba)					\
518	P2ROUNDUP(2 * (lba) + sizeof (efi_gpe_t) * VD_MAXPART,	\
519	    (lba))
520
521#define	VD_LABEL_VTOC(vd)	\
522		((struct dk_label *)(void *)((vd)->flabel))
523
524#define	VD_LABEL_EFI_GPT(vd, lba)	\
525		((efi_gpt_t *)(void *)((vd)->flabel + (lba)))
526#define	VD_LABEL_EFI_GPE(vd, lba)	\
527		((efi_gpe_t *)(void *)((vd)->flabel + 2 * (lba)))
528
529
530typedef struct vds_operation {
531	char	*namep;
532	uint8_t	operation;
533	int	(*start)(vd_task_t *task);
534	int	(*complete)(vd_task_t *task);
535} vds_operation_t;
536
537typedef struct vd_ioctl {
538	uint8_t		operation;		/* vdisk operation */
539	const char	*operation_name;	/* vdisk operation name */
540	size_t		nbytes;			/* size of operation buffer */
541	int		cmd;			/* corresponding ioctl cmd */
542	const char	*cmd_name;		/* ioctl cmd name */
543	void		*arg;			/* ioctl cmd argument */
544	/* convert input vd_buf to output ioctl_arg */
545	int		(*copyin)(void *vd_buf, size_t, void *ioctl_arg);
546	/* convert input ioctl_arg to output vd_buf */
547	void		(*copyout)(void *ioctl_arg, void *vd_buf);
548	/* write is true if the operation writes any data to the backend */
549	boolean_t	write;
550} vd_ioctl_t;
551
552/* Define trivial copyin/copyout conversion function flag */
553#define	VD_IDENTITY_IN	((int (*)(void *, size_t, void *))-1)
554#define	VD_IDENTITY_OUT	((void (*)(void *, void *))-1)
555
556
557static int	vds_ldc_retries = VDS_RETRIES;
558static int	vds_ldc_delay = VDS_LDC_DELAY;
559static int	vds_dev_retries = VDS_RETRIES;
560static int	vds_dev_delay = VDS_DEV_DELAY;
561static void	*vds_state;
562
563static short	vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT;
564static int	vd_scsi_debug = USCSI_SILENT;
565
566/*
567 * Number of threads in the taskq handling vdisk I/O. This can be set up to
568 * the size of the DRing which is the maximum number of I/O we can receive
569 * in parallel. Note that using a high number of threads can improve performance
570 * but this is going to consume a lot of resources if there are many vdisks.
571 */
572static int	vd_ioq_nthreads = VD_IOQ_NTHREADS;
573
574/*
575 * Tunable to define the behavior for flushing asynchronous writes used to
576 * performed some write I/O requests. The default behavior is to group as
577 * much asynchronous writes as possible and to flush them immediatly.
578 *
579 * If the tunable is set to 0 then explicit flushing is disabled. In that
580 * case, data will be flushed by traditional mechanism (like fsflush) but
581 * this might not happen immediatly.
582 *
583 */
584static int	vd_awflush = VD_AWFLUSH_IMMEDIATE | VD_AWFLUSH_GROUP;
585
586/*
587 * Tunable to define the behavior of the service domain if the vdisk server
588 * fails to reset disk exclusive access when a LDC channel is reset. When a
589 * LDC channel is reset the vdisk server will try to reset disk exclusive
590 * access by releasing any SCSI-2 reservation or resetting the disk. If these
591 * actions fail then the default behavior (vd_reset_access_failure = 0) is to
592 * print a warning message. This default behavior can be changed by setting
593 * the vd_reset_access_failure variable to A_REBOOT (= 0x1) and that will
594 * cause the service domain to reboot, or A_DUMP (= 0x5) and that will cause
595 * the service domain to panic. In both cases, the reset of the service domain
596 * should trigger a reset SCSI buses and hopefully clear any SCSI-2 reservation.
597 */
598static int	vd_reset_access_failure = 0;
599
600/*
601 * Tunable for backward compatibility. When this variable is set to B_TRUE,
602 * all disk volumes (ZFS, SVM, VxvM volumes) will be exported as single
603 * slice disks whether or not they have the "slice" option set. This is
604 * to provide a simple backward compatibility mechanism when upgrading
605 * the vds driver and using a domain configuration created before the
606 * "slice" option was available.
607 */
608static boolean_t vd_volume_force_slice = B_FALSE;
609
610/*
611 * The label of disk images created with some earlier versions of the virtual
612 * disk software is not entirely correct and have an incorrect v_sanity field
613 * (usually 0) instead of VTOC_SANE. This creates a compatibility problem with
614 * these images because we are now validating that the disk label (and the
615 * sanity) is correct when a disk image is opened.
616 *
617 * This tunable is set to false to not validate the sanity field and ensure
618 * compatibility. If the tunable is set to true, we will do a strict checking
619 * of the sanity but this can create compatibility problems with old disk
620 * images.
621 */
622static boolean_t vd_dskimg_validate_sanity = B_FALSE;
623
624/*
625 * Enables the use of LDC_DIRECT_MAP when mapping in imported descriptor rings.
626 */
627static boolean_t vd_direct_mapped_drings = B_TRUE;
628
629/*
630 * When a backend is exported as a single-slice disk then we entirely fake
631 * its disk label. So it can be exported either with a VTOC label or with
632 * an EFI label. If vd_slice_label is set to VD_DISK_LABEL_VTOC then all
633 * single-slice disks will be exported with a VTOC label; and if it is set
634 * to VD_DISK_LABEL_EFI then all single-slice disks will be exported with
635 * an EFI label.
636 *
637 * If vd_slice_label is set to VD_DISK_LABEL_UNK and the backend is a disk
638 * or volume device then it will be exported with the same type of label as
639 * defined on the device. Otherwise if the backend is a file then it will
640 * exported with the disk label type set in the vd_file_slice_label variable.
641 *
642 * Note that if the backend size is greater than 1TB then it will always be
643 * exported with an EFI label no matter what the setting is.
644 */
645static vd_disk_label_t vd_slice_label = VD_DISK_LABEL_UNK;
646
647static vd_disk_label_t vd_file_slice_label = VD_DISK_LABEL_VTOC;
648
649/*
650 * Tunable for backward compatibility. If this variable is set to B_TRUE then
651 * single-slice disks are exported as disks with only one slice instead of
652 * faking a complete disk partitioning.
653 */
654static boolean_t vd_slice_single_slice = B_FALSE;
655
656/*
657 * Supported protocol version pairs, from highest (newest) to lowest (oldest)
658 *
659 * Each supported major version should appear only once, paired with (and only
660 * with) its highest supported minor version number (as the protocol requires
661 * supporting all lower minor version numbers as well)
662 */
663static const vio_ver_t	vds_version[] = {{1, 1}};
664static const size_t	vds_num_versions =
665    sizeof (vds_version)/sizeof (vds_version[0]);
666
667static void vd_free_dring_task(vd_t *vdp);
668static int vd_setup_vd(vd_t *vd);
669static int vd_setup_single_slice_disk(vd_t *vd);
670static int vd_setup_slice_image(vd_t *vd);
671static int vd_setup_disk_image(vd_t *vd);
672static int vd_backend_check_size(vd_t *vd);
673static boolean_t vd_enabled(vd_t *vd);
674static ushort_t vd_lbl2cksum(struct dk_label *label);
675static int vd_dskimg_validate_geometry(vd_t *vd);
676static boolean_t vd_dskimg_is_iso_image(vd_t *vd);
677static void vd_set_exported_operations(vd_t *vd);
678static void vd_reset_access(vd_t *vd);
679static int vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg);
680static int vds_efi_alloc_and_read(vd_t *, efi_gpt_t **, efi_gpe_t **);
681static void vds_efi_free(vd_t *, efi_gpt_t *, efi_gpe_t *);
682static void vds_driver_types_free(vds_t *vds);
683static void vd_vtocgeom_to_label(struct extvtoc *vtoc, struct dk_geom *geom,
684    struct dk_label *label);
685static void vd_label_to_vtocgeom(struct dk_label *label, struct extvtoc *vtoc,
686    struct dk_geom *geom);
687static boolean_t vd_slice_geom_isvalid(vd_t *vd, struct dk_geom *geom);
688static boolean_t vd_slice_vtoc_isvalid(vd_t *vd, struct extvtoc *vtoc);
689
690extern int is_pseudo_device(dev_info_t *);
691
692/*
693 * Function:
694 *	vd_get_readable_size
695 *
696 * Description:
697 *	Convert a given size in bytes to a human readable format in
698 *	kilobytes, megabytes, gigabytes or terabytes.
699 *
700 * Parameters:
701 *	full_size	- the size to convert in bytes.
702 *	size		- the converted size.
703 *	unit		- the unit of the converted size: 'K' (kilobyte),
704 *			  'M' (Megabyte), 'G' (Gigabyte), 'T' (Terabyte).
705 *
706 * Return Code:
707 *	none
708 */
709static void
710vd_get_readable_size(size_t full_size, size_t *size, char *unit)
711{
712	if (full_size < (1ULL << 20)) {
713		*size = full_size >> 10;
714		*unit = 'K'; /* Kilobyte */
715	} else if (full_size < (1ULL << 30)) {
716		*size = full_size >> 20;
717		*unit = 'M'; /* Megabyte */
718	} else if (full_size < (1ULL << 40)) {
719		*size = full_size >> 30;
720		*unit = 'G'; /* Gigabyte */
721	} else {
722		*size = full_size >> 40;
723		*unit = 'T'; /* Terabyte */
724	}
725}
726
727/*
728 * Function:
729 *	vd_dskimg_io_params
730 *
731 * Description:
732 *	Convert virtual disk I/O parameters (slice, block, length) to
733 *	(offset, length) relative to the disk image and according to
734 *	the virtual disk partitioning.
735 *
736 * Parameters:
737 *	vd		- disk on which the operation is performed.
738 *	slice		- slice to which is the I/O parameters apply.
739 *			  VD_SLICE_NONE indicates that parameters are
740 *			  are relative to the entire virtual disk.
741 *	blkp		- pointer to the starting block relative to the
742 *			  slice; return the starting block relative to
743 *			  the disk image.
744 *	lenp		- pointer to the number of bytes requested; return
745 *			  the number of bytes that can effectively be used.
746 *
747 * Return Code:
748 *	0		- I/O parameters have been successfully converted;
749 *			  blkp and lenp point to the converted values.
750 *	ENODATA		- no data are available for the given I/O parameters;
751 *			  This occurs if the starting block is past the limit
752 *			  of the slice.
753 *	EINVAL		- I/O parameters are invalid.
754 */
755static int
756vd_dskimg_io_params(vd_t *vd, int slice, size_t *blkp, size_t *lenp)
757{
758	size_t blk = *blkp;
759	size_t len = *lenp;
760	size_t offset, maxlen;
761
762	ASSERT(vd->file || VD_DSKIMG(vd));
763	ASSERT(len > 0);
764	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
765
766	/*
767	 * If a file is exported as a slice then we don't care about the vtoc.
768	 * In that case, the vtoc is a fake mainly to make newfs happy and we
769	 * handle any I/O as a raw disk access so that we can have access to the
770	 * entire backend.
771	 */
772	if (vd->vdisk_type == VD_DISK_TYPE_SLICE || slice == VD_SLICE_NONE) {
773		/* raw disk access */
774		offset = blk * DEV_BSIZE;
775		if (offset >= vd->dskimg_size) {
776			/* offset past the end of the disk */
777			PR0("offset (0x%lx) >= size (0x%lx)",
778			    offset, vd->dskimg_size);
779			return (ENODATA);
780		}
781		maxlen = vd->dskimg_size - offset;
782	} else {
783		ASSERT(slice >= 0 && slice < V_NUMPAR);
784
785		/*
786		 * v1.0 vDisk clients depended on the server not verifying
787		 * the label of a unformatted disk.  This "feature" is
788		 * maintained for backward compatibility but all versions
789		 * from v1.1 onwards must do the right thing.
790		 */
791		if (vd->vdisk_label == VD_DISK_LABEL_UNK &&
792		    vio_ver_is_supported(vd->version, 1, 1)) {
793			(void) vd_dskimg_validate_geometry(vd);
794			if (vd->vdisk_label == VD_DISK_LABEL_UNK) {
795				PR0("Unknown disk label, can't do I/O "
796				    "from slice %d", slice);
797				return (EINVAL);
798			}
799		}
800
801		if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
802			ASSERT(vd->vtoc.v_sectorsz == DEV_BSIZE);
803		} else {
804			ASSERT(vd->vdisk_label == VD_DISK_LABEL_EFI);
805		}
806
807		if (blk >= vd->slices[slice].nblocks) {
808			/* address past the end of the slice */
809			PR0("req_addr (0x%lx) >= psize (0x%lx)",
810			    blk, vd->slices[slice].nblocks);
811			return (ENODATA);
812		}
813
814		offset = (vd->slices[slice].start + blk) * DEV_BSIZE;
815		maxlen = (vd->slices[slice].nblocks - blk) * DEV_BSIZE;
816	}
817
818	/*
819	 * If the requested size is greater than the size
820	 * of the partition, truncate the read/write.
821	 */
822	if (len > maxlen) {
823		PR0("I/O size truncated to %lu bytes from %lu bytes",
824		    maxlen, len);
825		len = maxlen;
826	}
827
828	/*
829	 * We have to ensure that we are reading/writing into the mmap
830	 * range. If we have a partial disk image (e.g. an image of
831	 * s0 instead s2) the system can try to access slices that
832	 * are not included into the disk image.
833	 */
834	if ((offset + len) > vd->dskimg_size) {
835		PR0("offset + nbytes (0x%lx + 0x%lx) > "
836		    "dskimg_size (0x%lx)", offset, len, vd->dskimg_size);
837		return (EINVAL);
838	}
839
840	*blkp = offset / DEV_BSIZE;
841	*lenp = len;
842
843	return (0);
844}
845
846/*
847 * Function:
848 *	vd_dskimg_rw
849 *
850 * Description:
851 *	Read or write to a disk image. It handles the case where the disk
852 *	image is a file or a volume exported as a full disk or a file
853 *	exported as single-slice disk. Read or write to volumes exported as
854 *	single slice disks are done by directly using the ldi interface.
855 *
856 * Parameters:
857 *	vd		- disk on which the operation is performed.
858 *	slice		- slice on which the operation is performed,
859 *			  VD_SLICE_NONE indicates that the operation
860 *			  is done using an absolute disk offset.
861 *	operation	- operation to execute: read (VD_OP_BREAD) or
862 *			  write (VD_OP_BWRITE).
863 *	data		- buffer where data are read to or written from.
864 *	blk		- starting block for the operation.
865 *	len		- number of bytes to read or write.
866 *
867 * Return Code:
868 *	n >= 0		- success, n indicates the number of bytes read
869 *			  or written.
870 *	-1		- error.
871 */
872static ssize_t
873vd_dskimg_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t offset,
874    size_t len)
875{
876	ssize_t resid;
877	struct buf buf;
878	int status;
879
880	ASSERT(vd->file || VD_DSKIMG(vd));
881	ASSERT(len > 0);
882	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
883
884	if ((status = vd_dskimg_io_params(vd, slice, &offset, &len)) != 0)
885		return ((status == ENODATA)? 0: -1);
886
887	if (vd->volume) {
888
889		bioinit(&buf);
890		buf.b_flags	= B_BUSY |
891		    ((operation == VD_OP_BREAD)? B_READ : B_WRITE);
892		buf.b_bcount	= len;
893		buf.b_lblkno	= offset;
894		buf.b_edev	= vd->dev[0];
895		buf.b_un.b_addr = data;
896
897		/*
898		 * We use ldi_strategy() and not ldi_read()/ldi_write() because
899		 * the read/write functions of the underlying driver may try to
900		 * lock pages of the data buffer, and this requires the data
901		 * buffer to be kmem_alloc'ed (and not allocated on the stack).
902		 *
903		 * Also using ldi_strategy() ensures that writes are immediatly
904		 * commited and not cached as this may be the case with
905		 * ldi_write() (for example with a ZFS volume).
906		 */
907		if (ldi_strategy(vd->ldi_handle[0], &buf) != 0) {
908			biofini(&buf);
909			return (-1);
910		}
911
912		if (biowait(&buf) != 0) {
913			biofini(&buf);
914			return (-1);
915		}
916
917		resid = buf.b_resid;
918		biofini(&buf);
919
920		ASSERT(resid <= len);
921		return (len - resid);
922	}
923
924	ASSERT(vd->file);
925
926	status = vn_rdwr((operation == VD_OP_BREAD)? UIO_READ : UIO_WRITE,
927	    vd->file_vnode, data, len, offset * DEV_BSIZE, UIO_SYSSPACE, FSYNC,
928	    RLIM64_INFINITY, kcred, &resid);
929
930	if (status != 0)
931		return (-1);
932
933	return (len);
934}
935
936/*
937 * Function:
938 *	vd_build_default_label
939 *
940 * Description:
941 *	Return a default label for a given disk size. This is used when the disk
942 *	does not have a valid VTOC so that the user can get a valid default
943 *	configuration. The default label has all slice sizes set to 0 (except
944 *	slice 2 which is the entire disk) to force the user to write a valid
945 *	label onto the disk image.
946 *
947 * Parameters:
948 *	disk_size	- the disk size in bytes
949 *	bsize		- the disk block size in bytes
950 *	label		- the returned default label.
951 *
952 * Return Code:
953 *	none.
954 */
955static void
956vd_build_default_label(size_t disk_size, size_t bsize, struct dk_label *label)
957{
958	size_t size;
959	char unit;
960
961	ASSERT(bsize > 0);
962
963	bzero(label, sizeof (struct dk_label));
964
965	/*
966	 * Ideally we would like the cylinder size (nsect * nhead) to be the
967	 * same whatever the disk size is. That way the VTOC label could be
968	 * easily updated in case the disk size is increased (keeping the
969	 * same cylinder size allows to preserve the existing partitioning
970	 * when updating the VTOC label). But it is not possible to have
971	 * a fixed cylinder size and to cover all disk size.
972	 *
973	 * So we define different cylinder sizes depending on the disk size.
974	 * The cylinder size is chosen so that we don't have too few cylinders
975	 * for a small disk image, or so many on a big disk image that you
976	 * waste space for backup superblocks or cylinder group structures.
977	 * Also we must have a resonable number of cylinders and sectors so
978	 * that newfs can run using default values.
979	 *
980	 *	+-----------+--------+---------+--------+
981	 *	| disk_size |  < 2MB | 2MB-4GB | >= 8GB |
982	 *	+-----------+--------+---------+--------+
983	 *	| nhead	    |	 1   |	   1   |    96  |
984	 *	| nsect	    |  200   |   600   |   768  |
985	 *	+-----------+--------+---------+--------+
986	 *
987	 * Other parameters are computed from these values:
988	 *
989	 *	pcyl = disk_size / (nhead * nsect * 512)
990	 *	acyl = (pcyl > 2)? 2 : 0
991	 *	ncyl = pcyl - acyl
992	 *
993	 * The maximum number of cylinder is 65535 so this allows to define a
994	 * geometry for a disk size up to 65535 * 96 * 768 * 512 = 2.24 TB
995	 * which is more than enough to cover the maximum size allowed by the
996	 * extended VTOC format (2TB).
997	 */
998
999	if (disk_size >= 8 * ONE_GIGABYTE) {
1000
1001		label->dkl_nhead = 96;
1002		label->dkl_nsect = 768;
1003
1004	} else if (disk_size >= 2 * ONE_MEGABYTE) {
1005
1006		label->dkl_nhead = 1;
1007		label->dkl_nsect = 600;
1008
1009	} else {
1010
1011		label->dkl_nhead = 1;
1012		label->dkl_nsect = 200;
1013	}
1014
1015	label->dkl_pcyl = disk_size /
1016	    (label->dkl_nsect * label->dkl_nhead * bsize);
1017
1018	if (label->dkl_pcyl == 0)
1019		label->dkl_pcyl = 1;
1020
1021	label->dkl_acyl = 0;
1022
1023	if (label->dkl_pcyl > 2)
1024		label->dkl_acyl = 2;
1025
1026	label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl;
1027	label->dkl_write_reinstruct = 0;
1028	label->dkl_read_reinstruct = 0;
1029	label->dkl_rpm = 7200;
1030	label->dkl_apc = 0;
1031	label->dkl_intrlv = 0;
1032
1033	PR0("requested disk size: %ld bytes\n", disk_size);
1034	PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl,
1035	    label->dkl_nhead, label->dkl_nsect);
1036	PR0("provided disk size: %ld bytes\n", (uint64_t)
1037	    (label->dkl_pcyl * label->dkl_nhead *
1038	    label->dkl_nsect * bsize));
1039
1040	vd_get_readable_size(disk_size, &size, &unit);
1041
1042	/*
1043	 * We must have a correct label name otherwise format(1m) will
1044	 * not recognized the disk as labeled.
1045	 */
1046	(void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII,
1047	    "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d",
1048	    size, unit,
1049	    label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead,
1050	    label->dkl_nsect);
1051
1052	/* default VTOC */
1053	label->dkl_vtoc.v_version = V_EXTVERSION;
1054	label->dkl_vtoc.v_nparts = V_NUMPAR;
1055	label->dkl_vtoc.v_sanity = VTOC_SANE;
1056	label->dkl_vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_tag = V_BACKUP;
1057	label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_cylno = 0;
1058	label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_nblk = label->dkl_ncyl *
1059	    label->dkl_nhead * label->dkl_nsect;
1060	label->dkl_magic = DKL_MAGIC;
1061	label->dkl_cksum = vd_lbl2cksum(label);
1062}
1063
1064/*
1065 * Function:
1066 *	vd_dskimg_set_vtoc
1067 *
1068 * Description:
1069 *	Set the vtoc of a disk image by writing the label and backup
1070 *	labels into the disk image backend.
1071 *
1072 * Parameters:
1073 *	vd		- disk on which the operation is performed.
1074 *	label		- the data to be written.
1075 *
1076 * Return Code:
1077 *	0		- success.
1078 *	n > 0		- error, n indicates the errno code.
1079 */
1080static int
1081vd_dskimg_set_vtoc(vd_t *vd, struct dk_label *label)
1082{
1083	size_t blk, sec, cyl, head, cnt;
1084
1085	ASSERT(VD_DSKIMG(vd));
1086
1087	if (VD_DSKIMG_LABEL_WRITE(vd, label) < 0) {
1088		PR0("fail to write disk label");
1089		return (EIO);
1090	}
1091
1092	/*
1093	 * Backup labels are on the last alternate cylinder's
1094	 * first five odd sectors.
1095	 */
1096	if (label->dkl_acyl == 0) {
1097		PR0("no alternate cylinder, can not store backup labels");
1098		return (0);
1099	}
1100
1101	cyl = label->dkl_ncyl  + label->dkl_acyl - 1;
1102	head = label->dkl_nhead - 1;
1103
1104	blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) +
1105	    (head * label->dkl_nsect);
1106
1107	/*
1108	 * Write the backup labels. Make sure we don't try to write past
1109	 * the last cylinder.
1110	 */
1111	sec = 1;
1112
1113	for (cnt = 0; cnt < VD_DSKIMG_NUM_BACKUP; cnt++) {
1114
1115		if (sec >= label->dkl_nsect) {
1116			PR0("not enough sector to store all backup labels");
1117			return (0);
1118		}
1119
1120		if (vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE,
1121		    (caddr_t)label, blk + sec, sizeof (struct dk_label)) < 0) {
1122			PR0("error writing backup label at block %lu\n",
1123			    blk + sec);
1124			return (EIO);
1125		}
1126
1127		PR1("wrote backup label at block %lu\n", blk + sec);
1128
1129		sec += 2;
1130	}
1131
1132	return (0);
1133}
1134
1135/*
1136 * Function:
1137 *	vd_dskimg_get_devid_block
1138 *
1139 * Description:
1140 *	Return the block number where the device id is stored.
1141 *
1142 * Parameters:
1143 *	vd		- disk on which the operation is performed.
1144 *	blkp		- pointer to the block number
1145 *
1146 * Return Code:
1147 *	0		- success
1148 *	ENOSPC		- disk has no space to store a device id
1149 */
1150static int
1151vd_dskimg_get_devid_block(vd_t *vd, size_t *blkp)
1152{
1153	diskaddr_t spc, head, cyl;
1154
1155	ASSERT(VD_DSKIMG(vd));
1156
1157	if (vd->vdisk_label == VD_DISK_LABEL_UNK) {
1158		/*
1159		 * If no label is defined we don't know where to find
1160		 * a device id.
1161		 */
1162		return (ENOSPC);
1163	}
1164
1165	if (vd->vdisk_label == VD_DISK_LABEL_EFI) {
1166		/*
1167		 * For an EFI disk, the devid is at the beginning of
1168		 * the reserved slice
1169		 */
1170		if (vd->efi_reserved == -1) {
1171			PR0("EFI disk has no reserved slice");
1172			return (ENOSPC);
1173		}
1174
1175		*blkp = vd->slices[vd->efi_reserved].start;
1176		return (0);
1177	}
1178
1179	ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
1180
1181	/* this geometry doesn't allow us to have a devid */
1182	if (vd->dk_geom.dkg_acyl < 2) {
1183		PR0("not enough alternate cylinder available for devid "
1184		    "(acyl=%u)", vd->dk_geom.dkg_acyl);
1185		return (ENOSPC);
1186	}
1187
1188	/* the devid is in on the track next to the last cylinder */
1189	cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2;
1190	spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
1191	head = vd->dk_geom.dkg_nhead - 1;
1192
1193	*blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) +
1194	    (head * vd->dk_geom.dkg_nsect) + 1;
1195
1196	return (0);
1197}
1198
1199/*
1200 * Return the checksum of a disk block containing an on-disk devid.
1201 */
1202static uint_t
1203vd_dkdevid2cksum(struct dk_devid *dkdevid)
1204{
1205	uint_t chksum, *ip;
1206	int i;
1207
1208	chksum = 0;
1209	ip = (void *)dkdevid;
1210	for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++)
1211		chksum ^= ip[i];
1212
1213	return (chksum);
1214}
1215
1216/*
1217 * Function:
1218 *	vd_dskimg_read_devid
1219 *
1220 * Description:
1221 *	Read the device id stored on a disk image.
1222 *
1223 * Parameters:
1224 *	vd		- disk on which the operation is performed.
1225 *	devid		- the return address of the device ID.
1226 *
1227 * Return Code:
1228 *	0		- success
1229 *	EIO		- I/O error while trying to access the disk image
1230 *	EINVAL		- no valid device id was found
1231 *	ENOSPC		- disk has no space to store a device id
1232 */
1233static int
1234vd_dskimg_read_devid(vd_t *vd, ddi_devid_t *devid)
1235{
1236	struct dk_devid *dkdevid;
1237	size_t blk;
1238	uint_t chksum;
1239	int status, sz;
1240
1241	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
1242
1243	if ((status = vd_dskimg_get_devid_block(vd, &blk)) != 0)
1244		return (status);
1245
1246	dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
1247
1248	/* get the devid */
1249	if ((vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk,
1250	    DEV_BSIZE)) < 0) {
1251		PR0("error reading devid block at %lu", blk);
1252		status = EIO;
1253		goto done;
1254	}
1255
1256	/* validate the revision */
1257	if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) ||
1258	    (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) {
1259		PR0("invalid devid found at block %lu (bad revision)", blk);
1260		status = EINVAL;
1261		goto done;
1262	}
1263
1264	/* compute checksum */
1265	chksum = vd_dkdevid2cksum(dkdevid);
1266
1267	/* compare the checksums */
1268	if (DKD_GETCHKSUM(dkdevid) != chksum) {
1269		PR0("invalid devid found at block %lu (bad checksum)", blk);
1270		status = EINVAL;
1271		goto done;
1272	}
1273
1274	/* validate the device id */
1275	if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) {
1276		PR0("invalid devid found at block %lu", blk);
1277		status = EINVAL;
1278		goto done;
1279	}
1280
1281	PR1("devid read at block %lu", blk);
1282
1283	sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid);
1284	*devid = kmem_alloc(sz, KM_SLEEP);
1285	bcopy(&dkdevid->dkd_devid, *devid, sz);
1286
1287done:
1288	kmem_free(dkdevid, DEV_BSIZE);
1289	return (status);
1290
1291}
1292
1293/*
1294 * Function:
1295 *	vd_dskimg_write_devid
1296 *
1297 * Description:
1298 *	Write a device id into disk image.
1299 *
1300 * Parameters:
1301 *	vd		- disk on which the operation is performed.
1302 *	devid		- the device ID to store.
1303 *
1304 * Return Code:
1305 *	0		- success
1306 *	EIO		- I/O error while trying to access the disk image
1307 *	ENOSPC		- disk has no space to store a device id
1308 */
1309static int
1310vd_dskimg_write_devid(vd_t *vd, ddi_devid_t devid)
1311{
1312	struct dk_devid *dkdevid;
1313	uint_t chksum;
1314	size_t blk;
1315	int status;
1316
1317	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
1318
1319	if (devid == NULL) {
1320		/* nothing to write */
1321		return (0);
1322	}
1323
1324	if ((status = vd_dskimg_get_devid_block(vd, &blk)) != 0)
1325		return (status);
1326
1327	dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
1328
1329	/* set revision */
1330	dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB;
1331	dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB;
1332
1333	/* copy devid */
1334	bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid));
1335
1336	/* compute checksum */
1337	chksum = vd_dkdevid2cksum(dkdevid);
1338
1339	/* set checksum */
1340	DKD_FORMCHKSUM(chksum, dkdevid);
1341
1342	/* store the devid */
1343	if ((status = vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE,
1344	    (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) {
1345		PR0("Error writing devid block at %lu", blk);
1346		status = EIO;
1347	} else {
1348		PR1("devid written at block %lu", blk);
1349		status = 0;
1350	}
1351
1352	kmem_free(dkdevid, DEV_BSIZE);
1353	return (status);
1354}
1355
1356/*
1357 * Function:
1358 *	vd_do_scsi_rdwr
1359 *
1360 * Description:
1361 *	Read or write to a SCSI disk using an absolute disk offset.
1362 *
1363 * Parameters:
1364 *	vd		- disk on which the operation is performed.
1365 *	operation	- operation to execute: read (VD_OP_BREAD) or
1366 *			  write (VD_OP_BWRITE).
1367 *	data		- buffer where data are read to or written from.
1368 *	blk		- starting block for the operation.
1369 *	len		- number of bytes to read or write.
1370 *
1371 * Return Code:
1372 *	0		- success
1373 *	n != 0		- error.
1374 */
1375static int
1376vd_do_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len)
1377{
1378	struct uscsi_cmd ucmd;
1379	union scsi_cdb cdb;
1380	int nsectors, nblk;
1381	int max_sectors;
1382	int status, rval;
1383
1384	ASSERT(!vd->file);
1385	ASSERT(!vd->volume);
1386	ASSERT(vd->vdisk_bsize > 0);
1387
1388	max_sectors = vd->max_xfer_sz;
1389	nblk = (len / vd->vdisk_bsize);
1390
1391	if (len % vd->vdisk_bsize != 0)
1392		return (EINVAL);
1393
1394	/*
1395	 * Build and execute the uscsi ioctl.  We build a group0, group1
1396	 * or group4 command as necessary, since some targets
1397	 * do not support group1 commands.
1398	 */
1399	while (nblk) {
1400
1401		bzero(&ucmd, sizeof (ucmd));
1402		bzero(&cdb, sizeof (cdb));
1403
1404		nsectors = (max_sectors < nblk) ? max_sectors : nblk;
1405
1406		/*
1407		 * Some of the optical drives on sun4v machines are ATAPI
1408		 * devices which use Group 1 Read/Write commands so we need
1409		 * to explicitly check a flag which is set when a domain
1410		 * is bound.
1411		 */
1412		if (blk < (2 << 20) && nsectors <= 0xff && !vd->is_atapi_dev) {
1413			FORMG0ADDR(&cdb, blk);
1414			FORMG0COUNT(&cdb, (uchar_t)nsectors);
1415			ucmd.uscsi_cdblen = CDB_GROUP0;
1416		} else if (blk > 0xffffffff) {
1417			FORMG4LONGADDR(&cdb, blk);
1418			FORMG4COUNT(&cdb, nsectors);
1419			ucmd.uscsi_cdblen = CDB_GROUP4;
1420			cdb.scc_cmd |= SCMD_GROUP4;
1421		} else {
1422			FORMG1ADDR(&cdb, blk);
1423			FORMG1COUNT(&cdb, nsectors);
1424			ucmd.uscsi_cdblen = CDB_GROUP1;
1425			cdb.scc_cmd |= SCMD_GROUP1;
1426		}
1427		ucmd.uscsi_cdb = (caddr_t)&cdb;
1428		ucmd.uscsi_bufaddr = data;
1429		ucmd.uscsi_buflen = nsectors * vd->backend_bsize;
1430		ucmd.uscsi_timeout = vd_scsi_rdwr_timeout;
1431		/*
1432		 * Set flags so that the command is isolated from normal
1433		 * commands and no error message is printed.
1434		 */
1435		ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT;
1436
1437		if (operation == VD_OP_BREAD) {
1438			cdb.scc_cmd |= SCMD_READ;
1439			ucmd.uscsi_flags |= USCSI_READ;
1440		} else {
1441			cdb.scc_cmd |= SCMD_WRITE;
1442		}
1443
1444		status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE],
1445		    USCSICMD, (intptr_t)&ucmd, (vd->open_flags | FKIOCTL),
1446		    kcred, &rval);
1447
1448		if (status == 0)
1449			status = ucmd.uscsi_status;
1450
1451		if (status != 0)
1452			break;
1453
1454		/*
1455		 * Check if partial DMA breakup is required. If so, reduce
1456		 * the request size by half and retry the last request.
1457		 */
1458		if (ucmd.uscsi_resid == ucmd.uscsi_buflen) {
1459			max_sectors >>= 1;
1460			if (max_sectors <= 0) {
1461				status = EIO;
1462				break;
1463			}
1464			continue;
1465		}
1466
1467		if (ucmd.uscsi_resid != 0) {
1468			status = EIO;
1469			break;
1470		}
1471
1472		blk += nsectors;
1473		nblk -= nsectors;
1474		data += nsectors * vd->vdisk_bsize;
1475	}
1476
1477	return (status);
1478}
1479
1480/*
1481 * Function:
1482 *	vd_scsi_rdwr
1483 *
1484 * Description:
1485 *	Wrapper function to read or write to a SCSI disk using an absolute
1486 *	disk offset. It checks the blocksize of the underlying device and,
1487 *	if necessary, adjusts the buffers accordingly before calling
1488 *	vd_do_scsi_rdwr() to do the actual read or write.
1489 *
1490 * Parameters:
1491 *	vd		- disk on which the operation is performed.
1492 *	operation	- operation to execute: read (VD_OP_BREAD) or
1493 *			  write (VD_OP_BWRITE).
1494 *	data		- buffer where data are read to or written from.
1495 *	blk		- starting block for the operation.
1496 *	len		- number of bytes to read or write.
1497 *
1498 * Return Code:
1499 *	0		- success
1500 *	n != 0		- error.
1501 */
1502static int
1503vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t vblk, size_t vlen)
1504{
1505	int	rv;
1506
1507	size_t	pblk;	/* physical device block number of data on device */
1508	size_t	delta;	/* relative offset between pblk and vblk */
1509	size_t	pnblk;	/* number of physical blocks to be read from device */
1510	size_t	plen;	/* length of data to be read from physical device */
1511	char	*buf;	/* buffer area to fit physical device's block size */
1512
1513	if (vd->backend_bsize == 0) {
1514		/*
1515		 * The block size was not available during the attach,
1516		 * try to update it now.
1517		 */
1518		if (vd_backend_check_size(vd) != 0)
1519			return (EIO);
1520	}
1521
1522	/*
1523	 * If the vdisk block size and the block size of the underlying device
1524	 * match we can skip straight to vd_do_scsi_rdwr(), otherwise we need
1525	 * to create a buffer large enough to handle the device's block size
1526	 * and adjust the block to be read from and the amount of data to
1527	 * read to correspond with the device's block size.
1528	 */
1529	if (vd->vdisk_bsize == vd->backend_bsize)
1530		return (vd_do_scsi_rdwr(vd, operation, data, vblk, vlen));
1531
1532	if (vd->vdisk_bsize > vd->backend_bsize)
1533		return (EINVAL);
1534
1535	/*
1536	 * Writing of physical block sizes larger than the virtual block size
1537	 * is not supported. This would be added if/when support for guests
1538	 * writing to DVDs is implemented.
1539	 */
1540	if (operation == VD_OP_BWRITE)
1541		return (ENOTSUP);
1542
1543	/* BEGIN CSTYLED */
1544	/*
1545	 * Below is a diagram showing the relationship between the physical
1546	 * and virtual blocks. If the virtual blocks marked by 'X' below are
1547	 * requested, then the physical blocks denoted by 'Y' are read.
1548	 *
1549	 *           vblk
1550	 *             |      vlen
1551	 *             |<--------------->|
1552	 *             v                 v
1553	 *  --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+-   virtual disk:
1554	 *    |  |  |  |XX|XX|XX|XX|XX|XX|  |  |  |  |  |  } block size is
1555	 *  --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+-   vd->vdisk_bsize
1556	 *          :  :                 :  :
1557	 *         >:==:< delta          :  :
1558	 *          :  :                 :  :
1559	 *  --+-----+-----+-----+-----+-----+-----+-----+--   physical disk:
1560	 *    |     |YY:YY|YYYYY|YYYYY|YY:YY|     |     |   } block size is
1561	 *  --+-----+-----+-----+-----+-----+-----+-----+--   vd->backend_bsize
1562	 *          ^                       ^
1563	 *          |<--------------------->|
1564	 *          |         plen
1565	 *	   pblk
1566	 */
1567	/* END CSTYLED */
1568	pblk = (vblk * vd->vdisk_bsize) / vd->backend_bsize;
1569	delta = (vblk * vd->vdisk_bsize) - (pblk * vd->backend_bsize);
1570	pnblk = ((delta + vlen - 1) / vd->backend_bsize) + 1;
1571	plen = pnblk * vd->backend_bsize;
1572
1573	PR2("vblk %lx:pblk %lx: vlen %ld:plen %ld", vblk, pblk, vlen, plen);
1574
1575	buf = kmem_zalloc(sizeof (caddr_t) * plen, KM_SLEEP);
1576	rv = vd_do_scsi_rdwr(vd, operation, (caddr_t)buf, pblk, plen);
1577	bcopy(buf + delta, data, vlen);
1578
1579	kmem_free(buf, sizeof (caddr_t) * plen);
1580
1581	return (rv);
1582}
1583
1584/*
1585 * Function:
1586 *	vd_slice_flabel_read
1587 *
1588 * Description:
1589 *	This function simulates a read operation from the fake label of
1590 *	a single-slice disk.
1591 *
1592 * Parameters:
1593 *	vd		- single-slice disk to read from
1594 *	data		- buffer where data should be read to
1595 *	offset		- offset in byte where the read should start
1596 *	length		- number of bytes to read
1597 *
1598 * Return Code:
1599 *	n >= 0		- success, n indicates the number of bytes read
1600 *	-1		- error
1601 */
1602static ssize_t
1603vd_slice_flabel_read(vd_t *vd, caddr_t data, size_t offset, size_t length)
1604{
1605	size_t n = 0;
1606	uint_t limit = vd->flabel_limit * vd->vdisk_bsize;
1607
1608	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
1609	ASSERT(vd->flabel != NULL);
1610
1611	/* if offset is past the fake label limit there's nothing to read */
1612	if (offset >= limit)
1613		return (0);
1614
1615	/* data with offset 0 to flabel_size are read from flabel */
1616	if (offset < vd->flabel_size) {
1617
1618		if (offset + length <= vd->flabel_size) {
1619			bcopy(vd->flabel + offset, data, length);
1620			return (length);
1621		}
1622
1623		n = vd->flabel_size - offset;
1624		bcopy(vd->flabel + offset, data, n);
1625		data += n;
1626	}
1627
1628	/* data with offset from flabel_size to flabel_limit are all zeros */
1629	if (offset + length <= limit) {
1630		bzero(data, length - n);
1631		return (length);
1632	}
1633
1634	bzero(data, limit - offset - n);
1635	return (limit - offset);
1636}
1637
1638/*
1639 * Function:
1640 *	vd_slice_flabel_write
1641 *
1642 * Description:
1643 *	This function simulates a write operation to the fake label of
1644 *	a single-slice disk. Write operations are actually faked and return
1645 *	success although the label is never changed. This is mostly to
1646 *	simulate a successful label update.
1647 *
1648 * Parameters:
1649 *	vd		- single-slice disk to write to
1650 *	data		- buffer where data should be written from
1651 *	offset		- offset in byte where the write should start
1652 *	length		- number of bytes to written
1653 *
1654 * Return Code:
1655 *	n >= 0		- success, n indicates the number of bytes written
1656 *	-1		- error
1657 */
1658static ssize_t
1659vd_slice_flabel_write(vd_t *vd, caddr_t data, size_t offset, size_t length)
1660{
1661	uint_t limit = vd->flabel_limit * vd->vdisk_bsize;
1662	struct dk_label *label;
1663	struct dk_geom geom;
1664	struct extvtoc vtoc;
1665
1666	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
1667	ASSERT(vd->flabel != NULL);
1668
1669	if (offset >= limit)
1670		return (0);
1671
1672	/*
1673	 * If this is a request to overwrite the VTOC disk label, check that
1674	 * the new label is similar to the previous one and return that the
1675	 * write was successful, but note that nothing is actually overwritten.
1676	 */
1677	if (vd->vdisk_label == VD_DISK_LABEL_VTOC &&
1678	    offset == 0 && length == vd->vdisk_bsize) {
1679		label = (void *)data;
1680
1681		/* check that this is a valid label */
1682		if (label->dkl_magic != DKL_MAGIC ||
1683		    label->dkl_cksum != vd_lbl2cksum(label))
1684			return (-1);
1685
1686		/* check the vtoc and geometry */
1687		vd_label_to_vtocgeom(label, &vtoc, &geom);
1688		if (vd_slice_geom_isvalid(vd, &geom) &&
1689		    vd_slice_vtoc_isvalid(vd, &vtoc))
1690			return (length);
1691	}
1692
1693	/* fail any other write */
1694	return (-1);
1695}
1696
1697/*
1698 * Function:
1699 *	vd_slice_fake_rdwr
1700 *
1701 * Description:
1702 *	This function simulates a raw read or write operation to a single-slice
1703 *	disk. It only handles the faked part of the operation i.e. I/Os to
1704 *	blocks which have no mapping with the vdisk backend (I/Os to the
1705 *	beginning and to the end of the vdisk).
1706 *
1707 *	The function returns 0 is the operation	is completed and it has been
1708 *	entirely handled as a fake read or write. In that case, lengthp points
1709 *	to the number of bytes not read or written. Values returned by datap
1710 *	and blkp are undefined.
1711 *
1712 *	If the fake operation has succeeded but the read or write is not
1713 *	complete (i.e. the read/write operation extends beyond the blocks
1714 *	we fake) then the function returns EAGAIN and datap, blkp and lengthp
1715 *	pointers points to the parameters for completing the operation.
1716 *
1717 *	In case of an error, for example if the slice is empty or parameters
1718 *	are invalid, then the function returns a non-zero value different
1719 *	from EAGAIN. In that case, the returned values of datap, blkp and
1720 *	lengthp are undefined.
1721 *
1722 * Parameters:
1723 *	vd		- single-slice disk on which the operation is performed
1724 *	slice		- slice on which the operation is performed,
1725 *			  VD_SLICE_NONE indicates that the operation
1726 *			  is done using an absolute disk offset.
1727 *	operation	- operation to execute: read (VD_OP_BREAD) or
1728 *			  write (VD_OP_BWRITE).
1729 *	datap		- pointer to the buffer where data are read to
1730 *			  or written from. Return the pointer where remaining
1731 *			  data have to be read to or written from.
1732 *	blkp		- pointer to the starting block for the operation.
1733 *			  Return the starting block relative to the vdisk
1734 *			  backend for the remaining operation.
1735 *	lengthp		- pointer to the number of bytes to read or write.
1736 *			  This should be a multiple of vdisk_bsize. Return the
1737 *			  remaining number of bytes to read or write.
1738 *
1739 * Return Code:
1740 *	0		- read/write operation is completed
1741 *	EAGAIN		- read/write operation is not completed
1742 *	other values	- error
1743 */
1744static int
1745vd_slice_fake_rdwr(vd_t *vd, int slice, int operation, caddr_t *datap,
1746    size_t *blkp, size_t *lengthp)
1747{
1748	struct dk_label *label;
1749	caddr_t data;
1750	size_t blk, length, csize;
1751	size_t ablk, asize, aoff, alen;
1752	ssize_t n;
1753	int sec, status;
1754	size_t bsize = vd->vdisk_bsize;
1755
1756	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
1757	ASSERT(slice != 0);
1758
1759	data = *datap;
1760	blk = *blkp;
1761	length = *lengthp;
1762
1763	/*
1764	 * If this is not a raw I/O or an I/O from a full disk slice then
1765	 * this is an I/O to/from an empty slice.
1766	 */
1767	if (slice != VD_SLICE_NONE &&
1768	    (slice != VD_ENTIRE_DISK_SLICE ||
1769	    vd->vdisk_label != VD_DISK_LABEL_VTOC) &&
1770	    (slice != VD_EFI_WD_SLICE ||
1771	    vd->vdisk_label != VD_DISK_LABEL_EFI)) {
1772		return (EIO);
1773	}
1774
1775	if (length % bsize != 0)
1776		return (EINVAL);
1777
1778	/* handle any I/O with the fake label */
1779	if (operation == VD_OP_BWRITE)
1780		n = vd_slice_flabel_write(vd, data, blk * bsize, length);
1781	else
1782		n = vd_slice_flabel_read(vd, data, blk * bsize, length);
1783
1784	if (n == -1)
1785		return (EINVAL);
1786
1787	ASSERT(n % bsize == 0);
1788
1789	/* adjust I/O arguments */
1790	data += n;
1791	blk += n / bsize;
1792	length -= n;
1793
1794	/* check if there's something else to process */
1795	if (length == 0) {
1796		status = 0;
1797		goto done;
1798	}
1799
1800	if (vd->vdisk_label == VD_DISK_LABEL_VTOC &&
1801	    slice == VD_ENTIRE_DISK_SLICE) {
1802		status = EAGAIN;
1803		goto done;
1804	}
1805
1806	if (vd->vdisk_label == VD_DISK_LABEL_EFI) {
1807		asize = EFI_MIN_RESV_SIZE + (EFI_MIN_ARRAY_SIZE / bsize) + 1;
1808		ablk = vd->vdisk_size - asize;
1809	} else {
1810		ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
1811		ASSERT(vd->dk_geom.dkg_apc == 0);
1812
1813		csize = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
1814		ablk = vd->dk_geom.dkg_ncyl * csize;
1815		asize = vd->dk_geom.dkg_acyl * csize;
1816	}
1817
1818	alen = length / bsize;
1819	aoff = blk;
1820
1821	/* if we have reached the last block then the I/O is completed */
1822	if (aoff == ablk + asize) {
1823		status = 0;
1824		goto done;
1825	}
1826
1827	/* if we are past the last block then return an error */
1828	if (aoff > ablk + asize)
1829		return (EIO);
1830
1831	/* check if there is any I/O to end of the disk */
1832	if (aoff + alen < ablk) {
1833		status = EAGAIN;
1834		goto done;
1835	}
1836
1837	/* we don't allow any write to the end of the disk */
1838	if (operation == VD_OP_BWRITE)
1839		return (EIO);
1840
1841	if (aoff < ablk) {
1842		alen -= (ablk - aoff);
1843		aoff = ablk;
1844	}
1845
1846	if (aoff + alen > ablk + asize) {
1847		alen = ablk + asize - aoff;
1848	}
1849
1850	alen *= bsize;
1851
1852	if (operation == VD_OP_BREAD) {
1853		bzero(data + (aoff - blk) * bsize, alen);
1854
1855		if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
1856			/* check if we read backup labels */
1857			label = VD_LABEL_VTOC(vd);
1858			ablk += (label->dkl_acyl - 1) * csize +
1859			    (label->dkl_nhead - 1) * label->dkl_nsect;
1860
1861			for (sec = 1; (sec < 5 * 2 + 1); sec += 2) {
1862
1863				if (ablk + sec >= blk &&
1864				    ablk + sec < blk + (length / bsize)) {
1865					bcopy(label, data +
1866					    (ablk + sec - blk) * bsize,
1867					    sizeof (struct dk_label));
1868				}
1869			}
1870		}
1871	}
1872
1873	length -= alen;
1874
1875	status = (length == 0)? 0: EAGAIN;
1876
1877done:
1878	ASSERT(length == 0 || blk >= vd->flabel_limit);
1879
1880	/*
1881	 * Return the parameters for the remaining I/O. The starting block is
1882	 * adjusted so that it is relative to the vdisk backend.
1883	 */
1884	*datap = data;
1885	*blkp = blk - vd->flabel_limit;
1886	*lengthp = length;
1887
1888	return (status);
1889}
1890
1891static int
1892vd_flush_write(vd_t *vd)
1893{
1894	int status, rval;
1895
1896	if (vd->file) {
1897		status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL);
1898	} else {
1899		status = ldi_ioctl(vd->ldi_handle[0], DKIOCFLUSHWRITECACHE,
1900		    (intptr_t)NULL, vd->open_flags | FKIOCTL, kcred, &rval);
1901	}
1902
1903	return (status);
1904}
1905
1906static void
1907vd_bio_task(void *arg)
1908{
1909	struct buf *buf = (struct buf *)arg;
1910	vd_task_t *task = (vd_task_t *)buf->b_private;
1911	vd_t *vd = task->vd;
1912	ssize_t resid;
1913	int status;
1914
1915	ASSERT(vd->vdisk_bsize == DEV_BSIZE);
1916
1917	if (vd->zvol) {
1918
1919		status = ldi_strategy(vd->ldi_handle[0], buf);
1920
1921	} else {
1922
1923		ASSERT(vd->file);
1924
1925		status = vn_rdwr((buf->b_flags & B_READ)? UIO_READ : UIO_WRITE,
1926		    vd->file_vnode, buf->b_un.b_addr, buf->b_bcount,
1927		    buf->b_lblkno * DEV_BSIZE, UIO_SYSSPACE, 0,
1928		    RLIM64_INFINITY, kcred, &resid);
1929
1930		if (status == 0) {
1931			buf->b_resid = resid;
1932			biodone(buf);
1933			return;
1934		}
1935	}
1936
1937	if (status != 0) {
1938		bioerror(buf, status);
1939		biodone(buf);
1940	}
1941}
1942
1943/*
1944 * We define our own biodone function so that buffers used for
1945 * asynchronous writes are not released when biodone() is called.
1946 */
1947static int
1948vd_biodone(struct buf *bp)
1949{
1950	ASSERT((bp->b_flags & B_DONE) == 0);
1951	ASSERT(SEMA_HELD(&bp->b_sem));
1952
1953	bp->b_flags |= B_DONE;
1954	sema_v(&bp->b_io);
1955
1956	return (0);
1957}
1958
1959/*
1960 * Return Values
1961 *	EINPROGRESS	- operation was successfully started
1962 *	EIO		- encountered LDC (aka. task error)
1963 *	0		- operation completed successfully
1964 *
1965 * Side Effect
1966 *     sets request->status = <disk operation status>
1967 */
1968static int
1969vd_start_bio(vd_task_t *task)
1970{
1971	int			rv, status = 0;
1972	vd_t			*vd		= task->vd;
1973	vd_dring_payload_t	*request	= task->request;
1974	struct buf		*buf		= &task->buf;
1975	uint8_t			mtype;
1976	int			slice;
1977	char			*bufaddr = 0;
1978	size_t			buflen;
1979	size_t			offset, length, nbytes;
1980
1981	ASSERT(vd != NULL);
1982	ASSERT(request != NULL);
1983
1984	slice = request->slice;
1985
1986	ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices);
1987	ASSERT((request->operation == VD_OP_BREAD) ||
1988	    (request->operation == VD_OP_BWRITE));
1989
1990	if (request->nbytes == 0) {
1991		/* no service for trivial requests */
1992		request->status = EINVAL;
1993		return (0);
1994	}
1995
1996	PR1("%s %lu bytes at block %lu",
1997	    (request->operation == VD_OP_BREAD) ? "Read" : "Write",
1998	    request->nbytes, request->addr);
1999
2000	/*
2001	 * We have to check the open flags because the functions processing
2002	 * the read/write request will not do it.
2003	 */
2004	if (request->operation == VD_OP_BWRITE && !(vd->open_flags & FWRITE)) {
2005		PR0("write fails because backend is opened read-only");
2006		request->nbytes = 0;
2007		request->status = EROFS;
2008		return (0);
2009	}
2010
2011	mtype = LDC_SHADOW_MAP;
2012
2013	/* Map memory exported by client */
2014	status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies,
2015	    mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R,
2016	    &bufaddr, NULL);
2017	if (status != 0) {
2018		PR0("ldc_mem_map() returned err %d ", status);
2019		return (EIO);
2020	}
2021
2022	/*
2023	 * The buffer size has to be 8-byte aligned, so the client should have
2024	 * sent a buffer which size is roundup to the next 8-byte aligned value.
2025	 */
2026	buflen = P2ROUNDUP(request->nbytes, 8);
2027
2028	status = ldc_mem_acquire(task->mhdl, 0, buflen);
2029	if (status != 0) {
2030		(void) ldc_mem_unmap(task->mhdl);
2031		PR0("ldc_mem_acquire() returned err %d ", status);
2032		return (EIO);
2033	}
2034
2035	offset = request->addr;
2036	nbytes = request->nbytes;
2037	length = nbytes;
2038
2039	/* default number of byte returned by the I/O */
2040	request->nbytes = 0;
2041
2042	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
2043
2044		if (slice != 0) {
2045			/* handle any fake I/O */
2046			rv = vd_slice_fake_rdwr(vd, slice, request->operation,
2047			    &bufaddr, &offset, &length);
2048
2049			/* record the number of bytes from the fake I/O */
2050			request->nbytes = nbytes - length;
2051
2052			if (rv == 0) {
2053				request->status = 0;
2054				goto io_done;
2055			}
2056
2057			if (rv != EAGAIN) {
2058				request->nbytes = 0;
2059				request->status = EIO;
2060				goto io_done;
2061			}
2062
2063			/*
2064			 * If we return with EAGAIN then this means that there
2065			 * are still data to read or write.
2066			 */
2067			ASSERT(length != 0);
2068
2069			/*
2070			 * We need to continue the I/O from the slice backend to
2071			 * complete the request. The variables bufaddr, offset
2072			 * and length have been adjusted to have the right
2073			 * information to do the remaining I/O from the backend.
2074			 * The backend is entirely mapped to slice 0 so we just
2075			 * have to complete the I/O from that slice.
2076			 */
2077			slice = 0;
2078		}
2079
2080	} else if (vd->volume || vd->file) {
2081
2082		rv = vd_dskimg_io_params(vd, slice, &offset, &length);
2083		if (rv != 0) {
2084			request->status = (rv == ENODATA)? 0: EIO;
2085			goto io_done;
2086		}
2087		slice = 0;
2088
2089	} else if (slice == VD_SLICE_NONE) {
2090
2091		/*
2092		 * This is not a disk image so it is a real disk. We
2093		 * assume that the underlying device driver supports
2094		 * USCSICMD ioctls. This is the case of all SCSI devices
2095		 * (sd, ssd...).
2096		 *
2097		 * In the future if we have non-SCSI disks we would need
2098		 * to invoke the appropriate function to do I/O using an
2099		 * absolute disk offset (for example using DIOCTL_RWCMD
2100		 * for IDE disks).
2101		 */
2102		rv = vd_scsi_rdwr(vd, request->operation, bufaddr, offset,
2103		    length);
2104		if (rv != 0) {
2105			request->status = EIO;
2106		} else {
2107			request->nbytes = length;
2108			request->status = 0;
2109		}
2110		goto io_done;
2111	}
2112
2113	/* Start the block I/O */
2114	bioinit(buf);
2115	buf->b_flags	= B_BUSY;
2116	buf->b_bcount	= length;
2117	buf->b_lblkno	= offset;
2118	buf->b_bufsize	= buflen;
2119	buf->b_edev	= vd->dev[slice];
2120	buf->b_un.b_addr = bufaddr;
2121	buf->b_iodone	= vd_biodone;
2122
2123	if (vd->file || vd->zvol) {
2124		/*
2125		 * I/O to a file are dispatched to an I/O queue, so that several
2126		 * I/Os can be processed in parallel. We also do that for ZFS
2127		 * volumes because the ZFS volume strategy() function will only
2128		 * return after the I/O is completed (instead of just starting
2129		 * the I/O).
2130		 */
2131
2132		if (request->operation == VD_OP_BREAD) {
2133			buf->b_flags |= B_READ;
2134		} else {
2135			/*
2136			 * For ZFS volumes and files, we do an asynchronous
2137			 * write and we will wait for the completion of the
2138			 * write in vd_complete_bio() by flushing the volume
2139			 * or file.
2140			 *
2141			 * This done for performance reasons, so that we can
2142			 * group together several write requests into a single
2143			 * flush operation.
2144			 */
2145			buf->b_flags |= B_WRITE | B_ASYNC;
2146
2147			/*
2148			 * We keep track of the write so that we can group
2149			 * requests when flushing. The write queue has the
2150			 * same number of slots as the dring so this prevents
2151			 * the write queue from wrapping and overwriting
2152			 * existing entries: if the write queue gets full
2153			 * then that means that the dring is full so we stop
2154			 * receiving new requests until an existing request
2155			 * is processed, removed from the write queue and
2156			 * then from the dring.
2157			 */
2158			task->write_index = vd->write_index;
2159			vd->write_queue[task->write_index] = buf;
2160			vd->write_index =
2161			    VD_WRITE_INDEX_NEXT(vd, vd->write_index);
2162		}
2163
2164		buf->b_private = task;
2165
2166		ASSERT(vd->ioq != NULL);
2167
2168		request->status = 0;
2169		(void) ddi_taskq_dispatch(task->vd->ioq, vd_bio_task, buf,
2170		    DDI_SLEEP);
2171
2172	} else {
2173
2174		if (request->operation == VD_OP_BREAD) {
2175			buf->b_flags |= B_READ;
2176		} else {
2177			buf->b_flags |= B_WRITE;
2178		}
2179
2180		/* convert VIO block number to buf block number */
2181		buf->b_lblkno = offset << vd->vio_bshift;
2182
2183		request->status = ldi_strategy(vd->ldi_handle[slice], buf);
2184	}
2185
2186	/*
2187	 * This is to indicate to the caller that the request
2188	 * needs to be finished by vd_complete_bio() by calling
2189	 * biowait() there and waiting for that to return before
2190	 * triggering the notification of the vDisk client.
2191	 *
2192	 * This is necessary when writing to real disks as
2193	 * otherwise calls to ldi_strategy() would be serialized
2194	 * behind the calls to biowait() and performance would
2195	 * suffer.
2196	 */
2197	if (request->status == 0)
2198		return (EINPROGRESS);
2199
2200	biofini(buf);
2201
2202io_done:
2203	/* Clean up after error or completion */
2204	rv = ldc_mem_release(task->mhdl, 0, buflen);
2205	if (rv) {
2206		PR0("ldc_mem_release() returned err %d ", rv);
2207		status = EIO;
2208	}
2209	rv = ldc_mem_unmap(task->mhdl);
2210	if (rv) {
2211		PR0("ldc_mem_unmap() returned err %d ", rv);
2212		status = EIO;
2213	}
2214
2215	return (status);
2216}
2217
2218/*
2219 * This function should only be called from vd_notify to ensure that requests
2220 * are responded to in the order that they are received.
2221 */
2222static int
2223send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
2224{
2225	int	status;
2226	size_t	nbytes;
2227
2228	do {
2229		nbytes = msglen;
2230		status = ldc_write(ldc_handle, msg, &nbytes);
2231		if (status != EWOULDBLOCK)
2232			break;
2233		drv_usecwait(vds_ldc_delay);
2234	} while (status == EWOULDBLOCK);
2235
2236	if (status != 0) {
2237		if (status != ECONNRESET)
2238			PR0("ldc_write() returned errno %d", status);
2239		return (status);
2240	} else if (nbytes != msglen) {
2241		PR0("ldc_write() performed only partial write");
2242		return (EIO);
2243	}
2244
2245	PR1("SENT %lu bytes", msglen);
2246	return (0);
2247}
2248
2249static void
2250vd_need_reset(vd_t *vd, boolean_t reset_ldc)
2251{
2252	mutex_enter(&vd->lock);
2253	vd->reset_state	= B_TRUE;
2254	vd->reset_ldc	= reset_ldc;
2255	mutex_exit(&vd->lock);
2256}
2257
2258/*
2259 * Reset the state of the connection with a client, if needed; reset the LDC
2260 * transport as well, if needed.  This function should only be called from the
2261 * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur.
2262 */
2263static void
2264vd_reset_if_needed(vd_t *vd)
2265{
2266	int	status = 0;
2267
2268	mutex_enter(&vd->lock);
2269	if (!vd->reset_state) {
2270		ASSERT(!vd->reset_ldc);
2271		mutex_exit(&vd->lock);
2272		return;
2273	}
2274	mutex_exit(&vd->lock);
2275
2276	PR0("Resetting connection state with %s", VD_CLIENT(vd));
2277
2278	/*
2279	 * Let any asynchronous I/O complete before possibly pulling the rug
2280	 * out from under it; defer checking vd->reset_ldc, as one of the
2281	 * asynchronous tasks might set it
2282	 */
2283	if (vd->ioq != NULL)
2284		ddi_taskq_wait(vd->ioq);
2285	ddi_taskq_wait(vd->completionq);
2286
2287	status = vd_flush_write(vd);
2288	if (status) {
2289		PR0("flushwrite returned error %d", status);
2290	}
2291
2292	if ((vd->initialized & VD_DRING) &&
2293	    ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
2294		PR0("ldc_mem_dring_unmap() returned errno %d", status);
2295
2296	vd_free_dring_task(vd);
2297
2298	/* Free the staging buffer for msgs */
2299	if (vd->vio_msgp != NULL) {
2300		kmem_free(vd->vio_msgp, vd->max_msglen);
2301		vd->vio_msgp = NULL;
2302	}
2303
2304	/* Free the inband message buffer */
2305	if (vd->inband_task.msg != NULL) {
2306		kmem_free(vd->inband_task.msg, vd->max_msglen);
2307		vd->inband_task.msg = NULL;
2308	}
2309
2310	mutex_enter(&vd->lock);
2311
2312	if (vd->reset_ldc)
2313		PR0("taking down LDC channel");
2314	if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0))
2315		PR0("ldc_down() returned errno %d", status);
2316
2317	/* Reset exclusive access rights */
2318	vd_reset_access(vd);
2319
2320	vd->initialized	&= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
2321	vd->state	= VD_STATE_INIT;
2322	vd->max_msglen	= sizeof (vio_msg_t);	/* baseline vio message size */
2323
2324	/* Allocate the staging buffer */
2325	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
2326
2327	PR0("calling ldc_up\n");
2328	(void) ldc_up(vd->ldc_handle);
2329
2330	vd->reset_state	= B_FALSE;
2331	vd->reset_ldc	= B_FALSE;
2332
2333	mutex_exit(&vd->lock);
2334}
2335
2336static void vd_recv_msg(void *arg);
2337
2338static void
2339vd_mark_in_reset(vd_t *vd)
2340{
2341	int status;
2342
2343	PR0("vd_mark_in_reset: marking vd in reset\n");
2344
2345	vd_need_reset(vd, B_FALSE);
2346	status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP);
2347	if (status == DDI_FAILURE) {
2348		PR0("cannot schedule task to recv msg\n");
2349		vd_need_reset(vd, B_TRUE);
2350		return;
2351	}
2352}
2353
2354static int
2355vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes)
2356{
2357	boolean_t		accepted;
2358	int			status;
2359	on_trap_data_t		otd;
2360	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
2361
2362	if (vd->reset_state)
2363		return (0);
2364
2365	/* Acquire the element */
2366	if ((status = VIO_DRING_ACQUIRE(&otd, vd->dring_mtype,
2367	    vd->dring_handle, idx, idx)) != 0) {
2368		if (status == ECONNRESET) {
2369			vd_mark_in_reset(vd);
2370			return (0);
2371		} else {
2372			return (status);
2373		}
2374	}
2375
2376	/* Set the element's status and mark it done */
2377	accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED);
2378	if (accepted) {
2379		elem->payload.nbytes	= elem_nbytes;
2380		elem->payload.status	= elem_status;
2381		elem->hdr.dstate	= VIO_DESC_DONE;
2382	} else {
2383		/* Perhaps client timed out waiting for I/O... */
2384		PR0("element %u no longer \"accepted\"", idx);
2385		VD_DUMP_DRING_ELEM(elem);
2386	}
2387	/* Release the element */
2388	if ((status = VIO_DRING_RELEASE(vd->dring_mtype,
2389	    vd->dring_handle, idx, idx)) != 0) {
2390		if (status == ECONNRESET) {
2391			vd_mark_in_reset(vd);
2392			return (0);
2393		} else {
2394			PR0("VIO_DRING_RELEASE() returned errno %d",
2395			    status);
2396			return (status);
2397		}
2398	}
2399
2400	return (accepted ? 0 : EINVAL);
2401}
2402
2403/*
2404 * Return Values
2405 *	0	- operation completed successfully
2406 *	EIO	- encountered LDC / task error
2407 *
2408 * Side Effect
2409 *	sets request->status = <disk operation status>
2410 */
2411static int
2412vd_complete_bio(vd_task_t *task)
2413{
2414	int			status		= 0;
2415	int			rv		= 0;
2416	vd_t			*vd		= task->vd;
2417	vd_dring_payload_t	*request	= task->request;
2418	struct buf		*buf		= &task->buf;
2419	int			wid, nwrites;
2420
2421
2422	ASSERT(vd != NULL);
2423	ASSERT(request != NULL);
2424	ASSERT(task->msg != NULL);
2425	ASSERT(task->msglen >= sizeof (*task->msg));
2426
2427	if (buf->b_flags & B_DONE) {
2428		/*
2429		 * If the I/O is already done then we don't call biowait()
2430		 * because biowait() might already have been called when
2431		 * flushing a previous asynchronous write. So we just
2432		 * retrieve the status of the request.
2433		 */
2434		request->status = geterror(buf);
2435	} else {
2436		/*
2437		 * Wait for the I/O. For synchronous I/O, biowait() will return
2438		 * when the I/O has completed. For asynchronous write, it will
2439		 * return the write has been submitted to the backend, but it
2440		 * may not have been committed.
2441		 */
2442		request->status = biowait(buf);
2443	}
2444
2445	if (buf->b_flags & B_ASYNC) {
2446		/*
2447		 * Asynchronous writes are used when writing to a file or a
2448		 * ZFS volume. In that case the bio notification indicates
2449		 * that the write has started. We have to flush the backend
2450		 * to ensure that the write has been committed before marking
2451		 * the request as completed.
2452		 */
2453		ASSERT(task->request->operation == VD_OP_BWRITE);
2454
2455		wid = task->write_index;
2456
2457		/* check if write has been already flushed */
2458		if (vd->write_queue[wid] != NULL) {
2459
2460			vd->write_queue[wid] = NULL;
2461			wid = VD_WRITE_INDEX_NEXT(vd, wid);
2462
2463			/*
2464			 * Because flushing is time consuming, it is worth
2465			 * waiting for any other writes so that they can be
2466			 * included in this single flush request.
2467			 */
2468			if (vd_awflush & VD_AWFLUSH_GROUP) {
2469				nwrites = 1;
2470				while (vd->write_queue[wid] != NULL) {
2471					(void) biowait(vd->write_queue[wid]);
2472					vd->write_queue[wid] = NULL;
2473					wid = VD_WRITE_INDEX_NEXT(vd, wid);
2474					nwrites++;
2475				}
2476				DTRACE_PROBE2(flushgrp, vd_task_t *, task,
2477				    int, nwrites);
2478			}
2479
2480			if (vd_awflush & VD_AWFLUSH_IMMEDIATE) {
2481				request->status = vd_flush_write(vd);
2482			} else if (vd_awflush & VD_AWFLUSH_DEFER) {
2483				(void) taskq_dispatch(system_taskq,
2484				    (void (*)(void *))vd_flush_write, vd,
2485				    DDI_SLEEP);
2486				request->status = 0;
2487			}
2488		}
2489	}
2490
2491	/* Update the number of bytes read/written */
2492	request->nbytes += buf->b_bcount - buf->b_resid;
2493
2494	/* Release the buffer */
2495	if (!vd->reset_state)
2496		status = ldc_mem_release(task->mhdl, 0, buf->b_bufsize);
2497	if (status) {
2498		PR0("ldc_mem_release() returned errno %d copying to "
2499		    "client", status);
2500		if (status == ECONNRESET) {
2501			vd_mark_in_reset(vd);
2502		}
2503		rv = EIO;
2504	}
2505
2506	/* Unmap the memory, even if in reset */
2507	status = ldc_mem_unmap(task->mhdl);
2508	if (status) {
2509		PR0("ldc_mem_unmap() returned errno %d copying to client",
2510		    status);
2511		if (status == ECONNRESET) {
2512			vd_mark_in_reset(vd);
2513		}
2514		rv = EIO;
2515	}
2516
2517	biofini(buf);
2518
2519	return (rv);
2520}
2521
2522/*
2523 * Description:
2524 *	This function is called by the two functions called by a taskq
2525 *	[ vd_complete_notify() and vd_serial_notify()) ] to send the
2526 *	message to the client.
2527 *
2528 * Parameters:
2529 *	arg	- opaque pointer to structure containing task to be completed
2530 *
2531 * Return Values
2532 *	None
2533 */
2534static void
2535vd_notify(vd_task_t *task)
2536{
2537	int	status;
2538
2539	ASSERT(task != NULL);
2540	ASSERT(task->vd != NULL);
2541
2542	/*
2543	 * Send the "ack" or "nack" back to the client; if sending the message
2544	 * via LDC fails, arrange to reset both the connection state and LDC
2545	 * itself
2546	 */
2547	PR2("Sending %s",
2548	    (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
2549
2550	status = send_msg(task->vd->ldc_handle, task->msg, task->msglen);
2551	switch (status) {
2552	case 0:
2553		break;
2554	case ECONNRESET:
2555		vd_mark_in_reset(task->vd);
2556		break;
2557	default:
2558		PR0("initiating full reset");
2559		vd_need_reset(task->vd, B_TRUE);
2560		break;
2561	}
2562
2563	DTRACE_PROBE1(task__end, vd_task_t *, task);
2564}
2565
2566/*
2567 * Description:
2568 *	Mark the Dring entry as Done and (if necessary) send an ACK/NACK to
2569 *	the vDisk client
2570 *
2571 * Parameters:
2572 *	task		- structure containing the request sent from client
2573 *
2574 * Return Values
2575 *	None
2576 */
2577static void
2578vd_complete_notify(vd_task_t *task)
2579{
2580	int			status		= 0;
2581	vd_t			*vd		= task->vd;
2582	vd_dring_payload_t	*request	= task->request;
2583
2584	/* Update the dring element for a dring client */
2585	if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE_V1_0)) {
2586		status = vd_mark_elem_done(vd, task->index,
2587		    request->status, request->nbytes);
2588		if (status == ECONNRESET)
2589			vd_mark_in_reset(vd);
2590		else if (status == EACCES)
2591			vd_need_reset(vd, B_TRUE);
2592	}
2593
2594	/*
2595	 * If a transport error occurred while marking the element done or
2596	 * previously while executing the task, arrange to "nack" the message
2597	 * when the final task in the descriptor element range completes
2598	 */
2599	if ((status != 0) || (task->status != 0))
2600		task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
2601
2602	/*
2603	 * Only the final task for a range of elements will respond to and
2604	 * free the message
2605	 */
2606	if (task->type == VD_NONFINAL_RANGE_TASK) {
2607		return;
2608	}
2609
2610	/*
2611	 * We should only send an ACK/NACK here if we are not currently in
2612	 * reset as, depending on how we reset, the dring may have been
2613	 * blown away and we don't want to ACK/NACK a message that isn't
2614	 * there.
2615	 */
2616	if (!vd->reset_state)
2617		vd_notify(task);
2618}
2619
2620/*
2621 * Description:
2622 *	This is the basic completion function called to handle inband data
2623 *	requests and handshake messages. All it needs to do is trigger a
2624 *	message to the client that the request is completed.
2625 *
2626 * Parameters:
2627 *	arg	- opaque pointer to structure containing task to be completed
2628 *
2629 * Return Values
2630 *	None
2631 */
2632static void
2633vd_serial_notify(void *arg)
2634{
2635	vd_task_t		*task = (vd_task_t *)arg;
2636
2637	ASSERT(task != NULL);
2638	vd_notify(task);
2639}
2640
2641/* ARGSUSED */
2642static int
2643vd_geom2dk_geom(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
2644{
2645	VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg);
2646	return (0);
2647}
2648
2649/* ARGSUSED */
2650static int
2651vd_vtoc2vtoc(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
2652{
2653	VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct extvtoc *)ioctl_arg);
2654	return (0);
2655}
2656
2657static void
2658dk_geom2vd_geom(void *ioctl_arg, void *vd_buf)
2659{
2660	DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf);
2661}
2662
2663static void
2664vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf)
2665{
2666	VTOC2VD_VTOC((struct extvtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf);
2667}
2668
2669static int
2670vd_get_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
2671{
2672	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
2673	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
2674	size_t data_len;
2675
2676	data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t));
2677	if (vd_efi->length > data_len)
2678		return (EINVAL);
2679
2680	dk_efi->dki_lba = vd_efi->lba;
2681	dk_efi->dki_length = vd_efi->length;
2682	dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP);
2683	return (0);
2684}
2685
2686static void
2687vd_get_efi_out(void *ioctl_arg, void *vd_buf)
2688{
2689	int len;
2690	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
2691	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
2692
2693	len = vd_efi->length;
2694	DK_EFI2VD_EFI(dk_efi, vd_efi);
2695	kmem_free(dk_efi->dki_data, len);
2696}
2697
2698static int
2699vd_set_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
2700{
2701	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
2702	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
2703	size_t data_len;
2704
2705	data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t));
2706	if (vd_efi->length > data_len)
2707		return (EINVAL);
2708
2709	dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP);
2710	VD_EFI2DK_EFI(vd_efi, dk_efi);
2711	return (0);
2712}
2713
2714static void
2715vd_set_efi_out(void *ioctl_arg, void *vd_buf)
2716{
2717	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
2718	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
2719
2720	kmem_free(dk_efi->dki_data, vd_efi->length);
2721}
2722
2723static int
2724vd_scsicmd_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
2725{
2726	size_t vd_scsi_len;
2727	vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf;
2728	struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg;
2729
2730	/* check buffer size */
2731	vd_scsi_len = VD_SCSI_SIZE;
2732	vd_scsi_len += P2ROUNDUP(vd_scsi->cdb_len, sizeof (uint64_t));
2733	vd_scsi_len += P2ROUNDUP(vd_scsi->sense_len, sizeof (uint64_t));
2734	vd_scsi_len += P2ROUNDUP(vd_scsi->datain_len, sizeof (uint64_t));
2735	vd_scsi_len += P2ROUNDUP(vd_scsi->dataout_len, sizeof (uint64_t));
2736
2737	ASSERT(vd_scsi_len % sizeof (uint64_t) == 0);
2738
2739	if (vd_buf_len < vd_scsi_len)
2740		return (EINVAL);
2741
2742	/* set flags */
2743	uscsi->uscsi_flags = vd_scsi_debug;
2744
2745	if (vd_scsi->options & VD_SCSI_OPT_NORETRY) {
2746		uscsi->uscsi_flags |= USCSI_ISOLATE;
2747		uscsi->uscsi_flags |= USCSI_DIAGNOSE;
2748	}
2749
2750	/* task attribute */
2751	switch (vd_scsi->task_attribute) {
2752	case VD_SCSI_TASK_ACA:
2753		uscsi->uscsi_flags |= USCSI_HEAD;
2754		break;
2755	case VD_SCSI_TASK_HQUEUE:
2756		uscsi->uscsi_flags |= USCSI_HTAG;
2757		break;
2758	case VD_SCSI_TASK_ORDERED:
2759		uscsi->uscsi_flags |= USCSI_OTAG;
2760		break;
2761	default:
2762		uscsi->uscsi_flags |= USCSI_NOTAG;
2763		break;
2764	}
2765
2766	/* timeout */
2767	uscsi->uscsi_timeout = vd_scsi->timeout;
2768
2769	/* cdb data */
2770	uscsi->uscsi_cdb = (caddr_t)VD_SCSI_DATA_CDB(vd_scsi);
2771	uscsi->uscsi_cdblen = vd_scsi->cdb_len;
2772
2773	/* sense buffer */
2774	if (vd_scsi->sense_len != 0) {
2775		uscsi->uscsi_flags |= USCSI_RQENABLE;
2776		uscsi->uscsi_rqbuf = (caddr_t)VD_SCSI_DATA_SENSE(vd_scsi);
2777		uscsi->uscsi_rqlen = vd_scsi->sense_len;
2778	}
2779
2780	if (vd_scsi->datain_len != 0 && vd_scsi->dataout_len != 0) {
2781		/* uscsi does not support read/write request */
2782		return (EINVAL);
2783	}
2784
2785	/* request data-in */
2786	if (vd_scsi->datain_len != 0) {
2787		uscsi->uscsi_flags |= USCSI_READ;
2788		uscsi->uscsi_buflen = vd_scsi->datain_len;
2789		uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_IN(vd_scsi);
2790	}
2791
2792	/* request data-out */
2793	if (vd_scsi->dataout_len != 0) {
2794		uscsi->uscsi_buflen = vd_scsi->dataout_len;
2795		uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_OUT(vd_scsi);
2796	}
2797
2798	return (0);
2799}
2800
2801static void
2802vd_scsicmd_out(void *ioctl_arg, void *vd_buf)
2803{
2804	vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf;
2805	struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg;
2806
2807	/* output fields */
2808	vd_scsi->cmd_status = uscsi->uscsi_status;
2809
2810	/* sense data */
2811	if ((uscsi->uscsi_flags & USCSI_RQENABLE) &&
2812	    (uscsi->uscsi_status == STATUS_CHECK ||
2813	    uscsi->uscsi_status == STATUS_TERMINATED)) {
2814		vd_scsi->sense_status = uscsi->uscsi_rqstatus;
2815		if (uscsi->uscsi_rqstatus == STATUS_GOOD)
2816			vd_scsi->sense_len -= uscsi->uscsi_rqresid;
2817		else
2818			vd_scsi->sense_len = 0;
2819	} else {
2820		vd_scsi->sense_len = 0;
2821	}
2822
2823	if (uscsi->uscsi_status != STATUS_GOOD) {
2824		vd_scsi->dataout_len = 0;
2825		vd_scsi->datain_len = 0;
2826		return;
2827	}
2828
2829	if (uscsi->uscsi_flags & USCSI_READ) {
2830		/* request data (read) */
2831		vd_scsi->datain_len -= uscsi->uscsi_resid;
2832		vd_scsi->dataout_len = 0;
2833	} else {
2834		/* request data (write) */
2835		vd_scsi->datain_len = 0;
2836		vd_scsi->dataout_len -= uscsi->uscsi_resid;
2837	}
2838}
2839
2840static ushort_t
2841vd_lbl2cksum(struct dk_label *label)
2842{
2843	int	count;
2844	ushort_t sum, *sp;
2845
2846	count =	(sizeof (struct dk_label)) / (sizeof (short)) - 1;
2847	sp = (ushort_t *)label;
2848	sum = 0;
2849	while (count--) {
2850		sum ^= *sp++;
2851	}
2852
2853	return (sum);
2854}
2855
2856/*
2857 * Copy information from a vtoc and dk_geom structures to a dk_label structure.
2858 */
2859static void
2860vd_vtocgeom_to_label(struct extvtoc *vtoc, struct dk_geom *geom,
2861    struct dk_label *label)
2862{
2863	int i;
2864
2865	ASSERT(vtoc->v_nparts == V_NUMPAR);
2866	ASSERT(vtoc->v_sanity == VTOC_SANE);
2867
2868	bzero(label, sizeof (struct dk_label));
2869
2870	label->dkl_ncyl = geom->dkg_ncyl;
2871	label->dkl_acyl = geom->dkg_acyl;
2872	label->dkl_pcyl = geom->dkg_pcyl;
2873	label->dkl_nhead = geom->dkg_nhead;
2874	label->dkl_nsect = geom->dkg_nsect;
2875	label->dkl_intrlv = geom->dkg_intrlv;
2876	label->dkl_apc = geom->dkg_apc;
2877	label->dkl_rpm = geom->dkg_rpm;
2878	label->dkl_write_reinstruct = geom->dkg_write_reinstruct;
2879	label->dkl_read_reinstruct = geom->dkg_read_reinstruct;
2880
2881	label->dkl_vtoc.v_nparts = V_NUMPAR;
2882	label->dkl_vtoc.v_sanity = VTOC_SANE;
2883	label->dkl_vtoc.v_version = vtoc->v_version;
2884	for (i = 0; i < V_NUMPAR; i++) {
2885		label->dkl_vtoc.v_timestamp[i] = vtoc->timestamp[i];
2886		label->dkl_vtoc.v_part[i].p_tag = vtoc->v_part[i].p_tag;
2887		label->dkl_vtoc.v_part[i].p_flag = vtoc->v_part[i].p_flag;
2888		label->dkl_map[i].dkl_cylno = vtoc->v_part[i].p_start /
2889		    (label->dkl_nhead * label->dkl_nsect);
2890		label->dkl_map[i].dkl_nblk = vtoc->v_part[i].p_size;
2891	}
2892
2893	/*
2894	 * The bootinfo array can not be copied with bcopy() because
2895	 * elements are of type long in vtoc (so 64-bit) and of type
2896	 * int in dk_vtoc (so 32-bit).
2897	 */
2898	label->dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0];
2899	label->dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1];
2900	label->dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2];
2901	bcopy(vtoc->v_asciilabel, label->dkl_asciilabel, LEN_DKL_ASCII);
2902	bcopy(vtoc->v_volume, label->dkl_vtoc.v_volume, LEN_DKL_VVOL);
2903
2904	/* re-compute checksum */
2905	label->dkl_magic = DKL_MAGIC;
2906	label->dkl_cksum = vd_lbl2cksum(label);
2907}
2908
2909/*
2910 * Copy information from a dk_label structure to a vtoc and dk_geom structures.
2911 */
2912static void
2913vd_label_to_vtocgeom(struct dk_label *label, struct extvtoc *vtoc,
2914    struct dk_geom *geom)
2915{
2916	int i;
2917
2918	bzero(vtoc, sizeof (struct extvtoc));
2919	bzero(geom, sizeof (struct dk_geom));
2920
2921	geom->dkg_ncyl = label->dkl_ncyl;
2922	geom->dkg_acyl = label->dkl_acyl;
2923	geom->dkg_nhead = label->dkl_nhead;
2924	geom->dkg_nsect = label->dkl_nsect;
2925	geom->dkg_intrlv = label->dkl_intrlv;
2926	geom->dkg_apc = label->dkl_apc;
2927	geom->dkg_rpm = label->dkl_rpm;
2928	geom->dkg_pcyl = label->dkl_pcyl;
2929	geom->dkg_write_reinstruct = label->dkl_write_reinstruct;
2930	geom->dkg_read_reinstruct = label->dkl_read_reinstruct;
2931
2932	vtoc->v_sanity = label->dkl_vtoc.v_sanity;
2933	vtoc->v_version = label->dkl_vtoc.v_version;
2934	vtoc->v_sectorsz = DEV_BSIZE;
2935	vtoc->v_nparts = label->dkl_vtoc.v_nparts;
2936
2937	for (i = 0; i < vtoc->v_nparts; i++) {
2938		vtoc->v_part[i].p_tag = label->dkl_vtoc.v_part[i].p_tag;
2939		vtoc->v_part[i].p_flag = label->dkl_vtoc.v_part[i].p_flag;
2940		vtoc->v_part[i].p_start = label->dkl_map[i].dkl_cylno *
2941		    (label->dkl_nhead * label->dkl_nsect);
2942		vtoc->v_part[i].p_size = label->dkl_map[i].dkl_nblk;
2943		vtoc->timestamp[i] = label->dkl_vtoc.v_timestamp[i];
2944	}
2945
2946	/*
2947	 * The bootinfo array can not be copied with bcopy() because
2948	 * elements are of type long in vtoc (so 64-bit) and of type
2949	 * int in dk_vtoc (so 32-bit).
2950	 */
2951	vtoc->v_bootinfo[0] = label->dkl_vtoc.v_bootinfo[0];
2952	vtoc->v_bootinfo[1] = label->dkl_vtoc.v_bootinfo[1];
2953	vtoc->v_bootinfo[2] = label->dkl_vtoc.v_bootinfo[2];
2954	bcopy(label->dkl_asciilabel, vtoc->v_asciilabel, LEN_DKL_ASCII);
2955	bcopy(label->dkl_vtoc.v_volume, vtoc->v_volume, LEN_DKL_VVOL);
2956}
2957
2958/*
2959 * Check if a geometry is valid for a single-slice disk. A geometry is
2960 * considered valid if the main attributes of the geometry match with the
2961 * attributes of the fake geometry we have created.
2962 */
2963static boolean_t
2964vd_slice_geom_isvalid(vd_t *vd, struct dk_geom *geom)
2965{
2966	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
2967	ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
2968
2969	if (geom->dkg_ncyl != vd->dk_geom.dkg_ncyl ||
2970	    geom->dkg_acyl != vd->dk_geom.dkg_acyl ||
2971	    geom->dkg_nsect != vd->dk_geom.dkg_nsect ||
2972	    geom->dkg_pcyl != vd->dk_geom.dkg_pcyl)
2973		return (B_FALSE);
2974
2975	return (B_TRUE);
2976}
2977
2978/*
2979 * Check if a vtoc is valid for a single-slice disk. A vtoc is considered
2980 * valid if the main attributes of the vtoc match with the attributes of the
2981 * fake vtoc we have created.
2982 */
2983static boolean_t
2984vd_slice_vtoc_isvalid(vd_t *vd, struct extvtoc *vtoc)
2985{
2986	size_t csize;
2987	int i;
2988
2989	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
2990	ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
2991
2992	if (vtoc->v_sanity != vd->vtoc.v_sanity ||
2993	    vtoc->v_version != vd->vtoc.v_version ||
2994	    vtoc->v_nparts != vd->vtoc.v_nparts ||
2995	    strcmp(vtoc->v_volume, vd->vtoc.v_volume) != 0 ||
2996	    strcmp(vtoc->v_asciilabel, vd->vtoc.v_asciilabel) != 0)
2997		return (B_FALSE);
2998
2999	/* slice 2 should be unchanged */
3000	if (vtoc->v_part[VD_ENTIRE_DISK_SLICE].p_start !=
3001	    vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_start ||
3002	    vtoc->v_part[VD_ENTIRE_DISK_SLICE].p_size !=
3003	    vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_size)
3004		return (B_FALSE);
3005
3006	/*
3007	 * Slice 0 should be mostly unchanged and cover most of the disk.
3008	 * However we allow some flexibility wrt to the start and the size
3009	 * of this slice mainly because we can't exactly know how it will
3010	 * be defined by the OS installer.
3011	 *
3012	 * We allow slice 0 to be defined as starting on any of the first
3013	 * 4 cylinders.
3014	 */
3015	csize = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
3016
3017	if (vtoc->v_part[0].p_start > 4 * csize ||
3018	    vtoc->v_part[0].p_size > vtoc->v_part[VD_ENTIRE_DISK_SLICE].p_size)
3019			return (B_FALSE);
3020
3021	if (vd->vtoc.v_part[0].p_size >= 4 * csize &&
3022	    vtoc->v_part[0].p_size < vd->vtoc.v_part[0].p_size - 4 *csize)
3023			return (B_FALSE);
3024
3025	/* any other slice should have a size of 0 */
3026	for (i = 1; i < vtoc->v_nparts; i++) {
3027		if (i != VD_ENTIRE_DISK_SLICE &&
3028		    vtoc->v_part[i].p_size != 0)
3029			return (B_FALSE);
3030	}
3031
3032	return (B_TRUE);
3033}
3034
3035/*
3036 * Handle ioctls to a disk slice.
3037 *
3038 * Return Values
3039 *	0	- Indicates that there are no errors in disk operations
3040 *	ENOTSUP	- Unknown disk label type or unsupported DKIO ioctl
3041 *	EINVAL	- Not enough room to copy the EFI label
3042 *
3043 */
3044static int
3045vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg)
3046{
3047	dk_efi_t *dk_ioc;
3048	struct extvtoc *vtoc;
3049	struct dk_geom *geom;
3050	size_t len, lba;
3051
3052	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
3053
3054	if (cmd == DKIOCFLUSHWRITECACHE)
3055		return (vd_flush_write(vd));
3056
3057	switch (vd->vdisk_label) {
3058
3059	/* ioctls for a single slice disk with a VTOC label */
3060	case VD_DISK_LABEL_VTOC:
3061
3062		switch (cmd) {
3063
3064		case DKIOCGGEOM:
3065			ASSERT(ioctl_arg != NULL);
3066			bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom));
3067			return (0);
3068
3069		case DKIOCGEXTVTOC:
3070			ASSERT(ioctl_arg != NULL);
3071			bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc));
3072			return (0);
3073
3074		case DKIOCSGEOM:
3075			ASSERT(ioctl_arg != NULL);
3076			if (vd_slice_single_slice)
3077				return (ENOTSUP);
3078
3079			/* fake success only if new geometry is valid */
3080			geom = (struct dk_geom *)ioctl_arg;
3081			if (!vd_slice_geom_isvalid(vd, geom))
3082				return (EINVAL);
3083
3084			return (0);
3085
3086		case DKIOCSEXTVTOC:
3087			ASSERT(ioctl_arg != NULL);
3088			if (vd_slice_single_slice)
3089				return (ENOTSUP);
3090
3091			/* fake sucess only if the new vtoc is valid */
3092			vtoc = (struct extvtoc *)ioctl_arg;
3093			if (!vd_slice_vtoc_isvalid(vd, vtoc))
3094				return (EINVAL);
3095
3096			return (0);
3097
3098		default:
3099			return (ENOTSUP);
3100		}
3101
3102	/* ioctls for a single slice disk with an EFI label */
3103	case VD_DISK_LABEL_EFI:
3104
3105		if (cmd != DKIOCGETEFI && cmd != DKIOCSETEFI)
3106			return (ENOTSUP);
3107
3108		ASSERT(ioctl_arg != NULL);
3109		dk_ioc = (dk_efi_t *)ioctl_arg;
3110
3111		len = dk_ioc->dki_length;
3112		lba = dk_ioc->dki_lba;
3113
3114		if ((lba != VD_EFI_LBA_GPT && lba != VD_EFI_LBA_GPE) ||
3115		    (lba == VD_EFI_LBA_GPT && len < sizeof (efi_gpt_t)) ||
3116		    (lba == VD_EFI_LBA_GPE && len < sizeof (efi_gpe_t)))
3117			return (EINVAL);
3118
3119		switch (cmd) {
3120		case DKIOCGETEFI:
3121			len = vd_slice_flabel_read(vd,
3122			    (caddr_t)dk_ioc->dki_data,
3123			    lba * vd->vdisk_bsize, len);
3124
3125			ASSERT(len > 0);
3126
3127			return (0);
3128
3129		case DKIOCSETEFI:
3130			if (vd_slice_single_slice)
3131				return (ENOTSUP);
3132
3133			/* we currently don't support writing EFI */
3134			return (EIO);
3135		}
3136
3137	default:
3138		/* Unknown disk label type */
3139		return (ENOTSUP);
3140	}
3141}
3142
3143static int
3144vds_efi_alloc_and_read(vd_t *vd, efi_gpt_t **gpt, efi_gpe_t **gpe)
3145{
3146	vd_efi_dev_t edev;
3147	int status;
3148
3149	VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl);
3150
3151	status = vd_efi_alloc_and_read(&edev, gpt, gpe);
3152
3153	return (status);
3154}
3155
3156static void
3157vds_efi_free(vd_t *vd, efi_gpt_t *gpt, efi_gpe_t *gpe)
3158{
3159	vd_efi_dev_t edev;
3160
3161	VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl);
3162
3163	vd_efi_free(&edev, gpt, gpe);
3164}
3165
3166static int
3167vd_dskimg_validate_efi(vd_t *vd)
3168{
3169	efi_gpt_t *gpt;
3170	efi_gpe_t *gpe;
3171	int i, nparts, status;
3172	struct uuid efi_reserved = EFI_RESERVED;
3173
3174	if ((status = vds_efi_alloc_and_read(vd, &gpt, &gpe)) != 0)
3175		return (status);
3176
3177	bzero(&vd->vtoc, sizeof (struct extvtoc));
3178	bzero(&vd->dk_geom, sizeof (struct dk_geom));
3179	bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART);
3180
3181	vd->efi_reserved = -1;
3182
3183	nparts = gpt->efi_gpt_NumberOfPartitionEntries;
3184
3185	for (i = 0; i < nparts && i < VD_MAXPART; i++) {
3186
3187		if (gpe[i].efi_gpe_StartingLBA == 0 &&
3188		    gpe[i].efi_gpe_EndingLBA == 0) {
3189			continue;
3190		}
3191
3192		vd->slices[i].start = gpe[i].efi_gpe_StartingLBA;
3193		vd->slices[i].nblocks = gpe[i].efi_gpe_EndingLBA -
3194		    gpe[i].efi_gpe_StartingLBA + 1;
3195
3196		if (bcmp(&gpe[i].efi_gpe_PartitionTypeGUID, &efi_reserved,
3197		    sizeof (struct uuid)) == 0)
3198			vd->efi_reserved = i;
3199
3200	}
3201
3202	ASSERT(vd->vdisk_size != 0);
3203	vd->slices[VD_EFI_WD_SLICE].start = 0;
3204	vd->slices[VD_EFI_WD_SLICE].nblocks = vd->vdisk_size;
3205
3206	vds_efi_free(vd, gpt, gpe);
3207
3208	return (status);
3209}
3210
3211/*
3212 * Function:
3213 *	vd_dskimg_validate_geometry
3214 *
3215 * Description:
3216 *	Read the label and validate the geometry of a disk image. The driver
3217 *	label, vtoc and geometry information are updated according to the
3218 *	label read from the disk image.
3219 *
3220 *	If no valid label is found, the label is set to unknown and the
3221 *	function returns EINVAL, but a default vtoc and geometry are provided
3222 *	to the driver. If an EFI label is found, ENOTSUP is returned.
3223 *
3224 * Parameters:
3225 *	vd	- disk on which the operation is performed.
3226 *
3227 * Return Code:
3228 *	0	- success.
3229 *	EIO	- error reading the label from the disk image.
3230 *	EINVAL	- unknown disk label.
3231 *	ENOTSUP	- geometry not applicable (EFI label).
3232 */
3233static int
3234vd_dskimg_validate_geometry(vd_t *vd)
3235{
3236	struct dk_label label;
3237	struct dk_geom *geom = &vd->dk_geom;
3238	struct extvtoc *vtoc = &vd->vtoc;
3239	int i;
3240	int status = 0;
3241
3242	ASSERT(VD_DSKIMG(vd));
3243
3244	if (VD_DSKIMG_LABEL_READ(vd, &label) < 0)
3245		return (EIO);
3246
3247	if (label.dkl_magic != DKL_MAGIC ||
3248	    label.dkl_cksum != vd_lbl2cksum(&label) ||
3249	    (vd_dskimg_validate_sanity &&
3250	    label.dkl_vtoc.v_sanity != VTOC_SANE) ||
3251	    label.dkl_vtoc.v_nparts != V_NUMPAR) {
3252
3253		if (vd_dskimg_validate_efi(vd) == 0) {
3254			vd->vdisk_label = VD_DISK_LABEL_EFI;
3255			return (ENOTSUP);
3256		}
3257
3258		vd->vdisk_label = VD_DISK_LABEL_UNK;
3259		vd_build_default_label(vd->dskimg_size, vd->vdisk_bsize,
3260		    &label);
3261		status = EINVAL;
3262	} else {
3263		vd->vdisk_label = VD_DISK_LABEL_VTOC;
3264	}
3265
3266	/* Update the driver geometry and vtoc */
3267	vd_label_to_vtocgeom(&label, vtoc, geom);
3268
3269	/* Update logical partitions */
3270	bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART);
3271	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
3272		for (i = 0; i < vtoc->v_nparts; i++) {
3273			vd->slices[i].start = vtoc->v_part[i].p_start;
3274			vd->slices[i].nblocks = vtoc->v_part[i].p_size;
3275		}
3276	}
3277
3278	return (status);
3279}
3280
3281/*
3282 * Handle ioctls to a disk image.
3283 *
3284 * Return Values
3285 *	0	- Indicates that there are no errors
3286 *	!= 0	- Disk operation returned an error
3287 */
3288static int
3289vd_do_dskimg_ioctl(vd_t *vd, int cmd, void *ioctl_arg)
3290{
3291	struct dk_label label;
3292	struct dk_geom *geom;
3293	struct extvtoc *vtoc;
3294	dk_efi_t *efi;
3295	int rc;
3296
3297	ASSERT(VD_DSKIMG(vd));
3298
3299	switch (cmd) {
3300
3301	case DKIOCGGEOM:
3302		ASSERT(ioctl_arg != NULL);
3303		geom = (struct dk_geom *)ioctl_arg;
3304
3305		rc = vd_dskimg_validate_geometry(vd);
3306		if (rc != 0 && rc != EINVAL)
3307			return (rc);
3308		bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom));
3309		return (0);
3310
3311	case DKIOCGEXTVTOC:
3312		ASSERT(ioctl_arg != NULL);
3313		vtoc = (struct extvtoc *)ioctl_arg;
3314
3315		rc = vd_dskimg_validate_geometry(vd);
3316		if (rc != 0 && rc != EINVAL)
3317			return (rc);
3318		bcopy(&vd->vtoc, vtoc, sizeof (struct extvtoc));
3319		return (0);
3320
3321	case DKIOCSGEOM:
3322		ASSERT(ioctl_arg != NULL);
3323		geom = (struct dk_geom *)ioctl_arg;
3324
3325		if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0)
3326			return (EINVAL);
3327
3328		/*
3329		 * The current device geometry is not updated, just the driver
3330		 * "notion" of it. The device geometry will be effectively
3331		 * updated when a label is written to the device during a next
3332		 * DKIOCSEXTVTOC.
3333		 */
3334		bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom));
3335		return (0);
3336
3337	case DKIOCSEXTVTOC:
3338		ASSERT(ioctl_arg != NULL);
3339		ASSERT(vd->dk_geom.dkg_nhead != 0 &&
3340		    vd->dk_geom.dkg_nsect != 0);
3341		vtoc = (struct extvtoc *)ioctl_arg;
3342
3343		if (vtoc->v_sanity != VTOC_SANE ||
3344		    vtoc->v_sectorsz != DEV_BSIZE ||
3345		    vtoc->v_nparts != V_NUMPAR)
3346			return (EINVAL);
3347
3348		vd_vtocgeom_to_label(vtoc, &vd->dk_geom, &label);
3349
3350		/* write label to the disk image */
3351		if ((rc = vd_dskimg_set_vtoc(vd, &label)) != 0)
3352			return (rc);
3353
3354		break;
3355
3356	case DKIOCFLUSHWRITECACHE:
3357		return (vd_flush_write(vd));
3358
3359	case DKIOCGETEFI:
3360		ASSERT(ioctl_arg != NULL);
3361		efi = (dk_efi_t *)ioctl_arg;
3362
3363		if (vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BREAD,
3364		    (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0)
3365			return (EIO);
3366
3367		return (0);
3368
3369	case DKIOCSETEFI:
3370		ASSERT(ioctl_arg != NULL);
3371		efi = (dk_efi_t *)ioctl_arg;
3372
3373		if (vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE,
3374		    (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0)
3375			return (EIO);
3376
3377		break;
3378
3379
3380	default:
3381		return (ENOTSUP);
3382	}
3383
3384	ASSERT(cmd == DKIOCSEXTVTOC || cmd == DKIOCSETEFI);
3385
3386	/* label has changed, revalidate the geometry */
3387	(void) vd_dskimg_validate_geometry(vd);
3388
3389	/*
3390	 * The disk geometry may have changed, so we need to write
3391	 * the devid (if there is one) so that it is stored at the
3392	 * right location.
3393	 */
3394	if (vd_dskimg_write_devid(vd, vd->dskimg_devid) != 0) {
3395		PR0("Fail to write devid");
3396	}
3397
3398	return (0);
3399}
3400
3401static int
3402vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg)
3403{
3404	int rval = 0, status;
3405	struct vtoc vtoc;
3406
3407	/*
3408	 * Call the appropriate function to execute the ioctl depending
3409	 * on the type of vdisk.
3410	 */
3411	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
3412
3413		/* slice, file or volume exported as a single slice disk */
3414		status = vd_do_slice_ioctl(vd, cmd, arg);
3415
3416	} else if (VD_DSKIMG(vd)) {
3417
3418		/* file or volume exported as a full disk */
3419		status = vd_do_dskimg_ioctl(vd, cmd, arg);
3420
3421	} else {
3422
3423		/* disk device exported as a full disk */
3424		status = ldi_ioctl(vd->ldi_handle[0], cmd, (intptr_t)arg,
3425		    vd->open_flags | FKIOCTL, kcred, &rval);
3426
3427		/*
3428		 * By default VTOC ioctls are done using ioctls for the
3429		 * extended VTOC. Some drivers (in particular non-Sun drivers)
3430		 * may not support these ioctls. In that case, we fallback to
3431		 * the regular VTOC ioctls.
3432		 */
3433		if (status == ENOTTY) {
3434			switch (cmd) {
3435
3436			case DKIOCGEXTVTOC:
3437				cmd = DKIOCGVTOC;
3438				status = ldi_ioctl(vd->ldi_handle[0], cmd,
3439				    (intptr_t)&vtoc, vd->open_flags | FKIOCTL,
3440				    kcred, &rval);
3441				vtoctoextvtoc(vtoc,
3442				    (*(struct extvtoc *)(void *)arg));
3443				break;
3444
3445			case DKIOCSEXTVTOC:
3446				cmd = DKIOCSVTOC;
3447				extvtoctovtoc((*(struct extvtoc *)(void *)arg),
3448				    vtoc);
3449				status = ldi_ioctl(vd->ldi_handle[0], cmd,
3450				    (intptr_t)&vtoc, vd->open_flags | FKIOCTL,
3451				    kcred, &rval);
3452				break;
3453			}
3454		}
3455	}
3456
3457#ifdef DEBUG
3458	if (rval != 0) {
3459		PR0("ioctl %x set rval = %d, which is not being returned"
3460		    " to caller", cmd, rval);
3461	}
3462#endif /* DEBUG */
3463
3464	return (status);
3465}
3466
3467/*
3468 * Description:
3469 *	This is the function that processes the ioctl requests (farming it
3470 *	out to functions that handle slices, files or whole disks)
3471 *
3472 * Return Values
3473 *     0		- ioctl operation completed successfully
3474 *     != 0		- The LDC error value encountered
3475 *			  (propagated back up the call stack as a task error)
3476 *
3477 * Side Effect
3478 *     sets request->status to the return value of the ioctl function.
3479 */
3480static int
3481vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl)
3482{
3483	int	status = 0;
3484	size_t	nbytes = request->nbytes;	/* modifiable copy */
3485
3486
3487	ASSERT(request->slice < vd->nslices);
3488	PR0("Performing %s", ioctl->operation_name);
3489
3490	/* Get data from client and convert, if necessary */
3491	if (ioctl->copyin != NULL)  {
3492		ASSERT(nbytes != 0 && buf != NULL);
3493		PR1("Getting \"arg\" data from client");
3494		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
3495		    request->cookie, request->ncookies,
3496		    LDC_COPY_IN)) != 0) {
3497			PR0("ldc_mem_copy() returned errno %d "
3498			    "copying from client", status);
3499			return (status);
3500		}
3501
3502		/* Convert client's data, if necessary */
3503		if (ioctl->copyin == VD_IDENTITY_IN) {
3504			/* use client buffer */
3505			ioctl->arg = buf;
3506		} else {
3507			/* convert client vdisk operation data to ioctl data */
3508			status = (ioctl->copyin)(buf, nbytes,
3509			    (void *)ioctl->arg);
3510			if (status != 0) {
3511				request->status = status;
3512				return (0);
3513			}
3514		}
3515	}
3516
3517	if (ioctl->operation == VD_OP_SCSICMD) {
3518		struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl->arg;
3519
3520		/* check write permission */
3521		if (!(vd->open_flags & FWRITE) &&
3522		    !(uscsi->uscsi_flags & USCSI_READ)) {
3523			PR0("uscsi fails because backend is opened read-only");
3524			request->status = EROFS;
3525			return (0);
3526		}
3527	}
3528
3529	/*
3530	 * Send the ioctl to the disk backend.
3531	 */
3532	request->status = vd_backend_ioctl(vd, ioctl->cmd, ioctl->arg);
3533
3534	if (request->status != 0) {
3535		PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status);
3536		if (ioctl->operation == VD_OP_SCSICMD &&
3537		    ((struct uscsi_cmd *)ioctl->arg)->uscsi_status != 0)
3538			/*
3539			 * USCSICMD has reported an error and the uscsi_status
3540			 * field is not zero. This means that the SCSI command
3541			 * has completed but it has an error. So we should
3542			 * mark the VD operation has succesfully completed
3543			 * and clients can check the SCSI status field for
3544			 * SCSI errors.
3545			 */
3546			request->status = 0;
3547		else
3548			return (0);
3549	}
3550
3551	/* Convert data and send to client, if necessary */
3552	if (ioctl->copyout != NULL)  {
3553		ASSERT(nbytes != 0 && buf != NULL);
3554		PR1("Sending \"arg\" data to client");
3555
3556		/* Convert ioctl data to vdisk operation data, if necessary */
3557		if (ioctl->copyout != VD_IDENTITY_OUT)
3558			(ioctl->copyout)((void *)ioctl->arg, buf);
3559
3560		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
3561		    request->cookie, request->ncookies,
3562		    LDC_COPY_OUT)) != 0) {
3563			PR0("ldc_mem_copy() returned errno %d "
3564			    "copying to client", status);
3565			return (status);
3566		}
3567	}
3568
3569	return (status);
3570}
3571
3572#define	RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t))
3573
3574/*
3575 * Description:
3576 *	This generic function is called by the task queue to complete
3577 *	the processing of the tasks. The specific completion function
3578 *	is passed in as a field in the task pointer.
3579 *
3580 * Parameters:
3581 *	arg	- opaque pointer to structure containing task to be completed
3582 *
3583 * Return Values
3584 *	None
3585 */
3586static void
3587vd_complete(void *arg)
3588{
3589	vd_task_t	*task = (vd_task_t *)arg;
3590
3591	ASSERT(task != NULL);
3592	ASSERT(task->status == EINPROGRESS);
3593	ASSERT(task->completef != NULL);
3594
3595	task->status = task->completef(task);
3596	if (task->status)
3597		PR0("%s: Error %d completing task", __func__, task->status);
3598
3599	/* Now notify the vDisk client */
3600	vd_complete_notify(task);
3601}
3602
3603static int
3604vd_ioctl(vd_task_t *task)
3605{
3606	int			i, status;
3607	void			*buf = NULL;
3608	struct dk_geom		dk_geom = {0};
3609	struct extvtoc		vtoc = {0};
3610	struct dk_efi		dk_efi = {0};
3611	struct uscsi_cmd	uscsi = {0};
3612	vd_t			*vd		= task->vd;
3613	vd_dring_payload_t	*request	= task->request;
3614	vd_ioctl_t		ioctl[] = {
3615		/* Command (no-copy) operations */
3616		{VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0,
3617		    DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE),
3618		    NULL, NULL, NULL, B_TRUE},
3619
3620		/* "Get" (copy-out) operations */
3621		{VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int),
3622		    DKIOCGETWCE, STRINGIZE(DKIOCGETWCE),
3623		    NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_FALSE},
3624		{VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM),
3625		    RNDSIZE(vd_geom_t),
3626		    DKIOCGGEOM, STRINGIZE(DKIOCGGEOM),
3627		    &dk_geom, NULL, dk_geom2vd_geom, B_FALSE},
3628		{VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t),
3629		    DKIOCGEXTVTOC, STRINGIZE(DKIOCGEXTVTOC),
3630		    &vtoc, NULL, vtoc2vd_vtoc, B_FALSE},
3631		{VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t),
3632		    DKIOCGETEFI, STRINGIZE(DKIOCGETEFI),
3633		    &dk_efi, vd_get_efi_in, vd_get_efi_out, B_FALSE},
3634
3635		/* "Set" (copy-in) operations */
3636		{VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int),
3637		    DKIOCSETWCE, STRINGIZE(DKIOCSETWCE),
3638		    NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_TRUE},
3639		{VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM),
3640		    RNDSIZE(vd_geom_t),
3641		    DKIOCSGEOM, STRINGIZE(DKIOCSGEOM),
3642		    &dk_geom, vd_geom2dk_geom, NULL, B_TRUE},
3643		{VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t),
3644		    DKIOCSEXTVTOC, STRINGIZE(DKIOCSEXTVTOC),
3645		    &vtoc, vd_vtoc2vtoc, NULL, B_TRUE},
3646		{VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t),
3647		    DKIOCSETEFI, STRINGIZE(DKIOCSETEFI),
3648		    &dk_efi, vd_set_efi_in, vd_set_efi_out, B_TRUE},
3649
3650		{VD_OP_SCSICMD, STRINGIZE(VD_OP_SCSICMD), RNDSIZE(vd_scsi_t),
3651		    USCSICMD, STRINGIZE(USCSICMD),
3652		    &uscsi, vd_scsicmd_in, vd_scsicmd_out, B_FALSE},
3653	};
3654	size_t		nioctls = (sizeof (ioctl))/(sizeof (ioctl[0]));
3655
3656
3657	ASSERT(vd != NULL);
3658	ASSERT(request != NULL);
3659	ASSERT(request->slice < vd->nslices);
3660
3661	/*
3662	 * Determine ioctl corresponding to caller's "operation" and
3663	 * validate caller's "nbytes"
3664	 */
3665	for (i = 0; i < nioctls; i++) {
3666		if (request->operation == ioctl[i].operation) {
3667			/* LDC memory operations require 8-byte multiples */
3668			ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0);
3669
3670			if (request->operation == VD_OP_GET_EFI ||
3671			    request->operation == VD_OP_SET_EFI ||
3672			    request->operation == VD_OP_SCSICMD) {
3673				if (request->nbytes >= ioctl[i].nbytes)
3674					break;
3675				PR0("%s:  Expected at least nbytes = %lu, "
3676				    "got %lu", ioctl[i].operation_name,
3677				    ioctl[i].nbytes, request->nbytes);
3678				return (EINVAL);
3679			}
3680
3681			if (request->nbytes != ioctl[i].nbytes) {
3682				PR0("%s:  Expected nbytes = %lu, got %lu",
3683				    ioctl[i].operation_name, ioctl[i].nbytes,
3684				    request->nbytes);
3685				return (EINVAL);
3686			}
3687
3688			break;
3689		}
3690	}
3691
3692	VERIFY(i < nioctls); /* because "operation" already validated */
3693
3694	if (!(vd->open_flags & FWRITE) && ioctl[i].write) {
3695		PR0("%s fails because backend is opened read-only",
3696		    ioctl[i].operation_name);
3697		request->status = EROFS;
3698		return (0);
3699	}
3700
3701	if (request->nbytes)
3702		buf = kmem_zalloc(request->nbytes, KM_SLEEP);
3703	status = vd_do_ioctl(vd, request, buf, &ioctl[i]);
3704	if (request->nbytes)
3705		kmem_free(buf, request->nbytes);
3706
3707	return (status);
3708}
3709
3710static int
3711vd_get_devid(vd_task_t *task)
3712{
3713	vd_t *vd = task->vd;
3714	vd_dring_payload_t *request = task->request;
3715	vd_devid_t *vd_devid;
3716	impl_devid_t *devid;
3717	int status, bufid_len, devid_len, len, sz;
3718	int bufbytes;
3719
3720	PR1("Get Device ID, nbytes=%ld", request->nbytes);
3721
3722	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
3723		/*
3724		 * We don't support devid for single-slice disks because we
3725		 * have no space to store a fabricated devid and for physical
3726		 * disk slices, we can't use the devid of the disk otherwise
3727		 * exporting multiple slices from the same disk will produce
3728		 * the same devids.
3729		 */
3730		PR2("No Device ID for slices");
3731		request->status = ENOTSUP;
3732		return (0);
3733	}
3734
3735	if (VD_DSKIMG(vd)) {
3736		if (vd->dskimg_devid == NULL) {
3737			PR2("No Device ID");
3738			request->status = ENOENT;
3739			return (0);
3740		} else {
3741			sz = ddi_devid_sizeof(vd->dskimg_devid);
3742			devid = kmem_alloc(sz, KM_SLEEP);
3743			bcopy(vd->dskimg_devid, devid, sz);
3744		}
3745	} else {
3746		if (ddi_lyr_get_devid(vd->dev[request->slice],
3747		    (ddi_devid_t *)&devid) != DDI_SUCCESS) {
3748			PR2("No Device ID");
3749			request->status = ENOENT;
3750			return (0);
3751		}
3752	}
3753
3754	bufid_len = request->nbytes - sizeof (vd_devid_t) + 1;
3755	devid_len = DEVID_GETLEN(devid);
3756
3757	/*
3758	 * Save the buffer size here for use in deallocation.
3759	 * The actual number of bytes copied is returned in
3760	 * the 'nbytes' field of the request structure.
3761	 */
3762	bufbytes = request->nbytes;
3763
3764	vd_devid = kmem_zalloc(bufbytes, KM_SLEEP);
3765	vd_devid->length = devid_len;
3766	vd_devid->type = DEVID_GETTYPE(devid);
3767
3768	len = (devid_len > bufid_len)? bufid_len : devid_len;
3769
3770	bcopy(devid->did_id, vd_devid->id, len);
3771
3772	request->status = 0;
3773
3774	/* LDC memory operations require 8-byte multiples */
3775	ASSERT(request->nbytes % sizeof (uint64_t) == 0);
3776
3777	if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0,
3778	    &request->nbytes, request->cookie, request->ncookies,
3779	    LDC_COPY_OUT)) != 0) {
3780		PR0("ldc_mem_copy() returned errno %d copying to client",
3781		    status);
3782	}
3783	PR1("post mem_copy: nbytes=%ld", request->nbytes);
3784
3785	kmem_free(vd_devid, bufbytes);
3786	ddi_devid_free((ddi_devid_t)devid);
3787
3788	return (status);
3789}
3790
3791static int
3792vd_scsi_reset(vd_t *vd)
3793{
3794	int rval, status;
3795	struct uscsi_cmd uscsi = { 0 };
3796
3797	uscsi.uscsi_flags = vd_scsi_debug | USCSI_RESET;
3798	uscsi.uscsi_timeout = vd_scsi_rdwr_timeout;
3799
3800	status = ldi_ioctl(vd->ldi_handle[0], USCSICMD, (intptr_t)&uscsi,
3801	    (vd->open_flags | FKIOCTL), kcred, &rval);
3802
3803	return (status);
3804}
3805
3806static int
3807vd_reset(vd_task_t *task)
3808{
3809	vd_t *vd = task->vd;
3810	vd_dring_payload_t *request = task->request;
3811
3812	ASSERT(request->operation == VD_OP_RESET);
3813	ASSERT(vd->scsi);
3814
3815	PR0("Performing VD_OP_RESET");
3816
3817	if (request->nbytes != 0) {
3818		PR0("VD_OP_RESET:  Expected nbytes = 0, got %lu",
3819		    request->nbytes);
3820		return (EINVAL);
3821	}
3822
3823	request->status = vd_scsi_reset(vd);
3824
3825	return (0);
3826}
3827
3828static int
3829vd_get_capacity(vd_task_t *task)
3830{
3831	int rv;
3832	size_t nbytes;
3833	vd_t *vd = task->vd;
3834	vd_dring_payload_t *request = task->request;
3835	vd_capacity_t vd_cap = { 0 };
3836
3837	ASSERT(request->operation == VD_OP_GET_CAPACITY);
3838
3839	PR0("Performing VD_OP_GET_CAPACITY");
3840
3841	nbytes = request->nbytes;
3842
3843	if (nbytes != RNDSIZE(vd_capacity_t)) {
3844		PR0("VD_OP_GET_CAPACITY:  Expected nbytes = %lu, got %lu",
3845		    RNDSIZE(vd_capacity_t), nbytes);
3846		return (EINVAL);
3847	}
3848
3849	/*
3850	 * Check the backend size in case it has changed. If the check fails
3851	 * then we will return the last known size.
3852	 */
3853
3854	(void) vd_backend_check_size(vd);
3855	ASSERT(vd->vdisk_size != 0);
3856
3857	request->status = 0;
3858
3859	vd_cap.vdisk_block_size = vd->vdisk_bsize;
3860	vd_cap.vdisk_size = vd->vdisk_size;
3861
3862	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&vd_cap, 0, &nbytes,
3863	    request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) {
3864		PR0("ldc_mem_copy() returned errno %d copying to client", rv);
3865		return (rv);
3866	}
3867
3868	return (0);
3869}
3870
3871static int
3872vd_get_access(vd_task_t *task)
3873{
3874	uint64_t access;
3875	int rv, rval = 0;
3876	size_t nbytes;
3877	vd_t *vd = task->vd;
3878	vd_dring_payload_t *request = task->request;
3879
3880	ASSERT(request->operation == VD_OP_GET_ACCESS);
3881	ASSERT(vd->scsi);
3882
3883	PR0("Performing VD_OP_GET_ACCESS");
3884
3885	nbytes = request->nbytes;
3886
3887	if (nbytes != sizeof (uint64_t)) {
3888		PR0("VD_OP_GET_ACCESS:  Expected nbytes = %lu, got %lu",
3889		    sizeof (uint64_t), nbytes);
3890		return (EINVAL);
3891	}
3892
3893	request->status = ldi_ioctl(vd->ldi_handle[request->slice], MHIOCSTATUS,
3894	    (intptr_t)NULL, (vd->open_flags | FKIOCTL), kcred, &rval);
3895
3896	if (request->status != 0)
3897		return (0);
3898
3899	access = (rval == 0)? VD_ACCESS_ALLOWED : VD_ACCESS_DENIED;
3900
3901	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&access, 0, &nbytes,
3902	    request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) {
3903		PR0("ldc_mem_copy() returned errno %d copying to client", rv);
3904		return (rv);
3905	}
3906
3907	return (0);
3908}
3909
3910static int
3911vd_set_access(vd_task_t *task)
3912{
3913	uint64_t flags;
3914	int rv, rval;
3915	size_t nbytes;
3916	vd_t *vd = task->vd;
3917	vd_dring_payload_t *request = task->request;
3918
3919	ASSERT(request->operation == VD_OP_SET_ACCESS);
3920	ASSERT(vd->scsi);
3921
3922	nbytes = request->nbytes;
3923
3924	if (nbytes != sizeof (uint64_t)) {
3925		PR0("VD_OP_SET_ACCESS:  Expected nbytes = %lu, got %lu",
3926		    sizeof (uint64_t), nbytes);
3927		return (EINVAL);
3928	}
3929
3930	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&flags, 0, &nbytes,
3931	    request->cookie, request->ncookies, LDC_COPY_IN)) != 0) {
3932		PR0("ldc_mem_copy() returned errno %d copying from client", rv);
3933		return (rv);
3934	}
3935
3936	if (flags == VD_ACCESS_SET_CLEAR) {
3937		PR0("Performing VD_OP_SET_ACCESS (CLEAR)");
3938		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
3939		    MHIOCRELEASE, (intptr_t)NULL, (vd->open_flags | FKIOCTL),
3940		    kcred, &rval);
3941		if (request->status == 0)
3942			vd->ownership = B_FALSE;
3943		return (0);
3944	}
3945
3946	/*
3947	 * As per the VIO spec, the PREEMPT and PRESERVE flags are only valid
3948	 * when the EXCLUSIVE flag is set.
3949	 */
3950	if (!(flags & VD_ACCESS_SET_EXCLUSIVE)) {
3951		PR0("Invalid VD_OP_SET_ACCESS flags: 0x%lx", flags);
3952		request->status = EINVAL;
3953		return (0);
3954	}
3955
3956	switch (flags & (VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE)) {
3957
3958	case VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE:
3959		/*
3960		 * Flags EXCLUSIVE and PREEMPT and PRESERVE. We have to
3961		 * acquire exclusive access rights, preserve them and we
3962		 * can use preemption. So we can use the MHIOCTKNOWN ioctl.
3963		 */
3964		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT|PRESERVE)");
3965		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
3966		    MHIOCTKOWN, (intptr_t)NULL, (vd->open_flags | FKIOCTL),
3967		    kcred, &rval);
3968		break;
3969
3970	case VD_ACCESS_SET_PRESERVE:
3971		/*
3972		 * Flags EXCLUSIVE and PRESERVE. We have to acquire exclusive
3973		 * access rights and preserve them, but not preempt any other
3974		 * host. So we need to use the MHIOCTKOWN ioctl to enable the
3975		 * "preserve" feature but we can not called it directly
3976		 * because it uses preemption. So before that, we use the
3977		 * MHIOCQRESERVE ioctl to ensure we can get exclusive rights
3978		 * without preempting anyone.
3979		 */
3980		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PRESERVE)");
3981		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
3982		    MHIOCQRESERVE, (intptr_t)NULL, (vd->open_flags | FKIOCTL),
3983		    kcred, &rval);
3984		if (request->status != 0)
3985			break;
3986		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
3987		    MHIOCTKOWN, (intptr_t)NULL, (vd->open_flags | FKIOCTL),
3988		    kcred, &rval);
3989		break;
3990
3991	case VD_ACCESS_SET_PREEMPT:
3992		/*
3993		 * Flags EXCLUSIVE and PREEMPT. We have to acquire exclusive
3994		 * access rights and we can use preemption. So we try to do
3995		 * a SCSI reservation, if it fails we reset the disk to clear
3996		 * any reservation and we try to reserve again.
3997		 */
3998		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT)");
3999		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
4000		    MHIOCQRESERVE, (intptr_t)NULL, (vd->open_flags | FKIOCTL),
4001		    kcred, &rval);
4002		if (request->status == 0)
4003			break;
4004
4005		/* reset the disk */
4006		(void) vd_scsi_reset(vd);
4007
4008		/* try again even if the reset has failed */
4009		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
4010		    MHIOCQRESERVE, (intptr_t)NULL, (vd->open_flags | FKIOCTL),
4011		    kcred, &rval);
4012		break;
4013
4014	case 0:
4015		/* Flag EXCLUSIVE only. Just issue a SCSI reservation */
4016		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE)");
4017		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
4018		    MHIOCQRESERVE, (intptr_t)NULL, (vd->open_flags | FKIOCTL),
4019		    kcred, &rval);
4020		break;
4021	}
4022
4023	if (request->status == 0)
4024		vd->ownership = B_TRUE;
4025	else
4026		PR0("VD_OP_SET_ACCESS: error %d", request->status);
4027
4028	return (0);
4029}
4030
4031static void
4032vd_reset_access(vd_t *vd)
4033{
4034	int status, rval;
4035
4036	if (vd->file || vd->volume || !vd->ownership)
4037		return;
4038
4039	PR0("Releasing disk ownership");
4040	status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, (intptr_t)NULL,
4041	    (vd->open_flags | FKIOCTL), kcred, &rval);
4042
4043	/*
4044	 * An EACCES failure means that there is a reservation conflict,
4045	 * so we are not the owner of the disk anymore.
4046	 */
4047	if (status == 0 || status == EACCES) {
4048		vd->ownership = B_FALSE;
4049		return;
4050	}
4051
4052	PR0("Fail to release ownership, error %d", status);
4053
4054	/*
4055	 * We have failed to release the ownership, try to reset the disk
4056	 * to release reservations.
4057	 */
4058	PR0("Resetting disk");
4059	status = vd_scsi_reset(vd);
4060
4061	if (status != 0)
4062		PR0("Fail to reset disk, error %d", status);
4063
4064	/* whatever the result of the reset is, we try the release again */
4065	status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, (intptr_t)NULL,
4066	    (vd->open_flags | FKIOCTL), kcred, &rval);
4067
4068	if (status == 0 || status == EACCES) {
4069		vd->ownership = B_FALSE;
4070		return;
4071	}
4072
4073	PR0("Fail to release ownership, error %d", status);
4074
4075	/*
4076	 * At this point we have done our best to try to reset the
4077	 * access rights to the disk and we don't know if we still
4078	 * own a reservation and if any mechanism to preserve the
4079	 * ownership is still in place. The ultimate solution would
4080	 * be to reset the system but this is usually not what we
4081	 * want to happen.
4082	 */
4083
4084	if (vd_reset_access_failure == A_REBOOT) {
4085		cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG
4086		    ", rebooting the system", vd->device_path);
4087		(void) uadmin(A_SHUTDOWN, AD_BOOT, (uintptr_t)NULL);
4088	} else if (vd_reset_access_failure == A_DUMP) {
4089		panic(VD_RESET_ACCESS_FAILURE_MSG, vd->device_path);
4090	}
4091
4092	cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG, vd->device_path);
4093}
4094
4095/*
4096 * Define the supported operations once the functions for performing them have
4097 * been defined
4098 */
4099static const vds_operation_t	vds_operation[] = {
4100#define	X(_s)	#_s, _s
4101	{X(VD_OP_BREAD),	vd_start_bio,	vd_complete_bio},
4102	{X(VD_OP_BWRITE),	vd_start_bio,	vd_complete_bio},
4103	{X(VD_OP_FLUSH),	vd_ioctl,	NULL},
4104	{X(VD_OP_GET_WCE),	vd_ioctl,	NULL},
4105	{X(VD_OP_SET_WCE),	vd_ioctl,	NULL},
4106	{X(VD_OP_GET_VTOC),	vd_ioctl,	NULL},
4107	{X(VD_OP_SET_VTOC),	vd_ioctl,	NULL},
4108	{X(VD_OP_GET_DISKGEOM),	vd_ioctl,	NULL},
4109	{X(VD_OP_SET_DISKGEOM),	vd_ioctl,	NULL},
4110	{X(VD_OP_GET_EFI),	vd_ioctl,	NULL},
4111	{X(VD_OP_SET_EFI),	vd_ioctl,	NULL},
4112	{X(VD_OP_GET_DEVID),	vd_get_devid,	NULL},
4113	{X(VD_OP_SCSICMD),	vd_ioctl,	NULL},
4114	{X(VD_OP_RESET),	vd_reset,	NULL},
4115	{X(VD_OP_GET_CAPACITY),	vd_get_capacity, NULL},
4116	{X(VD_OP_SET_ACCESS),	vd_set_access,	NULL},
4117	{X(VD_OP_GET_ACCESS),	vd_get_access,	NULL},
4118#undef	X
4119};
4120
4121static const size_t	vds_noperations =
4122	(sizeof (vds_operation))/(sizeof (vds_operation[0]));
4123
4124/*
4125 * Process a task specifying a client I/O request
4126 *
4127 * Parameters:
4128 *	task		- structure containing the request sent from client
4129 *
4130 * Return Value
4131 *	0	- success
4132 *	ENOTSUP	- Unknown/Unsupported VD_OP_XXX operation
4133 *	EINVAL	- Invalid disk slice
4134 *	!= 0	- some other non-zero return value from start function
4135 */
4136static int
4137vd_do_process_task(vd_task_t *task)
4138{
4139	int			i;
4140	vd_t			*vd		= task->vd;
4141	vd_dring_payload_t	*request	= task->request;
4142
4143	ASSERT(vd != NULL);
4144	ASSERT(request != NULL);
4145
4146	/* Find the requested operation */
4147	for (i = 0; i < vds_noperations; i++) {
4148		if (request->operation == vds_operation[i].operation) {
4149			/* all operations should have a start func */
4150			ASSERT(vds_operation[i].start != NULL);
4151
4152			task->completef = vds_operation[i].complete;
4153			break;
4154		}
4155	}
4156
4157	/*
4158	 * We need to check that the requested operation is permitted
4159	 * for the particular client that sent it or that the loop above
4160	 * did not complete without finding the operation type (indicating
4161	 * that the requested operation is unknown/unimplemented)
4162	 */
4163	if ((VD_OP_SUPPORTED(vd->operations, request->operation) == B_FALSE) ||
4164	    (i == vds_noperations)) {
4165		PR0("Unsupported operation %u", request->operation);
4166		request->status = ENOTSUP;
4167		return (0);
4168	}
4169
4170	/* Range-check slice */
4171	if (request->slice >= vd->nslices &&
4172	    ((vd->vdisk_type != VD_DISK_TYPE_DISK && vd_slice_single_slice) ||
4173	    request->slice != VD_SLICE_NONE)) {
4174		PR0("Invalid \"slice\" %u (max %u) for virtual disk",
4175		    request->slice, (vd->nslices - 1));
4176		request->status = EINVAL;
4177		return (0);
4178	}
4179
4180	/*
4181	 * Call the function pointer that starts the operation.
4182	 */
4183	return (vds_operation[i].start(task));
4184}
4185
4186/*
4187 * Description:
4188 *	This function is called by both the in-band and descriptor ring
4189 *	message processing functions paths to actually execute the task
4190 *	requested by the vDisk client. It in turn calls its worker
4191 *	function, vd_do_process_task(), to carry our the request.
4192 *
4193 *	Any transport errors (e.g. LDC errors, vDisk protocol errors) are
4194 *	saved in the 'status' field of the task and are propagated back
4195 *	up the call stack to trigger a NACK
4196 *
4197 *	Any request errors (e.g. ENOTTY from an ioctl) are saved in
4198 *	the 'status' field of the request and result in an ACK being sent
4199 *	by the completion handler.
4200 *
4201 * Parameters:
4202 *	task		- structure containing the request sent from client
4203 *
4204 * Return Value
4205 *	0		- successful synchronous request.
4206 *	!= 0		- transport error (e.g. LDC errors, vDisk protocol)
4207 *	EINPROGRESS	- task will be finished in a completion handler
4208 */
4209static int
4210vd_process_task(vd_task_t *task)
4211{
4212	vd_t	*vd = task->vd;
4213	int	status;
4214
4215	DTRACE_PROBE1(task__start, vd_task_t *, task);
4216
4217	task->status =  vd_do_process_task(task);
4218
4219	/*
4220	 * If the task processing function returned EINPROGRESS indicating
4221	 * that the task needs completing then schedule a taskq entry to
4222	 * finish it now.
4223	 *
4224	 * Otherwise the task processing function returned either zero
4225	 * indicating that the task was finished in the start function (and we
4226	 * don't need to wait in a completion function) or the start function
4227	 * returned an error - in both cases all that needs to happen is the
4228	 * notification to the vDisk client higher up the call stack.
4229	 * If the task was using a Descriptor Ring, we need to mark it as done
4230	 * at this stage.
4231	 */
4232	if (task->status == EINPROGRESS) {
4233		/* Queue a task to complete the operation */
4234		(void) ddi_taskq_dispatch(vd->completionq, vd_complete,
4235		    task, DDI_SLEEP);
4236		return (EINPROGRESS);
4237	}
4238
4239	if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE_V1_0)) {
4240		/* Update the dring element if it's a dring client */
4241		status = vd_mark_elem_done(vd, task->index,
4242		    task->request->status, task->request->nbytes);
4243		if (status == ECONNRESET)
4244			vd_mark_in_reset(vd);
4245		else if (status == EACCES)
4246			vd_need_reset(vd, B_TRUE);
4247	}
4248
4249	return (task->status);
4250}
4251
4252/*
4253 * Return true if the "type", "subtype", and "env" fields of the "tag" first
4254 * argument match the corresponding remaining arguments; otherwise, return false
4255 */
4256boolean_t
4257vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env)
4258{
4259	return ((tag->vio_msgtype == type) &&
4260	    (tag->vio_subtype == subtype) &&
4261	    (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE;
4262}
4263
4264/*
4265 * Check whether the major/minor version specified in "ver_msg" is supported
4266 * by this server.
4267 */
4268static boolean_t
4269vds_supported_version(vio_ver_msg_t *ver_msg)
4270{
4271	for (int i = 0; i < vds_num_versions; i++) {
4272		ASSERT(vds_version[i].major > 0);
4273		ASSERT((i == 0) ||
4274		    (vds_version[i].major < vds_version[i-1].major));
4275
4276		/*
4277		 * If the major versions match, adjust the minor version, if
4278		 * necessary, down to the highest value supported by this
4279		 * server and return true so this message will get "ack"ed;
4280		 * the client should also support all minor versions lower
4281		 * than the value it sent
4282		 */
4283		if (ver_msg->ver_major == vds_version[i].major) {
4284			if (ver_msg->ver_minor > vds_version[i].minor) {
4285				PR0("Adjusting minor version from %u to %u",
4286				    ver_msg->ver_minor, vds_version[i].minor);
4287				ver_msg->ver_minor = vds_version[i].minor;
4288			}
4289			return (B_TRUE);
4290		}
4291
4292		/*
4293		 * If the message contains a higher major version number, set
4294		 * the message's major/minor versions to the current values
4295		 * and return false, so this message will get "nack"ed with
4296		 * these values, and the client will potentially try again
4297		 * with the same or a lower version
4298		 */
4299		if (ver_msg->ver_major > vds_version[i].major) {
4300			ver_msg->ver_major = vds_version[i].major;
4301			ver_msg->ver_minor = vds_version[i].minor;
4302			return (B_FALSE);
4303		}
4304
4305		/*
4306		 * Otherwise, the message's major version is less than the
4307		 * current major version, so continue the loop to the next
4308		 * (lower) supported version
4309		 */
4310	}
4311
4312	/*
4313	 * No common version was found; "ground" the version pair in the
4314	 * message to terminate negotiation
4315	 */
4316	ver_msg->ver_major = 0;
4317	ver_msg->ver_minor = 0;
4318	return (B_FALSE);
4319}
4320
4321/*
4322 * Process a version message from a client.  vds expects to receive version
4323 * messages from clients seeking service, but never issues version messages
4324 * itself; therefore, vds can ACK or NACK client version messages, but does
4325 * not expect to receive version-message ACKs or NACKs (and will treat such
4326 * messages as invalid).
4327 */
4328static int
4329vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
4330{
4331	vio_ver_msg_t	*ver_msg = (vio_ver_msg_t *)msg;
4332
4333
4334	ASSERT(msglen >= sizeof (msg->tag));
4335
4336	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
4337	    VIO_VER_INFO)) {
4338		return (ENOMSG);	/* not a version message */
4339	}
4340
4341	if (msglen != sizeof (*ver_msg)) {
4342		PR0("Expected %lu-byte version message; "
4343		    "received %lu bytes", sizeof (*ver_msg), msglen);
4344		return (EBADMSG);
4345	}
4346
4347	if (ver_msg->dev_class != VDEV_DISK) {
4348		PR0("Expected device class %u (disk); received %u",
4349		    VDEV_DISK, ver_msg->dev_class);
4350		return (EBADMSG);
4351	}
4352
4353	/*
4354	 * We're talking to the expected kind of client; set our device class
4355	 * for "ack/nack" back to the client
4356	 */
4357	ver_msg->dev_class = VDEV_DISK_SERVER;
4358
4359	/*
4360	 * Check whether the (valid) version message specifies a version
4361	 * supported by this server.  If the version is not supported, return
4362	 * EBADMSG so the message will get "nack"ed; vds_supported_version()
4363	 * will have updated the message with a supported version for the
4364	 * client to consider
4365	 */
4366	if (!vds_supported_version(ver_msg))
4367		return (EBADMSG);
4368
4369
4370	/*
4371	 * A version has been agreed upon; use the client's SID for
4372	 * communication on this channel now
4373	 */
4374	ASSERT(!(vd->initialized & VD_SID));
4375	vd->sid = ver_msg->tag.vio_sid;
4376	vd->initialized |= VD_SID;
4377
4378	/*
4379	 * Store the negotiated major and minor version values in the "vd" data
4380	 * structure so that we can check if certain operations are supported
4381	 * by the client.
4382	 */
4383	vd->version.major = ver_msg->ver_major;
4384	vd->version.minor = ver_msg->ver_minor;
4385
4386	PR0("Using major version %u, minor version %u",
4387	    ver_msg->ver_major, ver_msg->ver_minor);
4388	return (0);
4389}
4390
4391static void
4392vd_set_exported_operations(vd_t *vd)
4393{
4394	vd->operations = 0;	/* clear field */
4395
4396	/*
4397	 * We need to check from the highest version supported to the
4398	 * lowest because versions with a higher minor number implicitly
4399	 * support versions with a lower minor number.
4400	 */
4401	if (vio_ver_is_supported(vd->version, 1, 1)) {
4402		ASSERT(vd->open_flags & FREAD);
4403		vd->operations |= VD_OP_MASK_READ | (1 << VD_OP_GET_CAPACITY);
4404
4405		if (vd->open_flags & FWRITE)
4406			vd->operations |= VD_OP_MASK_WRITE;
4407
4408		if (vd->scsi)
4409			vd->operations |= VD_OP_MASK_SCSI;
4410
4411		if (VD_DSKIMG(vd) && vd_dskimg_is_iso_image(vd)) {
4412			/*
4413			 * can't write to ISO images, make sure that write
4414			 * support is not set in case administrator did not
4415			 * use "options=ro" when doing an ldm add-vdsdev
4416			 */
4417			vd->operations &= ~VD_OP_MASK_WRITE;
4418		}
4419	} else if (vio_ver_is_supported(vd->version, 1, 0)) {
4420		vd->operations = VD_OP_MASK_READ | VD_OP_MASK_WRITE;
4421	}
4422
4423	/* we should have already agreed on a version */
4424	ASSERT(vd->operations != 0);
4425}
4426
4427static int
4428vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
4429{
4430	vd_attr_msg_t	*attr_msg = (vd_attr_msg_t *)msg;
4431	int		status, retry = 0;
4432
4433
4434	ASSERT(msglen >= sizeof (msg->tag));
4435
4436	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
4437	    VIO_ATTR_INFO)) {
4438		PR0("Message is not an attribute message");
4439		return (ENOMSG);
4440	}
4441
4442	if (msglen != sizeof (*attr_msg)) {
4443		PR0("Expected %lu-byte attribute message; "
4444		    "received %lu bytes", sizeof (*attr_msg), msglen);
4445		return (EBADMSG);
4446	}
4447
4448	if (attr_msg->max_xfer_sz == 0) {
4449		PR0("Received maximum transfer size of 0 from client");
4450		return (EBADMSG);
4451	}
4452
4453	if ((attr_msg->xfer_mode != VIO_DESC_MODE) &&
4454	    (attr_msg->xfer_mode != VIO_DRING_MODE_V1_0)) {
4455		PR0("Client requested unsupported transfer mode");
4456		return (EBADMSG);
4457	}
4458
4459	/*
4460	 * check if the underlying disk is ready, if not try accessing
4461	 * the device again. Open the vdisk device and extract info
4462	 * about it, as this is needed to respond to the attr info msg
4463	 */
4464	if ((vd->initialized & VD_DISK_READY) == 0) {
4465		PR0("Retry setting up disk (%s)", vd->device_path);
4466		do {
4467			status = vd_setup_vd(vd);
4468			if (status != EAGAIN || ++retry > vds_dev_retries)
4469				break;
4470
4471			/* incremental delay */
4472			delay(drv_usectohz(vds_dev_delay));
4473
4474			/* if vdisk is no longer enabled - return error */
4475			if (!vd_enabled(vd))
4476				return (ENXIO);
4477
4478		} while (status == EAGAIN);
4479
4480		if (status)
4481			return (ENXIO);
4482
4483		vd->initialized |= VD_DISK_READY;
4484		ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
4485		PR0("vdisk_type = %s, volume = %s, file = %s, nslices = %u",
4486		    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
4487		    (vd->volume ? "yes" : "no"),
4488		    (vd->file ? "yes" : "no"),
4489		    vd->nslices);
4490	}
4491
4492	/* Success:  valid message and transfer mode */
4493	vd->xfer_mode = attr_msg->xfer_mode;
4494
4495	if (vd->xfer_mode == VIO_DESC_MODE) {
4496
4497		/*
4498		 * The vd_dring_inband_msg_t contains one cookie; need room
4499		 * for up to n-1 more cookies, where "n" is the number of full
4500		 * pages plus possibly one partial page required to cover
4501		 * "max_xfer_sz".  Add room for one more cookie if
4502		 * "max_xfer_sz" isn't an integral multiple of the page size.
4503		 * Must first get the maximum transfer size in bytes.
4504		 */
4505		size_t	max_xfer_bytes = attr_msg->vdisk_block_size ?
4506		    attr_msg->vdisk_block_size * attr_msg->max_xfer_sz :
4507		    attr_msg->max_xfer_sz;
4508		size_t	max_inband_msglen =
4509		    sizeof (vd_dring_inband_msg_t) +
4510		    ((max_xfer_bytes/PAGESIZE +
4511		    ((max_xfer_bytes % PAGESIZE) ? 1 : 0))*
4512		    (sizeof (ldc_mem_cookie_t)));
4513
4514		/*
4515		 * Set the maximum expected message length to
4516		 * accommodate in-band-descriptor messages with all
4517		 * their cookies
4518		 */
4519		vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen);
4520
4521		/*
4522		 * Initialize the data structure for processing in-band I/O
4523		 * request descriptors
4524		 */
4525		vd->inband_task.vd	= vd;
4526		vd->inband_task.msg	= kmem_alloc(vd->max_msglen, KM_SLEEP);
4527		vd->inband_task.index	= 0;
4528		vd->inband_task.type	= VD_FINAL_RANGE_TASK;	/* range == 1 */
4529	}
4530
4531	/* Return the device's block size and max transfer size to the client */
4532	attr_msg->vdisk_block_size	= vd->vdisk_bsize;
4533	attr_msg->max_xfer_sz		= vd->max_xfer_sz;
4534
4535	attr_msg->vdisk_size = vd->vdisk_size;
4536	attr_msg->vdisk_type = (vd_slice_single_slice)? vd->vdisk_type :
4537	    VD_DISK_TYPE_DISK;
4538	attr_msg->vdisk_media = vd->vdisk_media;
4539
4540	/* Discover and save the list of supported VD_OP_XXX operations */
4541	vd_set_exported_operations(vd);
4542	attr_msg->operations = vd->operations;
4543
4544	PR0("%s", VD_CLIENT(vd));
4545
4546	ASSERT(vd->dring_task == NULL);
4547
4548	return (0);
4549}
4550
4551static int
4552vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
4553{
4554	int			status;
4555	size_t			expected;
4556	ldc_mem_info_t		dring_minfo;
4557	uint8_t			mtype;
4558	vio_dring_reg_msg_t	*reg_msg = (vio_dring_reg_msg_t *)msg;
4559
4560
4561	ASSERT(msglen >= sizeof (msg->tag));
4562
4563	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
4564	    VIO_DRING_REG)) {
4565		PR0("Message is not a register-dring message");
4566		return (ENOMSG);
4567	}
4568
4569	if (msglen < sizeof (*reg_msg)) {
4570		PR0("Expected at least %lu-byte register-dring message; "
4571		    "received %lu bytes", sizeof (*reg_msg), msglen);
4572		return (EBADMSG);
4573	}
4574
4575	expected = sizeof (*reg_msg) +
4576	    (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0]));
4577	if (msglen != expected) {
4578		PR0("Expected %lu-byte register-dring message; "
4579		    "received %lu bytes", expected, msglen);
4580		return (EBADMSG);
4581	}
4582
4583	if (vd->initialized & VD_DRING) {
4584		PR0("A dring was previously registered; only support one");
4585		return (EBADMSG);
4586	}
4587
4588	if (reg_msg->num_descriptors > INT32_MAX) {
4589		PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)",
4590		    reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX));
4591		return (EBADMSG);
4592	}
4593
4594	if (reg_msg->ncookies != 1) {
4595		/*
4596		 * In addition to fixing the assertion in the success case
4597		 * below, supporting drings which require more than one
4598		 * "cookie" requires increasing the value of vd->max_msglen
4599		 * somewhere in the code path prior to receiving the message
4600		 * which results in calling this function.  Note that without
4601		 * making this change, the larger message size required to
4602		 * accommodate multiple cookies cannot be successfully
4603		 * received, so this function will not even get called.
4604		 * Gracefully accommodating more dring cookies might
4605		 * reasonably demand exchanging an additional attribute or
4606		 * making a minor protocol adjustment
4607		 */
4608		PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies);
4609		return (EBADMSG);
4610	}
4611
4612	if (vd_direct_mapped_drings)
4613		mtype = LDC_DIRECT_MAP;
4614	else
4615		mtype = LDC_SHADOW_MAP;
4616
4617	status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie,
4618	    reg_msg->ncookies, reg_msg->num_descriptors,
4619	    reg_msg->descriptor_size, mtype, &vd->dring_handle);
4620	if (status != 0) {
4621		PR0("ldc_mem_dring_map() returned errno %d", status);
4622		return (status);
4623	}
4624
4625	/*
4626	 * To remove the need for this assertion, must call
4627	 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a
4628	 * successful call to ldc_mem_dring_map()
4629	 */
4630	ASSERT(reg_msg->ncookies == 1);
4631
4632	if ((status =
4633	    ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) {
4634		PR0("ldc_mem_dring_info() returned errno %d", status);
4635		if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)
4636			PR0("ldc_mem_dring_unmap() returned errno %d", status);
4637		return (status);
4638	}
4639
4640	if (dring_minfo.vaddr == NULL) {
4641		PR0("Descriptor ring virtual address is NULL");
4642		return (ENXIO);
4643	}
4644
4645
4646	/* Initialize for valid message and mapped dring */
4647	vd->initialized |= VD_DRING;
4648	vd->dring_ident = 1;	/* "There Can Be Only One" */
4649	vd->dring = dring_minfo.vaddr;
4650	vd->descriptor_size = reg_msg->descriptor_size;
4651	vd->dring_len = reg_msg->num_descriptors;
4652	vd->dring_mtype = dring_minfo.mtype;
4653	reg_msg->dring_ident = vd->dring_ident;
4654	PR1("descriptor size = %u, dring length = %u",
4655	    vd->descriptor_size, vd->dring_len);
4656
4657	/*
4658	 * Allocate and initialize a "shadow" array of data structures for
4659	 * tasks to process I/O requests in dring elements
4660	 */
4661	vd->dring_task =
4662	    kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP);
4663	for (int i = 0; i < vd->dring_len; i++) {
4664		vd->dring_task[i].vd		= vd;
4665		vd->dring_task[i].index		= i;
4666
4667		status = ldc_mem_alloc_handle(vd->ldc_handle,
4668		    &(vd->dring_task[i].mhdl));
4669		if (status) {
4670			PR0("ldc_mem_alloc_handle() returned err %d ", status);
4671			return (ENXIO);
4672		}
4673
4674		/*
4675		 * The descriptor payload varies in length. Calculate its
4676		 * size by subtracting the header size from the total
4677		 * descriptor size.
4678		 */
4679		vd->dring_task[i].request = kmem_zalloc((vd->descriptor_size -
4680		    sizeof (vio_dring_entry_hdr_t)), KM_SLEEP);
4681		vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP);
4682	}
4683
4684	if (vd->file || vd->zvol) {
4685		vd->write_queue =
4686		    kmem_zalloc(sizeof (buf_t *) * vd->dring_len, KM_SLEEP);
4687	}
4688
4689	return (0);
4690}
4691
4692static int
4693vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
4694{
4695	vio_dring_unreg_msg_t	*unreg_msg = (vio_dring_unreg_msg_t *)msg;
4696
4697
4698	ASSERT(msglen >= sizeof (msg->tag));
4699
4700	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
4701	    VIO_DRING_UNREG)) {
4702		PR0("Message is not an unregister-dring message");
4703		return (ENOMSG);
4704	}
4705
4706	if (msglen != sizeof (*unreg_msg)) {
4707		PR0("Expected %lu-byte unregister-dring message; "
4708		    "received %lu bytes", sizeof (*unreg_msg), msglen);
4709		return (EBADMSG);
4710	}
4711
4712	if (unreg_msg->dring_ident != vd->dring_ident) {
4713		PR0("Expected dring ident %lu; received %lu",
4714		    vd->dring_ident, unreg_msg->dring_ident);
4715		return (EBADMSG);
4716	}
4717
4718	return (0);
4719}
4720
4721static int
4722process_rdx_msg(vio_msg_t *msg, size_t msglen)
4723{
4724	ASSERT(msglen >= sizeof (msg->tag));
4725
4726	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) {
4727		PR0("Message is not an RDX message");
4728		return (ENOMSG);
4729	}
4730
4731	if (msglen != sizeof (vio_rdx_msg_t)) {
4732		PR0("Expected %lu-byte RDX message; received %lu bytes",
4733		    sizeof (vio_rdx_msg_t), msglen);
4734		return (EBADMSG);
4735	}
4736
4737	PR0("Valid RDX message");
4738	return (0);
4739}
4740
4741static int
4742vd_check_seq_num(vd_t *vd, uint64_t seq_num)
4743{
4744	if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) {
4745		PR0("Received seq_num %lu; expected %lu",
4746		    seq_num, (vd->seq_num + 1));
4747		PR0("initiating soft reset");
4748		vd_need_reset(vd, B_FALSE);
4749		return (1);
4750	}
4751
4752	vd->seq_num = seq_num;
4753	vd->initialized |= VD_SEQ_NUM;	/* superfluous after first time... */
4754	return (0);
4755}
4756
4757/*
4758 * Return the expected size of an inband-descriptor message with all the
4759 * cookies it claims to include
4760 */
4761static size_t
4762expected_inband_size(vd_dring_inband_msg_t *msg)
4763{
4764	return ((sizeof (*msg)) +
4765	    (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0])));
4766}
4767
4768/*
4769 * Process an in-band descriptor message:  used with clients like OBP, with
4770 * which vds exchanges descriptors within VIO message payloads, rather than
4771 * operating on them within a descriptor ring
4772 */
4773static int
4774vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
4775{
4776	size_t			expected;
4777	vd_dring_inband_msg_t	*desc_msg = (vd_dring_inband_msg_t *)msg;
4778
4779
4780	ASSERT(msglen >= sizeof (msg->tag));
4781
4782	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
4783	    VIO_DESC_DATA)) {
4784		PR1("Message is not an in-band-descriptor message");
4785		return (ENOMSG);
4786	}
4787
4788	if (msglen < sizeof (*desc_msg)) {
4789		PR0("Expected at least %lu-byte descriptor message; "
4790		    "received %lu bytes", sizeof (*desc_msg), msglen);
4791		return (EBADMSG);
4792	}
4793
4794	if (msglen != (expected = expected_inband_size(desc_msg))) {
4795		PR0("Expected %lu-byte descriptor message; "
4796		    "received %lu bytes", expected, msglen);
4797		return (EBADMSG);
4798	}
4799
4800	if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0)
4801		return (EBADMSG);
4802
4803	/*
4804	 * Valid message:  Set up the in-band descriptor task and process the
4805	 * request.  Arrange to acknowledge the client's message, unless an
4806	 * error processing the descriptor task results in setting
4807	 * VIO_SUBTYPE_NACK
4808	 */
4809	PR1("Valid in-band-descriptor message");
4810	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
4811
4812	ASSERT(vd->inband_task.msg != NULL);
4813
4814	bcopy(msg, vd->inband_task.msg, msglen);
4815	vd->inband_task.msglen	= msglen;
4816
4817	/*
4818	 * The task request is now the payload of the message
4819	 * that was just copied into the body of the task.
4820	 */
4821	desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg;
4822	vd->inband_task.request	= &desc_msg->payload;
4823
4824	return (vd_process_task(&vd->inband_task));
4825}
4826
4827static int
4828vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx,
4829    vio_msg_t *msg, size_t msglen)
4830{
4831	int			status;
4832	boolean_t		ready;
4833	on_trap_data_t		otd;
4834	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
4835
4836	/* Accept the updated dring element */
4837	if ((status = VIO_DRING_ACQUIRE(&otd, vd->dring_mtype,
4838	    vd->dring_handle, idx, idx)) != 0) {
4839		return (status);
4840	}
4841	ready = (elem->hdr.dstate == VIO_DESC_READY);
4842	if (ready) {
4843		elem->hdr.dstate = VIO_DESC_ACCEPTED;
4844		bcopy(&elem->payload, vd->dring_task[idx].request,
4845		    (vd->descriptor_size - sizeof (vio_dring_entry_hdr_t)));
4846	} else {
4847		PR0("descriptor %u not ready", idx);
4848		VD_DUMP_DRING_ELEM(elem);
4849	}
4850	if ((status = VIO_DRING_RELEASE(vd->dring_mtype,
4851	    vd->dring_handle, idx, idx)) != 0) {
4852		PR0("VIO_DRING_RELEASE() returned errno %d", status);
4853		return (status);
4854	}
4855	if (!ready)
4856		return (EBUSY);
4857
4858
4859	/* Initialize a task and process the accepted element */
4860	PR1("Processing dring element %u", idx);
4861	vd->dring_task[idx].type	= type;
4862
4863	/* duplicate msg buf for cookies etc. */
4864	bcopy(msg, vd->dring_task[idx].msg, msglen);
4865
4866	vd->dring_task[idx].msglen	= msglen;
4867	return (vd_process_task(&vd->dring_task[idx]));
4868}
4869
4870static int
4871vd_process_element_range(vd_t *vd, int start, int end,
4872    vio_msg_t *msg, size_t msglen)
4873{
4874	int		i, n, nelem, status = 0;
4875	boolean_t	inprogress = B_FALSE;
4876	vd_task_type_t	type;
4877
4878
4879	ASSERT(start >= 0);
4880	ASSERT(end >= 0);
4881
4882	/*
4883	 * Arrange to acknowledge the client's message, unless an error
4884	 * processing one of the dring elements results in setting
4885	 * VIO_SUBTYPE_NACK
4886	 */
4887	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
4888
4889	/*
4890	 * Process the dring elements in the range
4891	 */
4892	nelem = ((end < start) ? end + vd->dring_len : end) - start + 1;
4893	for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) {
4894		((vio_dring_msg_t *)msg)->end_idx = i;
4895		type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK;
4896		status = vd_process_element(vd, type, i, msg, msglen);
4897		if (status == EINPROGRESS)
4898			inprogress = B_TRUE;
4899		else if (status != 0)
4900			break;
4901	}
4902
4903	/*
4904	 * If some, but not all, operations of a multi-element range are in
4905	 * progress, wait for other operations to complete before returning
4906	 * (which will result in "ack" or "nack" of the message).  Note that
4907	 * all outstanding operations will need to complete, not just the ones
4908	 * corresponding to the current range of dring elements; howevever, as
4909	 * this situation is an error case, performance is less critical.
4910	 */
4911	if ((nelem > 1) && (status != EINPROGRESS) && inprogress) {
4912		if (vd->ioq != NULL)
4913			ddi_taskq_wait(vd->ioq);
4914		ddi_taskq_wait(vd->completionq);
4915	}
4916
4917	return (status);
4918}
4919
4920static int
4921vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
4922{
4923	vio_dring_msg_t	*dring_msg = (vio_dring_msg_t *)msg;
4924
4925
4926	ASSERT(msglen >= sizeof (msg->tag));
4927
4928	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
4929	    VIO_DRING_DATA)) {
4930		PR1("Message is not a dring-data message");
4931		return (ENOMSG);
4932	}
4933
4934	if (msglen != sizeof (*dring_msg)) {
4935		PR0("Expected %lu-byte dring message; received %lu bytes",
4936		    sizeof (*dring_msg), msglen);
4937		return (EBADMSG);
4938	}
4939
4940	if (vd_check_seq_num(vd, dring_msg->seq_num) != 0)
4941		return (EBADMSG);
4942
4943	if (dring_msg->dring_ident != vd->dring_ident) {
4944		PR0("Expected dring ident %lu; received ident %lu",
4945		    vd->dring_ident, dring_msg->dring_ident);
4946		return (EBADMSG);
4947	}
4948
4949	if (dring_msg->start_idx >= vd->dring_len) {
4950		PR0("\"start_idx\" = %u; must be less than %u",
4951		    dring_msg->start_idx, vd->dring_len);
4952		return (EBADMSG);
4953	}
4954
4955	if ((dring_msg->end_idx < 0) ||
4956	    (dring_msg->end_idx >= vd->dring_len)) {
4957		PR0("\"end_idx\" = %u; must be >= 0 and less than %u",
4958		    dring_msg->end_idx, vd->dring_len);
4959		return (EBADMSG);
4960	}
4961
4962	/* Valid message; process range of updated dring elements */
4963	PR1("Processing descriptor range, start = %u, end = %u",
4964	    dring_msg->start_idx, dring_msg->end_idx);
4965	return (vd_process_element_range(vd, dring_msg->start_idx,
4966	    dring_msg->end_idx, msg, msglen));
4967}
4968
4969static int
4970recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes)
4971{
4972	int	retry, status;
4973	size_t	size = *nbytes;
4974
4975
4976	for (retry = 0, status = ETIMEDOUT;
4977	    retry < vds_ldc_retries && status == ETIMEDOUT;
4978	    retry++) {
4979		PR1("ldc_read() attempt %d", (retry + 1));
4980		*nbytes = size;
4981		status = ldc_read(ldc_handle, msg, nbytes);
4982	}
4983
4984	if (status) {
4985		PR0("ldc_read() returned errno %d", status);
4986		if (status != ECONNRESET)
4987			return (ENOMSG);
4988		return (status);
4989	} else if (*nbytes == 0) {
4990		PR1("ldc_read() returned 0 and no message read");
4991		return (ENOMSG);
4992	}
4993
4994	PR1("RCVD %lu-byte message", *nbytes);
4995	return (0);
4996}
4997
4998static int
4999vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
5000{
5001	int		status;
5002
5003
5004	PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype,
5005	    msg->tag.vio_subtype, msg->tag.vio_subtype_env);
5006#ifdef	DEBUG
5007	vd_decode_tag(msg);
5008#endif
5009
5010	/*
5011	 * Validate session ID up front, since it applies to all messages
5012	 * once set
5013	 */
5014	if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) {
5015		PR0("Expected SID %u, received %u", vd->sid,
5016		    msg->tag.vio_sid);
5017		return (EBADMSG);
5018	}
5019
5020	PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state));
5021
5022	/*
5023	 * Process the received message based on connection state
5024	 */
5025	switch (vd->state) {
5026	case VD_STATE_INIT:	/* expect version message */
5027		if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0)
5028			return (status);
5029
5030		/* Version negotiated, move to that state */
5031		vd->state = VD_STATE_VER;
5032		return (0);
5033
5034	case VD_STATE_VER:	/* expect attribute message */
5035		if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0)
5036			return (status);
5037
5038		/* Attributes exchanged, move to that state */
5039		vd->state = VD_STATE_ATTR;
5040		return (0);
5041
5042	case VD_STATE_ATTR:
5043		switch (vd->xfer_mode) {
5044		case VIO_DESC_MODE:	/* expect RDX message */
5045			if ((status = process_rdx_msg(msg, msglen)) != 0)
5046				return (status);
5047
5048			/* Ready to receive in-band descriptors */
5049			vd->state = VD_STATE_DATA;
5050			return (0);
5051
5052		case VIO_DRING_MODE_V1_0:  /* expect register-dring message */
5053			if ((status =
5054			    vd_process_dring_reg_msg(vd, msg, msglen)) != 0)
5055				return (status);
5056
5057			/* One dring negotiated, move to that state */
5058			vd->state = VD_STATE_DRING;
5059			return (0);
5060
5061		default:
5062			ASSERT("Unsupported transfer mode");
5063			PR0("Unsupported transfer mode");
5064			return (ENOTSUP);
5065		}
5066
5067	case VD_STATE_DRING:	/* expect RDX, register-dring, or unreg-dring */
5068		if ((status = process_rdx_msg(msg, msglen)) == 0) {
5069			/* Ready to receive data */
5070			vd->state = VD_STATE_DATA;
5071			return (0);
5072		} else if (status != ENOMSG) {
5073			return (status);
5074		}
5075
5076
5077		/*
5078		 * If another register-dring message is received, stay in
5079		 * dring state in case the client sends RDX; although the
5080		 * protocol allows multiple drings, this server does not
5081		 * support using more than one
5082		 */
5083		if ((status =
5084		    vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG)
5085			return (status);
5086
5087		/*
5088		 * Acknowledge an unregister-dring message, but reset the
5089		 * connection anyway:  Although the protocol allows
5090		 * unregistering drings, this server cannot serve a vdisk
5091		 * without its only dring
5092		 */
5093		status = vd_process_dring_unreg_msg(vd, msg, msglen);
5094		return ((status == 0) ? ENOTSUP : status);
5095
5096	case VD_STATE_DATA:
5097		switch (vd->xfer_mode) {
5098		case VIO_DESC_MODE:	/* expect in-band-descriptor message */
5099			return (vd_process_desc_msg(vd, msg, msglen));
5100
5101		case VIO_DRING_MODE_V1_0: /* expect dring-data or unreg-dring */
5102			/*
5103			 * Typically expect dring-data messages, so handle
5104			 * them first
5105			 */
5106			if ((status = vd_process_dring_msg(vd, msg,
5107			    msglen)) != ENOMSG)
5108				return (status);
5109
5110			/*
5111			 * Acknowledge an unregister-dring message, but reset
5112			 * the connection anyway:  Although the protocol
5113			 * allows unregistering drings, this server cannot
5114			 * serve a vdisk without its only dring
5115			 */
5116			status = vd_process_dring_unreg_msg(vd, msg, msglen);
5117			return ((status == 0) ? ENOTSUP : status);
5118
5119		default:
5120			ASSERT("Unsupported transfer mode");
5121			PR0("Unsupported transfer mode");
5122			return (ENOTSUP);
5123		}
5124
5125	default:
5126		ASSERT("Invalid client connection state");
5127		PR0("Invalid client connection state");
5128		return (ENOTSUP);
5129	}
5130}
5131
5132static int
5133vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
5134{
5135	int		status;
5136	boolean_t	reset_ldc = B_FALSE;
5137	vd_task_t	task;
5138
5139	/*
5140	 * Check that the message is at least big enough for a "tag", so that
5141	 * message processing can proceed based on tag-specified message type
5142	 */
5143	if (msglen < sizeof (vio_msg_tag_t)) {
5144		PR0("Received short (%lu-byte) message", msglen);
5145		/* Can't "nack" short message, so drop the big hammer */
5146		PR0("initiating full reset");
5147		vd_need_reset(vd, B_TRUE);
5148		return (EBADMSG);
5149	}
5150
5151	/*
5152	 * Process the message
5153	 */
5154	switch (status = vd_do_process_msg(vd, msg, msglen)) {
5155	case 0:
5156		/* "ack" valid, successfully-processed messages */
5157		msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
5158		break;
5159
5160	case EINPROGRESS:
5161		/* The completion handler will "ack" or "nack" the message */
5162		return (EINPROGRESS);
5163	case ENOMSG:
5164		PR0("Received unexpected message");
5165		_NOTE(FALLTHROUGH);
5166	case EBADMSG:
5167	case ENOTSUP:
5168		/* "transport" error will cause NACK of invalid messages */
5169		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
5170		break;
5171
5172	default:
5173		/* "transport" error will cause NACK of invalid messages */
5174		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
5175		/* An LDC error probably occurred, so try resetting it */
5176		reset_ldc = B_TRUE;
5177		break;
5178	}
5179
5180	PR1("\tResulting in state %d (%s)", vd->state,
5181	    vd_decode_state(vd->state));
5182
5183	/* populate the task so we can dispatch it on the taskq */
5184	task.vd = vd;
5185	task.msg = msg;
5186	task.msglen = msglen;
5187
5188	/*
5189	 * Queue a task to send the notification that the operation completed.
5190	 * We need to ensure that requests are responded to in the correct
5191	 * order and since the taskq is processed serially this ordering
5192	 * is maintained.
5193	 */
5194	(void) ddi_taskq_dispatch(vd->completionq, vd_serial_notify,
5195	    &task, DDI_SLEEP);
5196
5197	/*
5198	 * To ensure handshake negotiations do not happen out of order, such
5199	 * requests that come through this path should not be done in parallel
5200	 * so we need to wait here until the response is sent to the client.
5201	 */
5202	ddi_taskq_wait(vd->completionq);
5203
5204	/* Arrange to reset the connection for nack'ed or failed messages */
5205	if ((status != 0) || reset_ldc) {
5206		PR0("initiating %s reset",
5207		    (reset_ldc) ? "full" : "soft");
5208		vd_need_reset(vd, reset_ldc);
5209	}
5210
5211	return (status);
5212}
5213
5214static boolean_t
5215vd_enabled(vd_t *vd)
5216{
5217	boolean_t	enabled;
5218
5219	mutex_enter(&vd->lock);
5220	enabled = vd->enabled;
5221	mutex_exit(&vd->lock);
5222	return (enabled);
5223}
5224
5225static void
5226vd_recv_msg(void *arg)
5227{
5228	vd_t	*vd = (vd_t *)arg;
5229	int	rv = 0, status = 0;
5230
5231	ASSERT(vd != NULL);
5232
5233	PR2("New task to receive incoming message(s)");
5234
5235
5236	while (vd_enabled(vd) && status == 0) {
5237		size_t		msglen, msgsize;
5238		ldc_status_t	lstatus;
5239
5240		/*
5241		 * Receive and process a message
5242		 */
5243		vd_reset_if_needed(vd);	/* can change vd->max_msglen */
5244
5245		/*
5246		 * check if channel is UP - else break out of loop
5247		 */
5248		status = ldc_status(vd->ldc_handle, &lstatus);
5249		if (lstatus != LDC_UP) {
5250			PR0("channel not up (status=%d), exiting recv loop\n",
5251			    lstatus);
5252			break;
5253		}
5254
5255		ASSERT(vd->max_msglen != 0);
5256
5257		msgsize = vd->max_msglen; /* stable copy for alloc/free */
5258		msglen	= msgsize;	  /* actual len after recv_msg() */
5259
5260		status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen);
5261		switch (status) {
5262		case 0:
5263			rv = vd_process_msg(vd, (void *)vd->vio_msgp, msglen);
5264			/* check if max_msglen changed */
5265			if (msgsize != vd->max_msglen) {
5266				PR0("max_msglen changed 0x%lx to 0x%lx bytes\n",
5267				    msgsize, vd->max_msglen);
5268				kmem_free(vd->vio_msgp, msgsize);
5269				vd->vio_msgp =
5270				    kmem_alloc(vd->max_msglen, KM_SLEEP);
5271			}
5272			if (rv == EINPROGRESS)
5273				continue;
5274			break;
5275
5276		case ENOMSG:
5277			break;
5278
5279		case ECONNRESET:
5280			PR0("initiating soft reset (ECONNRESET)\n");
5281			vd_need_reset(vd, B_FALSE);
5282			status = 0;
5283			break;
5284
5285		default:
5286			/* Probably an LDC failure; arrange to reset it */
5287			PR0("initiating full reset (status=0x%x)", status);
5288			vd_need_reset(vd, B_TRUE);
5289			break;
5290		}
5291	}
5292
5293	PR2("Task finished");
5294}
5295
5296static uint_t
5297vd_handle_ldc_events(uint64_t event, caddr_t arg)
5298{
5299	vd_t	*vd = (vd_t *)(void *)arg;
5300	int	status;
5301
5302	ASSERT(vd != NULL);
5303
5304	if (!vd_enabled(vd))
5305		return (LDC_SUCCESS);
5306
5307	if (event & LDC_EVT_DOWN) {
5308		PR0("LDC_EVT_DOWN: LDC channel went down");
5309
5310		vd_need_reset(vd, B_TRUE);
5311		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
5312		    DDI_SLEEP);
5313		if (status == DDI_FAILURE) {
5314			PR0("cannot schedule task to recv msg\n");
5315			vd_need_reset(vd, B_TRUE);
5316		}
5317	}
5318
5319	if (event & LDC_EVT_RESET) {
5320		PR0("LDC_EVT_RESET: LDC channel was reset");
5321
5322		if (vd->state != VD_STATE_INIT) {
5323			PR0("scheduling full reset");
5324			vd_need_reset(vd, B_FALSE);
5325			status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
5326			    vd, DDI_SLEEP);
5327			if (status == DDI_FAILURE) {
5328				PR0("cannot schedule task to recv msg\n");
5329				vd_need_reset(vd, B_TRUE);
5330			}
5331
5332		} else {
5333			PR0("channel already reset, ignoring...\n");
5334			PR0("doing ldc up...\n");
5335			(void) ldc_up(vd->ldc_handle);
5336		}
5337
5338		return (LDC_SUCCESS);
5339	}
5340
5341	if (event & LDC_EVT_UP) {
5342		PR0("EVT_UP: LDC is up\nResetting client connection state");
5343		PR0("initiating soft reset");
5344		vd_need_reset(vd, B_FALSE);
5345		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
5346		    vd, DDI_SLEEP);
5347		if (status == DDI_FAILURE) {
5348			PR0("cannot schedule task to recv msg\n");
5349			vd_need_reset(vd, B_TRUE);
5350			return (LDC_SUCCESS);
5351		}
5352	}
5353
5354	if (event & LDC_EVT_READ) {
5355		int	status;
5356
5357		PR1("New data available");
5358		/* Queue a task to receive the new data */
5359		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
5360		    DDI_SLEEP);
5361
5362		if (status == DDI_FAILURE) {
5363			PR0("cannot schedule task to recv msg\n");
5364			vd_need_reset(vd, B_TRUE);
5365		}
5366	}
5367
5368	return (LDC_SUCCESS);
5369}
5370
5371static uint_t
5372vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
5373{
5374	_NOTE(ARGUNUSED(key, val))
5375	(*((uint_t *)arg))++;
5376	return (MH_WALK_TERMINATE);
5377}
5378
5379
5380static int
5381vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5382{
5383	uint_t	vd_present = 0;
5384	minor_t	instance;
5385	vds_t	*vds;
5386
5387
5388	switch (cmd) {
5389	case DDI_DETACH:
5390		/* the real work happens below */
5391		break;
5392	case DDI_SUSPEND:
5393		PR0("No action required for DDI_SUSPEND");
5394		return (DDI_SUCCESS);
5395	default:
5396		PR0("Unrecognized \"cmd\"");
5397		return (DDI_FAILURE);
5398	}
5399
5400	ASSERT(cmd == DDI_DETACH);
5401	instance = ddi_get_instance(dip);
5402	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
5403		PR0("Could not get state for instance %u", instance);
5404		ddi_soft_state_free(vds_state, instance);
5405		return (DDI_FAILURE);
5406	}
5407
5408	/* Do no detach when serving any vdisks */
5409	mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present);
5410	if (vd_present) {
5411		PR0("Not detaching because serving vdisks");
5412		return (DDI_FAILURE);
5413	}
5414
5415	PR0("Detaching");
5416	if (vds->initialized & VDS_MDEG) {
5417		(void) mdeg_unregister(vds->mdeg);
5418		kmem_free(vds->ispecp->specp, sizeof (vds_prop_template));
5419		kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t));
5420		vds->ispecp = NULL;
5421		vds->mdeg = 0;
5422	}
5423
5424	vds_driver_types_free(vds);
5425
5426	if (vds->initialized & VDS_LDI)
5427		(void) ldi_ident_release(vds->ldi_ident);
5428	mod_hash_destroy_hash(vds->vd_table);
5429	ddi_soft_state_free(vds_state, instance);
5430	return (DDI_SUCCESS);
5431}
5432
5433/*
5434 * Description:
5435 *	This function checks to see if the disk image being used as a
5436 *	virtual disk is an ISO image. An ISO image is a special case
5437 *	which can be booted/installed from like a CD/DVD.
5438 *
5439 * Parameters:
5440 *	vd		- disk on which the operation is performed.
5441 *
5442 * Return Code:
5443 *	B_TRUE		- The disk image is an ISO 9660 compliant image
5444 *	B_FALSE		- just a regular disk image
5445 */
5446static boolean_t
5447vd_dskimg_is_iso_image(vd_t *vd)
5448{
5449	char	iso_buf[ISO_SECTOR_SIZE];
5450	int	i, rv;
5451	uint_t	sec;
5452
5453	ASSERT(VD_DSKIMG(vd));
5454
5455	/*
5456	 * If we have already discovered and saved this info we can
5457	 * short-circuit the check and avoid reading the disk image.
5458	 */
5459	if (vd->vdisk_media == VD_MEDIA_DVD || vd->vdisk_media == VD_MEDIA_CD)
5460		return (B_TRUE);
5461
5462	/*
5463	 * We wish to read the sector that should contain the 2nd ISO volume
5464	 * descriptor. The second field in this descriptor is called the
5465	 * Standard Identifier and is set to CD001 for a CD-ROM compliant
5466	 * to the ISO 9660 standard.
5467	 */
5468	sec = (ISO_VOLDESC_SEC * ISO_SECTOR_SIZE) / vd->vdisk_bsize;
5469	rv = vd_dskimg_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)iso_buf,
5470	    sec, ISO_SECTOR_SIZE);
5471
5472	if (rv < 0)
5473		return (B_FALSE);
5474
5475	for (i = 0; i < ISO_ID_STRLEN; i++) {
5476		if (ISO_STD_ID(iso_buf)[i] != ISO_ID_STRING[i])
5477			return (B_FALSE);
5478	}
5479
5480	return (B_TRUE);
5481}
5482
5483/*
5484 * Description:
5485 *	This function checks to see if the virtual device is an ATAPI
5486 *	device. ATAPI devices use Group 1 Read/Write commands, so
5487 *	any USCSI calls vds makes need to take this into account.
5488 *
5489 * Parameters:
5490 *	vd		- disk on which the operation is performed.
5491 *
5492 * Return Code:
5493 *	B_TRUE		- The virtual disk is backed by an ATAPI device
5494 *	B_FALSE		- not an ATAPI device (presumably SCSI)
5495 */
5496static boolean_t
5497vd_is_atapi_device(vd_t *vd)
5498{
5499	boolean_t	is_atapi = B_FALSE;
5500	char		*variantp;
5501	int		rv;
5502
5503	ASSERT(vd->ldi_handle[0] != NULL);
5504	ASSERT(!vd->file);
5505
5506	rv = ldi_prop_lookup_string(vd->ldi_handle[0],
5507	    (LDI_DEV_T_ANY | DDI_PROP_DONTPASS), "variant", &variantp);
5508	if (rv == DDI_PROP_SUCCESS) {
5509		PR0("'variant' property exists for %s", vd->device_path);
5510		if (strcmp(variantp, "atapi") == 0)
5511			is_atapi = B_TRUE;
5512		ddi_prop_free(variantp);
5513	}
5514
5515	rv = ldi_prop_exists(vd->ldi_handle[0], LDI_DEV_T_ANY, "atapi");
5516	if (rv) {
5517		PR0("'atapi' property exists for %s", vd->device_path);
5518		is_atapi = B_TRUE;
5519	}
5520
5521	return (is_atapi);
5522}
5523
5524static int
5525vd_setup_full_disk(vd_t *vd)
5526{
5527	int		status;
5528	major_t		major = getmajor(vd->dev[0]);
5529	minor_t		minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE;
5530
5531	ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
5532
5533	/* set the disk size, block size and the media type of the disk */
5534	status = vd_backend_check_size(vd);
5535
5536	if (status != 0) {
5537		if (!vd->scsi) {
5538			/* unexpected failure */
5539			PRN("Check size failed for %s (errno %d)",
5540			    vd->device_path, status);
5541			return (EIO);
5542		}
5543
5544		/*
5545		 * The function can fail for SCSI disks which are present but
5546		 * reserved by another system. In that case, we don't know the
5547		 * size of the disk and the block size.
5548		 */
5549		vd->vdisk_size = VD_SIZE_UNKNOWN;
5550		vd->vdisk_bsize = 0;
5551		vd->backend_bsize = 0;
5552		vd->vdisk_media = VD_MEDIA_FIXED;
5553	}
5554
5555	/* Move dev number and LDI handle to entire-disk-slice array elements */
5556	vd->dev[VD_ENTIRE_DISK_SLICE]		= vd->dev[0];
5557	vd->dev[0]				= 0;
5558	vd->ldi_handle[VD_ENTIRE_DISK_SLICE]	= vd->ldi_handle[0];
5559	vd->ldi_handle[0]			= NULL;
5560
5561	/* Initialize device numbers for remaining slices and open them */
5562	for (int slice = 0; slice < vd->nslices; slice++) {
5563		/*
5564		 * Skip the entire-disk slice, as it's already open and its
5565		 * device known
5566		 */
5567		if (slice == VD_ENTIRE_DISK_SLICE)
5568			continue;
5569		ASSERT(vd->dev[slice] == 0);
5570		ASSERT(vd->ldi_handle[slice] == NULL);
5571
5572		/*
5573		 * Construct the device number for the current slice
5574		 */
5575		vd->dev[slice] = makedevice(major, (minor + slice));
5576
5577		/*
5578		 * Open all slices of the disk to serve them to the client.
5579		 * Slices are opened exclusively to prevent other threads or
5580		 * processes in the service domain from performing I/O to
5581		 * slices being accessed by a client.  Failure to open a slice
5582		 * results in vds not serving this disk, as the client could
5583		 * attempt (and should be able) to access any slice immediately.
5584		 * Any slices successfully opened before a failure will get
5585		 * closed by vds_destroy_vd() as a result of the error returned
5586		 * by this function.
5587		 *
5588		 * We need to do the open with FNDELAY so that opening an empty
5589		 * slice does not fail.
5590		 */
5591		PR0("Opening device major %u, minor %u = slice %u",
5592		    major, minor, slice);
5593
5594		/*
5595		 * Try to open the device. This can fail for example if we are
5596		 * opening an empty slice. So in case of a failure, we try the
5597		 * open again but this time with the FNDELAY flag.
5598		 */
5599		status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
5600		    vd->open_flags, kcred, &vd->ldi_handle[slice],
5601		    vd->vds->ldi_ident);
5602
5603		if (status != 0) {
5604			status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
5605			    vd->open_flags | FNDELAY, kcred,
5606			    &vd->ldi_handle[slice], vd->vds->ldi_ident);
5607		}
5608
5609		if (status != 0) {
5610			PRN("ldi_open_by_dev() returned errno %d "
5611			    "for slice %u", status, slice);
5612			/* vds_destroy_vd() will close any open slices */
5613			vd->ldi_handle[slice] = NULL;
5614			return (status);
5615		}
5616	}
5617
5618	return (0);
5619}
5620
5621/*
5622 * When a slice or a volume is exported as a single-slice disk, we want
5623 * the disk backend (i.e. the slice or volume) to be entirely mapped as
5624 * a slice without the addition of any metadata.
5625 *
5626 * So when exporting the disk as a VTOC disk, we fake a disk with the following
5627 * layout:
5628 *                flabel +--- flabel_limit
5629 *                 <->   V
5630 *                 0 1   C                          D  E
5631 *                 +-+---+--------------------------+--+
5632 *  virtual disk:  |L|XXX|           slice 0        |AA|
5633 *                 +-+---+--------------------------+--+
5634 *                  ^    :                          :
5635 *                  |    :                          :
5636 *      VTOC LABEL--+    :                          :
5637 *                       +--------------------------+
5638 *  disk backend:        |     slice/volume/file    |
5639 *                       +--------------------------+
5640 *                       0                          N
5641 *
5642 * N is the number of blocks in the slice/volume/file.
5643 *
5644 * We simulate a disk with N+M blocks, where M is the number of blocks
5645 * simluated at the beginning and at the end of the disk (blocks 0-C
5646 * and D-E).
5647 *
5648 * The first blocks (0 to C-1) are emulated and can not be changed. Blocks C
5649 * to D defines slice 0 and are mapped to the backend. Finally we emulate 2
5650 * alternate cylinders at the end of the disk (blocks D-E). In summary we have:
5651 *
5652 * - block 0 (L) returns a fake VTOC label
5653 * - blocks 1 to C-1 (X) are unused and return 0
5654 * - blocks C to D-1 are mapped to the exported slice or volume
5655 * - blocks D and E (A) are blocks defining alternate cylinders (2 cylinders)
5656 *
5657 * Note: because we define a fake disk geometry, it is possible that the length
5658 * of the backend is not a multiple of the size of cylinder, in that case the
5659 * very end of the backend will not map to any block of the virtual disk.
5660 */
5661static int
5662vd_setup_partition_vtoc(vd_t *vd)
5663{
5664	char *device_path = vd->device_path;
5665	char unit;
5666	size_t size, csize;
5667
5668	/* Initialize dk_geom structure for single-slice device */
5669	if (vd->dk_geom.dkg_nsect == 0) {
5670		PRN("%s geometry claims 0 sectors per track", device_path);
5671		return (EIO);
5672	}
5673	if (vd->dk_geom.dkg_nhead == 0) {
5674		PRN("%s geometry claims 0 heads", device_path);
5675		return (EIO);
5676	}
5677
5678	/* size of a cylinder in block */
5679	csize = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
5680
5681	/*
5682	 * Add extra cylinders: we emulate the first cylinder (which contains
5683	 * the disk label).
5684	 */
5685	vd->dk_geom.dkg_ncyl = vd->vdisk_size / csize + 1;
5686
5687	/* we emulate 2 alternate cylinders */
5688	vd->dk_geom.dkg_acyl = 2;
5689	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
5690
5691
5692	/* Initialize vtoc structure for single-slice device */
5693	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
5694	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
5695	vd->vtoc.v_part[0].p_flag = 0;
5696	/*
5697	 * Partition 0 starts on cylinder 1 and its size has to be
5698	 * a multiple of a number of cylinder.
5699	 */
5700	vd->vtoc.v_part[0].p_start = csize; /* start on cylinder 1 */
5701	vd->vtoc.v_part[0].p_size = (vd->vdisk_size / csize) * csize;
5702
5703	if (vd_slice_single_slice) {
5704		vd->vtoc.v_nparts = 1;
5705		bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
5706		    MIN(sizeof (VD_ASCIILABEL),
5707		    sizeof (vd->vtoc.v_asciilabel)));
5708		bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
5709		    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
5710	} else {
5711		/* adjust the number of slices */
5712		vd->nslices = V_NUMPAR;
5713		vd->vtoc.v_nparts = V_NUMPAR;
5714
5715		/* define slice 2 representing the entire disk */
5716		vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_tag = V_BACKUP;
5717		vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_flag = 0;
5718		vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_start = 0;
5719		vd->vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_size =
5720		    vd->dk_geom.dkg_ncyl * csize;
5721
5722		vd_get_readable_size(vd->vdisk_size * vd->vdisk_bsize,
5723		    &size, &unit);
5724
5725		/*
5726		 * Set some attributes of the geometry to what format(1m) uses
5727		 * so that writing a default label using format(1m) does not
5728		 * produce any error.
5729		 */
5730		vd->dk_geom.dkg_bcyl = 0;
5731		vd->dk_geom.dkg_intrlv = 1;
5732		vd->dk_geom.dkg_write_reinstruct = 0;
5733		vd->dk_geom.dkg_read_reinstruct = 0;
5734
5735		/*
5736		 * We must have a correct label name otherwise format(1m) will
5737		 * not recognized the disk as labeled.
5738		 */
5739		(void) snprintf(vd->vtoc.v_asciilabel, LEN_DKL_ASCII,
5740		    "SUN-DiskSlice-%ld%cB cyl %d alt %d hd %d sec %d",
5741		    size, unit,
5742		    vd->dk_geom.dkg_ncyl, vd->dk_geom.dkg_acyl,
5743		    vd->dk_geom.dkg_nhead, vd->dk_geom.dkg_nsect);
5744		bzero(vd->vtoc.v_volume, sizeof (vd->vtoc.v_volume));
5745
5746		/* create a fake label from the vtoc and geometry */
5747		vd->flabel_limit = (uint_t)csize;
5748		vd->flabel_size = VD_LABEL_VTOC_SIZE(vd->vdisk_bsize);
5749		vd->flabel = kmem_zalloc(vd->flabel_size, KM_SLEEP);
5750		vd_vtocgeom_to_label(&vd->vtoc, &vd->dk_geom,
5751		    VD_LABEL_VTOC(vd));
5752	}
5753
5754	/* adjust the vdisk_size, we emulate 3 cylinders */
5755	vd->vdisk_size += csize * 3;
5756
5757	return (0);
5758}
5759
5760/*
5761 * When a slice, volume or file is exported as a single-slice disk, we want
5762 * the disk backend (i.e. the slice, volume or file) to be entirely mapped
5763 * as a slice without the addition of any metadata.
5764 *
5765 * So when exporting the disk as an EFI disk, we fake a disk with the following
5766 * layout: (assuming the block size is 512 bytes)
5767 *
5768 *                  flabel        +--- flabel_limit
5769 *                 <------>       v
5770 *                 0 1 2  L      34                        34+N      P
5771 *                 +-+-+--+-------+--------------------------+-------+
5772 *  virtual disk:  |X|T|EE|XXXXXXX|           slice 0        |RRRRRRR|
5773 *                 +-+-+--+-------+--------------------------+-------+
5774 *                    ^ ^         :                          :
5775 *                    | |         :                          :
5776 *                GPT-+ +-GPE     :                          :
5777 *                                +--------------------------+
5778 *  disk backend:                 |     slice/volume/file    |
5779 *                                +--------------------------+
5780 *                                0                          N
5781 *
5782 * N is the number of blocks in the slice/volume/file.
5783 *
5784 * We simulate a disk with N+M blocks, where M is the number of blocks
5785 * simluated at the beginning and at the end of the disk (blocks 0-34
5786 * and 34+N-P).
5787 *
5788 * The first 34 blocks (0 to 33) are emulated and can not be changed. Blocks 34
5789 * to 34+N defines slice 0 and are mapped to the exported backend, and we
5790 * emulate some blocks at the end of the disk (blocks 34+N to P) as a the EFI
5791 * reserved partition.
5792 *
5793 * - block 0 (X) is unused and return 0
5794 * - block 1 (T) returns a fake EFI GPT (via DKIOCGETEFI)
5795 * - blocks 2 to L-1 (E) defines a fake EFI GPE (via DKIOCGETEFI)
5796 * - blocks L to 33 (X) are unused and return 0
5797 * - blocks 34 to 34+N are mapped to the exported slice, volume or file
5798 * - blocks 34+N+1 to P define a fake reserved partition and backup label, it
5799 *   returns 0
5800 *
5801 * Note: if the backend size is not a multiple of the vdisk block size then
5802 * the very end of the backend will not map to any block of the virtual disk.
5803 */
5804static int
5805vd_setup_partition_efi(vd_t *vd)
5806{
5807	efi_gpt_t *gpt;
5808	efi_gpe_t *gpe;
5809	struct uuid uuid = EFI_USR;
5810	struct uuid efi_reserved = EFI_RESERVED;
5811	uint32_t crc;
5812	uint64_t s0_start, s0_end, first_u_lba;
5813	size_t bsize;
5814
5815	ASSERT(vd->vdisk_bsize > 0);
5816
5817	bsize = vd->vdisk_bsize;
5818	/*
5819	 * The minimum size for the label is 16K (EFI_MIN_ARRAY_SIZE)
5820	 * for GPEs plus one block for the GPT and one for PMBR.
5821	 */
5822	first_u_lba = (EFI_MIN_ARRAY_SIZE / bsize) + 2;
5823	vd->flabel_limit = (uint_t)first_u_lba;
5824	vd->flabel_size = VD_LABEL_EFI_SIZE(bsize);
5825	vd->flabel = kmem_zalloc(vd->flabel_size, KM_SLEEP);
5826	gpt = VD_LABEL_EFI_GPT(vd, bsize);
5827	gpe = VD_LABEL_EFI_GPE(vd, bsize);
5828
5829	/*
5830	 * Adjust the vdisk_size, we emulate the first few blocks
5831	 * for the disk label.
5832	 */
5833	vd->vdisk_size += first_u_lba;
5834	s0_start = first_u_lba;
5835	s0_end = vd->vdisk_size - 1;
5836
5837	gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE);
5838	gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
5839	gpt->efi_gpt_HeaderSize = LE_32(EFI_HEADER_SIZE);
5840	gpt->efi_gpt_FirstUsableLBA = LE_64(first_u_lba);
5841	gpt->efi_gpt_PartitionEntryLBA = LE_64(2ULL);
5842	gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t));
5843
5844	UUID_LE_CONVERT(gpe[0].efi_gpe_PartitionTypeGUID, uuid);
5845	gpe[0].efi_gpe_StartingLBA = LE_64(s0_start);
5846	gpe[0].efi_gpe_EndingLBA = LE_64(s0_end);
5847
5848	if (vd_slice_single_slice) {
5849		gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1);
5850	} else {
5851		/* adjust the number of slices */
5852		gpt->efi_gpt_NumberOfPartitionEntries = LE_32(VD_MAXPART);
5853		vd->nslices = V_NUMPAR;
5854
5855		/* define a fake reserved partition */
5856		UUID_LE_CONVERT(gpe[VD_MAXPART - 1].efi_gpe_PartitionTypeGUID,
5857		    efi_reserved);
5858		gpe[VD_MAXPART - 1].efi_gpe_StartingLBA =
5859		    LE_64(s0_end + 1);
5860		gpe[VD_MAXPART - 1].efi_gpe_EndingLBA =
5861		    LE_64(s0_end + EFI_MIN_RESV_SIZE);
5862
5863		/* adjust the vdisk_size to include the reserved slice */
5864		vd->vdisk_size += EFI_MIN_RESV_SIZE;
5865	}
5866
5867	gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1);
5868
5869	/* adjust the vdisk size for the backup GPT and GPE */
5870	vd->vdisk_size += (EFI_MIN_ARRAY_SIZE / bsize) + 1;
5871	gpt->efi_gpt_AlternateLBA = LE_64(vd->vdisk_size - 1);
5872
5873	CRC32(crc, gpe, sizeof (efi_gpe_t) * VD_MAXPART, -1U, crc32_table);
5874	gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
5875
5876	CRC32(crc, gpt, EFI_HEADER_SIZE, -1U, crc32_table);
5877	gpt->efi_gpt_HeaderCRC32 = LE_32(~crc);
5878
5879	return (0);
5880}
5881
5882/*
5883 * Setup for a virtual disk whose backend is a file (exported as a single slice
5884 * or as a full disk). In that case, the backend is accessed using the vnode
5885 * interface.
5886 */
5887static int
5888vd_setup_backend_vnode(vd_t *vd)
5889{
5890	int		rval, status;
5891	dev_t		dev;
5892	char		*file_path = vd->device_path;
5893	ldi_handle_t	lhandle;
5894	struct dk_cinfo	dk_cinfo;
5895
5896	ASSERT(!vd->volume);
5897
5898	if ((status = vn_open(file_path, UIO_SYSSPACE, vd->open_flags | FOFFMAX,
5899	    0, &vd->file_vnode, 0, 0)) != 0) {
5900		if ((status == ENXIO || status == ENODEV || status == ENOENT ||
5901		    status == EROFS) && (!(vd->initialized & VD_SETUP_ERROR) &&
5902		    !(DEVI_IS_ATTACHING(vd->vds->dip)))) {
5903			PRN("vn_open(%s) = errno %d", file_path, status);
5904		}
5905		return (status);
5906	}
5907
5908	/*
5909	 * We set vd->file now so that vds_destroy_vd will take care of
5910	 * closing the file and releasing the vnode in case of an error.
5911	 */
5912	vd->file = B_TRUE;
5913
5914	vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */
5915
5916	/*
5917	 * Get max_xfer_sz from the device where the file is.
5918	 */
5919	dev = vd->file_vnode->v_vfsp->vfs_dev;
5920	PR0("underlying device of %s = (%d, %d)\n", file_path,
5921	    getmajor(dev), getminor(dev));
5922
5923	status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, kcred, &lhandle,
5924	    vd->vds->ldi_ident);
5925
5926	if (status != 0) {
5927		PR0("ldi_open() returned errno %d for underlying device",
5928		    status);
5929	} else {
5930		if ((status = ldi_ioctl(lhandle, DKIOCINFO,
5931		    (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred,
5932		    &rval)) != 0) {
5933			PR0("ldi_ioctl(DKIOCINFO) returned errno %d for "
5934			    "underlying device", status);
5935		} else {
5936			/*
5937			 * Store the device's max transfer size for
5938			 * return to the client
5939			 */
5940			vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
5941		}
5942
5943		PR0("close the underlying device");
5944		(void) ldi_close(lhandle, FREAD, kcred);
5945	}
5946
5947	PR0("using file %s on device (%d, %d), max_xfer = %u blks",
5948	    file_path, getmajor(dev), getminor(dev), vd->max_xfer_sz);
5949
5950	if (vd->vdisk_type == VD_DISK_TYPE_SLICE)
5951		status = vd_setup_slice_image(vd);
5952	else
5953		status = vd_setup_disk_image(vd);
5954
5955	return (status);
5956}
5957
5958static int
5959vd_setup_slice_image(vd_t *vd)
5960{
5961	struct dk_label label;
5962	int status;
5963
5964	if ((status = vd_backend_check_size(vd)) != 0) {
5965		PRN("Check size failed for %s (errno %d)",
5966		    vd->device_path, status);
5967		return (EIO);
5968	}
5969
5970	vd->vdisk_media = VD_MEDIA_FIXED;
5971	vd->vdisk_label = (vd_slice_label == VD_DISK_LABEL_UNK)?
5972	    vd_file_slice_label : vd_slice_label;
5973
5974	if (vd->vdisk_label == VD_DISK_LABEL_EFI ||
5975	    vd->dskimg_size >= 2 * ONE_TERABYTE) {
5976		status = vd_setup_partition_efi(vd);
5977	} else {
5978		/*
5979		 * We build a default label to get a geometry for
5980		 * the vdisk. Then the partition setup function will
5981		 * adjust the vtoc so that it defines a single-slice
5982		 * disk.
5983		 */
5984		vd_build_default_label(vd->dskimg_size, vd->vdisk_bsize,
5985		    &label);
5986		vd_label_to_vtocgeom(&label, &vd->vtoc, &vd->dk_geom);
5987		status = vd_setup_partition_vtoc(vd);
5988	}
5989
5990	return (status);
5991}
5992
5993static int
5994vd_setup_disk_image(vd_t *vd)
5995{
5996	int status;
5997	char *backend_path = vd->device_path;
5998
5999	if ((status = vd_backend_check_size(vd)) != 0) {
6000		PRN("Check size failed for %s (errno %d)",
6001		    backend_path, status);
6002		return (EIO);
6003	}
6004
6005	/* size should be at least sizeof(dk_label) */
6006	if (vd->dskimg_size < sizeof (struct dk_label)) {
6007		PRN("Size of file has to be at least %ld bytes",
6008		    sizeof (struct dk_label));
6009		return (EIO);
6010	}
6011
6012	/*
6013	 * Find and validate the geometry of a disk image.
6014	 */
6015	status = vd_dskimg_validate_geometry(vd);
6016	if (status != 0 && status != EINVAL && status != ENOTSUP) {
6017		PRN("Failed to read label from %s", backend_path);
6018		return (EIO);
6019	}
6020
6021	if (vd_dskimg_is_iso_image(vd)) {
6022		/*
6023		 * Indicate whether to call this a CD or DVD from the size
6024		 * of the ISO image (images for both drive types are stored
6025		 * in the ISO-9600 format). CDs can store up to just under 1Gb
6026		 */
6027		if ((vd->vdisk_size * vd->vdisk_bsize) > ONE_GIGABYTE)
6028			vd->vdisk_media = VD_MEDIA_DVD;
6029		else
6030			vd->vdisk_media = VD_MEDIA_CD;
6031	} else {
6032		vd->vdisk_media = VD_MEDIA_FIXED;
6033	}
6034
6035	/* Setup devid for the disk image */
6036
6037	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
6038
6039		status = vd_dskimg_read_devid(vd, &vd->dskimg_devid);
6040
6041		if (status == 0) {
6042			/* a valid devid was found */
6043			return (0);
6044		}
6045
6046		if (status != EINVAL) {
6047			/*
6048			 * There was an error while trying to read the devid.
6049			 * So this disk image may have a devid but we are
6050			 * unable to read it.
6051			 */
6052			PR0("can not read devid for %s", backend_path);
6053			vd->dskimg_devid = NULL;
6054			return (0);
6055		}
6056	}
6057
6058	/*
6059	 * No valid device id was found so we create one. Note that a failure
6060	 * to create a device id is not fatal and does not prevent the disk
6061	 * image from being attached.
6062	 */
6063	PR1("creating devid for %s", backend_path);
6064
6065	if (ddi_devid_init(vd->vds->dip, DEVID_FAB, 0, 0,
6066	    &vd->dskimg_devid) != DDI_SUCCESS) {
6067		PR0("fail to create devid for %s", backend_path);
6068		vd->dskimg_devid = NULL;
6069		return (0);
6070	}
6071
6072	/*
6073	 * Write devid to the disk image. The devid is stored into the disk
6074	 * image if we have a valid label; otherwise the devid will be stored
6075	 * when the user writes a valid label.
6076	 */
6077	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
6078		if (vd_dskimg_write_devid(vd, vd->dskimg_devid) != 0) {
6079			PR0("fail to write devid for %s", backend_path);
6080			ddi_devid_free(vd->dskimg_devid);
6081			vd->dskimg_devid = NULL;
6082		}
6083	}
6084
6085	return (0);
6086}
6087
6088
6089/*
6090 * Description:
6091 *	Open a device using its device path (supplied by ldm(1m))
6092 *
6093 * Parameters:
6094 *	vd	- pointer to structure containing the vDisk info
6095 *	flags	- open flags
6096 *
6097 * Return Value
6098 *	0	- success
6099 *	!= 0	- some other non-zero return value from ldi(9F) functions
6100 */
6101static int
6102vd_open_using_ldi_by_name(vd_t *vd, int flags)
6103{
6104	int		status;
6105	char		*device_path = vd->device_path;
6106
6107	/* Attempt to open device */
6108	status = ldi_open_by_name(device_path, flags, kcred,
6109	    &vd->ldi_handle[0], vd->vds->ldi_ident);
6110
6111	/*
6112	 * The open can fail for example if we are opening an empty slice.
6113	 * In case of a failure, we try the open again but this time with
6114	 * the FNDELAY flag.
6115	 */
6116	if (status != 0)
6117		status = ldi_open_by_name(device_path, flags | FNDELAY,
6118		    kcred, &vd->ldi_handle[0], vd->vds->ldi_ident);
6119
6120	if (status != 0) {
6121		PR0("ldi_open_by_name(%s) = errno %d", device_path, status);
6122		vd->ldi_handle[0] = NULL;
6123		return (status);
6124	}
6125
6126	return (0);
6127}
6128
6129/*
6130 * Setup for a virtual disk which backend is a device (a physical disk,
6131 * slice or volume device) exported as a full disk or as a slice. In these
6132 * cases, the backend is accessed using the LDI interface.
6133 */
6134static int
6135vd_setup_backend_ldi(vd_t *vd)
6136{
6137	int		rval, status;
6138	struct dk_cinfo	dk_cinfo;
6139	char		*device_path = vd->device_path;
6140
6141	/* device has been opened by vd_identify_dev() */
6142	ASSERT(vd->ldi_handle[0] != NULL);
6143	ASSERT(vd->dev[0] != NULL);
6144
6145	vd->file = B_FALSE;
6146
6147	/* Verify backing device supports dk_cinfo */
6148	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
6149	    (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred,
6150	    &rval)) != 0) {
6151		PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
6152		    status, device_path);
6153		return (status);
6154	}
6155	if (dk_cinfo.dki_partition >= V_NUMPAR) {
6156		PRN("slice %u >= maximum slice %u for %s",
6157		    dk_cinfo.dki_partition, V_NUMPAR, device_path);
6158		return (EIO);
6159	}
6160
6161	/*
6162	 * The device has been opened read-only by vd_identify_dev(), re-open
6163	 * it read-write if the write flag is set and we don't have an optical
6164	 * device such as a CD-ROM, which, for now, we do not permit writes to
6165	 * and thus should not export write operations to the client.
6166	 *
6167	 * Future: if/when we implement support for guest domains writing to
6168	 * optical devices we will need to do further checking of the media type
6169	 * to distinguish between read-only and writable discs.
6170	 */
6171	if (dk_cinfo.dki_ctype == DKC_CDROM) {
6172
6173		vd->open_flags &= ~FWRITE;
6174
6175	} else if (vd->open_flags & FWRITE) {
6176
6177		(void) ldi_close(vd->ldi_handle[0], vd->open_flags & ~FWRITE,
6178		    kcred);
6179		status = vd_open_using_ldi_by_name(vd, vd->open_flags);
6180		if (status != 0) {
6181			PR0("Failed to open (%s) = errno %d",
6182			    device_path, status);
6183			return (status);
6184		}
6185	}
6186
6187	/* Store the device's max transfer size for return to the client */
6188	vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
6189
6190	/*
6191	 * We need to work out if it's an ATAPI (IDE CD-ROM) or SCSI device so
6192	 * that we can use the correct CDB group when sending USCSI commands.
6193	 */
6194	vd->is_atapi_dev = vd_is_atapi_device(vd);
6195
6196	/*
6197	 * Export a full disk.
6198	 *
6199	 * The exported device can be either a volume, a disk or a CD/DVD
6200	 * device.  We export a device as a full disk if we have an entire
6201	 * disk slice (slice 2) and if this slice is exported as a full disk
6202	 * and not as a single slice disk. A CD or DVD device is exported
6203	 * as a full disk (even if it isn't s2). A volume is exported as a
6204	 * full disk as long as the "slice" option is not specified.
6205	 */
6206	if (vd->vdisk_type == VD_DISK_TYPE_DISK) {
6207
6208		if (vd->volume) {
6209			/* setup disk image */
6210			return (vd_setup_disk_image(vd));
6211		}
6212
6213		if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE ||
6214		    dk_cinfo.dki_ctype == DKC_CDROM) {
6215			ASSERT(!vd->volume);
6216			if (dk_cinfo.dki_ctype == DKC_SCSI_CCS)
6217				vd->scsi = B_TRUE;
6218			return (vd_setup_full_disk(vd));
6219		}
6220	}
6221
6222	/*
6223	 * Export a single slice disk.
6224	 *
6225	 * The exported device can be either a volume device or a disk slice. If
6226	 * it is a disk slice different from slice 2 then it is always exported
6227	 * as a single slice disk even if the "slice" option is not specified.
6228	 * If it is disk slice 2 or a volume device then it is exported as a
6229	 * single slice disk only if the "slice" option is specified.
6230	 */
6231	return (vd_setup_single_slice_disk(vd));
6232}
6233
6234static int
6235vd_setup_single_slice_disk(vd_t *vd)
6236{
6237	int status, rval;
6238	struct dk_label label;
6239	char *device_path = vd->device_path;
6240	struct vtoc vtoc;
6241
6242	vd->vdisk_media = VD_MEDIA_FIXED;
6243
6244	if (vd->volume) {
6245		ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
6246	}
6247
6248	/*
6249	 * We export the slice as a single slice disk even if the "slice"
6250	 * option was not specified.
6251	 */
6252	vd->vdisk_type  = VD_DISK_TYPE_SLICE;
6253	vd->nslices	= 1;
6254
6255	/* Get size of backing device */
6256	if ((status = vd_backend_check_size(vd)) != 0) {
6257		PRN("Check size failed for %s (errno %d)", device_path, status);
6258		return (EIO);
6259	}
6260
6261	/*
6262	 * When exporting a slice or a device as a single slice disk, we don't
6263	 * care about any partitioning exposed by the backend. The goal is just
6264	 * to export the backend as a flat storage. We provide a fake partition
6265	 * table (either a VTOC or EFI), which presents only one slice, to
6266	 * accommodate tools expecting a disk label. The selection of the label
6267	 * type (VTOC or EFI) depends on the value of the vd_slice_label
6268	 * variable.
6269	 */
6270	if (vd_slice_label == VD_DISK_LABEL_EFI ||
6271	    vd->vdisk_size >= ONE_TERABYTE / vd->vdisk_bsize) {
6272		vd->vdisk_label = VD_DISK_LABEL_EFI;
6273	} else {
6274		status = ldi_ioctl(vd->ldi_handle[0], DKIOCGEXTVTOC,
6275		    (intptr_t)&vd->vtoc, (vd->open_flags | FKIOCTL),
6276		    kcred, &rval);
6277
6278		if (status == ENOTTY) {
6279			/* try with the non-extended vtoc ioctl */
6280			status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC,
6281			    (intptr_t)&vtoc, (vd->open_flags | FKIOCTL),
6282			    kcred, &rval);
6283			vtoctoextvtoc(vtoc, vd->vtoc);
6284		}
6285
6286		if (status == 0) {
6287			status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM,
6288			    (intptr_t)&vd->dk_geom, (vd->open_flags | FKIOCTL),
6289			    kcred, &rval);
6290
6291			if (status != 0) {
6292				PRN("ldi_ioctl(DKIOCGEOM) returned errno %d "
6293				    "for %s", status, device_path);
6294				return (status);
6295			}
6296			vd->vdisk_label = VD_DISK_LABEL_VTOC;
6297
6298		} else if (vd_slice_label == VD_DISK_LABEL_VTOC) {
6299
6300			vd->vdisk_label = VD_DISK_LABEL_VTOC;
6301			vd_build_default_label(vd->vdisk_size * vd->vdisk_bsize,
6302			    vd->vdisk_bsize, &label);
6303			vd_label_to_vtocgeom(&label, &vd->vtoc, &vd->dk_geom);
6304
6305		} else {
6306			vd->vdisk_label = VD_DISK_LABEL_EFI;
6307		}
6308	}
6309
6310	if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
6311		/* export with a fake VTOC label */
6312		status = vd_setup_partition_vtoc(vd);
6313
6314	} else {
6315		/* export with a fake EFI label */
6316		status = vd_setup_partition_efi(vd);
6317	}
6318
6319	return (status);
6320}
6321
6322/*
6323 * This function is invoked when setting up the vdisk backend and to process
6324 * the VD_OP_GET_CAPACITY operation. It checks the backend size and set the
6325 * following attributes of the vd structure:
6326 *
6327 * - vdisk_bsize: block size for the virtual disk used by the VIO protocol. Its
6328 *   value is 512 bytes (DEV_BSIZE) when the backend is a file, a volume or a
6329 *   CD/DVD. When the backend is a disk or a disk slice then it has the value
6330 *   of the logical block size of that disk (as returned by the DKIOCGMEDIAINFO
6331 *   ioctl). This block size is expected to be a power of 2 and a multiple of
6332 *   512.
6333 *
6334 * - vdisk_size: size of the virtual disk expressed as a number of vdisk_bsize
6335 *   blocks.
6336 *
6337 * vdisk_size and vdisk_bsize are sent to the vdisk client during the connection
6338 * handshake and in the result of a VD_OP_GET_CAPACITY operation.
6339 *
6340 * - backend_bsize: block size of the backend device. backend_bsize has the same
6341 *   value as vdisk_bsize except when the backend is a CD/DVD. In that case,
6342 *   vdisk_bsize is set to 512 (DEV_BSIZE) while backend_bsize is set to the
6343 *   effective logical block size of the CD/DVD (usually 2048).
6344 *
6345 * - dskimg_size: size of the backend when the backend is a disk image. This
6346 *   attribute is set only when the backend is a file or a volume, otherwise it
6347 *   is unused.
6348 *
6349 * - vio_bshift: number of bit to shift to convert a VIO block number (which
6350 *   uses a block size of vdisk_bsize) to a buf(9s) block number (which uses a
6351 *   block size of 512 bytes) i.e. we have vdisk_bsize = 512 x 2 ^ vio_bshift
6352 *
6353 * - vdisk_media: media of the virtual disk. This function only sets this
6354 *   attribute for physical disk and CD/DVD. For other backend types, this
6355 *   attribute is set in the setup function of the backend.
6356 */
6357static int
6358vd_backend_check_size(vd_t *vd)
6359{
6360	size_t backend_size, backend_bsize, vdisk_bsize;
6361	size_t old_size, new_size;
6362	struct dk_minfo minfo;
6363	vattr_t vattr;
6364	int rval, rv, media, nshift = 0;
6365	uint32_t n;
6366
6367	if (vd->file) {
6368
6369		/* file (slice or full disk) */
6370		vattr.va_mask = AT_SIZE;
6371		rv = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred, NULL);
6372		if (rv != 0) {
6373			PR0("VOP_GETATTR(%s) = errno %d", vd->device_path, rv);
6374			return (rv);
6375		}
6376		backend_size = vattr.va_size;
6377		backend_bsize = DEV_BSIZE;
6378		vdisk_bsize = DEV_BSIZE;
6379
6380	} else if (vd->volume) {
6381
6382		/* volume (slice or full disk) */
6383		rv = ldi_get_size(vd->ldi_handle[0], &backend_size);
6384		if (rv != DDI_SUCCESS) {
6385			PR0("ldi_get_size() failed for %s", vd->device_path);
6386			return (EIO);
6387		}
6388		backend_bsize = DEV_BSIZE;
6389		vdisk_bsize = DEV_BSIZE;
6390
6391	} else {
6392
6393		/* physical disk or slice */
6394		rv = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO,
6395		    (intptr_t)&minfo, (vd->open_flags | FKIOCTL),
6396		    kcred, &rval);
6397		if (rv != 0) {
6398			PR0("DKIOCGMEDIAINFO failed for %s (err=%d)",
6399			    vd->device_path, rv);
6400			return (rv);
6401		}
6402
6403		if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
6404			rv = ldi_get_size(vd->ldi_handle[0], &backend_size);
6405			if (rv != DDI_SUCCESS) {
6406				PR0("ldi_get_size() failed for %s",
6407				    vd->device_path);
6408				return (EIO);
6409			}
6410		} else {
6411			ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
6412			backend_size = minfo.dki_capacity * minfo.dki_lbsize;
6413		}
6414
6415		backend_bsize = minfo.dki_lbsize;
6416		media = DK_MEDIATYPE2VD_MEDIATYPE(minfo.dki_media_type);
6417
6418		/*
6419		 * If the device is a CD or a DVD then we force the vdisk block
6420		 * size to 512 bytes (DEV_BSIZE). In that case, vdisk_bsize can
6421		 * be different from backend_size.
6422		 */
6423		if (media == VD_MEDIA_CD || media == VD_MEDIA_DVD)
6424			vdisk_bsize = DEV_BSIZE;
6425		else
6426			vdisk_bsize = backend_bsize;
6427	}
6428
6429	/* check vdisk block size */
6430	if (vdisk_bsize == 0 || vdisk_bsize % DEV_BSIZE != 0)
6431		return (EINVAL);
6432
6433	old_size = vd->vdisk_size;
6434	new_size = backend_size / vdisk_bsize;
6435
6436	/* check if size has changed */
6437	if (old_size != VD_SIZE_UNKNOWN && old_size == new_size &&
6438	    vd->vdisk_bsize == vdisk_bsize)
6439		return (0);
6440
6441	/* cache info for blk conversion */
6442	for (n = vdisk_bsize / DEV_BSIZE; n > 1; n >>= 1) {
6443		if ((n & 0x1) != 0) {
6444			/* blk_size is not a power of 2 */
6445			return (EINVAL);
6446		}
6447		nshift++;
6448	}
6449
6450	vd->vio_bshift = nshift;
6451	vd->vdisk_size = new_size;
6452	vd->vdisk_bsize = vdisk_bsize;
6453	vd->backend_bsize = backend_bsize;
6454
6455	if (vd->file || vd->volume)
6456		vd->dskimg_size = backend_size;
6457
6458	/*
6459	 * If we are exporting a single-slice disk and the size of the backend
6460	 * has changed then we regenerate the partition setup so that the
6461	 * partitioning matches with the new disk backend size.
6462	 */
6463
6464	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
6465		/* slice or file or device exported as a slice */
6466		if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
6467			rv = vd_setup_partition_vtoc(vd);
6468			if (rv != 0) {
6469				PR0("vd_setup_partition_vtoc() failed for %s "
6470				    "(err = %d)", vd->device_path, rv);
6471				return (rv);
6472			}
6473		} else {
6474			rv = vd_setup_partition_efi(vd);
6475			if (rv != 0) {
6476				PR0("vd_setup_partition_efi() failed for %s "
6477				    "(err = %d)", vd->device_path, rv);
6478				return (rv);
6479			}
6480		}
6481
6482	} else if (!vd->file && !vd->volume) {
6483		/* physical disk */
6484		ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
6485		vd->vdisk_media = media;
6486	}
6487
6488	return (0);
6489}
6490
6491/*
6492 * Description:
6493 *	Open a device using its device path and identify if this is
6494 *	a disk device or a volume device.
6495 *
6496 * Parameters:
6497 *	vd	- pointer to structure containing the vDisk info
6498 *	dtype	- return the driver type of the device
6499 *
6500 * Return Value
6501 *	0	- success
6502 *	!= 0	- some other non-zero return value from ldi(9F) functions
6503 */
6504static int
6505vd_identify_dev(vd_t *vd, int *dtype)
6506{
6507	int status, i;
6508	char *device_path = vd->device_path;
6509	char *drv_name;
6510	int drv_type;
6511	vds_t *vds = vd->vds;
6512
6513	status = vd_open_using_ldi_by_name(vd, vd->open_flags & ~FWRITE);
6514	if (status != 0) {
6515		PR0("Failed to open (%s) = errno %d", device_path, status);
6516		return (status);
6517	}
6518
6519	/* Get device number of backing device */
6520	if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) {
6521		PRN("ldi_get_dev() returned errno %d for %s",
6522		    status, device_path);
6523		return (status);
6524	}
6525
6526	/*
6527	 * We start by looking if the driver is in the list from vds.conf
6528	 * so that we can override the built-in list using vds.conf.
6529	 */
6530	drv_name = ddi_major_to_name(getmajor(vd->dev[0]));
6531	drv_type = VD_DRIVER_UNKNOWN;
6532
6533	/* check vds.conf list */
6534	for (i = 0; i < vds->num_drivers; i++) {
6535		if (vds->driver_types[i].type == VD_DRIVER_UNKNOWN) {
6536			/* ignore invalid entries */
6537			continue;
6538		}
6539		if (strcmp(drv_name, vds->driver_types[i].name) == 0) {
6540			drv_type = vds->driver_types[i].type;
6541			goto done;
6542		}
6543	}
6544
6545	/* check built-in list */
6546	for (i = 0; i < VDS_NUM_DRIVERS; i++) {
6547		if (strcmp(drv_name, vds_driver_types[i].name) == 0) {
6548			drv_type = vds_driver_types[i].type;
6549			goto done;
6550		}
6551	}
6552
6553done:
6554	PR0("driver %s identified as %s", drv_name,
6555	    (drv_type == VD_DRIVER_DISK)? "DISK" :
6556	    (drv_type == VD_DRIVER_VOLUME)? "VOLUME" : "UNKNOWN");
6557
6558	if (strcmp(drv_name, "zfs") == 0)
6559		vd->zvol = B_TRUE;
6560
6561	*dtype = drv_type;
6562
6563	return (0);
6564}
6565
6566static int
6567