1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26/*
27 * LDoms virtual disk client (vdc) device driver
28 *
29 * This driver runs on a guest logical domain and communicates with the virtual
30 * disk server (vds) driver running on the service domain which is exporting
31 * virtualized "disks" to the guest logical domain.
32 *
33 * The driver can be divided into four sections:
34 *
35 * 1) generic device driver housekeeping
36 *	_init, _fini, attach, detach, ops structures, etc.
37 *
38 * 2) communication channel setup
39 *	Setup the communications link over the LDC channel that vdc uses to
40 *	talk to the vDisk server. Initialise the descriptor ring which
41 *	allows the LDC clients to transfer data via memory mappings.
42 *
43 * 3) Support exported to upper layers (filesystems, etc)
44 *	The upper layers call into vdc via strategy(9E) and DKIO(7I)
45 *	ioctl calls. vdc will copy the data to be written to the descriptor
46 *	ring or maps the buffer to store the data read by the vDisk
47 *	server into the descriptor ring. It then sends a message to the
48 *	vDisk server requesting it to complete the operation.
49 *
50 * 4) Handling responses from vDisk server.
51 *	The vDisk server will ACK some or all of the messages vdc sends to it
52 *	(this is configured during the handshake). Upon receipt of an ACK
53 *	vdc will check the descriptor ring and signal to the upper layer
54 *	code waiting on the IO.
55 */
56
57#include <sys/atomic.h>
58#include <sys/conf.h>
59#include <sys/disp.h>
60#include <sys/ddi.h>
61#include <sys/dkio.h>
62#include <sys/efi_partition.h>
63#include <sys/fcntl.h>
64#include <sys/file.h>
65#include <sys/kstat.h>
66#include <sys/mach_descrip.h>
67#include <sys/modctl.h>
68#include <sys/mdeg.h>
69#include <sys/note.h>
70#include <sys/open.h>
71#include <sys/random.h>
72#include <sys/sdt.h>
73#include <sys/stat.h>
74#include <sys/sunddi.h>
75#include <sys/types.h>
76#include <sys/promif.h>
77#include <sys/var.h>
78#include <sys/vtoc.h>
79#include <sys/archsystm.h>
80#include <sys/sysmacros.h>
81
82#include <sys/cdio.h>
83#include <sys/dktp/fdisk.h>
84#include <sys/dktp/dadkio.h>
85#include <sys/fs/dv_node.h>
86#include <sys/mhd.h>
87#include <sys/scsi/generic/sense.h>
88#include <sys/scsi/impl/uscsi.h>
89#include <sys/scsi/impl/services.h>
90#include <sys/scsi/targets/sddef.h>
91
92#include <sys/ldoms.h>
93#include <sys/ldc.h>
94#include <sys/vio_common.h>
95#include <sys/vio_mailbox.h>
96#include <sys/vio_util.h>
97#include <sys/vdsk_common.h>
98#include <sys/vdsk_mailbox.h>
99#include <sys/vdc.h>
100
101#define	VD_OLDVTOC_LIMIT	0x7fffffff
102
103/*
104 * function prototypes
105 */
106
107/* standard driver functions */
108static int	vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred);
109static int	vdc_close(dev_t dev, int flag, int otyp, cred_t *cred);
110static int	vdc_strategy(struct buf *buf);
111static int	vdc_print(dev_t dev, char *str);
112static int	vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk);
113static int	vdc_read(dev_t dev, struct uio *uio, cred_t *cred);
114static int	vdc_write(dev_t dev, struct uio *uio, cred_t *cred);
115static int	vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
116			cred_t *credp, int *rvalp);
117static int	vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred);
118static int	vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred);
119
120static int	vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,
121			void *arg, void **resultp);
122static int	vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
123static int	vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
124static int	vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
125		    int mod_flags, char *name, caddr_t valuep, int *lengthp);
126
127/* setup */
128static void	vdc_min(struct buf *bufp);
129static int	vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen);
130static int	vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr);
131static int	vdc_start_ldc_connection(vdc_t *vdc);
132static int	vdc_create_device_nodes(vdc_t *vdc);
133static int	vdc_create_device_nodes_efi(vdc_t *vdc);
134static int	vdc_create_device_nodes_vtoc(vdc_t *vdc);
135static void	vdc_create_io_kstats(vdc_t *vdc);
136static void	vdc_create_err_kstats(vdc_t *vdc);
137static void	vdc_set_err_kstats(vdc_t *vdc);
138static int	vdc_get_md_node(dev_info_t *dip, md_t **mdpp,
139		    mde_cookie_t *vd_nodep);
140static int	vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep);
141static void	vdc_fini_ports(vdc_t *vdc);
142static void	vdc_switch_server(vdc_t *vdcp);
143static int	vdc_do_ldc_up(vdc_t *vdc);
144static void	vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr);
145static int	vdc_init_descriptor_ring(vdc_t *vdc);
146static void	vdc_destroy_descriptor_ring(vdc_t *vdc);
147static int	vdc_setup_devid(vdc_t *vdc);
148static void	vdc_store_label_efi(vdc_t *, efi_gpt_t *, efi_gpe_t *);
149static void	vdc_store_label_vtoc(vdc_t *, struct dk_geom *,
150		    struct extvtoc *);
151static void	vdc_store_label_unk(vdc_t *vdc);
152static boolean_t vdc_is_opened(vdc_t *vdc);
153static void	vdc_update_size(vdc_t *vdc, size_t, size_t, size_t);
154static int	vdc_update_vio_bsize(vdc_t *vdc, uint32_t);
155
156/* handshake with vds */
157static int		vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver);
158static int		vdc_ver_negotiation(vdc_t *vdcp);
159static int		vdc_init_attr_negotiation(vdc_t *vdc);
160static int		vdc_attr_negotiation(vdc_t *vdcp);
161static int		vdc_init_dring_negotiate(vdc_t *vdc);
162static int		vdc_dring_negotiation(vdc_t *vdcp);
163static int		vdc_send_rdx(vdc_t *vdcp);
164static int		vdc_rdx_exchange(vdc_t *vdcp);
165static boolean_t	vdc_is_supported_version(vio_ver_msg_t *ver_msg);
166
167/* processing incoming messages from vDisk server */
168static void	vdc_process_msg_thread(vdc_t *vdc);
169static int	vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp);
170
171static uint_t	vdc_handle_cb(uint64_t event, caddr_t arg);
172static int	vdc_process_data_msg(vdc_t *vdc, vio_msg_t *msg);
173static int	vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg);
174static int	vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg);
175static int	vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg);
176static int	vdc_send_request(vdc_t *vdcp, int operation,
177		    caddr_t addr, size_t nbytes, int slice, diskaddr_t offset,
178		    buf_t *bufp, vio_desc_direction_t dir, int flags);
179static int	vdc_map_to_shared_dring(vdc_t *vdcp, int idx);
180static int	vdc_populate_descriptor(vdc_t *vdcp, int operation,
181		    caddr_t addr, size_t nbytes, int slice, diskaddr_t offset,
182		    buf_t *bufp, vio_desc_direction_t dir, int flags);
183static int	vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr,
184		    size_t nbytes, int slice, diskaddr_t offset,
185		    vio_desc_direction_t dir, boolean_t);
186static int	vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes,
187		    int slice, diskaddr_t offset, struct buf *bufp,
188		    vio_desc_direction_t dir, int flags);
189
190static int	vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp);
191static int	vdc_drain_response(vdc_t *vdcp, struct buf *buf);
192static int	vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx);
193static int	vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep);
194static int	vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg);
195
196/* dkio */
197static int	vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode,
198		    int *rvalp);
199static int	vd_process_efi_ioctl(void *vdisk, int cmd, uintptr_t arg);
200static void	vdc_create_fake_geometry(vdc_t *vdc);
201static int	vdc_validate_geometry(vdc_t *vdc);
202static void	vdc_validate(vdc_t *vdc);
203static void	vdc_validate_task(void *arg);
204static int	vdc_null_copy_func(vdc_t *vdc, void *from, void *to,
205		    int mode, int dir);
206static int	vdc_get_wce_convert(vdc_t *vdc, void *from, void *to,
207		    int mode, int dir);
208static int	vdc_set_wce_convert(vdc_t *vdc, void *from, void *to,
209		    int mode, int dir);
210static int	vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to,
211		    int mode, int dir);
212static int	vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to,
213		    int mode, int dir);
214static int	vdc_get_extvtoc_convert(vdc_t *vdc, void *from, void *to,
215		    int mode, int dir);
216static int	vdc_set_extvtoc_convert(vdc_t *vdc, void *from, void *to,
217		    int mode, int dir);
218static int	vdc_get_geom_convert(vdc_t *vdc, void *from, void *to,
219		    int mode, int dir);
220static int	vdc_set_geom_convert(vdc_t *vdc, void *from, void *to,
221		    int mode, int dir);
222static int	vdc_get_efi_convert(vdc_t *vdc, void *from, void *to,
223		    int mode, int dir);
224static int	vdc_set_efi_convert(vdc_t *vdc, void *from, void *to,
225		    int mode, int dir);
226
227static void	vdc_ownership_update(vdc_t *vdc, int ownership_flags);
228static int	vdc_access_set(vdc_t *vdc, uint64_t flags);
229static vdc_io_t	*vdc_eio_queue(vdc_t *vdc, int index);
230static void	vdc_eio_unqueue(vdc_t *vdc, clock_t deadline,
231		    boolean_t complete_io);
232static int	vdc_eio_check(vdc_t *vdc, int flags);
233static void	vdc_eio_thread(void *arg);
234
235/*
236 * Module variables
237 */
238
239/*
240 * Number of handshake retries with the current server before switching to
241 * a different server. These retries are done so that we stick with the same
242 * server if vdc receives a LDC reset event during the initiation of the
243 * handshake. This can happen if vdc reset the LDC channel and then immediately
244 * retry a connexion before it has received the LDC reset event.
245 *
246 * If there is only one server then we "switch" to the same server. We also
247 * switch if the handshake has reached the attribute negotiate step whatever
248 * the number of handshake retries might be.
249 */
250static uint_t vdc_hshake_retries = VDC_HSHAKE_RETRIES;
251
252/*
253 * If the handshake done during the attach fails then the two following
254 * variables will also be used to control the number of retries for the
255 * next handshakes. In that case, when a handshake is done after the
256 * attach (i.e. the vdc lifecycle is VDC_ONLINE_PENDING) then the handshake
257 * will be retried until we have done an attribution negotiation with each
258 * server, with a specified minimum total number of negotations (the value
259 * of the vdc_hattr_min_initial or vdc_hattr_min variable).
260 *
261 * This prevents new I/Os on a newly used vdisk to block forever if the
262 * attribute negotiations can not be done, and to limit the amount of time
263 * before I/Os will fail. Basically, attribute negotiations will fail when
264 * the service is up but the backend does not exist. In that case, vds will
265 * typically retry to access the backend during 50 seconds. So I/Os will fail
266 * after the following amount of time:
267 *
268 *	50 seconds x max(number of servers, vdc->hattr_min)
269 *
270 * After that the handshake done during the attach has failed then the next
271 * handshake will use vdc_attr_min_initial. This handshake will correspond to
272 * the very first I/O to the device. If this handshake also fails then
273 * vdc_hattr_min will be used for subsequent handshakes. We typically allow
274 * more retries for the first handshake (VDC_HATTR_MIN_INITIAL = 3) to give more
275 * time for the backend to become available (50s x VDC_HATTR_MIN_INITIAL = 150s)
276 * in case this is a critical vdisk (e.g. vdisk access during boot). Then we use
277 * a smaller value (VDC_HATTR_MIN = 1) to avoid waiting too long for each I/O.
278 */
279static uint_t vdc_hattr_min_initial = VDC_HATTR_MIN_INITIAL;
280static uint_t vdc_hattr_min = VDC_HATTR_MIN;
281
282/*
283 * Tunable variables to control how long vdc waits before timing out on
284 * various operations
285 */
286static int	vdc_timeout = 0; /* units: seconds */
287static int	vdc_ldcup_timeout = 1; /* units: seconds */
288
289static uint64_t vdc_hz_min_ldc_delay;
290static uint64_t vdc_min_timeout_ldc = 1 * MILLISEC;
291static uint64_t vdc_hz_max_ldc_delay;
292static uint64_t vdc_max_timeout_ldc = 100 * MILLISEC;
293
294static uint64_t vdc_ldc_read_init_delay = 1 * MILLISEC;
295static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC;
296
297/* values for dumping - need to run in a tighter loop */
298static uint64_t	vdc_usec_timeout_dump = 100 * MILLISEC;	/* 0.1s units: ns */
299static int	vdc_dump_retries = 100;
300
301static uint16_t	vdc_scsi_timeout = 60;	/* 60s units: seconds  */
302
303static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */
304
305/* Count of the number of vdc instances attached */
306static volatile uint32_t	vdc_instance_count = 0;
307
308/* Tunable to log all SCSI errors */
309static boolean_t vdc_scsi_log_error = B_FALSE;
310
311/* Soft state pointer */
312static void	*vdc_state;
313
314/*
315 * Controlling the verbosity of the error/debug messages
316 *
317 * vdc_msglevel - controls level of messages
318 * vdc_matchinst - 64-bit variable where each bit corresponds
319 *                 to the vdc instance the vdc_msglevel applies.
320 */
321int		vdc_msglevel = 0x0;
322uint64_t	vdc_matchinst = 0ull;
323
324/*
325 * Supported vDisk protocol version pairs.
326 *
327 * The first array entry is the latest and preferred version.
328 */
329static const vio_ver_t	vdc_version[] = {{1, 1}};
330
331static struct cb_ops vdc_cb_ops = {
332	vdc_open,	/* cb_open */
333	vdc_close,	/* cb_close */
334	vdc_strategy,	/* cb_strategy */
335	vdc_print,	/* cb_print */
336	vdc_dump,	/* cb_dump */
337	vdc_read,	/* cb_read */
338	vdc_write,	/* cb_write */
339	vdc_ioctl,	/* cb_ioctl */
340	nodev,		/* cb_devmap */
341	nodev,		/* cb_mmap */
342	nodev,		/* cb_segmap */
343	nochpoll,	/* cb_chpoll */
344	vdc_prop_op,	/* cb_prop_op */
345	NULL,		/* cb_str */
346	D_MP | D_64BIT,	/* cb_flag */
347	CB_REV,		/* cb_rev */
348	vdc_aread,	/* cb_aread */
349	vdc_awrite	/* cb_awrite */
350};
351
352static struct dev_ops vdc_ops = {
353	DEVO_REV,	/* devo_rev */
354	0,		/* devo_refcnt */
355	vdc_getinfo,	/* devo_getinfo */
356	nulldev,	/* devo_identify */
357	nulldev,	/* devo_probe */
358	vdc_attach,	/* devo_attach */
359	vdc_detach,	/* devo_detach */
360	nodev,		/* devo_reset */
361	&vdc_cb_ops,	/* devo_cb_ops */
362	NULL,		/* devo_bus_ops */
363	nulldev,	/* devo_power */
364	ddi_quiesce_not_needed,	/* devo_quiesce */
365};
366
367static struct modldrv modldrv = {
368	&mod_driverops,
369	"virtual disk client",
370	&vdc_ops,
371};
372
373static struct modlinkage modlinkage = {
374	MODREV_1,
375	&modldrv,
376	NULL
377};
378
379/* -------------------------------------------------------------------------- */
380
381/*
382 * Device Driver housekeeping and setup
383 */
384
385int
386_init(void)
387{
388	int	status;
389
390	if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0)
391		return (status);
392	if ((status = mod_install(&modlinkage)) != 0)
393		ddi_soft_state_fini(&vdc_state);
394	return (status);
395}
396
397int
398_info(struct modinfo *modinfop)
399{
400	return (mod_info(&modlinkage, modinfop));
401}
402
403int
404_fini(void)
405{
406	int	status;
407
408	if ((status = mod_remove(&modlinkage)) != 0)
409		return (status);
410	ddi_soft_state_fini(&vdc_state);
411	return (0);
412}
413
414static int
415vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,  void *arg, void **resultp)
416{
417	_NOTE(ARGUNUSED(dip))
418
419	int	instance = VDCUNIT((dev_t)arg);
420	vdc_t	*vdc = NULL;
421
422	switch (cmd) {
423	case DDI_INFO_DEVT2DEVINFO:
424		if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
425			*resultp = NULL;
426			return (DDI_FAILURE);
427		}
428		*resultp = vdc->dip;
429		return (DDI_SUCCESS);
430	case DDI_INFO_DEVT2INSTANCE:
431		*resultp = (void *)(uintptr_t)instance;
432		return (DDI_SUCCESS);
433	default:
434		*resultp = NULL;
435		return (DDI_FAILURE);
436	}
437}
438
439static int
440vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
441{
442	kt_did_t eio_tid, ownership_tid;
443	int	instance;
444	int	rv;
445	vdc_server_t *srvr;
446	vdc_t	*vdc = NULL;
447
448	switch (cmd) {
449	case DDI_DETACH:
450		/* the real work happens below */
451		break;
452	case DDI_SUSPEND:
453		/* nothing to do for this non-device */
454		return (DDI_SUCCESS);
455	default:
456		return (DDI_FAILURE);
457	}
458
459	ASSERT(cmd == DDI_DETACH);
460	instance = ddi_get_instance(dip);
461	DMSGX(1, "[%d] Entered\n", instance);
462
463	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
464		cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance);
465		return (DDI_FAILURE);
466	}
467
468	if (vdc_is_opened(vdc)) {
469		DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance);
470		return (DDI_FAILURE);
471	}
472
473	if (vdc->dkio_flush_pending) {
474		DMSG(vdc, 0,
475		    "[%d] Cannot detach: %d outstanding DKIO flushes\n",
476		    instance, vdc->dkio_flush_pending);
477		return (DDI_FAILURE);
478	}
479
480	if (vdc->validate_pending) {
481		DMSG(vdc, 0,
482		    "[%d] Cannot detach: %d outstanding validate request\n",
483		    instance, vdc->validate_pending);
484		return (DDI_FAILURE);
485	}
486
487	DMSG(vdc, 0, "[%d] proceeding...\n", instance);
488
489	/* If we took ownership, release ownership */
490	mutex_enter(&vdc->ownership_lock);
491	if (vdc->ownership & VDC_OWNERSHIP_GRANTED) {
492		rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR);
493		if (rv == 0) {
494			vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE);
495		}
496	}
497	mutex_exit(&vdc->ownership_lock);
498
499	/* mark instance as detaching */
500	mutex_enter(&vdc->lock);
501	vdc->lifecycle	= VDC_LC_DETACHING;
502	mutex_exit(&vdc->lock);
503
504	/*
505	 * Try and disable callbacks to prevent another handshake. We have to
506	 * disable callbacks for all servers.
507	 */
508	for (srvr = vdc->server_list; srvr != NULL; srvr = srvr->next) {
509		rv = ldc_set_cb_mode(srvr->ldc_handle, LDC_CB_DISABLE);
510		DMSG(vdc, 0, "callback disabled (ldc=%lu, rv=%d)\n",
511		    srvr->ldc_id, rv);
512	}
513
514	if (vdc->initialized & VDC_THREAD) {
515		mutex_enter(&vdc->read_lock);
516		if ((vdc->read_state == VDC_READ_WAITING) ||
517		    (vdc->read_state == VDC_READ_RESET)) {
518			vdc->read_state = VDC_READ_RESET;
519			cv_signal(&vdc->read_cv);
520		}
521
522		mutex_exit(&vdc->read_lock);
523
524		/* wake up any thread waiting for connection to come online */
525		mutex_enter(&vdc->lock);
526		if (vdc->state == VDC_STATE_INIT_WAITING) {
527			DMSG(vdc, 0,
528			    "[%d] write reset - move to resetting state...\n",
529			    instance);
530			vdc->state = VDC_STATE_RESETTING;
531			cv_signal(&vdc->initwait_cv);
532		} else if (vdc->state == VDC_STATE_FAILED) {
533			vdc->io_pending = B_TRUE;
534			cv_signal(&vdc->io_pending_cv);
535		}
536		mutex_exit(&vdc->lock);
537
538		/* now wait until state transitions to VDC_STATE_DETACH */
539		thread_join(vdc->msg_proc_thr->t_did);
540		ASSERT(vdc->state == VDC_STATE_DETACH);
541		DMSG(vdc, 0, "[%d] Reset thread exit and join ..\n",
542		    vdc->instance);
543	}
544
545	mutex_enter(&vdc->lock);
546
547	if (vdc->initialized & VDC_DRING)
548		vdc_destroy_descriptor_ring(vdc);
549
550	vdc_fini_ports(vdc);
551
552	if (vdc->eio_thread) {
553		eio_tid = vdc->eio_thread->t_did;
554		vdc->failfast_interval = 0;
555		ASSERT(vdc->num_servers == 0);
556		cv_signal(&vdc->eio_cv);
557	} else {
558		eio_tid = 0;
559	}
560
561	if (vdc->ownership & VDC_OWNERSHIP_WANTED) {
562		ownership_tid = vdc->ownership_thread->t_did;
563		vdc->ownership = VDC_OWNERSHIP_NONE;
564		cv_signal(&vdc->ownership_cv);
565	} else {
566		ownership_tid = 0;
567	}
568
569	mutex_exit(&vdc->lock);
570
571	if (eio_tid != 0)
572		thread_join(eio_tid);
573
574	if (ownership_tid != 0)
575		thread_join(ownership_tid);
576
577	if (vdc->initialized & VDC_MINOR)
578		ddi_remove_minor_node(dip, NULL);
579
580	if (vdc->io_stats) {
581		kstat_delete(vdc->io_stats);
582		vdc->io_stats = NULL;
583	}
584
585	if (vdc->err_stats) {
586		kstat_delete(vdc->err_stats);
587		vdc->err_stats = NULL;
588	}
589
590	if (vdc->initialized & VDC_LOCKS) {
591		mutex_destroy(&vdc->lock);
592		mutex_destroy(&vdc->read_lock);
593		mutex_destroy(&vdc->ownership_lock);
594		cv_destroy(&vdc->initwait_cv);
595		cv_destroy(&vdc->dring_free_cv);
596		cv_destroy(&vdc->membind_cv);
597		cv_destroy(&vdc->sync_blocked_cv);
598		cv_destroy(&vdc->read_cv);
599		cv_destroy(&vdc->running_cv);
600		cv_destroy(&vdc->io_pending_cv);
601		cv_destroy(&vdc->ownership_cv);
602		cv_destroy(&vdc->eio_cv);
603	}
604
605	if (vdc->minfo)
606		kmem_free(vdc->minfo, sizeof (struct dk_minfo));
607
608	if (vdc->cinfo)
609		kmem_free(vdc->cinfo, sizeof (struct dk_cinfo));
610
611	if (vdc->vtoc)
612		kmem_free(vdc->vtoc, sizeof (struct extvtoc));
613
614	if (vdc->geom)
615		kmem_free(vdc->geom, sizeof (struct dk_geom));
616
617	if (vdc->devid) {
618		ddi_devid_unregister(dip);
619		ddi_devid_free(vdc->devid);
620	}
621
622	if (vdc->initialized & VDC_SOFT_STATE)
623		ddi_soft_state_free(vdc_state, instance);
624
625	DMSG(vdc, 0, "[%d] End %p\n", instance, (void *)vdc);
626
627	return (DDI_SUCCESS);
628}
629
630
631static int
632vdc_do_attach(dev_info_t *dip)
633{
634	int		instance;
635	vdc_t		*vdc = NULL;
636	int		status;
637	md_t		*mdp;
638	mde_cookie_t	vd_node;
639
640	ASSERT(dip != NULL);
641
642	instance = ddi_get_instance(dip);
643	if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) {
644		cmn_err(CE_NOTE, "[%d] Couldn't alloc state structure",
645		    instance);
646		return (DDI_FAILURE);
647	}
648
649	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
650		cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance);
651		return (DDI_FAILURE);
652	}
653
654	/*
655	 * We assign the value to initialized in this case to zero out the
656	 * variable and then set bits in it to indicate what has been done
657	 */
658	vdc->initialized = VDC_SOFT_STATE;
659
660	vdc_hz_min_ldc_delay = drv_usectohz(vdc_min_timeout_ldc);
661	vdc_hz_max_ldc_delay = drv_usectohz(vdc_max_timeout_ldc);
662
663	vdc->dip	= dip;
664	vdc->instance	= instance;
665	vdc->vdisk_type	= VD_DISK_TYPE_UNK;
666	vdc->vdisk_label = VD_DISK_LABEL_UNK;
667	vdc->state	= VDC_STATE_INIT;
668	vdc->lifecycle	= VDC_LC_ATTACHING;
669	vdc->session_id = 0;
670	vdc->vdisk_bsize = DEV_BSIZE;
671	vdc->vio_bmask = 0;
672	vdc->vio_bshift = 0;
673	vdc->max_xfer_sz = maxphys / vdc->vdisk_bsize;
674
675	/*
676	 * We assume, for now, that the vDisk server will export 'read'
677	 * operations to us at a minimum (this is needed because of checks
678	 * in vdc for supported operations early in the handshake process).
679	 * The vDisk server will return ENOTSUP if this is not the case.
680	 * The value will be overwritten during the attribute exchange with
681	 * the bitmask of operations exported by server.
682	 */
683	vdc->operations = VD_OP_MASK_READ;
684
685	vdc->vtoc = NULL;
686	vdc->geom = NULL;
687	vdc->cinfo = NULL;
688	vdc->minfo = NULL;
689
690	mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL);
691	cv_init(&vdc->initwait_cv, NULL, CV_DRIVER, NULL);
692	cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL);
693	cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL);
694	cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL);
695	cv_init(&vdc->io_pending_cv, NULL, CV_DRIVER, NULL);
696
697	vdc->io_pending = B_FALSE;
698	vdc->threads_pending = 0;
699	vdc->sync_op_blocked = B_FALSE;
700	cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL);
701
702	mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL);
703	cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL);
704	cv_init(&vdc->eio_cv, NULL, CV_DRIVER, NULL);
705
706	/* init blocking msg read functionality */
707	mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL);
708	cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL);
709	vdc->read_state = VDC_READ_IDLE;
710
711	vdc->initialized |= VDC_LOCKS;
712
713	/* get device and port MD node for this disk instance */
714	if (vdc_get_md_node(dip, &mdp, &vd_node) != 0) {
715		cmn_err(CE_NOTE, "[%d] Could not get machine description node",
716		    instance);
717		return (DDI_FAILURE);
718	}
719
720	if (vdc_init_ports(vdc, mdp, vd_node) != 0) {
721		cmn_err(CE_NOTE, "[%d] Error initialising ports", instance);
722		return (DDI_FAILURE);
723	}
724
725	(void) md_fini_handle(mdp);
726
727	/* Create the kstats for saving the I/O statistics used by iostat(1M) */
728	vdc_create_io_kstats(vdc);
729	vdc_create_err_kstats(vdc);
730
731	/* Initialize remaining structures before starting the msg thread */
732	vdc->vdisk_label = VD_DISK_LABEL_UNK;
733	vdc->vtoc = kmem_zalloc(sizeof (struct extvtoc), KM_SLEEP);
734	vdc->geom = kmem_zalloc(sizeof (struct dk_geom), KM_SLEEP);
735	vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP);
736
737	/* initialize the thread responsible for managing state with server */
738	vdc->msg_proc_thr = thread_create(NULL, 0, vdc_process_msg_thread,
739	    vdc, 0, &p0, TS_RUN, minclsyspri);
740	if (vdc->msg_proc_thr == NULL) {
741		cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread",
742		    instance);
743		return (DDI_FAILURE);
744	}
745
746	/*
747	 * If there are multiple servers then start the eio thread.
748	 */
749	if (vdc->num_servers > 1) {
750		vdc->eio_thread = thread_create(NULL, 0, vdc_eio_thread, vdc, 0,
751		    &p0, TS_RUN, v.v_maxsyspri - 2);
752		if (vdc->eio_thread == NULL) {
753			cmn_err(CE_NOTE, "[%d] Failed to create error "
754			    "I/O thread", instance);
755			return (DDI_FAILURE);
756		}
757	}
758
759	vdc->initialized |= VDC_THREAD;
760
761	atomic_inc_32(&vdc_instance_count);
762
763	/*
764	 * Check the disk label. This will send requests and do the handshake.
765	 * We don't really care about the disk label now. What we really need is
766	 * the handshake do be done so that we know the type of the disk (slice
767	 * or full disk) and the appropriate device nodes can be created.
768	 */
769
770	mutex_enter(&vdc->lock);
771	(void) vdc_validate_geometry(vdc);
772	mutex_exit(&vdc->lock);
773
774	/*
775	 * Now that we have the device info we can create the device nodes
776	 */
777	status = vdc_create_device_nodes(vdc);
778	if (status) {
779		DMSG(vdc, 0, "[%d] Failed to create device nodes",
780		    instance);
781		goto return_status;
782	}
783
784	/*
785	 * Fill in the fields of the error statistics kstat that were not
786	 * available when creating the kstat
787	 */
788	vdc_set_err_kstats(vdc);
789	ddi_report_dev(dip);
790	ASSERT(vdc->lifecycle == VDC_LC_ONLINE ||
791	    vdc->lifecycle == VDC_LC_ONLINE_PENDING);
792	DMSG(vdc, 0, "[%d] Attach tasks successful\n", instance);
793
794return_status:
795	DMSG(vdc, 0, "[%d] Attach completed\n", instance);
796	return (status);
797}
798
799static int
800vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
801{
802	int	status;
803
804	switch (cmd) {
805	case DDI_ATTACH:
806		if ((status = vdc_do_attach(dip)) != 0)
807			(void) vdc_detach(dip, DDI_DETACH);
808		return (status);
809	case DDI_RESUME:
810		/* nothing to do for this non-device */
811		return (DDI_SUCCESS);
812	default:
813		return (DDI_FAILURE);
814	}
815}
816
817static int
818vdc_do_ldc_init(vdc_t *vdc, vdc_server_t *srvr)
819{
820	int			status = 0;
821	ldc_status_t		ldc_state;
822	ldc_attr_t		ldc_attr;
823
824	ASSERT(vdc != NULL);
825	ASSERT(srvr != NULL);
826
827	ldc_attr.devclass = LDC_DEV_BLK;
828	ldc_attr.instance = vdc->instance;
829	ldc_attr.mode = LDC_MODE_UNRELIABLE;	/* unreliable transport */
830	ldc_attr.mtu = VD_LDC_MTU;
831
832	if ((srvr->state & VDC_LDC_INIT) == 0) {
833		status = ldc_init(srvr->ldc_id, &ldc_attr,
834		    &srvr->ldc_handle);
835		if (status != 0) {
836			DMSG(vdc, 0, "[%d] ldc_init(chan %ld) returned %d",
837			    vdc->instance, srvr->ldc_id, status);
838			return (status);
839		}
840		srvr->state |= VDC_LDC_INIT;
841	}
842	status = ldc_status(srvr->ldc_handle, &ldc_state);
843	if (status != 0) {
844		DMSG(vdc, 0, "[%d] Cannot discover LDC status [err=%d]",
845		    vdc->instance, status);
846		goto init_exit;
847	}
848	srvr->ldc_state = ldc_state;
849
850	if ((srvr->state & VDC_LDC_CB) == 0) {
851		status = ldc_reg_callback(srvr->ldc_handle, vdc_handle_cb,
852		    (caddr_t)srvr);
853		if (status != 0) {
854			DMSG(vdc, 0, "[%d] LDC callback reg. failed (%d)",
855			    vdc->instance, status);
856			goto init_exit;
857		}
858		srvr->state |= VDC_LDC_CB;
859	}
860
861	/*
862	 * At this stage we have initialised LDC, we will now try and open
863	 * the connection.
864	 */
865	if (srvr->ldc_state == LDC_INIT) {
866		status = ldc_open(srvr->ldc_handle);
867		if (status != 0) {
868			DMSG(vdc, 0, "[%d] ldc_open(chan %ld) returned %d",
869			    vdc->instance, srvr->ldc_id, status);
870			goto init_exit;
871		}
872		srvr->state |= VDC_LDC_OPEN;
873	}
874
875init_exit:
876	if (status) {
877		vdc_terminate_ldc(vdc, srvr);
878	}
879
880	return (status);
881}
882
883static int
884vdc_start_ldc_connection(vdc_t *vdc)
885{
886	int		status = 0;
887
888	ASSERT(vdc != NULL);
889
890	ASSERT(MUTEX_HELD(&vdc->lock));
891
892	status = vdc_do_ldc_up(vdc);
893
894	DMSG(vdc, 0, "[%d] Finished bringing up LDC\n", vdc->instance);
895
896	return (status);
897}
898
899static int
900vdc_stop_ldc_connection(vdc_t *vdcp)
901{
902	int	status;
903
904	ASSERT(vdcp != NULL);
905
906	ASSERT(MUTEX_HELD(&vdcp->lock));
907
908	DMSG(vdcp, 0, ": Resetting connection to vDisk server : state %d\n",
909	    vdcp->state);
910
911	status = ldc_down(vdcp->curr_server->ldc_handle);
912	DMSG(vdcp, 0, "ldc_down() = %d\n", status);
913
914	vdcp->initialized &= ~VDC_HANDSHAKE;
915	DMSG(vdcp, 0, "initialized=%x\n", vdcp->initialized);
916
917	return (status);
918}
919
920static void
921vdc_create_io_kstats(vdc_t *vdc)
922{
923	if (vdc->io_stats != NULL) {
924		DMSG(vdc, 0, "[%d] I/O kstat already exists\n", vdc->instance);
925		return;
926	}
927
928	vdc->io_stats = kstat_create(VDC_DRIVER_NAME, vdc->instance, NULL,
929	    "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT);
930	if (vdc->io_stats != NULL) {
931		vdc->io_stats->ks_lock = &vdc->lock;
932		kstat_install(vdc->io_stats);
933	} else {
934		cmn_err(CE_NOTE, "[%d] Failed to create kstat: I/O statistics"
935		    " will not be gathered", vdc->instance);
936	}
937}
938
939static void
940vdc_create_err_kstats(vdc_t *vdc)
941{
942	vd_err_stats_t	*stp;
943	char	kstatmodule_err[KSTAT_STRLEN];
944	char	kstatname[KSTAT_STRLEN];
945	int	ndata = (sizeof (vd_err_stats_t) / sizeof (kstat_named_t));
946	int	instance = vdc->instance;
947
948	if (vdc->err_stats != NULL) {
949		DMSG(vdc, 0, "[%d] ERR kstat already exists\n", vdc->instance);
950		return;
951	}
952
953	(void) snprintf(kstatmodule_err, sizeof (kstatmodule_err),
954	    "%serr", VDC_DRIVER_NAME);
955	(void) snprintf(kstatname, sizeof (kstatname),
956	    "%s%d,err", VDC_DRIVER_NAME, instance);
957
958	vdc->err_stats = kstat_create(kstatmodule_err, instance, kstatname,
959	    "device_error", KSTAT_TYPE_NAMED, ndata, KSTAT_FLAG_PERSISTENT);
960
961	if (vdc->err_stats == NULL) {
962		cmn_err(CE_NOTE, "[%d] Failed to create kstat: Error statistics"
963		    " will not be gathered", instance);
964		return;
965	}
966
967	stp = (vd_err_stats_t *)vdc->err_stats->ks_data;
968	kstat_named_init(&stp->vd_softerrs,	"Soft Errors",
969	    KSTAT_DATA_UINT32);
970	kstat_named_init(&stp->vd_transerrs,	"Transport Errors",
971	    KSTAT_DATA_UINT32);
972	kstat_named_init(&stp->vd_protoerrs,	"Protocol Errors",
973	    KSTAT_DATA_UINT32);
974	kstat_named_init(&stp->vd_vid,		"Vendor",
975	    KSTAT_DATA_CHAR);
976	kstat_named_init(&stp->vd_pid,		"Product",
977	    KSTAT_DATA_CHAR);
978	kstat_named_init(&stp->vd_capacity,	"Size",
979	    KSTAT_DATA_ULONGLONG);
980
981	vdc->err_stats->ks_update  = nulldev;
982
983	kstat_install(vdc->err_stats);
984}
985
986static void
987vdc_set_err_kstats(vdc_t *vdc)
988{
989	vd_err_stats_t  *stp;
990
991	if (vdc->err_stats == NULL)
992		return;
993
994	mutex_enter(&vdc->lock);
995
996	stp = (vd_err_stats_t *)vdc->err_stats->ks_data;
997	ASSERT(stp != NULL);
998
999	stp->vd_capacity.value.ui64 = vdc->vdisk_size * vdc->vdisk_bsize;
1000	(void) strcpy(stp->vd_vid.value.c, "SUN");
1001	(void) strcpy(stp->vd_pid.value.c, "VDSK");
1002
1003	mutex_exit(&vdc->lock);
1004}
1005
1006static int
1007vdc_create_device_nodes_efi(vdc_t *vdc)
1008{
1009	ddi_remove_minor_node(vdc->dip, "h");
1010	ddi_remove_minor_node(vdc->dip, "h,raw");
1011
1012	if (ddi_create_minor_node(vdc->dip, "wd", S_IFBLK,
1013	    VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE),
1014	    DDI_NT_BLOCK, 0) != DDI_SUCCESS) {
1015		cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd'",
1016		    vdc->instance);
1017		return (EIO);
1018	}
1019
1020	/* if any device node is created we set this flag */
1021	vdc->initialized |= VDC_MINOR;
1022
1023	if (ddi_create_minor_node(vdc->dip, "wd,raw", S_IFCHR,
1024	    VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE),
1025	    DDI_NT_BLOCK, 0) != DDI_SUCCESS) {
1026		cmn_err(CE_NOTE, "[%d] Couldn't add block node 'wd,raw'",
1027		    vdc->instance);
1028		return (EIO);
1029	}
1030
1031	return (0);
1032}
1033
1034static int
1035vdc_create_device_nodes_vtoc(vdc_t *vdc)
1036{
1037	ddi_remove_minor_node(vdc->dip, "wd");
1038	ddi_remove_minor_node(vdc->dip, "wd,raw");
1039
1040	if (ddi_create_minor_node(vdc->dip, "h", S_IFBLK,
1041	    VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE),
1042	    DDI_NT_BLOCK, 0) != DDI_SUCCESS) {
1043		cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h'",
1044		    vdc->instance);
1045		return (EIO);
1046	}
1047
1048	/* if any device node is created we set this flag */
1049	vdc->initialized |= VDC_MINOR;
1050
1051	if (ddi_create_minor_node(vdc->dip, "h,raw", S_IFCHR,
1052	    VD_MAKE_DEV(vdc->instance, VD_EFI_WD_SLICE),
1053	    DDI_NT_BLOCK, 0) != DDI_SUCCESS) {
1054		cmn_err(CE_NOTE, "[%d] Couldn't add block node 'h,raw'",
1055		    vdc->instance);
1056		return (EIO);
1057	}
1058
1059	return (0);
1060}
1061
1062/*
1063 * Function:
1064 *	vdc_create_device_nodes
1065 *
1066 * Description:
1067 *	This function creates the block and character device nodes under
1068 *	/devices. It is called as part of the attach(9E) of the instance
1069 *	during the handshake with vds after vds has sent the attributes
1070 *	to vdc.
1071 *
1072 *	If the device is of type VD_DISK_TYPE_SLICE then the minor node
1073 *	of 2 is used in keeping with the Solaris convention that slice 2
1074 *	refers to a whole disk. Slices start at 'a'
1075 *
1076 * Parameters:
1077 *	vdc		- soft state pointer
1078 *
1079 * Return Values
1080 *	0		- Success
1081 *	EIO		- Failed to create node
1082 */
1083static int
1084vdc_create_device_nodes(vdc_t *vdc)
1085{
1086	char		name[sizeof ("s,raw")];
1087	dev_info_t	*dip = NULL;
1088	int		instance, status;
1089	int		num_slices = 1;
1090	int		i;
1091
1092	ASSERT(vdc != NULL);
1093
1094	instance = vdc->instance;
1095	dip = vdc->dip;
1096
1097	switch (vdc->vdisk_type) {
1098	case VD_DISK_TYPE_DISK:
1099	case VD_DISK_TYPE_UNK:
1100		num_slices = V_NUMPAR;
1101		break;
1102	case VD_DISK_TYPE_SLICE:
1103		num_slices = 1;
1104		break;
1105	default:
1106		ASSERT(0);
1107	}
1108
1109	/*
1110	 * Minor nodes are different for EFI disks: EFI disks do not have
1111	 * a minor node 'g' for the minor number corresponding to slice
1112	 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd'
1113	 * representing the whole disk.
1114	 */
1115	for (i = 0; i < num_slices; i++) {
1116
1117		if (i == VD_EFI_WD_SLICE) {
1118			if (vdc->vdisk_label == VD_DISK_LABEL_EFI)
1119				status = vdc_create_device_nodes_efi(vdc);
1120			else
1121				status = vdc_create_device_nodes_vtoc(vdc);
1122			if (status != 0)
1123				return (status);
1124			continue;
1125		}
1126
1127		(void) snprintf(name, sizeof (name), "%c", 'a' + i);
1128		if (ddi_create_minor_node(dip, name, S_IFBLK,
1129		    VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) {
1130			cmn_err(CE_NOTE, "[%d] Couldn't add block node '%s'",
1131			    instance, name);
1132			return (EIO);
1133		}
1134
1135		/* if any device node is created we set this flag */
1136		vdc->initialized |= VDC_MINOR;
1137
1138		(void) snprintf(name, sizeof (name), "%c%s", 'a' + i, ",raw");
1139
1140		if (ddi_create_minor_node(dip, name, S_IFCHR,
1141		    VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) {
1142			cmn_err(CE_NOTE, "[%d] Couldn't add raw node '%s'",
1143			    instance, name);
1144			return (EIO);
1145		}
1146	}
1147
1148	return (0);
1149}
1150
1151/*
1152 * Driver prop_op(9e) entry point function. Return the number of blocks for
1153 * the partition in question or forward the request to the property facilities.
1154 */
1155static int
1156vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
1157    char *name, caddr_t valuep, int *lengthp)
1158{
1159	int instance = ddi_get_instance(dip);
1160	vdc_t *vdc;
1161	uint64_t nblocks;
1162	uint_t blksize;
1163
1164	vdc = ddi_get_soft_state(vdc_state, instance);
1165
1166	if (dev == DDI_DEV_T_ANY || vdc == NULL) {
1167		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1168		    name, valuep, lengthp));
1169	}
1170
1171	mutex_enter(&vdc->lock);
1172	(void) vdc_validate_geometry(vdc);
1173	if (vdc->vdisk_label == VD_DISK_LABEL_UNK) {
1174		mutex_exit(&vdc->lock);
1175		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
1176		    name, valuep, lengthp));
1177	}
1178	nblocks = vdc->slice[VDCPART(dev)].nblocks;
1179	blksize = vdc->vdisk_bsize;
1180	mutex_exit(&vdc->lock);
1181
1182	return (ddi_prop_op_nblocks_blksize(dev, dip, prop_op, mod_flags,
1183	    name, valuep, lengthp, nblocks, blksize));
1184}
1185
1186/*
1187 * Function:
1188 *	vdc_is_opened
1189 *
1190 * Description:
1191 *	This function checks if any slice of a given virtual disk is
1192 *	currently opened.
1193 *
1194 * Parameters:
1195 *	vdc		- soft state pointer
1196 *
1197 * Return Values
1198 *	B_TRUE		- at least one slice is opened.
1199 *	B_FALSE		- no slice is opened.
1200 */
1201static boolean_t
1202vdc_is_opened(vdc_t *vdc)
1203{
1204	int i;
1205
1206	/* check if there's any layered open */
1207	for (i = 0; i < V_NUMPAR; i++) {
1208		if (vdc->open_lyr[i] > 0)
1209			return (B_TRUE);
1210	}
1211
1212	/* check if there is any other kind of open */
1213	for (i = 0; i < OTYPCNT; i++) {
1214		if (vdc->open[i] != 0)
1215			return (B_TRUE);
1216	}
1217
1218	return (B_FALSE);
1219}
1220
1221static int
1222vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp)
1223{
1224	uint8_t slicemask;
1225	int i;
1226
1227	ASSERT(otyp < OTYPCNT);
1228	ASSERT(slice < V_NUMPAR);
1229	ASSERT(MUTEX_HELD(&vdc->lock));
1230
1231	slicemask = 1 << slice;
1232
1233	/*
1234	 * If we have a single-slice disk which was unavailable during the
1235	 * attach then a device was created for each 8 slices. Now that
1236	 * the type is known, we prevent opening any slice other than 0
1237	 * even if a device still exists.
1238	 */
1239	if (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0)
1240		return (EIO);
1241
1242	/* check if slice is already exclusively opened */
1243	if (vdc->open_excl & slicemask)
1244		return (EBUSY);
1245
1246	/* if open exclusive, check if slice is already opened */
1247	if (flag & FEXCL) {
1248		if (vdc->open_lyr[slice] > 0)
1249			return (EBUSY);
1250		for (i = 0; i < OTYPCNT; i++) {
1251			if (vdc->open[i] & slicemask)
1252				return (EBUSY);
1253		}
1254		vdc->open_excl |= slicemask;
1255	}
1256
1257	/* mark slice as opened */
1258	if (otyp == OTYP_LYR) {
1259		vdc->open_lyr[slice]++;
1260	} else {
1261		vdc->open[otyp] |= slicemask;
1262	}
1263
1264	return (0);
1265}
1266
1267static void
1268vdc_mark_closed(vdc_t *vdc, int slice, int flag, int otyp)
1269{
1270	uint8_t slicemask;
1271
1272	ASSERT(otyp < OTYPCNT);
1273	ASSERT(slice < V_NUMPAR);
1274	ASSERT(MUTEX_HELD(&vdc->lock));
1275
1276	slicemask = 1 << slice;
1277
1278	if (otyp == OTYP_LYR) {
1279		ASSERT(vdc->open_lyr[slice] > 0);
1280		vdc->open_lyr[slice]--;
1281	} else {
1282		vdc->open[otyp] &= ~slicemask;
1283	}
1284
1285	if (flag & FEXCL)
1286		vdc->open_excl &= ~slicemask;
1287}
1288
1289static int
1290vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred)
1291{
1292	_NOTE(ARGUNUSED(cred))
1293
1294	int	instance, nodelay;
1295	int	slice, status = 0;
1296	vdc_t	*vdc;
1297
1298	ASSERT(dev != NULL);
1299	instance = VDCUNIT(*dev);
1300
1301	if (otyp >= OTYPCNT)
1302		return (EINVAL);
1303
1304	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
1305		cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance);
1306		return (ENXIO);
1307	}
1308
1309	DMSG(vdc, 0, "minor = %d flag = %x, otyp = %x\n",
1310	    getminor(*dev), flag, otyp);
1311
1312	slice = VDCPART(*dev);
1313
1314	nodelay = flag & (FNDELAY | FNONBLOCK);
1315
1316	if ((flag & FWRITE) && (!nodelay) &&
1317	    !(VD_OP_SUPPORTED(vdc->operations, VD_OP_BWRITE))) {
1318		return (EROFS);
1319	}
1320
1321	mutex_enter(&vdc->lock);
1322
1323	status = vdc_mark_opened(vdc, slice, flag, otyp);
1324
1325	if (status != 0) {
1326		mutex_exit(&vdc->lock);
1327		return (status);
1328	}
1329
1330	/*
1331	 * If the disk type is unknown then we have to wait for the
1332	 * handshake to complete because we don't know if the slice
1333	 * device we are opening effectively exists.
1334	 */
1335	if (vdc->vdisk_type != VD_DISK_TYPE_UNK && nodelay) {
1336
1337		/* don't resubmit a validate request if there's already one */
1338		if (vdc->validate_pending > 0) {
1339			mutex_exit(&vdc->lock);
1340			return (0);
1341		}
1342
1343		/* call vdc_validate() asynchronously to avoid blocking */
1344		if (taskq_dispatch(system_taskq, vdc_validate_task,
1345		    (void *)vdc, TQ_NOSLEEP) == TASKQID_INVALID) {
1346			vdc_mark_closed(vdc, slice, flag, otyp);
1347			mutex_exit(&vdc->lock);
1348			return (ENXIO);
1349		}
1350
1351		vdc->validate_pending++;
1352		mutex_exit(&vdc->lock);
1353		return (0);
1354	}
1355
1356	mutex_exit(&vdc->lock);
1357
1358	vdc_validate(vdc);
1359
1360	mutex_enter(&vdc->lock);
1361
1362	if (vdc->vdisk_type == VD_DISK_TYPE_UNK ||
1363	    (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0) ||
1364	    (!nodelay && (vdc->vdisk_label == VD_DISK_LABEL_UNK ||
1365	    vdc->slice[slice].nblocks == 0))) {
1366		vdc_mark_closed(vdc, slice, flag, otyp);
1367		status = EIO;
1368	}
1369
1370	mutex_exit(&vdc->lock);
1371
1372	return (status);
1373}
1374
1375static int
1376vdc_close(dev_t dev, int flag, int otyp, cred_t *cred)
1377{
1378	_NOTE(ARGUNUSED(cred))
1379
1380	int	instance;
1381	int	slice;
1382	int	rv, rval;
1383	vdc_t	*vdc;
1384
1385	instance = VDCUNIT(dev);
1386
1387	if (otyp >= OTYPCNT)
1388		return (EINVAL);
1389
1390	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
1391		cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance);
1392		return (ENXIO);
1393	}
1394
1395	DMSG(vdc, 0, "[%d] flag = %x, otyp = %x\n", instance, flag, otyp);
1396
1397	slice = VDCPART(dev);
1398
1399	/*
1400	 * Attempt to flush the W$ on a close operation. If this is
1401	 * not a supported IOCTL command or the backing device is read-only
1402	 * do not fail the close operation.
1403	 */
1404	rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval);
1405
1406	if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) {
1407		DMSG(vdc, 0, "[%d] flush failed with error %d on close\n",
1408		    instance, rv);
1409		return (EIO);
1410	}
1411
1412	mutex_enter(&vdc->lock);
1413	vdc_mark_closed(vdc, slice, flag, otyp);
1414	mutex_exit(&vdc->lock);
1415
1416	return (0);
1417}
1418
1419static int
1420vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1421{
1422	_NOTE(ARGUNUSED(credp))
1423
1424	return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp));
1425}
1426
1427static int
1428vdc_print(dev_t dev, char *str)
1429{
1430	cmn_err(CE_NOTE, "vdc%d:  %s", VDCUNIT(dev), str);
1431	return (0);
1432}
1433
1434static int
1435vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1436{
1437	int	rv, flags;
1438	size_t	nbytes = nblk * DEV_BSIZE;
1439	int	instance = VDCUNIT(dev);
1440	vdc_t	*vdc = NULL;
1441	diskaddr_t vio_blkno;
1442
1443	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
1444		cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance);
1445		return (ENXIO);
1446	}
1447
1448	DMSG(vdc, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n",
1449	    instance, nbytes, blkno, (void *)addr);
1450
1451	/* convert logical block to vio block */
1452	if ((blkno & vdc->vio_bmask) != 0) {
1453		DMSG(vdc, 0, "Misaligned block number (%lu)\n", blkno);
1454		return (EINVAL);
1455	}
1456	vio_blkno = blkno >> vdc->vio_bshift;
1457
1458	/*
1459	 * If we are panicking, we need the state to be "running" so that we
1460	 * can submit I/Os, but we don't want to check for any backend error.
1461	 */
1462	flags = (ddi_in_panic())? VDC_OP_STATE_RUNNING : VDC_OP_NORMAL;
1463
1464	rv = vdc_do_op(vdc, VD_OP_BWRITE, addr, nbytes, VDCPART(dev),
1465	    vio_blkno, NULL, VIO_write_dir, flags);
1466
1467	if (rv) {
1468		DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv);
1469		return (rv);
1470	}
1471
1472	DMSG(vdc, 0, "[%d] End\n", instance);
1473
1474	return (0);
1475}
1476
1477/* -------------------------------------------------------------------------- */
1478
1479/*
1480 * Disk access routines
1481 *
1482 */
1483
1484/*
1485 * vdc_strategy()
1486 *
1487 * Return Value:
1488 *	0:	As per strategy(9E), the strategy() function must return 0
1489 *		[ bioerror(9f) sets b_flags to the proper error code ]
1490 */
1491static int
1492vdc_strategy(struct buf *buf)
1493{
1494	diskaddr_t vio_blkno;
1495	vdc_t	*vdc = NULL;
1496	int	instance = VDCUNIT(buf->b_edev);
1497	int	op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE;
1498	int	slice;
1499
1500	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
1501		cmn_err(CE_NOTE, "[%d] Couldn't get state structure", instance);
1502		bioerror(buf, ENXIO);
1503		biodone(buf);
1504		return (0);
1505	}
1506
1507	DMSG(vdc, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n",
1508	    instance, (buf->b_flags & B_READ) ? "Read" : "Write",
1509	    buf->b_bcount, buf->b_lblkno, (void *)buf->b_un.b_addr);
1510
1511	bp_mapin(buf);
1512
1513	if ((long)buf->b_private == VD_SLICE_NONE) {
1514		/* I/O using an absolute disk offset */
1515		slice = VD_SLICE_NONE;
1516	} else {
1517		slice = VDCPART(buf->b_edev);
1518	}
1519
1520	/*
1521	 * In the buf structure, b_lblkno represents a logical block number
1522	 * using a block size of 512 bytes. For the VIO request, this block
1523	 * number has to be converted to be represented with the block size
1524	 * used by the VIO protocol.
1525	 */
1526	if ((buf->b_lblkno & vdc->vio_bmask) != 0) {
1527		bioerror(buf, EINVAL);
1528		biodone(buf);
1529		return (0);
1530	}
1531	vio_blkno = buf->b_lblkno >> vdc->vio_bshift;
1532
1533	/* submit the I/O, any error will be reported in the buf structure */
1534	(void) vdc_do_op(vdc, op, (caddr_t)buf->b_un.b_addr,
1535	    buf->b_bcount, slice, vio_blkno,
1536	    buf, (op == VD_OP_BREAD) ? VIO_read_dir : VIO_write_dir,
1537	    VDC_OP_NORMAL);
1538
1539	return (0);
1540}
1541
1542/*
1543 * Function:
1544 *	vdc_min
1545 *
1546 * Description:
1547 *	Routine to limit the size of a data transfer. Used in
1548 *	conjunction with physio(9F).
1549 *
1550 * Arguments:
1551 *	bp - pointer to the indicated buf(9S) struct.
1552 *
1553 */
1554static void
1555vdc_min(struct buf *bufp)
1556{
1557	vdc_t	*vdc = NULL;
1558	int	instance = VDCUNIT(bufp->b_edev);
1559
1560	vdc = ddi_get_soft_state(vdc_state, instance);
1561	VERIFY(vdc != NULL);
1562
1563	if (bufp->b_bcount > (vdc->max_xfer_sz * vdc->vdisk_bsize)) {
1564		bufp->b_bcount = vdc->max_xfer_sz * vdc->vdisk_bsize;
1565	}
1566}
1567
1568static int
1569vdc_read(dev_t dev, struct uio *uio, cred_t *cred)
1570{
1571	_NOTE(ARGUNUSED(cred))
1572
1573	DMSGX(1, "[%d] Entered", VDCUNIT(dev));
1574	return (physio(vdc_strategy, NULL, dev, B_READ, vdc_min, uio));
1575}
1576
1577static int
1578vdc_write(dev_t dev, struct uio *uio, cred_t *cred)
1579{
1580	_NOTE(ARGUNUSED(cred))
1581
1582	DMSGX(1, "[%d] Entered", VDCUNIT(dev));
1583	return (physio(vdc_strategy, NULL, dev, B_WRITE, vdc_min, uio));
1584}
1585
1586static int
1587vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred)
1588{
1589	_NOTE(ARGUNUSED(cred))
1590
1591	DMSGX(1, "[%d] Entered", VDCUNIT(dev));
1592	return (aphysio(vdc_strategy, anocancel, dev, B_READ, vdc_min, aio));
1593}
1594
1595static int
1596vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred)
1597{
1598	_NOTE(ARGUNUSED(cred))
1599
1600	DMSGX(1, "[%d] Entered", VDCUNIT(dev));
1601	return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, vdc_min, aio));
1602}
1603
1604
1605/* -------------------------------------------------------------------------- */
1606
1607/*
1608 * Handshake support
1609 */
1610
1611
1612/*
1613 * Function:
1614 *	vdc_init_ver_negotiation()
1615 *
1616 * Description:
1617 *
1618 * Arguments:
1619 *	vdc	- soft state pointer for this instance of the device driver.
1620 *
1621 * Return Code:
1622 *	0	- Success
1623 */
1624static int
1625vdc_init_ver_negotiation(vdc_t *vdc, vio_ver_t ver)
1626{
1627	vio_ver_msg_t	pkt;
1628	size_t		msglen = sizeof (pkt);
1629	int		status = -1;
1630
1631	ASSERT(vdc != NULL);
1632	ASSERT(mutex_owned(&vdc->lock));
1633
1634	DMSG(vdc, 0, "[%d] Entered.\n", vdc->instance);
1635
1636	/*
1637	 * set the Session ID to a unique value
1638	 * (the lower 32 bits of the clock tick)
1639	 */
1640	vdc->session_id = ((uint32_t)gettick() & 0xffffffff);
1641	DMSG(vdc, 0, "[%d] Set SID to 0x%lx\n", vdc->instance, vdc->session_id);
1642
1643	pkt.tag.vio_msgtype = VIO_TYPE_CTRL;
1644	pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
1645	pkt.tag.vio_subtype_env = VIO_VER_INFO;
1646	pkt.tag.vio_sid = vdc->session_id;
1647	pkt.dev_class = VDEV_DISK;
1648	pkt.ver_major = ver.major;
1649	pkt.ver_minor = ver.minor;
1650
1651	status = vdc_send(vdc, (caddr_t)&pkt, &msglen);
1652	DMSG(vdc, 0, "[%d] Ver info sent (status = %d)\n",
1653	    vdc->instance, status);
1654	if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) {
1655		DMSG(vdc, 0, "[%d] Failed to send Ver negotiation info: "
1656		    "id(%lx) rv(%d) size(%ld)", vdc->instance,
1657		    vdc->curr_server->ldc_handle, status, msglen);
1658		if (msglen != sizeof (vio_ver_msg_t))
1659			status = ENOMSG;
1660	}
1661
1662	return (status);
1663}
1664
1665/*
1666 * Function:
1667 *	vdc_ver_negotiation()
1668 *
1669 * Description:
1670 *
1671 * Arguments:
1672 *	vdcp	- soft state pointer for this instance of the device driver.
1673 *
1674 * Return Code:
1675 *	0	- Success
1676 */
1677static int
1678vdc_ver_negotiation(vdc_t *vdcp)
1679{
1680	vio_msg_t vio_msg;
1681	int status;
1682
1683	if (status = vdc_init_ver_negotiation(vdcp, vdc_version[0]))
1684		return (status);
1685
1686	/* release lock and wait for response */
1687	mutex_exit(&vdcp->lock);
1688	status = vdc_wait_for_response(vdcp, &vio_msg);
1689	mutex_enter(&vdcp->lock);
1690	if (status) {
1691		DMSG(vdcp, 0,
1692		    "[%d] Failed waiting for Ver negotiation response, rv(%d)",
1693		    vdcp->instance, status);
1694		return (status);
1695	}
1696
1697	/* check type and sub_type ... */
1698	if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL ||
1699	    vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) {
1700		DMSG(vdcp, 0, "[%d] Invalid ver negotiation response\n",
1701		    vdcp->instance);
1702		return (EPROTO);
1703	}
1704
1705	return (vdc_handle_ver_msg(vdcp, (vio_ver_msg_t *)&vio_msg));
1706}
1707
1708/*
1709 * Function:
1710 *	vdc_init_attr_negotiation()
1711 *
1712 * Description:
1713 *
1714 * Arguments:
1715 *	vdc	- soft state pointer for this instance of the device driver.
1716 *
1717 * Return Code:
1718 *	0	- Success
1719 */
1720static int
1721vdc_init_attr_negotiation(vdc_t *vdc)
1722{
1723	vd_attr_msg_t	pkt;
1724	size_t		msglen = sizeof (pkt);
1725	int		status;
1726
1727	ASSERT(vdc != NULL);
1728	ASSERT(mutex_owned(&vdc->lock));
1729
1730	DMSG(vdc, 0, "[%d] entered\n", vdc->instance);
1731
1732	/* fill in tag */
1733	pkt.tag.vio_msgtype = VIO_TYPE_CTRL;
1734	pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
1735	pkt.tag.vio_subtype_env = VIO_ATTR_INFO;
1736	pkt.tag.vio_sid = vdc->session_id;
1737	/* fill in payload */
1738	pkt.max_xfer_sz = vdc->max_xfer_sz;
1739	pkt.vdisk_block_size = vdc->vdisk_bsize;
1740	pkt.xfer_mode = VIO_DRING_MODE_V1_0;
1741	pkt.operations = 0;	/* server will set bits of valid operations */
1742	pkt.vdisk_type = 0;	/* server will set to valid device type */
1743	pkt.vdisk_media = 0;	/* server will set to valid media type */
1744	pkt.vdisk_size = 0;	/* server will set to valid size */
1745
1746	status = vdc_send(vdc, (caddr_t)&pkt, &msglen);
1747	DMSG(vdc, 0, "Attr info sent (status = %d)\n", status);
1748
1749	if ((status != 0) || (msglen != sizeof (vd_attr_msg_t))) {
1750		DMSG(vdc, 0, "[%d] Failed to send Attr negotiation info: "
1751		    "id(%lx) rv(%d) size(%ld)", vdc->instance,
1752		    vdc->curr_server->ldc_handle, status, msglen);
1753		if (msglen != sizeof (vd_attr_msg_t))
1754			status = ENOMSG;
1755	}
1756
1757	return (status);
1758}
1759
1760/*
1761 * Function:
1762 *	vdc_attr_negotiation()
1763 *
1764 * Description:
1765 *
1766 * Arguments:
1767 *	vdc	- soft state pointer for this instance of the device driver.
1768 *
1769 * Return Code:
1770 *	0	- Success
1771 */
1772static int
1773vdc_attr_negotiation(vdc_t *vdcp)
1774{
1775	int status;
1776	vio_msg_t vio_msg;
1777
1778	if (status = vdc_init_attr_negotiation(vdcp))
1779		return (status);
1780
1781	/* release lock and wait for response */
1782	mutex_exit(&vdcp->lock);
1783	status = vdc_wait_for_response(vdcp, &vio_msg);
1784	mutex_enter(&vdcp->lock);
1785	if (status) {
1786		DMSG(vdcp, 0,
1787		    "[%d] Failed waiting for Attr negotiation response, rv(%d)",
1788		    vdcp->instance, status);
1789		return (status);
1790	}
1791
1792	/* check type and sub_type ... */
1793	if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL ||
1794	    vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) {
1795		DMSG(vdcp, 0, "[%d] Invalid attr negotiation response\n",
1796		    vdcp->instance);
1797		return (EPROTO);
1798	}
1799
1800	return (vdc_handle_attr_msg(vdcp, (vd_attr_msg_t *)&vio_msg));
1801}
1802
1803
1804/*
1805 * Function:
1806 *	vdc_init_dring_negotiate()
1807 *
1808 * Description:
1809 *
1810 * Arguments:
1811 *	vdc	- soft state pointer for this instance of the device driver.
1812 *
1813 * Return Code:
1814 *	0	- Success
1815 */
1816static int
1817vdc_init_dring_negotiate(vdc_t *vdc)
1818{
1819	vio_dring_reg_msg_t	pkt;
1820	size_t			msglen = sizeof (pkt);
1821	int			status = -1;
1822	int			retry;
1823	int			nretries = 10;
1824
1825	ASSERT(vdc != NULL);
1826	ASSERT(mutex_owned(&vdc->lock));
1827
1828	for (retry = 0; retry < nretries; retry++) {
1829		status = vdc_init_descriptor_ring(vdc);
1830		if (status != EAGAIN)
1831			break;
1832		drv_usecwait(vdc_min_timeout_ldc);
1833	}
1834
1835	if (status != 0) {
1836		DMSG(vdc, 0, "[%d] Failed to init DRing (status = %d)\n",
1837		    vdc->instance, status);
1838		return (status);
1839	}
1840
1841	DMSG(vdc, 0, "[%d] Init of descriptor ring completed (status = %d)\n",
1842	    vdc->instance, status);
1843
1844	/* fill in tag */
1845	pkt.tag.vio_msgtype = VIO_TYPE_CTRL;
1846	pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
1847	pkt.tag.vio_subtype_env = VIO_DRING_REG;
1848	pkt.tag.vio_sid = vdc->session_id;
1849	/* fill in payload */
1850	pkt.dring_ident = 0;
1851	pkt.num_descriptors = vdc->dring_len;
1852	pkt.descriptor_size = vdc->dring_entry_size;
1853	pkt.options = (VIO_TX_DRING | VIO_RX_DRING);
1854	pkt.ncookies = vdc->dring_cookie_count;
1855	pkt.cookie[0] = vdc->dring_cookie[0];	/* for now just one cookie */
1856
1857	status = vdc_send(vdc, (caddr_t)&pkt, &msglen);
1858	if (status != 0) {
1859		DMSG(vdc, 0, "[%d] Failed to register DRing (err = %d)",
1860		    vdc->instance, status);
1861	}
1862
1863	return (status);
1864}
1865
1866
1867/*
1868 * Function:
1869 *	vdc_dring_negotiation()
1870 *
1871 * Description:
1872 *
1873 * Arguments:
1874 *	vdc	- soft state pointer for this instance of the device driver.
1875 *
1876 * Return Code:
1877 *	0	- Success
1878 */
1879static int
1880vdc_dring_negotiation(vdc_t *vdcp)
1881{
1882	int status;
1883	vio_msg_t vio_msg;
1884
1885	if (status = vdc_init_dring_negotiate(vdcp))
1886		return (status);
1887
1888	/* release lock and wait for response */
1889	mutex_exit(&vdcp->lock);
1890	status = vdc_wait_for_response(vdcp, &vio_msg);
1891	mutex_enter(&vdcp->lock);
1892	if (status) {
1893		DMSG(vdcp, 0,
1894		    "[%d] Failed waiting for Dring negotiation response,"
1895		    " rv(%d)", vdcp->instance, status);
1896		return (status);
1897	}
1898
1899	/* check type and sub_type ... */
1900	if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL ||
1901	    vio_msg.tag.vio_subtype == VIO_SUBTYPE_INFO) {
1902		DMSG(vdcp, 0, "[%d] Invalid Dring negotiation response\n",
1903		    vdcp->instance);
1904		return (EPROTO);
1905	}
1906
1907	return (vdc_handle_dring_reg_msg(vdcp,
1908	    (vio_dring_reg_msg_t *)&vio_msg));
1909}
1910
1911
1912/*
1913 * Function:
1914 *	vdc_send_rdx()
1915 *
1916 * Description:
1917 *
1918 * Arguments:
1919 *	vdc	- soft state pointer for this instance of the device driver.
1920 *
1921 * Return Code:
1922 *	0	- Success
1923 */
1924static int
1925vdc_send_rdx(vdc_t *vdcp)
1926{
1927	vio_msg_t	msg;
1928	size_t		msglen = sizeof (vio_msg_t);
1929	int		status;
1930
1931	/*
1932	 * Send an RDX message to vds to indicate we are ready
1933	 * to send data
1934	 */
1935	msg.tag.vio_msgtype = VIO_TYPE_CTRL;
1936	msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
1937	msg.tag.vio_subtype_env = VIO_RDX;
1938	msg.tag.vio_sid = vdcp->session_id;
1939	status = vdc_send(vdcp, (caddr_t)&msg, &msglen);
1940	if (status != 0) {
1941		DMSG(vdcp, 0, "[%d] Failed to send RDX message (%d)",
1942		    vdcp->instance, status);
1943	}
1944
1945	return (status);
1946}
1947
1948/*
1949 * Function:
1950 *	vdc_handle_rdx()
1951 *
1952 * Description:
1953 *
1954 * Arguments:
1955 *	vdc	- soft state pointer for this instance of the device driver.
1956 *	msgp	- received msg
1957 *
1958 * Return Code:
1959 *	0	- Success
1960 */
1961static int
1962vdc_handle_rdx(vdc_t *vdcp, vio_rdx_msg_t *msgp)
1963{
1964	_NOTE(ARGUNUSED(vdcp))
1965	_NOTE(ARGUNUSED(msgp))
1966
1967	ASSERT(msgp->tag.vio_msgtype == VIO_TYPE_CTRL);
1968	ASSERT(msgp->tag.vio_subtype == VIO_SUBTYPE_ACK);
1969	ASSERT(msgp->tag.vio_subtype_env == VIO_RDX);
1970
1971	DMSG(vdcp, 1, "[%d] Got an RDX msg", vdcp->instance);
1972
1973	return (0);
1974}
1975
1976/*
1977 * Function:
1978 *	vdc_rdx_exchange()
1979 *
1980 * Description:
1981 *
1982 * Arguments:
1983 *	vdc	- soft state pointer for this instance of the device driver.
1984 *
1985 * Return Code:
1986 *	0	- Success
1987 */
1988static int
1989vdc_rdx_exchange(vdc_t *vdcp)
1990{
1991	int status;
1992	vio_msg_t vio_msg;
1993
1994	if (status = vdc_send_rdx(vdcp))
1995		return (status);
1996
1997	/* release lock and wait for response */
1998	mutex_exit(&vdcp->lock);
1999	status = vdc_wait_for_response(vdcp, &vio_msg);
2000	mutex_enter(&vdcp->lock);
2001	if (status) {
2002		DMSG(vdcp, 0, "[%d] Failed waiting for RDX response, rv(%d)",
2003		    vdcp->instance, status);
2004		return (status);
2005	}
2006
2007	/* check type and sub_type ... */
2008	if (vio_msg.tag.vio_msgtype != VIO_TYPE_CTRL ||
2009	    vio_msg.tag.vio_subtype != VIO_SUBTYPE_ACK) {
2010		DMSG(vdcp, 0, "[%d] Invalid RDX response\n", vdcp->instance);
2011		return (EPROTO);
2012	}
2013
2014	return (vdc_handle_rdx(vdcp, (vio_rdx_msg_t *)&vio_msg));
2015}
2016
2017
2018/* -------------------------------------------------------------------------- */
2019
2020/*
2021 * LDC helper routines
2022 */
2023
2024static int
2025vdc_recv(vdc_t *vdc, vio_msg_t *msgp, size_t *nbytesp)
2026{
2027	int		status;
2028	uint64_t	delay_time;
2029	size_t		len;
2030
2031	/*
2032	 * Until we get a blocking ldc read we have to retry until the entire
2033	 * LDC message has arrived before ldc_read() will return that message.
2034	 * If ldc_read() succeed but returns a zero length message then that
2035	 * means that the LDC queue is empty and we have to wait for a
2036	 * notification from the LDC callback which will set the read_state to
2037	 * VDC_READ_PENDING. Note we also bail out if the channel is reset or
2038	 * goes away.
2039	 */
2040	delay_time = vdc_ldc_read_init_delay;
2041
2042	for (;;) {
2043
2044		len = *nbytesp;
2045		/*
2046		 * vdc->curr_server is protected by vdc->lock but to avoid
2047		 * contentions we don't take the lock here. We can do this
2048		 * safely because vdc_recv() is only called from thread
2049		 * process_msg_thread() which is also the only thread that
2050		 * can change vdc->curr_server.
2051		 */
2052		status = ldc_read(vdc->curr_server->ldc_handle,
2053		    (caddr_t)msgp, &len);
2054
2055		if (status == EAGAIN) {
2056			delay_time *= 2;
2057			if (delay_time >= vdc_ldc_read_max_delay)
2058				delay_time = vdc_ldc_read_max_delay;
2059			delay(delay_time);
2060			continue;
2061		}
2062
2063		if (status != 0) {
2064			DMSG(vdc, 0, "ldc_read returned %d\n", status);
2065			break;
2066		}
2067
2068		if (len != 0) {
2069			*nbytesp = len;
2070			break;
2071		}
2072
2073		mutex_enter(&vdc->read_lock);
2074
2075		while (vdc->read_state != VDC_READ_PENDING) {
2076
2077			/* detect if the connection has been reset */
2078			if (vdc->read_state == VDC_READ_RESET) {
2079				mutex_exit(&vdc->read_lock);
2080				return (ECONNRESET);
2081			}
2082
2083			vdc->read_state = VDC_READ_WAITING;
2084			cv_wait(&vdc->read_cv, &vdc->read_lock);
2085		}
2086
2087		vdc->read_state = VDC_READ_IDLE;
2088		mutex_exit(&vdc->read_lock);
2089
2090		delay_time = vdc_ldc_read_init_delay;
2091	}
2092
2093	return (status);
2094}
2095
2096
2097
2098#ifdef DEBUG
2099void
2100vdc_decode_tag(vdc_t *vdcp, vio_msg_t *msg)
2101{
2102	char *ms, *ss, *ses;
2103	switch (msg->tag.vio_msgtype) {
2104#define	Q(_s)	case _s : ms = #_s; break;
2105	Q(VIO_TYPE_CTRL)
2106	Q(VIO_TYPE_DATA)
2107	Q(VIO_TYPE_ERR)
2108#undef Q
2109	default: ms = "unknown"; break;
2110	}
2111
2112	switch (msg->tag.vio_subtype) {
2113#define	Q(_s)	case _s : ss = #_s; break;
2114	Q(VIO_SUBTYPE_INFO)
2115	Q(VIO_SUBTYPE_ACK)
2116	Q(VIO_SUBTYPE_NACK)
2117#undef Q
2118	default: ss = "unknown"; break;
2119	}
2120
2121	switch (msg->tag.vio_subtype_env) {
2122#define	Q(_s)	case _s : ses = #_s; break;
2123	Q(VIO_VER_INFO)
2124	Q(VIO_ATTR_INFO)
2125	Q(VIO_DRING_REG)
2126	Q(VIO_DRING_UNREG)
2127	Q(VIO_RDX)
2128	Q(VIO_PKT_DATA)
2129	Q(VIO_DESC_DATA)
2130	Q(VIO_DRING_DATA)
2131#undef Q
2132	default: ses = "unknown"; break;
2133	}
2134
2135	DMSG(vdcp, 3, "(%x/%x/%x) message : (%s/%s/%s)\n",
2136	    msg->tag.vio_msgtype, msg->tag.vio_subtype,
2137	    msg->tag.vio_subtype_env, ms, ss, ses);
2138}
2139#endif
2140
2141/*
2142 * Function:
2143 *	vdc_send()
2144 *
2145 * Description:
2146 *	The function encapsulates the call to write a message using LDC.
2147 *	If LDC indicates that the call failed due to the queue being full,
2148 *	we retry the ldc_write(), otherwise we return the error returned by LDC.
2149 *
2150 * Arguments:
2151 *	ldc_handle	- LDC handle for the channel this instance of vdc uses
2152 *	pkt		- address of LDC message to be sent
2153 *	msglen		- the size of the message being sent. When the function
2154 *			  returns, this contains the number of bytes written.
2155 *
2156 * Return Code:
2157 *	0		- Success.
2158 *	EINVAL		- pkt or msglen were NULL
2159 *	ECONNRESET	- The connection was not up.
2160 *	EWOULDBLOCK	- LDC queue is full
2161 *	xxx		- other error codes returned by ldc_write
2162 */
2163static int
2164vdc_send(vdc_t *vdc, caddr_t pkt, size_t *msglen)
2165{
2166	size_t	size = 0;
2167	int	status = 0;
2168	clock_t delay_ticks;
2169
2170	ASSERT(vdc != NULL);
2171	ASSERT(mutex_owned(&vdc->lock));
2172	ASSERT(msglen != NULL);
2173	ASSERT(*msglen != 0);
2174
2175#ifdef DEBUG
2176	vdc_decode_tag(vdc, (vio_msg_t *)(uintptr_t)pkt);
2177#endif
2178	/*
2179	 * Wait indefinitely to send if channel
2180	 * is busy, but bail out if we succeed or
2181	 * if the channel closes or is reset.
2182	 */
2183	delay_ticks = vdc_hz_min_ldc_delay;
2184	do {
2185		size = *msglen;
2186		status = ldc_write(vdc->curr_server->ldc_handle, pkt, &size);
2187		if (status == EWOULDBLOCK) {
2188			delay(delay_ticks);
2189			/* geometric backoff */
2190			delay_ticks *= 2;
2191			if (delay_ticks > vdc_hz_max_ldc_delay)
2192				delay_ticks = vdc_hz_max_ldc_delay;
2193		}
2194	} while (status == EWOULDBLOCK);
2195
2196	/* if LDC had serious issues --- reset vdc state */
2197	if (status == EIO || status == ECONNRESET) {
2198		/* LDC had serious issues --- reset vdc state */
2199		mutex_enter(&vdc->read_lock);
2200		if ((vdc->read_state == VDC_READ_WAITING) ||
2201		    (vdc->read_state == VDC_READ_RESET))
2202			cv_signal(&vdc->read_cv);
2203		vdc->read_state = VDC_READ_RESET;
2204		mutex_exit(&vdc->read_lock);
2205
2206		/* wake up any waiters in the reset thread */
2207		if (vdc->state == VDC_STATE_INIT_WAITING) {
2208			DMSG(vdc, 0, "[%d] write reset - "
2209			    "vdc is resetting ..\n", vdc->instance);
2210			vdc->state = VDC_STATE_RESETTING;
2211			cv_signal(&vdc->initwait_cv);
2212		}
2213
2214		return (ECONNRESET);
2215	}
2216
2217	/* return the last size written */
2218	*msglen = size;
2219
2220	return (status);
2221}
2222
2223/*
2224 * Function:
2225 *	vdc_get_md_node
2226 *
2227 * Description:
2228 *	Get the MD, the device node for the given disk instance. The
2229 *	caller is responsible for cleaning up the reference to the
2230 *	returned MD (mdpp) by calling md_fini_handle().
2231 *
2232 * Arguments:
2233 *	dip	- dev info pointer for this instance of the device driver.
2234 *	mdpp	- the returned MD.
2235 *	vd_nodep - the returned device node.
2236 *
2237 * Return Code:
2238 *	0	- Success.
2239 *	ENOENT	- Expected node or property did not exist.
2240 *	ENXIO	- Unexpected error communicating with MD framework
2241 */
2242static int
2243vdc_get_md_node(dev_info_t *dip, md_t **mdpp, mde_cookie_t *vd_nodep)
2244{
2245	int		status = ENOENT;
2246	char		*node_name = NULL;
2247	md_t		*mdp = NULL;
2248	int		num_nodes;
2249	int		num_vdevs;
2250	mde_cookie_t	rootnode;
2251	mde_cookie_t	*listp = NULL;
2252	boolean_t	found_inst = B_FALSE;
2253	int		listsz;
2254	int		idx;
2255	uint64_t	md_inst;
2256	int		obp_inst;
2257	int		instance = ddi_get_instance(dip);
2258
2259	/*
2260	 * Get the OBP instance number for comparison with the MD instance
2261	 *
2262	 * The "cfg-handle" property of a vdc node in an MD contains the MD's
2263	 * notion of "instance", or unique identifier, for that node; OBP
2264	 * stores the value of the "cfg-handle" MD property as the value of
2265	 * the "reg" property on the node in the device tree it builds from
2266	 * the MD and passes to Solaris.  Thus, we look up the devinfo node's
2267	 * "reg" property value to uniquely identify this device instance.
2268	 * If the "reg" property cannot be found, the device tree state is
2269	 * presumably so broken that there is no point in continuing.
2270	 */
2271	if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) {
2272		cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG);
2273		return (ENOENT);
2274	}
2275	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2276	    OBP_REG, -1);
2277	DMSGX(1, "[%d] OBP inst=%d\n", instance, obp_inst);
2278
2279	/*
2280	 * We now walk the MD nodes to find the node for this vdisk.
2281	 */
2282	if ((mdp = md_get_handle()) == NULL) {
2283		cmn_err(CE_WARN, "unable to init machine description");
2284		return (ENXIO);
2285	}
2286
2287	num_nodes = md_node_count(mdp);
2288	ASSERT(num_nodes > 0);
2289
2290	listsz = num_nodes * sizeof (mde_cookie_t);
2291
2292	/* allocate memory for nodes */
2293	listp = kmem_zalloc(listsz, KM_SLEEP);
2294
2295	rootnode = md_root_node(mdp);
2296	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
2297
2298	/*
2299	 * Search for all the virtual devices, we will then check to see which
2300	 * ones are disk nodes.
2301	 */
2302	num_vdevs = md_scan_dag(mdp, rootnode,
2303	    md_find_name(mdp, VDC_MD_VDEV_NAME),
2304	    md_find_name(mdp, "fwd"), listp);
2305
2306	if (num_vdevs <= 0) {
2307		cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME);
2308		status = ENOENT;
2309		goto done;
2310	}
2311
2312	DMSGX(1, "[%d] num_vdevs=%d\n", instance, num_vdevs);
2313	for (idx = 0; idx < num_vdevs; idx++) {
2314		status = md_get_prop_str(mdp, listp[idx], "name", &node_name);
2315		if ((status != 0) || (node_name == NULL)) {
2316			cmn_err(CE_NOTE, "Unable to get name of node type '%s'"
2317			    ": err %d", VDC_MD_VDEV_NAME, status);
2318			continue;
2319		}
2320
2321		DMSGX(1, "[%d] Found node '%s'\n", instance, node_name);
2322		if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) {
2323			status = md_get_prop_val(mdp, listp[idx],
2324			    VDC_MD_CFG_HDL, &md_inst);
2325			DMSGX(1, "[%d] vdc inst in MD=%lx\n",
2326			    instance, md_inst);
2327			if ((status == 0) && (md_inst == obp_inst)) {
2328				found_inst = B_TRUE;
2329				break;
2330			}
2331		}
2332	}
2333
2334	if (!found_inst) {
2335		DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME);
2336		status = ENOENT;
2337		goto done;
2338	}
2339	DMSGX(0, "[%d] MD inst=%lx\n", instance, md_inst);
2340
2341	*vd_nodep = listp[idx];
2342	*mdpp = mdp;
2343done:
2344	kmem_free(listp, listsz);
2345	return (status);
2346}
2347
2348/*
2349 * Function:
2350 *	vdc_init_ports
2351 *
2352 * Description:
2353 *	Initialize all the ports for this vdisk instance.
2354 *
2355 * Arguments:
2356 *	vdc	- soft state pointer for this instance of the device driver.
2357 *	mdp	- md pointer
2358 *	vd_nodep - device md node.
2359 *
2360 * Return Code:
2361 *	0	- Success.
2362 *	ENOENT	- Expected node or property did not exist.
2363 */
2364static int
2365vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep)
2366{
2367	int		status = 0;
2368	int		idx;
2369	int		num_nodes;
2370	int		num_vports;
2371	int		num_chans;
2372	int		listsz;
2373	mde_cookie_t	vd_port;
2374	mde_cookie_t	*chanp = NULL;
2375	mde_cookie_t	*portp = NULL;
2376	vdc_server_t	*srvr;
2377	vdc_server_t	*prev_srvr = NULL;
2378
2379	/*
2380	 * We now walk the MD nodes to find the port nodes for this vdisk.
2381	 */
2382	num_nodes = md_node_count(mdp);
2383	ASSERT(num_nodes > 0);
2384
2385	listsz = num_nodes * sizeof (mde_cookie_t);
2386
2387	/* allocate memory for nodes */
2388	portp = kmem_zalloc(listsz, KM_SLEEP);
2389	chanp = kmem_zalloc(listsz, KM_SLEEP);
2390
2391	num_vports = md_scan_dag(mdp, vd_nodep,
2392	    md_find_name(mdp, VDC_MD_PORT_NAME),
2393	    md_find_name(mdp, "fwd"), portp);
2394	if (num_vports == 0) {
2395		DMSGX(0, "Found no '%s' node for '%s' port\n",
2396		    VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME);
2397		status = ENOENT;
2398		goto done;
2399	}
2400
2401	DMSGX(1, "Found %d '%s' node(s) for '%s' port\n",
2402	    num_vports, VDC_MD_PORT_NAME, VDC_MD_VDEV_NAME);
2403
2404	vdc->num_servers = 0;
2405	for (idx = 0; idx < num_vports; idx++) {
2406
2407		/* initialize this port */
2408		vd_port = portp[idx];
2409		srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP);
2410		srvr->vdcp = vdc;
2411		srvr->svc_state = VDC_SERVICE_OFFLINE;
2412		srvr->log_state = VDC_SERVICE_NONE;
2413
2414		/* get port id */
2415		if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) {
2416			cmn_err(CE_NOTE, "vDisk port '%s' property not found",
2417			    VDC_MD_ID);
2418			kmem_free(srvr, sizeof (vdc_server_t));
2419			continue;
2420		}
2421
2422		/* set the connection timeout */
2423		if (md_get_prop_val(mdp, vd_port, VDC_MD_TIMEOUT,
2424		    &srvr->ctimeout) != 0) {
2425			srvr->ctimeout = 0;
2426		}
2427
2428		/* get the ldc id */
2429		num_chans = md_scan_dag(mdp, vd_port,
2430		    md_find_name(mdp, VDC_MD_CHAN_NAME),
2431		    md_find_name(mdp, "fwd"), chanp);
2432
2433		/* expecting at least one channel */
2434		if (num_chans <= 0) {
2435			cmn_err(CE_NOTE, "No '%s' node for '%s' port",
2436			    VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME);
2437			kmem_free(srvr, sizeof (vdc_server_t));
2438			continue;
2439		} else if (num_chans != 1) {
2440			DMSGX(0, "Expected 1 '%s' node for '%s' port, "
2441			    "found %d\n", VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME,
2442			    num_chans);
2443		}
2444
2445		/*
2446		 * We use the first channel found (index 0), irrespective of how
2447		 * many are there in total.
2448		 */
2449		if (md_get_prop_val(mdp, chanp[0], VDC_MD_ID,
2450		    &srvr->ldc_id) != 0) {
2451			cmn_err(CE_NOTE, "Channel '%s' property not found",
2452			    VDC_MD_ID);
2453			kmem_free(srvr, sizeof (vdc_server_t));
2454			continue;
2455		}
2456
2457		/*
2458		 * now initialise LDC channel which will be used to
2459		 * communicate with this server
2460		 */
2461		if (vdc_do_ldc_init(vdc, srvr) != 0) {
2462			kmem_free(srvr, sizeof (vdc_server_t));
2463			continue;
2464		}
2465
2466		/* add server to list */
2467		if (prev_srvr)
2468			prev_srvr->next = srvr;
2469		else
2470			vdc->server_list = srvr;
2471
2472		prev_srvr = srvr;
2473
2474		/* inc numbers of servers */
2475		vdc->num_servers++;
2476	}
2477
2478	/* pick first server as current server */
2479	if (vdc->server_list != NULL) {
2480		vdc->curr_server = vdc->server_list;
2481		status = 0;
2482	} else {
2483		status = ENOENT;
2484	}
2485
2486done:
2487	kmem_free(chanp, listsz);
2488	kmem_free(portp, listsz);
2489	return (status);
2490}
2491
2492
2493/*
2494 * Function:
2495 *	vdc_do_ldc_up
2496 *
2497 * Description:
2498 *	Bring the channel for the current server up.
2499 *
2500 * Arguments:
2501 *	vdc	- soft state pointer for this instance of the device driver.
2502 *
2503 * Return Code:
2504 *	0		- Success.
2505 *	EINVAL		- Driver is detaching / LDC error
2506 *	ECONNREFUSED	- Other end is not listening
2507 */
2508static int
2509vdc_do_ldc_up(vdc_t *vdc)
2510{
2511	int		status;
2512	ldc_status_t	ldc_state;
2513
2514	ASSERT(MUTEX_HELD(&vdc->lock));
2515
2516	DMSG(vdc, 0, "[%d] Bringing up channel %lx\n",
2517	    vdc->instance, vdc->curr_server->ldc_id);
2518
2519	if (vdc->lifecycle == VDC_LC_DETACHING)
2520		return (EINVAL);
2521
2522	if ((status = ldc_up(vdc->curr_server->ldc_handle)) != 0) {
2523		switch (status) {
2524		case ECONNREFUSED:	/* listener not ready at other end */
2525			DMSG(vdc, 0, "[%d] ldc_up(%lx,...) return %d\n",
2526			    vdc->instance, vdc->curr_server->ldc_id, status);
2527			status = 0;
2528			break;
2529		default:
2530			DMSG(vdc, 0, "[%d] Failed to bring up LDC: "
2531			    "channel=%ld, err=%d", vdc->instance,
2532			    vdc->curr_server->ldc_id, status);
2533			break;
2534		}
2535	}
2536
2537	if (ldc_status(vdc->curr_server->ldc_handle, &ldc_state) == 0) {
2538		vdc->curr_server->ldc_state = ldc_state;
2539		if (ldc_state == LDC_UP) {
2540			DMSG(vdc, 0, "[%d] LDC channel already up\n",
2541			    vdc->instance);
2542			vdc->seq_num = 1;
2543			vdc->seq_num_reply = 0;
2544		}
2545	}
2546
2547	return (status);
2548}
2549
2550/*
2551 * Function:
2552 *	vdc_terminate_ldc()
2553 *
2554 * Description:
2555 *
2556 * Arguments:
2557 *	vdc	- soft state pointer for this instance of the device driver.
2558 *	srvr	- vdc per-server info structure
2559 *
2560 * Return Code:
2561 *	None
2562 */
2563static void
2564vdc_terminate_ldc(vdc_t *vdc, vdc_server_t *srvr)
2565{
2566	int	instance = ddi_get_instance(vdc->dip);
2567
2568	if (srvr->state & VDC_LDC_OPEN) {
2569		DMSG(vdc, 0, "[%d] ldc_close()\n", instance);
2570		(void) ldc_close(srvr->ldc_handle);
2571	}
2572	if (srvr->state & VDC_LDC_CB) {
2573		DMSG(vdc, 0, "[%d] ldc_unreg_callback()\n", instance);
2574		(void) ldc_unreg_callback(srvr->ldc_handle);
2575	}
2576	if (srvr->state & VDC_LDC_INIT) {
2577		DMSG(vdc, 0, "[%d] ldc_fini()\n", instance);
2578		(void) ldc_fini(srvr->ldc_handle);
2579		srvr->ldc_handle = 0;
2580	}
2581
2582	srvr->state &= ~(VDC_LDC_INIT | VDC_LDC_CB | VDC_LDC_OPEN);
2583}
2584
2585/*
2586 * Function:
2587 *	vdc_fini_ports()
2588 *
2589 * Description:
2590 *	Finalize all ports by closing the channel associated with each
2591 *	port and also freeing the server structure.
2592 *
2593 * Arguments:
2594 *	vdc	- soft state pointer for this instance of the device driver.
2595 *
2596 * Return Code:
2597 *	None
2598 */
2599static void
2600vdc_fini_ports(vdc_t *vdc)
2601{
2602	int		instance = ddi_get_instance(vdc->dip);
2603	vdc_server_t	*srvr, *prev_srvr;
2604
2605	ASSERT(vdc != NULL);
2606	ASSERT(mutex_owned(&vdc->lock));
2607
2608	DMSG(vdc, 0, "[%d] initialized=%x\n", instance, vdc->initialized);
2609
2610	srvr = vdc->server_list;
2611
2612	while (srvr) {
2613
2614		vdc_terminate_ldc(vdc, srvr);
2615
2616		/* next server */
2617		prev_srvr = srvr;
2618		srvr = srvr->next;
2619
2620		/* free server */
2621		kmem_free(prev_srvr, sizeof (vdc_server_t));
2622	}
2623
2624	vdc->server_list = NULL;
2625	vdc->num_servers = 0;
2626}
2627
2628/* -------------------------------------------------------------------------- */
2629
2630/*
2631 * Descriptor Ring helper routines
2632 */
2633
2634/*
2635 * Function:
2636 *	vdc_init_descriptor_ring()
2637 *
2638 * Description:
2639 *
2640 * Arguments:
2641 *	vdc	- soft state pointer for this instance of the device driver.
2642 *
2643 * Return Code:
2644 *	0	- Success
2645 */
2646static int
2647vdc_init_descriptor_ring(vdc_t *vdc)
2648{
2649	vd_dring_entry_t	*dep = NULL;	/* DRing Entry pointer */
2650	int	status = 0;
2651	int	i;
2652
2653	DMSG(vdc, 0, "[%d] initialized=%x\n", vdc->instance, vdc->initialized);
2654
2655	ASSERT(vdc != NULL);
2656	ASSERT(mutex_owned(&vdc->lock));
2657
2658	/* ensure we have enough room to store max sized block */
2659	ASSERT(maxphys <= VD_MAX_BLOCK_SIZE);
2660
2661	if ((vdc->initialized & VDC_DRING_INIT) == 0) {
2662		DMSG(vdc, 0, "[%d] ldc_mem_dring_create\n", vdc->instance);
2663		/*
2664		 * Calculate the maximum block size we can transmit using one
2665		 * Descriptor Ring entry from the attributes returned by the
2666		 * vDisk server. This is subject to a minimum of 'maxphys'
2667		 * as we do not have the capability to split requests over
2668		 * multiple DRing entries.
2669		 */
2670		if ((vdc->max_xfer_sz * vdc->vdisk_bsize) < maxphys) {
2671			DMSG(vdc, 0, "[%d] using minimum DRing size\n",
2672			    vdc->instance);
2673			vdc->dring_max_cookies = maxphys / PAGESIZE;
2674		} else {
2675			vdc->dring_max_cookies =
2676			    (vdc->max_xfer_sz * vdc->vdisk_bsize) / PAGESIZE;
2677		}
2678		vdc->dring_entry_size = (sizeof (vd_dring_entry_t) +
2679		    (sizeof (ldc_mem_cookie_t) *
2680		    (vdc->dring_max_cookies - 1)));
2681		vdc->dring_len = VD_DRING_LEN;
2682
2683		status = ldc_mem_dring_create(vdc->dring_len,
2684		    vdc->dring_entry_size, &vdc->dring_hdl);
2685		if ((vdc->dring_hdl == 0) || (status != 0)) {
2686			DMSG(vdc, 0, "[%d] Descriptor ring creation failed",
2687			    vdc->instance);
2688			return (status);
2689		}
2690		vdc->initialized |= VDC_DRING_INIT;
2691	}
2692
2693	if ((vdc->initialized & VDC_DRING_BOUND) == 0) {
2694		DMSG(vdc, 0, "[%d] ldc_mem_dring_bind\n", vdc->instance);
2695		vdc->dring_cookie =
2696		    kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP);
2697
2698		status = ldc_mem_dring_bind(vdc->curr_server->ldc_handle,
2699		    vdc->dring_hdl,
2700		    LDC_SHADOW_MAP|LDC_DIRECT_MAP, LDC_MEM_RW,
2701		    &vdc->dring_cookie[0],
2702		    &vdc->dring_cookie_count);
2703		if (status != 0) {
2704			DMSG(vdc, 0, "[%d] Failed to bind descriptor ring "
2705			    "(%lx) to channel (%lx) status=%d\n",
2706			    vdc->instance, vdc->dring_hdl,
2707			    vdc->curr_server->ldc_handle, status);
2708			return (status);
2709		}
2710		ASSERT(vdc->dring_cookie_count == 1);
2711		vdc->initialized |= VDC_DRING_BOUND;
2712	}
2713
2714	status = ldc_mem_dring_info(vdc->dring_hdl, &vdc->dring_mem_info);
2715	if (status != 0) {
2716		DMSG(vdc, 0,
2717		    "[%d] Failed to get info for descriptor ring (%lx)\n",
2718		    vdc->instance, vdc->dring_hdl);
2719		return (status);
2720	}
2721
2722	if ((vdc->initialized & VDC_DRING_LOCAL) == 0) {
2723		DMSG(vdc, 0, "[%d] local dring\n", vdc->instance);
2724
2725		/* Allocate the local copy of this dring */
2726		vdc->local_dring =
2727		    kmem_zalloc(vdc->dring_len * sizeof (vdc_local_desc_t),
2728		    KM_SLEEP);
2729		vdc->initialized |= VDC_DRING_LOCAL;
2730	}
2731
2732	/*
2733	 * Mark all DRing entries as free and initialize the private
2734	 * descriptor's memory handles. If any entry is initialized,
2735	 * we need to free it later so we set the bit in 'initialized'
2736	 * at the start.
2737	 */
2738	vdc->initialized |= VDC_DRING_ENTRY;
2739	for (i = 0; i < vdc->dring_len; i++) {
2740		dep = VDC_GET_DRING_ENTRY_PTR(vdc, i);
2741		dep->hdr.dstate = VIO_DESC_FREE;
2742
2743		status = ldc_mem_alloc_handle(vdc->curr_server->ldc_handle,
2744		    &vdc->local_dring[i].desc_mhdl);
2745		if (status != 0) {
2746			DMSG(vdc, 0, "![%d] Failed to alloc mem handle for"
2747			    " descriptor %d", vdc->instance, i);
2748			return (status);
2749		}
2750		vdc->local_dring[i].is_free = B_TRUE;
2751		vdc->local_dring[i].dep = dep;
2752	}
2753
2754	/* Initialize the starting index */
2755	vdc->dring_curr_idx = VDC_DRING_FIRST_ENTRY;
2756
2757	return (status);
2758}
2759
2760/*
2761 * Function:
2762 *	vdc_destroy_descriptor_ring()
2763 *
2764 * Description:
2765 *
2766 * Arguments:
2767 *	vdc	- soft state pointer for this instance of the device driver.
2768 *
2769 * Return Code:
2770 *	None
2771 */
2772static void
2773vdc_destroy_descriptor_ring(vdc_t *vdc)
2774{
2775	vdc_local_desc_t	*ldep = NULL;	/* Local Dring Entry Pointer */
2776	ldc_mem_handle_t	mhdl = 0;
2777	ldc_mem_info_t		minfo;
2778	int			status = -1;
2779	int			i;	/* loop */
2780
2781	ASSERT(vdc != NULL);
2782	ASSERT(mutex_owned(&vdc->lock));
2783
2784	DMSG(vdc, 0, "[%d] Entered\n", vdc->instance);
2785
2786	if (vdc->initialized & VDC_DRING_ENTRY) {
2787		DMSG(vdc, 0,
2788		    "[%d] Removing Local DRing entries\n", vdc->instance);
2789		for (i = 0; i < vdc->dring_len; i++) {
2790			ldep = &vdc->local_dring[i];
2791			mhdl = ldep->desc_mhdl;
2792
2793			if (mhdl == 0)
2794				continue;
2795
2796			if ((status = ldc_mem_info(mhdl, &minfo)) != 0) {
2797				DMSG(vdc, 0,
2798				    "ldc_mem_info returned an error: %d\n",
2799				    status);
2800
2801				/*
2802				 * This must mean that the mem handle
2803				 * is not valid. Clear it out so that
2804				 * no one tries to use it.
2805				 */
2806				ldep->desc_mhdl = 0;
2807				continue;
2808			}
2809
2810			if (minfo.status == LDC_BOUND) {
2811				(void) ldc_mem_unbind_handle(mhdl);
2812			}
2813
2814			(void) ldc_mem_free_handle(mhdl);
2815
2816			ldep->desc_mhdl = 0;
2817		}
2818		vdc->initialized &= ~VDC_DRING_ENTRY;
2819	}
2820
2821	if (vdc->initialized & VDC_DRING_LOCAL) {
2822		DMSG(vdc, 0, "[%d] Freeing Local DRing\n", vdc->instance);
2823		kmem_free(vdc->local_dring,
2824		    vdc->dring_len * sizeof (vdc_local_desc_t));
2825		vdc->initialized &= ~VDC_DRING_LOCAL;
2826	}
2827
2828	if (vdc->initialized & VDC_DRING_BOUND) {
2829		DMSG(vdc, 0, "[%d] Unbinding DRing\n", vdc->instance);
2830		status = ldc_mem_dring_unbind(vdc->dring_hdl);
2831		if (status == 0) {
2832			vdc->initialized &= ~VDC_DRING_BOUND;
2833		} else {
2834			DMSG(vdc, 0, "[%d] Error %d unbinding DRing %lx",
2835			    vdc->instance, status, vdc->dring_hdl);
2836		}
2837		kmem_free(vdc->dring_cookie, sizeof (ldc_mem_cookie_t));
2838	}
2839
2840	if (vdc->initialized & VDC_DRING_INIT) {
2841		DMSG(vdc, 0, "[%d] Destroying DRing\n", vdc->instance);
2842		status = ldc_mem_dring_destroy(vdc->dring_hdl);
2843		if (status == 0) {
2844			vdc->dring_hdl = 0;
2845			bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t));
2846			vdc->initialized &= ~VDC_DRING_INIT;
2847		} else {
2848			DMSG(vdc, 0, "[%d] Error %d destroying DRing (%lx)",
2849			    vdc->instance, status, vdc->dring_hdl);
2850		}
2851	}
2852}
2853
2854/*
2855 * Function:
2856 *	vdc_map_to_shared_dring()
2857 *
2858 * Description:
2859 *	Copy contents of the local descriptor to the shared
2860 *	memory descriptor.
2861 *
2862 * Arguments:
2863 *	vdcp	- soft state pointer for this instance of the device driver.
2864 *	idx	- descriptor ring index
2865 *
2866 * Return Code:
2867 *	None
2868 */
2869static int
2870vdc_map_to_shared_dring(vdc_t *vdcp, int idx)
2871{
2872	vdc_local_desc_t	*ldep;
2873	vd_dring_entry_t	*dep;
2874	int			rv;
2875
2876	ldep = &(vdcp->local_dring[idx]);
2877
2878	/* for now leave in the old pop_mem_hdl stuff */
2879	if (ldep->nbytes > 0) {
2880		rv = vdc_populate_mem_hdl(vdcp, ldep);
2881		if (rv) {
2882			DMSG(vdcp, 0, "[%d] Cannot populate mem handle\n",
2883			    vdcp->instance);
2884			return (rv);
2885		}
2886	}
2887
2888	/*
2889	 * fill in the data details into the DRing
2890	 */
2891	dep = ldep->dep;
2892	ASSERT(dep != NULL);
2893
2894	dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdcp);
2895	dep->payload.operation = ldep->operation;
2896	dep->payload.addr = ldep->offset;
2897	dep->payload.nbytes = ldep->nbytes;
2898	dep->payload.status = (uint32_t)-1;	/* vds will set valid value */
2899	dep->payload.slice = ldep->slice;
2900	dep->hdr.dstate = VIO_DESC_READY;
2901	dep->hdr.ack = 1;		/* request an ACK for every message */
2902
2903	return (0);
2904}
2905
2906/*
2907 * Function:
2908 *	vdc_send_request
2909 *
2910 * Description:
2911 *	This routine writes the data to be transmitted to vds into the
2912 *	descriptor, notifies vds that the ring has been updated and
2913 *	then waits for the request to be processed.
2914 *
2915 * Arguments:
2916 *	vdcp	  - the soft state pointer
2917 *	operation - operation we want vds to perform (VD_OP_XXX)
2918 *	addr	  - address of data buf to be read/written.
2919 *	nbytes	  - number of bytes to read/write
2920 *	slice	  - the disk slice this request is for
2921 *	offset	  - relative disk offset
2922 *	bufp	  - buf of operation
2923 *	dir	  - direction of operation (READ/WRITE/BOTH)
2924 *
2925 * Return Codes:
2926 *	0
2927 *	ENXIO
2928 */
2929static int
2930vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr,
2931    size_t nbytes, int slice, diskaddr_t offset, buf_t *bufp,
2932    vio_desc_direction_t dir, int flags)
2933{
2934	int	rv = 0;
2935
2936	ASSERT(vdcp != NULL);
2937	ASSERT(slice == VD_SLICE_NONE || slice < V_NUMPAR);
2938
2939	mutex_enter(&vdcp->lock);
2940
2941	/*
2942	 * If this is a block read/write operation we update the I/O statistics
2943	 * to indicate that the request is being put on the waitq to be
2944	 * serviced. Operations which are resubmitted are already in the waitq.
2945	 *
2946	 * We do it here (a common routine for both synchronous and strategy
2947	 * calls) for performance reasons - we are already holding vdc->lock
2948	 * so there is no extra locking overhead. We would have to explicitly
2949	 * grab the 'lock' mutex to update the stats if we were to do this
2950	 * higher up the stack in vdc_strategy() et. al.
2951	 */
2952	if (((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) &&
2953	    !(flags & VDC_OP_RESUBMIT)) {
2954		DTRACE_IO1(start, buf_t *, bufp);
2955		VD_KSTAT_WAITQ_ENTER(vdcp);
2956	}
2957
2958	/*
2959	 * If the request does not expect the state to be VDC_STATE_RUNNING
2960	 * then we just try to populate the descriptor ring once.
2961	 */
2962	if (!(flags & VDC_OP_STATE_RUNNING)) {
2963		rv = vdc_populate_descriptor(vdcp, operation, addr,
2964		    nbytes, slice, offset, bufp, dir, flags);
2965		goto done;
2966	}
2967
2968	do {
2969		while (vdcp->state != VDC_STATE_RUNNING) {
2970
2971			/* return error if detaching */
2972			if (vdcp->state == VDC_STATE_DETACH) {
2973				rv = ENXIO;
2974				goto done;
2975			}
2976
2977			/*
2978			 * If we are panicking and the disk is not ready then
2979			 * we can't send any request because we can't complete
2980			 * the handshake now.
2981			 */
2982			if (ddi_in_panic()) {
2983				rv = EIO;
2984				goto done;
2985			}
2986
2987			/*
2988			 * If the state is faulted, notify that a new I/O is
2989			 * being submitted to force the system to check if any
2990			 * server has recovered.
2991			 */
2992			if (vdcp->state == VDC_STATE_FAILED) {
2993				vdcp->io_pending = B_TRUE;
2994				cv_signal(&vdcp->io_pending_cv);
2995			}
2996
2997			cv_wait(&vdcp->running_cv, &vdcp->lock);
2998
2999			/* if service is still faulted then fail the request */
3000			if (vdcp->state == VDC_STATE_FAILED) {
3001				rv = EIO;
3002				goto done;
3003			}
3004		}
3005
3006	} while (vdc_populate_descriptor(vdcp, operation, addr,
3007	    nbytes, slice, offset, bufp, dir, flags & ~VDC_OP_RESUBMIT));
3008
3009done:
3010	/*
3011	 * If this is a block read/write we update the I/O statistics kstat
3012	 * to indicate that this request has been placed on the queue for
3013	 * processing (i.e sent to the vDisk server) - iostat(1M) will
3014	 * report the time waiting for the vDisk server under the %b column
3015	 *
3016	 * In the case of an error we take it off the wait queue only if
3017	 * the I/O was not resubmited.
3018	 */
3019	if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) {
3020		if (rv == 0) {
3021			VD_KSTAT_WAITQ_TO_RUNQ(vdcp);
3022			DTRACE_PROBE1(send, buf_t *, bufp);
3023		} else {
3024			VD_UPDATE_ERR_STATS(vdcp, vd_transerrs);
3025			if (!(flags & VDC_OP_RESUBMIT)) {
3026				VD_KSTAT_WAITQ_EXIT(vdcp);
3027				DTRACE_IO1(done, buf_t *, bufp);
3028			}
3029		}
3030	}
3031
3032	mutex_exit(&vdcp->lock);
3033
3034	return (rv);
3035}
3036
3037
3038/*
3039 * Function:
3040 *	vdc_populate_descriptor
3041 *
3042 * Description:
3043 *	This routine writes the data to be transmitted to vds into the
3044 *	descriptor, notifies vds that the ring has been updated and
3045 *	then waits for the request to be processed.
3046 *
3047 * Arguments:
3048 *	vdcp	  - the soft state pointer
3049 *	operation - operation we want vds to perform (VD_OP_XXX)
3050 *	addr	  - address of data buf to be read/written.
3051 *	nbytes	  - number of bytes to read/write
3052 *	slice	  - the disk slice this request is for
3053 *	offset	  - relative disk offset
3054 *	bufp	  - buf of operation
3055 *	dir	  - direction of operation (READ/WRITE/BOTH)
3056 *
3057 * Return Codes:
3058 *	0
3059 *	EAGAIN
3060 *	ECONNRESET
3061 *	ENXIO
3062 */
3063static int
3064vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr,
3065    size_t nbytes, int slice, diskaddr_t offset,
3066    buf_t *bufp, vio_desc_direction_t dir, int flags)
3067{
3068	vdc_local_desc_t	*local_dep = NULL; /* Local Dring Pointer */
3069	int			idx;		/* Index of DRing entry used */
3070	int			next_idx;
3071	vio_dring_msg_t		dmsg;
3072	size_t			msglen;
3073	int			rv;
3074
3075	ASSERT(MUTEX_HELD(&vdcp->lock));
3076	vdcp->threads_pending++;
3077loop:
3078	DMSG(vdcp, 2, ": dring_curr_idx = %d\n", vdcp->dring_curr_idx);
3079
3080	if (flags & VDC_OP_DRING_RESERVED) {
3081		/* use D-Ring reserved entry */
3082		idx = VDC_DRING_FIRST_RESV;
3083		local_dep = &(vdcp->local_dring[idx]);
3084	} else {
3085		/* Get next available D-Ring entry */
3086		idx = vdcp->dring_curr_idx;
3087		local_dep = &(vdcp->local_dring[idx]);
3088
3089		if (!local_dep->is_free) {
3090			DMSG(vdcp, 2, "[%d]: dring full - waiting for space\n",
3091			    vdcp->instance);
3092			cv_wait(&vdcp->dring_free_cv, &vdcp->lock);
3093			if (vdcp->state == VDC_STATE_RUNNING ||
3094			    vdcp->state == VDC_STATE_HANDLE_PENDING) {
3095				goto loop;
3096			}
3097			vdcp->threads_pending--;
3098			return (ECONNRESET);
3099		}
3100
3101		next_idx = idx + 1;
3102		if (next_idx >= vdcp->dring_len)
3103			next_idx = VDC_DRING_FIRST_ENTRY;
3104		vdcp->dring_curr_idx = next_idx;
3105	}
3106
3107	ASSERT(local_dep->is_free);
3108
3109	local_dep->operation = operation;
3110	local_dep->addr = addr;
3111	local_dep->nbytes = nbytes;
3112	local_dep->slice = slice;
3113	local_dep->offset = offset;
3114	local_dep->buf = bufp;
3115	local_dep->dir = dir;
3116	local_dep->flags = flags;
3117
3118	local_dep->is_free = B_FALSE;
3119
3120	rv = vdc_map_to_shared_dring(vdcp, idx);
3121	if (rv) {
3122		if (flags & VDC_OP_DRING_RESERVED) {
3123			DMSG(vdcp, 0, "[%d]: cannot bind memory - error\n",
3124			    vdcp->instance);
3125			/*
3126			 * We can't wait if we are using reserved slot.
3127			 * Free the descriptor and return.
3128			 */
3129			local_dep->is_free = B_TRUE;
3130			vdcp->threads_pending--;
3131			return (rv);
3132		}
3133		DMSG(vdcp, 0, "[%d]: cannot bind memory - waiting ..\n",
3134		    vdcp->instance);
3135		/* free the descriptor */
3136		local_dep->is_free = B_TRUE;
3137		vdcp->dring_curr_idx = idx;
3138		cv_wait(&vdcp->membind_cv, &vdcp->lock);
3139		if (vdcp->state == VDC_STATE_RUNNING ||
3140		    vdcp->state == VDC_STATE_HANDLE_PENDING) {
3141			goto loop;
3142		}
3143		vdcp->threads_pending--;
3144		return (ECONNRESET);
3145	}
3146
3147	/*
3148	 * Send a msg with the DRing details to vds
3149	 */
3150	VIO_INIT_DRING_DATA_TAG(dmsg);
3151	VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdcp);
3152	dmsg.dring_ident = vdcp->dring_ident;
3153	dmsg.start_idx = idx;
3154	dmsg.end_idx = idx;
3155	vdcp->seq_num++;
3156
3157	DTRACE_PROBE2(populate, int, vdcp->instance,
3158	    vdc_local_desc_t *, local_dep);
3159	DMSG(vdcp, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n",
3160	    vdcp->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num);
3161
3162	/*
3163	 * note we're still holding the lock here to
3164	 * make sure the message goes out in order !!!...
3165	 */
3166	msglen = sizeof (dmsg);
3167	rv = vdc_send(vdcp, (caddr_t)&dmsg, &msglen);
3168	switch (rv) {
3169	case ECONNRESET:
3170		/*
3171		 * vdc_send initiates the reset on failure.
3172		 * Since the transaction has already been put
3173		 * on the local dring, it will automatically get
3174		 * retried when the channel is reset. Given that,
3175		 * it is ok to just return success even though the
3176		 * send failed.
3177		 */
3178		rv = 0;
3179		break;
3180
3181	case 0: /* EOK */
3182		DMSG(vdcp, 1, "sent via LDC: rv=%d\n", rv);
3183		break;
3184
3185	default:
3186		DMSG(vdcp, 0, "unexpected error, rv=%d\n", rv);
3187		rv = ENXIO;
3188		break;
3189	}
3190
3191	vdcp->threads_pending--;
3192	return (rv);
3193}
3194
3195/*
3196 * Function:
3197 *	vdc_do_op
3198 *
3199 * Description:
3200 *	Wrapper around vdc_submit_request(). Each request is associated with a
3201 *	buf structure. If a buf structure is provided (bufp != NULL) then the
3202 *	request will be submitted with that buf, and the caller can wait for
3203 *	completion of the request with biowait(). If a buf structure is not
3204 *	provided (bufp == NULL) then a buf structure is created and the function
3205 *	waits for the completion of the request.
3206 *
3207 *	If the flag VD_OP_STATE_RUNNING is set then vdc_submit_request() will
3208 *	submit the request only when the vdisk is in state VD_STATE_RUNNING.
3209 *	If the vdisk is not in that state then the vdc_submit_request() will
3210 *	wait for that state to be reached. After the request is submitted, the
3211 *	reply will be processed asynchronously by the vdc_process_msg_thread()
3212 *	thread.
3213 *
3214 *	If the flag VD_OP_STATE_RUNNING is not set then vdc_submit_request()
3215 *	submit the request whatever the state of the vdisk is. Then vdc_do_op()
3216 *	will wait for a reply message, process the reply and complete the
3217 *	request.
3218 *
3219 * Arguments:
3220 *	vdc	- the soft state pointer
3221 *	op	- operation we want vds to perform (VD_OP_XXX)
3222 *	addr	- address of data buf to be read/written.
3223 *	nbytes	- number of bytes to read/write
3224 *	slice	- the disk slice this request is for
3225 *	offset	- relative disk offset
3226 *	bufp	- buf structure associated with the request (can be NULL).
3227 *	dir	- direction of operation (READ/WRITE/BOTH)
3228 *	flags	- flags for the request.
3229 *
3230 * Return Codes:
3231 *	0	- the request has been succesfully submitted and completed.
3232 *	!= 0	- the request has failed. In that case, if a buf structure
3233 *		  was provided (bufp != NULL) then the B_ERROR flag is set
3234 *		  and the b_error field of the buf structure is set to EIO.
3235 */
3236static int
3237vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes, int slice,
3238    diskaddr_t offset, struct buf *bufp, vio_desc_direction_t dir, int flags)
3239{
3240	vio_msg_t vio_msg;
3241	struct buf buf;
3242	int rv;
3243
3244	if (bufp == NULL) {
3245		/*
3246		 * We use buf just as a convenient way to get a notification
3247		 * that the request is completed, so we initialize buf to the
3248		 * minimum we need.
3249		 */
3250		bioinit(&buf);
3251		buf.b_bcount = nbytes;
3252		buf.b_flags = B_BUSY;
3253		bufp = &buf;
3254	}
3255
3256	rv = vdc_send_request(vdc, op, addr, nbytes, slice, offset, bufp,
3257	    dir, flags);
3258
3259	if (rv != 0)
3260		goto done;
3261
3262	/*
3263	 * If the request should be done in VDC_STATE_RUNNING state then the
3264	 * reply will be received and processed by vdc_process_msg_thread()
3265	 * and we just have to handle the panic case. Otherwise we have to
3266	 * wait for the reply message and process it.
3267	 */
3268	if (flags & VDC_OP_STATE_RUNNING) {
3269
3270		if (ddi_in_panic()) {
3271			rv = vdc_drain_response(vdc, bufp);
3272			goto done;
3273		}
3274
3275	} else {
3276		/* wait for the response message */
3277		rv = vdc_wait_for_response(vdc, &vio_msg);
3278
3279		if (rv == 0)
3280			rv = vdc_process_data_msg(vdc, &vio_msg);
3281
3282		if (rv) {
3283			/*
3284			 * If this is a block read/write we update the I/O
3285			 * statistics kstat to take it off the run queue.
3286			 * If it is a resubmit then it needs to stay in
3287			 * in the waitq, and it will be removed when the
3288			 * I/O is eventually completed or cancelled.
3289			 */
3290			mutex_enter(&vdc->lock);
3291			if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
3292				if (flags & VDC_OP_RESUBMIT) {
3293					VD_KSTAT_RUNQ_BACK_TO_WAITQ(vdc);
3294				} else {
3295					VD_KSTAT_RUNQ_EXIT(vdc);
3296					DTRACE_IO1(done, buf_t *, bufp);
3297				}
3298			}
3299			mutex_exit(&vdc->lock);
3300			goto done;
3301		}
3302
3303	}
3304
3305	if (bufp == &buf)
3306		rv = biowait(bufp);
3307
3308done:
3309	if (bufp == &buf) {
3310		biofini(bufp);
3311	} else if (rv != 0) {
3312		bioerror(bufp, EIO);
3313		biodone(bufp);
3314	}
3315
3316	return (rv);
3317}
3318
3319/*
3320 * Function:
3321 *	vdc_do_sync_op
3322 *
3323 * Description:
3324 *	Wrapper around vdc_do_op that serializes requests.
3325 *
3326 * Arguments:
3327 *	vdcp	  - the soft state pointer
3328 *	operation - operation we want vds to perform (VD_OP_XXX)
3329 *	addr	  - address of data buf to be read/written.
3330 *	nbytes	  - number of bytes to read/write
3331 *	slice	  - the disk slice this request is for
3332 *	offset	  - relative disk offset
3333 *	dir	  - direction of operation (READ/WRITE/BOTH)
3334 *	rconflict - check for reservation conflict in case of failure
3335 *
3336 * rconflict should be set to B_TRUE by most callers. Callers invoking the
3337 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the
3338 * result of a successful operation with vdc_scsi_status().
3339 *
3340 * Return Codes:
3341 *	0
3342 *	EAGAIN
3343 *	EFAULT
3344 *	ENXIO
3345 *	EIO
3346 */
3347static int
3348vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes,
3349    int slice, diskaddr_t offset, vio_desc_direction_t dir, boolean_t rconflict)
3350{
3351	int status;
3352	int flags = VDC_OP_NORMAL;
3353
3354	/*
3355	 * Grab the lock, if blocked wait until the server
3356	 * response causes us to wake up again.
3357	 */
3358	mutex_enter(&vdcp->lock);
3359	vdcp->sync_op_cnt++;
3360	while (vdcp->sync_op_blocked && vdcp->state != VDC_STATE_DETACH) {
3361		if (ddi_in_panic()) {
3362			/* don't block if we are panicking */
3363			vdcp->sync_op_cnt--;
3364			mutex_exit(&vdcp->lock);
3365			return (EIO);
3366		} else {
3367			cv_wait(&vdcp->sync_blocked_cv, &vdcp->lock);
3368		}
3369	}
3370
3371	if (vdcp->state == VDC_STATE_DETACH) {
3372		cv_broadcast(&vdcp->sync_blocked_cv);
3373		vdcp->sync_op_cnt--;
3374		mutex_exit(&vdcp->lock);
3375		return (ENXIO);
3376	}
3377
3378	/* now block anyone other thread entering after us */
3379	vdcp->sync_op_blocked = B_TRUE;
3380
3381	mutex_exit(&vdcp->lock);
3382
3383	if (!rconflict)
3384		flags &= ~VDC_OP_ERRCHK_CONFLICT;
3385
3386	status = vdc_do_op(vdcp, operation, addr, nbytes, slice, offset,
3387	    NULL, dir, flags);
3388
3389	mutex_enter(&vdcp->lock);
3390
3391	DMSG(vdcp, 2, ": operation returned %d\n", status);
3392
3393	if (vdcp->state == VDC_STATE_DETACH) {
3394		status = ENXIO;
3395	}
3396
3397	vdcp->sync_op_blocked = B_FALSE;
3398	vdcp->sync_op_cnt--;
3399
3400	/* signal the next waiting thread */
3401	cv_signal(&vdcp->sync_blocked_cv);
3402
3403	mutex_exit(&vdcp->lock);
3404
3405	return (status);
3406}
3407
3408
3409/*
3410 * Function:
3411 *	vdc_drain_response()
3412 *
3413 * Description:
3414 *	When a guest is panicking, the completion of requests needs to be
3415 *	handled differently because interrupts are disabled and vdc
3416 *	will not get messages. We have to poll for the messages instead.
3417 *
3418 *	Note: since we are panicking we don't implement	the io:::done
3419 *	DTrace probe or update the I/O statistics kstats.
3420 *
3421 * Arguments:
3422 *	vdc	- soft state pointer for this instance of the device driver.
3423 *	buf	- if buf is NULL then we drain all responses, otherwise we
3424 *		  poll until we receive a ACK/NACK for the specific I/O
3425 *		  described by buf.
3426 *
3427 * Return Code:
3428 *	0	- Success. If we were expecting a response to a particular
3429 *		  request then this means that a response has been received.
3430 */
3431static int
3432vdc_drain_response(vdc_t *vdc, struct buf *buf)
3433{
3434	int			rv, idx, retries;
3435	size_t			msglen;
3436	vdc_local_desc_t	*ldep = NULL;	/* Local Dring Entry Pointer */
3437	vio_dring_msg_t		dmsg;
3438	struct buf		*mbuf;
3439	boolean_t		ack;
3440
3441	mutex_enter(&vdc->lock);
3442
3443	retries = 0;
3444	for (;;) {
3445		msglen = sizeof (dmsg);
3446		rv = ldc_read(vdc->curr_server->ldc_handle, (caddr_t)&dmsg,
3447		    &msglen);
3448		if (rv) {
3449			rv = EINVAL;
3450			break;
3451		}
3452
3453		/*
3454		 * if there are no packets wait and check again
3455		 */
3456		if ((rv == 0) && (msglen == 0)) {
3457			if (retries++ > vdc_dump_retries) {
3458				rv = EAGAIN;
3459				break;
3460			}
3461
3462			drv_usecwait(vdc_usec_timeout_dump);
3463			continue;
3464		}
3465
3466		/*
3467		 * Ignore all messages that are not ACKs/NACKs to
3468		 * DRing requests.
3469		 */
3470		if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) ||
3471		    (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) {
3472			DMSG(vdc, 0, "discard pkt: type=%d sub=%d env=%d\n",
3473			    dmsg.tag.vio_msgtype,
3474			    dmsg.tag.vio_subtype,
3475			    dmsg.tag.vio_subtype_env);
3476			continue;
3477		}
3478
3479		/*
3480		 * Record if the packet was ACK'ed or not. If the packet was not
3481		 * ACK'ed then we will just mark the request as failed; we don't
3482		 * want to reset the connection at this point.
3483		 */
3484		switch (dmsg.tag.vio_subtype) {
3485		case VIO_SUBTYPE_ACK:
3486			ack = B_TRUE;
3487			break;
3488		case VIO_SUBTYPE_NACK:
3489			ack = B_FALSE;
3490			break;
3491		default:
3492			continue;
3493		}
3494
3495		idx = dmsg.start_idx;
3496		if (idx >= vdc->dring_len) {
3497			DMSG(vdc, 0, "[%d] Bogus ack data : start %d\n",
3498			    vdc->instance, idx);
3499			continue;
3500		}
3501		ldep = &vdc->local_dring[idx];
3502		if (ldep->dep->hdr.dstate != VIO_DESC_DONE) {
3503			DMSG(vdc, 0, "[%d] Entry @ %d - state !DONE %d\n",
3504			    vdc->instance, idx, ldep->dep->hdr.dstate);
3505			continue;
3506		}
3507
3508		mbuf = ldep->buf;
3509		ASSERT(mbuf != NULL);
3510		mbuf->b_resid = mbuf->b_bcount - ldep->dep->payload.nbytes;
3511		bioerror(mbuf, ack ? ldep->dep->payload.status : EIO);
3512		biodone(mbuf);
3513
3514		rv = vdc_depopulate_descriptor(vdc, idx);
3515		if (buf != NULL && buf == mbuf) {
3516			rv = 0;
3517			goto done;
3518		}
3519
3520		/* if this is the last descriptor - break out of loop */
3521		if ((idx + 1) % vdc->dring_len == vdc->dring_curr_idx) {
3522			/*
3523			 * If we were expecting a response for a particular
3524			 * request then we return with an error otherwise we
3525			 * have successfully completed the drain.
3526			 */
3527			rv = (buf != NULL)? ESRCH: 0;
3528			break;
3529		}
3530	}
3531
3532done:
3533	mutex_exit(&vdc->lock);
3534	DMSG(vdc, 0, "End idx=%d\n", idx);
3535
3536	return (rv);
3537}
3538
3539
3540/*
3541 * Function:
3542 *	vdc_depopulate_descriptor()
3543 *
3544 * Description:
3545 *
3546 * Arguments:
3547 *	vdc	- soft state pointer for this instance of the device driver.
3548 *	idx	- Index of the Descriptor Ring entry being modified
3549 *
3550 * Return Code:
3551 *	0	- Success
3552 */
3553static int
3554vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx)
3555{
3556	vd_dring_entry_t *dep = NULL;		/* Dring Entry Pointer */
3557	vdc_local_desc_t *ldep = NULL;		/* Local Dring Entry Pointer */
3558	int		status = ENXIO;
3559	int		rv = 0;
3560
3561	ASSERT(vdc != NULL);
3562	ASSERT(idx < vdc->dring_len);
3563	ldep = &vdc->local_dring[idx];
3564	ASSERT(ldep != NULL);
3565	ASSERT(MUTEX_HELD(&vdc->lock));
3566
3567	DTRACE_PROBE2(depopulate, int, vdc->instance, vdc_local_desc_t *, ldep);
3568	DMSG(vdc, 2, ": idx = %d\n", idx);
3569
3570	dep = ldep->dep;
3571	ASSERT(dep != NULL);
3572	ASSERT((dep->hdr.dstate == VIO_DESC_DONE) ||
3573	    (dep->payload.status == ECANCELED));
3574
3575	VDC_MARK_DRING_ENTRY_FREE(vdc, idx);
3576
3577	ldep->is_free = B_TRUE;
3578	status = dep->payload.status;
3579	DMSG(vdc, 2, ": is_free = %d : status = %d\n", ldep->is_free, status);
3580
3581	/*
3582	 * If no buffers were used to transfer information to the server when
3583	 * populating the descriptor then no memory handles need to be unbound
3584	 * and we can return now.
3585	 */
3586	if (ldep->nbytes == 0) {
3587		cv_signal(&vdc->dring_free_cv);
3588		return (status);
3589	}
3590
3591	/*
3592	 * If the upper layer passed in a misaligned address we copied the
3593	 * data into an aligned buffer before sending it to LDC - we now
3594	 * copy it back to the original buffer.
3595	 */
3596	if (ldep->align_addr) {
3597		ASSERT(ldep->addr != NULL);
3598
3599		if (dep->payload.nbytes > 0)
3600			bcopy(ldep->align_addr, ldep->addr,
3601			    dep->payload.nbytes);
3602		kmem_free(ldep->align_addr,
3603		    sizeof (caddr_t) * P2ROUNDUP(ldep->nbytes, 8));
3604		ldep->align_addr = NULL;
3605	}
3606
3607	rv = ldc_mem_unbind_handle(ldep->desc_mhdl);
3608	if (rv != 0) {
3609		DMSG(vdc, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)",
3610		    vdc->instance, ldep->desc_mhdl, idx, rv);
3611		/*
3612		 * The error returned by the vDisk server is more informative
3613		 * and thus has a higher priority but if it isn't set we ensure
3614		 * that this function returns an error.
3615		 */
3616		if (status == 0)
3617			status = EINVAL;
3618	}
3619
3620	cv_signal(&vdc->membind_cv);
3621	cv_signal(&vdc->dring_free_cv);
3622
3623	return (status);
3624}
3625
3626/*
3627 * Function:
3628 *	vdc_populate_mem_hdl()
3629 *
3630 * Description:
3631 *
3632 * Arguments:
3633 *	vdc	- soft state pointer for this instance of the device driver.
3634 *	idx	- Index of the Descriptor Ring entry being modified
3635 *	addr	- virtual address being mapped in
3636 *	nybtes	- number of bytes in 'addr'
3637 *	operation - the vDisk operation being performed (VD_OP_xxx)
3638 *
3639 * Return Code:
3640 *	0	- Success
3641 */
3642static int
3643vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep)
3644{
3645	vd_dring_entry_t	*dep = NULL;
3646	ldc_mem_handle_t	mhdl;
3647	caddr_t			vaddr;
3648	size_t			nbytes;
3649	uint8_t			perm = LDC_MEM_RW;
3650	uint8_t			maptype;
3651	int			rv = 0;
3652	int			i;
3653
3654	ASSERT(vdcp != NULL);
3655
3656	dep = ldep->dep;
3657	mhdl = ldep->desc_mhdl;
3658
3659	switch (ldep->dir) {
3660	case VIO_read_dir:
3661		perm = LDC_MEM_W;
3662		break;
3663
3664	case VIO_write_dir:
3665		perm = LDC_MEM_R;
3666		break;
3667
3668	case VIO_both_dir:
3669		perm = LDC_MEM_RW;
3670		break;
3671
3672	default:
3673		ASSERT(0);	/* catch bad programming in vdc */
3674	}
3675
3676	/*
3677	 * LDC expects any addresses passed in to be 8-byte aligned. We need
3678	 * to copy the contents of any misaligned buffers to a newly allocated
3679	 * buffer and bind it instead (and copy the the contents back to the
3680	 * original buffer passed in when depopulating the descriptor)
3681	 */
3682	vaddr = ldep->addr;
3683	nbytes = ldep->nbytes;
3684	if (((uint64_t)vaddr & 0x7) != 0) {
3685		ASSERT(ldep->align_addr == NULL);
3686		ldep->align_addr =
3687		    kmem_alloc(sizeof (caddr_t) *
3688		    P2ROUNDUP(nbytes, 8), KM_SLEEP);
3689		DMSG(vdcp, 0, "[%d] Misaligned address %p reallocating "
3690		    "(buf=%p nb=%ld op=%d)\n",
3691		    vdcp->instance, (void *)vaddr, (void *)ldep->align_addr,
3692		    nbytes, ldep->operation);
3693		if (perm != LDC_MEM_W)
3694			bcopy(vaddr, ldep->align_addr, nbytes);
3695		vaddr = ldep->align_addr;
3696	}
3697
3698	maptype = LDC_IO_MAP|LDC_SHADOW_MAP;
3699	rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8),
3700	    maptype, perm, &dep->payload.cookie[0], &dep->payload.ncookies);
3701	DMSG(vdcp, 2, "[%d] bound mem handle; ncookies=%d\n",
3702	    vdcp->instance, dep->payload.ncookies);
3703	if (rv != 0) {
3704		DMSG(vdcp, 0, "[%d] Failed to bind LDC memory handle "
3705		    "(mhdl=%p, buf=%p, err=%d)\n",
3706		    vdcp->instance, (void *)mhdl, (void *)vaddr, rv);
3707		if (ldep->align_addr) {
3708			kmem_free(ldep->align_addr,
3709			    sizeof (caddr_t) * P2ROUNDUP(nbytes, 8));
3710			ldep->align_addr = NULL;
3711		}
3712		return (EAGAIN);
3713	}
3714
3715	/*
3716	 * Get the other cookies (if any).
3717	 */
3718	for (i = 1; i < dep->payload.ncookies; i++) {
3719		rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]);
3720		if (rv != 0) {
3721			(void) ldc_mem_unbind_handle(mhdl);
3722			DMSG(vdcp, 0, "?[%d] Failed to get next cookie "
3723			    "(mhdl=%lx cnum=%d), err=%d",
3724			    vdcp->instance, mhdl, i, rv);
3725			if (ldep->align_addr) {
3726				kmem_free(ldep->align_addr,
3727				    sizeof (caddr_t) * ldep->nbytes);
3728				ldep->align_addr = NULL;
3729			}
3730			return (EAGAIN);
3731		}
3732	}
3733
3734	return (rv);
3735}
3736
3737/*
3738 * Interrupt handlers for messages from LDC
3739 */
3740
3741/*
3742 * Function:
3743 *	vdc_handle_cb()
3744 *
3745 * Description:
3746 *
3747 * Arguments:
3748 *	event	- Type of event (LDC_EVT_xxx) that triggered the callback
3749 *	arg	- soft state pointer for this instance of the device driver.
3750 *
3751 * Return Code:
3752 *	0	- Success
3753 */
3754static uint_t
3755vdc_handle_cb(uint64_t event, caddr_t arg)
3756{
3757	ldc_status_t	ldc_state;
3758	int		rv = 0;
3759	vdc_server_t	*srvr = (vdc_server_t *)(void *)arg;
3760	vdc_t		*vdc = srvr->vdcp;
3761
3762	ASSERT(vdc != NULL);
3763
3764	DMSG(vdc, 1, "evt=%lx seqID=%ld\n", event, vdc->seq_num);
3765
3766	/* If callback is not for the current server, ignore it */
3767	mutex_enter(&vdc->lock);
3768
3769	if (vdc->curr_server != srvr) {
3770		DMSG(vdc, 0, "[%d] Ignoring event 0x%lx for port@%ld\n",
3771		    vdc->instance, event, srvr->id);
3772		mutex_exit(&vdc->lock);
3773		return (LDC_SUCCESS);
3774	}
3775
3776	/*
3777	 * Depending on the type of event that triggered this callback,
3778	 * we modify the handshake state or read the data.
3779	 *
3780	 * NOTE: not done as a switch() as event could be triggered by
3781	 * a state change and a read request. Also the ordering	of the
3782	 * check for the event types is deliberate.
3783	 */
3784	if (event & LDC_EVT_UP) {
3785		DMSG(vdc, 0, "[%d] Received LDC_EVT_UP\n", vdc->instance);
3786
3787		/* get LDC state */
3788		rv = ldc_status(srvr->ldc_handle, &ldc_state);
3789		if (rv != 0) {
3790			DMSG(vdc, 0, "[%d] Couldn't get LDC status %d",
3791			    vdc->instance, rv);
3792			mutex_exit(&vdc->lock);
3793			return (LDC_SUCCESS);
3794		}
3795		if (srvr->ldc_state != LDC_UP &&
3796		    ldc_state == LDC_UP) {
3797			/*
3798			 * Reset the transaction sequence numbers when
3799			 * LDC comes up. We then kick off the handshake
3800			 * negotiation with the vDisk server.
3801			 */
3802			vdc->seq_num = 1;
3803			vdc->seq_num_reply = 0;
3804			vdc->io_pending = B_TRUE;
3805			srvr->ldc_state = ldc_state;
3806			cv_signal(&vdc->initwait_cv);
3807			cv_signal(&vdc->io_pending_cv);
3808		}
3809	}
3810
3811	if (event & LDC_EVT_READ) {
3812		DMSG(vdc, 1, "[%d] Received LDC_EVT_READ\n", vdc->instance);
3813		mutex_enter(&vdc->read_lock);
3814		cv_signal(&vdc->read_cv);
3815		vdc->read_state = VDC_READ_PENDING;
3816		mutex_exit(&vdc->read_lock);
3817		mutex_exit(&vdc->lock);
3818
3819		/* that's all we have to do - no need to handle DOWN/RESET */
3820		return (LDC_SUCCESS);
3821	}
3822
3823	if (event & (LDC_EVT_RESET|LDC_EVT_DOWN)) {
3824
3825		DMSG(vdc, 0, "[%d] Received LDC RESET event\n", vdc->instance);
3826
3827		/*
3828		 * Need to wake up any readers so they will
3829		 * detect that a reset has occurred.
3830		 */
3831		mutex_enter(&vdc->read_lock);
3832		if ((vdc->read_state == VDC_READ_WAITING) ||
3833		    (vdc->read_state == VDC_READ_RESET))
3834			cv_signal(&vdc->read_cv);
3835		vdc->read_state = VDC_READ_RESET;
3836		mutex_exit(&vdc->read_lock);
3837
3838		/* wake up any threads waiting for connection to come up */
3839		if (vdc->state == VDC_STATE_INIT_WAITING) {
3840			vdc->state = VDC_STATE_RESETTING;
3841			cv_signal(&vdc->initwait_cv);
3842		} else if (vdc->state == VDC_STATE_FAILED) {
3843			vdc->io_pending = B_TRUE;
3844			cv_signal(&vdc->io_pending_cv);
3845		}
3846
3847	}
3848
3849	mutex_exit(&vdc->lock);
3850
3851	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ))
3852		DMSG(vdc, 0, "![%d] Unexpected LDC event (%lx) received",
3853		    vdc->instance, event);
3854
3855	return (LDC_SUCCESS);
3856}
3857
3858/*
3859 * Function:
3860 *	vdc_wait_for_response()
3861 *
3862 * Description:
3863 *	Block waiting for a response from the server. If there is
3864 *	no data the thread block on the read_cv that is signalled
3865 *	by the callback when an EVT_READ occurs.
3866 *
3867 * Arguments:
3868 *	vdcp	- soft state pointer for this instance of the device driver.
3869 *
3870 * Return Code:
3871 *	0	- Success
3872 */
3873static int
3874vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp)
3875{
3876	size_t		nbytes = sizeof (*msgp);
3877	int		status;
3878
3879	ASSERT(vdcp != NULL);
3880
3881	DMSG(vdcp, 1, "[%d] Entered\n", vdcp->instance);
3882
3883	status = vdc_recv(vdcp, msgp, &nbytes);
3884	DMSG(vdcp, 3, "vdc_read() done.. status=0x%x size=0x%x\n",
3885	    status, (int)nbytes);
3886	if (status) {
3887		DMSG(vdcp, 0, "?[%d] Error %d reading LDC msg\n",
3888		    vdcp->instance, status);
3889		return (status);
3890	}
3891
3892	if (nbytes < sizeof (vio_msg_tag_t)) {
3893		DMSG(vdcp, 0, "?[%d] Expect %lu bytes; recv'd %lu\n",
3894		    vdcp->instance, sizeof (vio_msg_tag_t), nbytes);
3895		return (ENOMSG);
3896	}
3897
3898	DMSG(vdcp, 2, "[%d] (%x/%x/%x)\n", vdcp->instance,
3899	    msgp->tag.vio_msgtype,
3900	    msgp->tag.vio_subtype,
3901	    msgp->tag.vio_subtype_env);
3902
3903	/*
3904	 * Verify the Session ID of the message
3905	 *
3906	 * Every message after the Version has been negotiated should
3907	 * have the correct session ID set.
3908	 */
3909	if ((msgp->tag.vio_sid != vdcp->session_id) &&
3910	    (msgp->tag.vio_subtype_env != VIO_VER_INFO)) {
3911		DMSG(vdcp, 0, "[%d] Invalid SID: received 0x%x, "
3912		    "expected 0x%lx [seq num %lx @ %d]",
3913		    vdcp->instance, msgp->tag.vio_sid,
3914		    vdcp->session_id,
3915		    ((vio_dring_msg_t *)msgp)->seq_num,
3916		    ((vio_dring_msg_t *)msgp)->start_idx);
3917		return (ENOMSG);
3918	}
3919	return (0);
3920}
3921
3922
3923/*
3924 * Function:
3925 *	vdc_resubmit_backup_dring()
3926 *
3927 * Description:
3928 *	Resubmit each descriptor in the backed up dring to
3929 *	vDisk server. The Dring was backed up during connection
3930 *	reset.
3931 *
3932 * Arguments:
3933 *	vdcp	- soft state pointer for this instance of the device driver.
3934 *
3935 * Return Code:
3936 *	0	- Success
3937 */
3938static int
3939vdc_resubmit_backup_dring(vdc_t *vdcp)
3940{
3941	int		processed = 0;
3942	int		count;
3943	int		b_idx;
3944	int		rv = 0;
3945	int		dring_size;
3946	vdc_local_desc_t	*curr_ldep;
3947
3948	ASSERT(MUTEX_NOT_HELD(&vdcp->lock));
3949	ASSERT(vdcp->state == VDC_STATE_HANDLE_PENDING);
3950
3951	if (vdcp->local_dring_backup == NULL) {
3952		/* the pending requests have already been processed */
3953		return (0);
3954	}
3955
3956	DMSG(vdcp, 1, "restoring pending dring entries (len=%d, tail=%d)\n",
3957	    vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail);
3958
3959	/*
3960	 * Walk the backup copy of the local descriptor ring and
3961	 * resubmit all the outstanding transactions.
3962	 */
3963	b_idx = vdcp->local_dring_backup_tail;
3964	for (count = 0; count < vdcp->local_dring_backup_len; count++) {
3965
3966		curr_ldep = &(vdcp->local_dring_backup[b_idx]);
3967
3968		/* only resubmit outstanding transactions */
3969		if (!curr_ldep->is_free) {
3970
3971			DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx);
3972
3973			rv = vdc_do_op(vdcp, curr_ldep->operation,
3974			    curr_ldep->addr, curr_ldep->nbytes,
3975			    curr_ldep->slice, curr_ldep->offset,
3976			    curr_ldep->buf, curr_ldep->dir,
3977			    (curr_ldep->flags & ~VDC_OP_STATE_RUNNING) |
3978			    VDC_OP_RESUBMIT);
3979
3980			if (rv) {
3981				DMSG(vdcp, 1, "[%d] resubmit entry %d failed\n",
3982				    vdcp->instance, b_idx);
3983				goto done;
3984			}
3985
3986			/*
3987			 * Mark this entry as free so that we will not resubmit
3988			 * this "done" request again, if we were to use the same
3989			 * backup_dring again in future. This could happen when
3990			 * a reset happens while processing the backup_dring.
3991			 */
3992			curr_ldep->is_free = B_TRUE;
3993			processed++;
3994		}
3995
3996		/* get the next element to submit */
3997		if (++b_idx >= vdcp->local_dring_backup_len)
3998			b_idx = 0;
3999	}
4000
4001	/* all done - now clear up pending dring copy */
4002	dring_size = vdcp->local_dring_backup_len *
4003	    sizeof (vdcp->local_dring_backup[0]);
4004
4005	(void) kmem_free(vdcp->local_dring_backup, dring_size);
4006
4007	vdcp->local_dring_backup = NULL;
4008
4009done:
4010	DTRACE_PROBE2(processed, int, processed, vdc_t *, vdcp);
4011
4012	return (rv);
4013}
4014
4015/*
4016 * Function:
4017 *	vdc_cancel_backup_dring
4018 *
4019 * Description:
4020 *	Cancel each descriptor in the backed up dring to vDisk server.
4021 *	The Dring was backed up during connection reset.
4022 *
4023 * Arguments:
4024 *	vdcp	- soft state pointer for this instance of the device driver.
4025 *
4026 * Return Code:
4027 *	None
4028 */
4029void
4030vdc_cancel_backup_dring(vdc_t *vdcp)
4031{
4032	vdc_local_desc_t *ldep;
4033	struct buf	*bufp;
4034	int		count;
4035	int		b_idx;
4036	int		dring_size;
4037	int		cancelled = 0;
4038
4039	ASSERT(MUTEX_HELD(&vdcp->lock));
4040	ASSERT(vdcp->state == VDC_STATE_FAILED);
4041
4042	if (vdcp->local_dring_backup == NULL) {
4043		/* the pending requests have already been processed */
4044		return;
4045	}
4046
4047	DMSG(vdcp, 1, "cancelling pending dring entries (len=%d, tail=%d)\n",
4048	    vdcp->local_dring_backup_len, vdcp->local_dring_backup_tail);
4049
4050	/*
4051	 * Walk the backup copy of the local descriptor ring and
4052	 * cancel all the outstanding transactions.
4053	 */
4054	b_idx = vdcp->local_dring_backup_tail;
4055	for (count = 0; count < vdcp->local_dring_backup_len; count++) {
4056
4057		ldep = &(vdcp->local_dring_backup[b_idx]);
4058
4059		/* only cancel outstanding transactions */
4060		if (!ldep->is_free) {
4061
4062			DMSG(vdcp, 1, "cancelling entry idx=%x\n", b_idx);
4063			cancelled++;
4064
4065			/*
4066			 * All requests have already been cleared from the
4067			 * local descriptor ring and the LDC channel has been
4068			 * reset so we will never get any reply for these
4069			 * requests. Now we just have to notify threads waiting
4070			 * for replies that the request has failed.
4071			 */
4072			bufp = ldep->buf;
4073			ASSERT(bufp != NULL);
4074			bufp->b_resid = bufp->b_bcount;
4075			if (ldep->operation == VD_OP_BREAD ||
4076			    ldep->operation == VD_OP_BWRITE) {
4077				VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
4078				VD_KSTAT_WAITQ_EXIT(vdcp);
4079				DTRACE_IO1(done, buf_t *, bufp);
4080			}
4081			bioerror(bufp, EIO);
4082			biodone(bufp);
4083		}
4084
4085		/* get the next element to cancel */
4086		if (++b_idx >= vdcp->local_dring_backup_len)
4087			b_idx = 0;
4088	}
4089
4090	/* all done - now clear up pending dring copy */
4091	dring_size = vdcp->local_dring_backup_len *
4092	    sizeof (vdcp->local_dring_backup[0]);
4093
4094	(void) kmem_free(vdcp->local_dring_backup, dring_size);
4095
4096	vdcp->local_dring_backup = NULL;
4097
4098	DTRACE_PROBE2(cancelled, int, cancelled, vdc_t *, vdcp);
4099}
4100
4101/*
4102 * Function:
4103 *	vdc_connection_timeout
4104 *
4105 * Description:
4106 *	This function is invoked if the timeout set to establish the connection
4107 *	with vds expires. This will happen if we spend too much time in the
4108 *	VDC_STATE_INIT_WAITING, VDC_STATE_NEGOTIATE or VDC_STATE_HANDLE_PENDING
4109 *	states.
4110 *
4111 * Arguments:
4112 *	arg	- argument of the timeout function actually a soft state
4113 *		  pointer for the instance of the device driver.
4114 *
4115 * Return Code:
4116 *	None
4117 */
4118void
4119vdc_connection_timeout(void *arg)
4120{
4121	vdc_t		*vdcp = (vdc_t *)arg;
4122
4123	mutex_enter(&vdcp->lock);
4124
4125	vdcp->ctimeout_reached = B_TRUE;
4126
4127	mutex_exit(&vdcp->lock);
4128}
4129
4130/*
4131 * Function:
4132 *	vdc_backup_local_dring()
4133 *
4134 * Description:
4135 *	Backup the current dring in the event of a reset. The Dring
4136 *	transactions will be resubmitted to the server when the
4137 *	connection is restored.
4138 *
4139 * Arguments:
4140 *	vdcp	- soft state pointer for this instance of the device driver.
4141 *
4142 * Return Code:
4143 *	NONE
4144 */
4145static void
4146vdc_backup_local_dring(vdc_t *vdcp)
4147{
4148	int b_idx, count, dring_size;
4149	vdc_local_desc_t *curr_ldep;
4150
4151	ASSERT(MUTEX_HELD(&vdcp->lock));
4152	ASSERT(vdcp->state == VDC_STATE_RESETTING);
4153
4154	/*
4155	 * If the backup dring is stil around, it means
4156	 * that the last restore did not complete. However,
4157	 * since we never got back into the running state,
4158	 * the backup copy we have is still valid.
4159	 */
4160	if (vdcp->local_dring_backup != NULL) {
4161		DMSG(vdcp, 1, "reusing local descriptor ring backup "
4162		    "(len=%d, tail=%d)\n", vdcp->local_dring_backup_len,
4163		    vdcp->local_dring_backup_tail);
4164		return;
4165	}
4166
4167	/*
4168	 * The backup dring can be NULL and the local dring may not be
4169	 * initialized. This can happen if we had a reset while establishing
4170	 * a new connection but after the connection has timed out. In that
4171	 * case the backup dring is NULL because the requests have been
4172	 * cancelled and the request occured before the local dring is
4173	 * initialized.
4174	 */
4175	if (!(vdcp->initialized & VDC_DRING_LOCAL))
4176		return;
4177
4178	DMSG(vdcp, 1, "backing up the local descriptor ring (len=%d, "
4179	    "tail=%d)\n", vdcp->dring_len, vdcp->dring_curr_idx);
4180
4181	dring_size = vdcp->dring_len * sizeof (vdcp->local_dring[0]);
4182
4183	vdcp->local_dring_backup = kmem_alloc(dring_size, KM_SLEEP);
4184	bcopy(vdcp->local_dring, vdcp->local_dring_backup, dring_size);
4185
4186	vdcp->local_dring_backup_tail = vdcp->dring_curr_idx;
4187	vdcp->local_dring_backup_len = vdcp->dring_len;
4188
4189	/*
4190	 * At this point, pending read or write I/Os are recorded in the
4191	 * runq. We update the I/O statistics to indicate that they are now
4192	 * back in the waitq.
4193	 */
4194	b_idx = vdcp->local_dring_backup_tail;
4195	for (count = 0; count < vdcp->local_dring_backup_len; count++) {
4196
4197		curr_ldep = &(vdcp->local_dring_backup[b_idx]);
4198
4199		if (!curr_ldep->is_free &&
4200		    (curr_ldep->operation == VD_OP_BREAD ||
4201		    curr_ldep->operation == VD_OP_BWRITE)) {
4202			VD_KSTAT_RUNQ_BACK_TO_WAITQ(vdcp);
4203		}
4204
4205		/* get the next element */
4206		if (++b_idx >= vdcp->local_dring_backup_len)
4207			b_idx = 0;
4208	}
4209
4210}
4211
4212static void
4213vdc_switch_server(vdc_t *vdcp)
4214{
4215	int		rv;
4216	vdc_server_t	*curr_server, *new_server;
4217
4218	ASSERT(MUTEX_HELD(&vdcp->lock));
4219
4220	/* if there is only one server return back */
4221	if (vdcp->num_servers == 1) {
4222		return;
4223	}
4224
4225	/* Get current and next server */
4226	curr_server = vdcp->curr_server;
4227	new_server =
4228	    (curr_server->next) ? curr_server->next : vdcp->server_list;
4229	ASSERT(curr_server != new_server);
4230
4231	/* bring current server's channel down */
4232	rv = ldc_down(curr_server->ldc_handle);
4233	if (rv) {
4234		DMSG(vdcp, 0, "[%d] Cannot bring channel down, port %ld\n",
4235		    vdcp->instance, curr_server->id);
4236		return;
4237	}
4238
4239	/* switch the server */
4240	vdcp->curr_server = new_server;
4241
4242	DMSG(vdcp, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n",
4243	    vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id);
4244}
4245
4246static void
4247vdc_print_svc_status(vdc_t *vdcp)
4248{
4249	int instance;
4250	uint64_t ldc_id, port_id;
4251	vdc_service_state_t svc_state;
4252
4253	ASSERT(mutex_owned(&vdcp->lock));
4254
4255	svc_state = vdcp->curr_server->svc_state;
4256
4257	if (vdcp->curr_server->log_state == svc_state)
4258		return;
4259
4260	instance = vdcp->instance;
4261	ldc_id = vdcp->curr_server->ldc_id;
4262	port_id = vdcp->curr_server->id;
4263
4264	switch (svc_state) {
4265
4266	case VDC_SERVICE_OFFLINE:
4267		cmn_err(CE_CONT, "?vdisk@%d is offline\n", instance);
4268		break;
4269
4270	case VDC_SERVICE_CONNECTED:
4271		cmn_err(CE_CONT, "?vdisk@%d is connected using ldc@%ld,%ld\n",
4272		    instance, ldc_id, port_id);
4273		break;
4274
4275	case VDC_SERVICE_ONLINE:
4276		cmn_err(CE_CONT, "?vdisk@%d is online using ldc@%ld,%ld\n",
4277		    instance, ldc_id, port_id);
4278		break;
4279
4280	case VDC_SERVICE_FAILED:
4281		cmn_err(CE_CONT, "?vdisk@%d access to service failed "
4282		    "using ldc@%ld,%ld\n", instance, ldc_id, port_id);
4283		break;
4284
4285	case VDC_SERVICE_FAULTED:
4286		cmn_err(CE_CONT, "?vdisk@%d access to backend failed "
4287		    "using ldc@%ld,%ld\n", instance, ldc_id, port_id);
4288		break;
4289
4290	default:
4291		ASSERT(0);
4292		break;
4293	}
4294
4295	vdcp->curr_server->log_state = svc_state;
4296}
4297
4298/*
4299 * Function:
4300 *	vdc_handshake_retry
4301 *
4302 * Description:
4303 *	This function indicates if the handshake should be retried or not.
4304 *	This depends on the lifecycle of the driver:
4305 *
4306 *	VDC_LC_ATTACHING: the handshake is retried until we have tried
4307 *	a handshake with each server. We don't care how far each handshake
4308 *	went, the goal is just to try the handshake. We want to minimize the
4309 *	the time spent doing the attach because this is locking the device
4310 *	tree.
4311 *
4312 *	VDC_LC_ONLINE_PENDING: the handshake is retried while we haven't done
4313 *	consecutive attribute negotiations with each server, and we haven't
4314 *	reached a minimum total of consecutive negotiations (hattr_min). The
4315 *	number of attribution negotiations determines the time spent before
4316 *	failing	pending I/Os if the handshake is not successful.
4317 *
4318 *	VDC_LC_ONLINE: the handshake is always retried, until we have a
4319 *	successful handshake with a server.
4320 *
4321 *	VDC_LC_DETACHING: N/A
4322 *
4323 * Arguments:
4324 *	hshake_cnt	- number of handshake attempts
4325 *	hattr_cnt	- number of attribute negotiation attempts
4326 *
4327 * Return Code:
4328 *	B_TRUE		- handshake should be retried
4329 *	B_FALSE		- handshake should not be retried
4330 */
4331static boolean_t
4332vdc_handshake_retry(vdc_t *vdcp, int hshake_cnt, int hattr_cnt)
4333{
4334	int		hattr_total = 0;
4335	vdc_server_t	*srvr;
4336
4337	ASSERT(vdcp->lifecycle != VDC_LC_DETACHING);
4338
4339	/* update handshake counters */
4340	vdcp->curr_server->hshake_cnt = hshake_cnt;
4341	vdcp->curr_server->hattr_cnt = hattr_cnt;
4342
4343	/*
4344	 * If no attribute negotiation was done then we reset the total
4345	 *  number otherwise we cumulate the number.
4346	 */
4347	if (hattr_cnt == 0)
4348		vdcp->curr_server->hattr_total = 0;
4349	else
4350		vdcp->curr_server->hattr_total += hattr_cnt;
4351
4352	/*
4353	 * If we are online (i.e. at least one handshake was successfully
4354	 * completed) then we always retry the handshake.
4355	 */
4356	if (vdcp->lifecycle == VDC_LC_ONLINE)
4357		return (B_TRUE);
4358
4359	/*
4360	 * If we are attaching then we retry the handshake only if we haven't
4361	 * tried with all servers.
4362	 */
4363	if (vdcp->lifecycle == VDC_LC_ATTACHING) {
4364
4365		for (srvr = vdcp->server_list; srvr != NULL;
4366		    srvr = srvr->next) {
4367			if (srvr->hshake_cnt == 0) {
4368				return (B_TRUE);
4369			}
4370		}
4371
4372		return (B_FALSE);
4373	}
4374
4375	/*
4376	 * Here we are in the case where we haven't completed any handshake
4377	 * successfully yet.
4378	 */
4379	ASSERT(vdcp->lifecycle == VDC_LC_ONLINE_PENDING);
4380
4381	/*
4382	 * We retry the handshake if we haven't done an attribute negotiation
4383	 * with each server. This is to handle the case where one service domain
4384	 * is down.
4385	 */
4386	for (srvr = vdcp->server_list; srvr != NULL; srvr = srvr->next) {
4387		if (srvr->hattr_cnt == 0) {
4388			return (B_TRUE);
4389		}
4390		hattr_total += srvr->hattr_total;
4391	}
4392
4393	/*
4394	 * We retry the handshake if we haven't reached the minimum number of
4395	 * attribute negotiation.
4396	 */
4397	return (hattr_total < vdcp->hattr_min);
4398}
4399
4400/* -------------------------------------------------------------------------- */
4401
4402/*
4403 * The following functions process the incoming messages from vds
4404 */
4405
4406/*
4407 * Function:
4408 *      vdc_process_msg_thread()
4409 *
4410 * Description:
4411 *
4412 *	Main VDC message processing thread. Each vDisk instance
4413 *	consists of a copy of this thread. This thread triggers
4414 *	all the handshakes and data exchange with the server. It
4415 *	also handles all channel resets
4416 *
4417 * Arguments:
4418 *      vdc     - soft state pointer for this instance of the device driver.
4419 *
4420 * Return Code:
4421 *      None
4422 */
4423static void
4424vdc_process_msg_thread(vdc_t *vdcp)
4425{
4426	boolean_t	failure_msg = B_FALSE;
4427	int		status;
4428	int		ctimeout;
4429	timeout_id_t	tmid = 0;
4430	clock_t		ldcup_timeout = 0;
4431	vdc_server_t	*srvr;
4432	vdc_service_state_t svc_state;
4433	int		hshake_cnt = 0;
4434	int		hattr_cnt = 0;
4435
4436	mutex_enter(&vdcp->lock);
4437
4438	ASSERT(vdcp->lifecycle == VDC_LC_ATTACHING);
4439
4440	for (;;) {
4441
4442#define	Q(_s)	(vdcp->state == _s) ? #_s :
4443		DMSG(vdcp, 3, "state = %d (%s)\n", vdcp->state,
4444		    Q(VDC_STATE_INIT)
4445		    Q(VDC_STATE_INIT_WAITING)
4446		    Q(VDC_STATE_NEGOTIATE)
4447		    Q(VDC_STATE_HANDLE_PENDING)
4448		    Q(VDC_STATE_FAULTED)
4449		    Q(VDC_STATE_FAILED)
4450		    Q(VDC_STATE_RUNNING)
4451		    Q(VDC_STATE_RESETTING)
4452		    Q(VDC_STATE_DETACH)
4453		    "UNKNOWN");
4454#undef Q
4455
4456		switch (vdcp->state) {
4457		case VDC_STATE_INIT:
4458
4459			/*
4460			 * If requested, start a timeout to check if the
4461			 * connection with vds is established in the
4462			 * specified delay. If the timeout expires, we
4463			 * will cancel any pending request.
4464			 *
4465			 * If some reset have occurred while establishing
4466			 * the connection, we already have a timeout armed
4467			 * and in that case we don't need to arm a new one.
4468			 *
4469			 * The same rule applies when there are multiple vds'.
4470			 * If either a connection cannot be established or
4471			 * the handshake times out, the connection thread will
4472			 * try another server. The 'ctimeout' will report
4473			 * back an error after it expires irrespective of
4474			 * whether the vdisk is trying to connect to just
4475			 * one or multiple servers.
4476			 */
4477			ctimeout = (vdc_timeout != 0)?
4478			    vdc_timeout : vdcp->curr_server->ctimeout;
4479
4480			if (ctimeout != 0 && tmid == 0) {
4481				tmid = timeout(vdc_connection_timeout, vdcp,
4482				    ctimeout * drv_usectohz(MICROSEC));
4483			}
4484
4485			/* Switch to STATE_DETACH if drv is detaching */
4486			if (vdcp->lifecycle == VDC_LC_DETACHING) {
4487				vdcp->state = VDC_STATE_DETACH;
4488				break;
4489			}
4490
4491			/* Check if the timeout has been reached */
4492			if (vdcp->ctimeout_reached) {
4493				ASSERT(tmid != 0);
4494				tmid = 0;
4495				vdcp->state = VDC_STATE_FAILED;
4496				break;
4497			}
4498
4499			/*
4500			 * Switch to another server when we reach the limit of
4501			 * the number of handshake per server or if we have done
4502			 * an attribute negotiation.
4503			 */
4504			if (hshake_cnt >= vdc_hshake_retries || hattr_cnt > 0) {
4505
4506				if (!vdc_handshake_retry(vdcp, hshake_cnt,
4507				    hattr_cnt)) {
4508					DMSG(vdcp, 0, "[%d] too many "
4509					    "handshakes", vdcp->instance);
4510					vdcp->state = VDC_STATE_FAILED;
4511					break;
4512				}
4513
4514				vdc_switch_server(vdcp);
4515
4516				hshake_cnt = 0;
4517				hattr_cnt = 0;
4518			}
4519
4520			hshake_cnt++;
4521
4522			/* Bring up connection with vds via LDC */
4523			status = vdc_start_ldc_connection(vdcp);
4524			if (status != EINVAL) {
4525				vdcp->state = VDC_STATE_INIT_WAITING;
4526			} else {
4527				vdcp->curr_server->svc_state =
4528				    VDC_SERVICE_FAILED;
4529				vdc_print_svc_status(vdcp);
4530			}
4531			break;
4532
4533		case VDC_STATE_INIT_WAITING:
4534
4535			/* if channel is UP, start negotiation */
4536			if (vdcp->curr_server->ldc_state == LDC_UP) {
4537				vdcp->state = VDC_STATE_NEGOTIATE;
4538				break;
4539			}
4540
4541			/*
4542			 * Wait for LDC_UP. If it times out and we have multiple
4543			 * servers then we will retry using a different server.
4544			 */
4545			ldcup_timeout = ddi_get_lbolt() + (vdc_ldcup_timeout *
4546			    drv_usectohz(MICROSEC));
4547			status = cv_timedwait(&vdcp->initwait_cv, &vdcp->lock,
4548			    ldcup_timeout);
4549			if (status == -1 &&
4550			    vdcp->state == VDC_STATE_INIT_WAITING &&
4551			    vdcp->curr_server->ldc_state != LDC_UP) {
4552				/* timed out & still waiting */
4553				vdcp->curr_server->svc_state =
4554				    VDC_SERVICE_FAILED;
4555				vdc_print_svc_status(vdcp);
4556				vdcp->state = VDC_STATE_INIT;
4557				break;
4558			}
4559
4560			if (vdcp->state != VDC_STATE_INIT_WAITING) {
4561				DMSG(vdcp, 0,
4562				    "state moved to %d out from under us...\n",
4563				    vdcp->state);
4564			}
4565			break;
4566
4567		case VDC_STATE_NEGOTIATE:
4568			switch (status = vdc_ver_negotiation(vdcp)) {
4569			case 0:
4570				break;
4571			default:
4572				DMSG(vdcp, 0, "ver negotiate failed (%d)..\n",
4573				    status);
4574				goto reset;
4575			}
4576
4577			hattr_cnt++;
4578
4579			switch (status = vdc_attr_negotiation(vdcp)) {
4580			case 0:
4581				break;
4582			default:
4583				DMSG(vdcp, 0, "attr negotiate failed (%d)..\n",
4584				    status);
4585				goto reset;
4586			}
4587
4588			switch (status = vdc_dring_negotiation(vdcp)) {
4589			case 0:
4590				break;
4591			default:
4592				DMSG(vdcp, 0, "dring negotiate failed (%d)..\n",
4593				    status);
4594				goto reset;
4595			}
4596
4597			switch (status = vdc_rdx_exchange(vdcp)) {
4598			case 0:
4599				vdcp->state = VDC_STATE_HANDLE_PENDING;
4600				goto done;
4601			default:
4602				DMSG(vdcp, 0, "RDX xchg failed ..(%d)\n",
4603				    status);
4604				goto reset;
4605			}
4606reset:
4607			DMSG(vdcp, 0, "negotiation failed: resetting (%d)\n",
4608			    status);
4609			vdcp->state = VDC_STATE_RESETTING;
4610			vdcp->self_reset = B_TRUE;
4611			vdcp->curr_server->svc_state = VDC_SERVICE_FAILED;
4612			vdc_print_svc_status(vdcp);
4613done:
4614			DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n",
4615			    vdcp->state);
4616			break;
4617
4618		case VDC_STATE_HANDLE_PENDING:
4619
4620			DMSG(vdcp, 0, "[%d] connection to service domain is up",
4621			    vdcp->instance);
4622			vdcp->curr_server->svc_state = VDC_SERVICE_CONNECTED;
4623
4624			mutex_exit(&vdcp->lock);
4625
4626			/*
4627			 * If we have multiple servers, check that the backend
4628			 * is effectively available before resubmitting any IO.
4629			 */
4630			if (vdcp->num_servers > 1 &&
4631			    vdc_eio_check(vdcp, 0) != 0) {
4632				mutex_enter(&vdcp->lock);
4633				vdcp->curr_server->svc_state =
4634				    VDC_SERVICE_FAULTED;
4635				vdcp->state = VDC_STATE_FAULTED;
4636				break;
4637			}
4638
4639			if (tmid != 0) {
4640				(void) untimeout(tmid);
4641				tmid = 0;
4642				vdcp->ctimeout_reached = B_FALSE;
4643			}
4644
4645			/*
4646			 * Setup devid
4647			 */
4648			(void) vdc_setup_devid(vdcp);
4649
4650			status = vdc_resubmit_backup_dring(vdcp);
4651
4652			mutex_enter(&vdcp->lock);
4653
4654			if (status) {
4655				vdcp->state = VDC_STATE_RESETTING;
4656				vdcp->self_reset = B_TRUE;
4657				vdcp->curr_server->svc_state =
4658				    VDC_SERVICE_FAILED;
4659				vdc_print_svc_status(vdcp);
4660			} else {
4661				vdcp->state = VDC_STATE_RUNNING;
4662			}
4663			break;
4664
4665		case VDC_STATE_FAULTED:
4666			/*
4667			 * Server is faulted because the backend is unavailable.
4668			 * If all servers are faulted then we mark the service
4669			 * as failed, otherwise we reset to switch to another
4670			 * server.
4671			 */
4672			vdc_print_svc_status(vdcp);
4673
4674			/* check if all servers are faulted */
4675			for (srvr = vdcp->server_list; srvr != NULL;
4676			    srvr = srvr->next) {
4677				svc_state = srvr->svc_state;
4678				if (svc_state != VDC_SERVICE_FAULTED)
4679					break;
4680			}
4681
4682			if (srvr != NULL) {
4683				vdcp->state = VDC_STATE_RESETTING;
4684				vdcp->self_reset = B_TRUE;
4685			} else {
4686				vdcp->state = VDC_STATE_FAILED;
4687			}
4688			break;
4689
4690		case VDC_STATE_FAILED:
4691			/*
4692			 * We reach this state when we are unable to access the
4693			 * backend from any server, either because of a maximum
4694			 * connection retries or timeout, or because the backend
4695			 * is unavailable.
4696			 *
4697			 * Then we cancel the backup DRing so that errors get
4698			 * reported and we wait for a new I/O before attempting
4699			 * another connection.
4700			 */
4701
4702			cmn_err(CE_NOTE, "vdisk@%d disk access failed",
4703			    vdcp->instance);
4704			failure_msg = B_TRUE;
4705
4706			if (vdcp->lifecycle == VDC_LC_ATTACHING) {
4707				vdcp->lifecycle = VDC_LC_ONLINE_PENDING;
4708				vdcp->hattr_min = vdc_hattr_min_initial;
4709			} else {
4710				vdcp->hattr_min = vdc_hattr_min;
4711			}
4712
4713			/* cancel any timeout */
4714			if (tmid != 0) {
4715				(void) untimeout(tmid);
4716				tmid = 0;
4717			}
4718
4719			/* cancel pending I/Os */
4720			cv_broadcast(&vdcp->running_cv);
4721			vdc_cancel_backup_dring(vdcp);
4722
4723			/* wait for new I/O */
4724			while (!vdcp->io_pending)
4725				cv_wait(&vdcp->io_pending_cv, &vdcp->lock);
4726
4727			/*
4728			 * There's a new IO pending. Try to re-establish a
4729			 * connection. Mark all services as offline, so that
4730			 * we don't stop again before having retried all
4731			 * servers.
4732			 */
4733			for (srvr = vdcp->server_list; srvr != NULL;
4734			    srvr = srvr->next) {
4735				srvr->svc_state = VDC_SERVICE_OFFLINE;
4736				srvr->hshake_cnt = 0;
4737				srvr->hattr_cnt = 0;
4738				srvr->hattr_total = 0;
4739			}
4740
4741			/* reset variables */
4742			hshake_cnt = 0;
4743			hattr_cnt = 0;
4744			vdcp->ctimeout_reached = B_FALSE;
4745
4746			vdcp->state = VDC_STATE_RESETTING;
4747			vdcp->self_reset = B_TRUE;
4748			break;
4749
4750		/* enter running state */
4751		case VDC_STATE_RUNNING:
4752
4753			if (vdcp->lifecycle == VDC_LC_DETACHING) {
4754				vdcp->state = VDC_STATE_DETACH;
4755				break;
4756			}
4757
4758			vdcp->lifecycle = VDC_LC_ONLINE;
4759
4760			if (failure_msg) {
4761				cmn_err(CE_NOTE, "vdisk@%d disk access "
4762				    "recovered", vdcp->instance);
4763				failure_msg = B_FALSE;
4764			}
4765
4766			/*
4767			 * Signal anyone waiting for the connection
4768			 * to come on line.
4769			 */
4770			cv_broadcast(&vdcp->running_cv);
4771
4772			/* backend has to be checked after reset */
4773			if (vdcp->failfast_interval != 0 ||
4774			    vdcp->num_servers > 1)
4775				cv_signal(&vdcp->eio_cv);
4776
4777			/* ownership is lost during reset */
4778			if (vdcp->ownership & VDC_OWNERSHIP_WANTED)
4779				vdcp->ownership |= VDC_OWNERSHIP_RESET;
4780			cv_signal(&vdcp->ownership_cv);
4781
4782			vdcp->curr_server->svc_state = VDC_SERVICE_ONLINE;
4783			vdc_print_svc_status(vdcp);
4784
4785			mutex_exit(&vdcp->lock);
4786
4787			for (;;) {
4788				vio_msg_t msg;
4789				status = vdc_wait_for_response(vdcp, &msg);
4790				if (status) break;
4791
4792				DMSG(vdcp, 1, "[%d] new pkt(s) available\n",
4793				    vdcp->instance);
4794				status = vdc_process_data_msg(vdcp, &msg);
4795				if (status) {
4796					DMSG(vdcp, 1, "[%d] process_data_msg "
4797					    "returned err=%d\n", vdcp->instance,
4798					    status);
4799					break;
4800				}
4801
4802			}
4803
4804			mutex_enter(&vdcp->lock);
4805
4806			/* all servers are now offline */
4807			for (srvr = vdcp->server_list; srvr != NULL;
4808			    srvr = srvr->next) {
4809				srvr->svc_state = VDC_SERVICE_OFFLINE;
4810				srvr->log_state = VDC_SERVICE_NONE;
4811				srvr->hshake_cnt = 0;
4812				srvr->hattr_cnt = 0;
4813				srvr->hattr_total = 0;
4814			}
4815
4816			hshake_cnt = 0;
4817			hattr_cnt = 0;
4818
4819			vdc_print_svc_status(vdcp);
4820
4821			vdcp->state = VDC_STATE_RESETTING;
4822			vdcp->self_reset = B_TRUE;
4823			break;
4824
4825		case VDC_STATE_RESETTING:
4826			/*
4827			 * When we reach this state, we either come from the
4828			 * VDC_STATE_RUNNING state and we can have pending
4829			 * request but no timeout is armed; or we come from
4830			 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or
4831			 * VDC_HANDLE_PENDING state and there is no pending
4832			 * request or pending requests have already been copied
4833			 * into the backup dring. So we can safely keep the
4834			 * connection timeout armed while we are in this state.
4835			 */
4836
4837			DMSG(vdcp, 0, "Initiating channel reset "
4838			    "(pending = %d)\n", (int)vdcp->threads_pending);
4839
4840			if (vdcp->self_reset) {
4841				DMSG(vdcp, 0,
4842				    "[%d] calling stop_ldc_connection.\n",
4843				    vdcp->instance);
4844				status = vdc_stop_ldc_connection(vdcp);
4845				vdcp->self_reset = B_FALSE;
4846			}
4847
4848			/*
4849			 * Wait for all threads currently waiting
4850			 * for a free dring entry to use.
4851			 */
4852			while (vdcp->threads_pending) {
4853				cv_broadcast(&vdcp->membind_cv);
4854				cv_broadcast(&vdcp->dring_free_cv);
4855				mutex_exit(&vdcp->lock);
4856				/* give the waiters enough time to wake up */
4857				delay(vdc_hz_min_ldc_delay);
4858				mutex_enter(&vdcp->lock);
4859			}
4860
4861			ASSERT(vdcp->threads_pending == 0);
4862
4863			/* Sanity check that no thread is receiving */
4864			ASSERT(vdcp->read_state != VDC_READ_WAITING);
4865
4866			vdcp->read_state = VDC_READ_IDLE;
4867			vdcp->io_pending = B_FALSE;
4868
4869			/*
4870			 * Cleanup any pending eio. These I/Os are going to
4871			 * be resubmitted.
4872			 */
4873			vdc_eio_unqueue(vdcp, 0, B_FALSE);
4874
4875			vdc_backup_local_dring(vdcp);
4876
4877			/* cleanup the old d-ring */
4878			vdc_destroy_descriptor_ring(vdcp);
4879
4880			/* go and start again */
4881			vdcp->state = VDC_STATE_INIT;
4882
4883			break;
4884
4885		case VDC_STATE_DETACH:
4886			DMSG(vdcp, 0, "[%d] Reset thread exit cleanup ..\n",
4887			    vdcp->instance);
4888
4889			/* cancel any pending timeout */
4890			mutex_exit(&vdcp->lock);
4891			if (tmid != 0) {
4892				(void) untimeout(tmid);
4893				tmid = 0;
4894			}
4895			mutex_enter(&vdcp->lock);
4896
4897			/*
4898			 * Signal anyone waiting for connection
4899			 * to come online
4900			 */
4901			cv_broadcast(&vdcp->running_cv);
4902
4903			while (vdcp->sync_op_cnt > 0) {
4904				cv_broadcast(&vdcp->sync_blocked_cv);
4905				mutex_exit(&vdcp->lock);
4906				/* give the waiters enough time to wake up */
4907				delay(vdc_hz_min_ldc_delay);
4908				mutex_enter(&vdcp->lock);
4909			}
4910
4911			mutex_exit(&vdcp->lock);
4912
4913			DMSG(vdcp, 0, "[%d] Msg processing thread exiting ..\n",
4914			    vdcp->instance);
4915			thread_exit();
4916			break;
4917		}
4918	}
4919}
4920
4921
4922/*
4923 * Function:
4924 *	vdc_process_data_msg()
4925 *
4926 * Description:
4927 *	This function is called by the message processing thread each time
4928 *	a message with a msgtype of VIO_TYPE_DATA is received. It will either
4929 *	be an ACK or NACK from vds[1] which vdc handles as follows.
4930 *		ACK	- wake up the waiting thread
4931 *		NACK	- resend any messages necessary
4932 *
4933 *	[1] Although the message format allows it, vds should not send a
4934 *	    VIO_SUBTYPE_INFO message to vdc asking it to read data; if for
4935 *	    some bizarre reason it does, vdc will reset the connection.
4936 *
4937 * Arguments:
4938 *	vdc	- soft state pointer for this instance of the device driver.
4939 *	msg	- the LDC message sent by vds
4940 *
4941 * Return Code:
4942 *	0	- Success.
4943 *	> 0	- error value returned by LDC
4944 */
4945static int
4946vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg)
4947{
4948	int			status = 0;
4949	vio_dring_msg_t		*dring_msg;
4950	vdc_local_desc_t	*ldep = NULL;
4951	int			start, end;
4952	int			idx;
4953	int			op;
4954
4955	dring_msg = (vio_dring_msg_t *)msg;
4956
4957	ASSERT(msg->tag.vio_msgtype == VIO_TYPE_DATA);
4958	ASSERT(vdcp != NULL);
4959
4960	mutex_enter(&vdcp->lock);
4961
4962	/*
4963	 * Check to see if the message has bogus data
4964	 */
4965	idx = start = dring_msg->start_idx;
4966	end = dring_msg->end_idx;
4967	if ((start >= vdcp->dring_len) ||
4968	    (end >= vdcp->dring_len) || (end < -1)) {
4969		/*
4970		 * Update the I/O statistics to indicate that an error ocurred.
4971		 * No need to update the wait/run queues as no specific read or
4972		 * write request is being completed in response to this 'msg'.
4973		 */
4974		VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
4975		DMSG(vdcp, 0, "[%d] Bogus ACK data : start %d, end %d\n",
4976		    vdcp->instance, start, end);
4977		mutex_exit(&vdcp->lock);
4978		return (EINVAL);
4979	}
4980
4981	/*
4982	 * Verify that the sequence number is what vdc expects.
4983	 */
4984	switch (vdc_verify_seq_num(vdcp, dring_msg)) {
4985	case VDC_SEQ_NUM_TODO:
4986		break;	/* keep processing this message */
4987	case VDC_SEQ_NUM_SKIP:
4988		mutex_exit(&vdcp->lock);
4989		return (0);
4990	case VDC_SEQ_NUM_INVALID:
4991		/*
4992		 * Update the I/O statistics to indicate that an error ocurred.
4993		 * No need to update the wait/run queues as no specific read or
4994		 * write request is being completed in response to this 'msg'.
4995		 */
4996		VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
4997		DMSG(vdcp, 0, "[%d] invalid seqno\n", vdcp->instance);
4998		mutex_exit(&vdcp->lock);
4999		return (ENXIO);
5000	}
5001
5002	if (msg->tag.vio_subtype == VIO_SUBTYPE_NACK) {
5003		/*
5004		 * Update the I/O statistics to indicate that an error ocurred.
5005		 * No need to update the wait/run queues, this will be done by
5006		 * the thread calling this function.
5007		 */
5008		VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
5009		VDC_DUMP_DRING_MSG(dring_msg);
5010		DMSG(vdcp, 0, "[%d] DATA NACK\n", vdcp->instance);
5011		mutex_exit(&vdcp->lock);
5012		return (EIO);
5013
5014	} else if (msg->tag.vio_subtype == VIO_SUBTYPE_INFO) {
5015		/*
5016		 * Update the I/O statistics to indicate that an error occurred.
5017		 * No need to update the wait/run queues as no specific read or
5018		 * write request is being completed in response to this 'msg'.
5019		 */
5020		VD_UPDATE_ERR_STATS(vdcp, vd_protoerrs);
5021		mutex_exit(&vdcp->lock);
5022		return (EPROTO);
5023	}
5024
5025	DMSG(vdcp, 1, ": start %d end %d\n", start, end);
5026	ASSERT(start == end);
5027
5028	ldep = &vdcp->local_dring[idx];
5029
5030	DMSG(vdcp, 1, ": state 0x%x\n", ldep->dep->hdr.dstate);
5031
5032	if (ldep->dep->hdr.dstate == VIO_DESC_DONE) {
5033		struct buf *bufp;
5034
5035		status = ldep->dep->payload.status;
5036
5037		bufp = ldep->buf;
5038		ASSERT(bufp != NULL);
5039
5040		bufp->b_resid = bufp->b_bcount - ldep->dep->payload.nbytes;
5041		bioerror(bufp, status);
5042
5043		if (status != 0) {
5044			DMSG(vdcp, 1, "I/O status=%d\n", status);
5045		}
5046
5047		DMSG(vdcp, 1,
5048		    "I/O complete req=%ld bytes resp=%ld bytes\n",
5049		    bufp->b_bcount, ldep->dep->payload.nbytes);
5050
5051		/*
5052		 * If the request has failed and we have multiple servers or
5053		 * failfast is enabled then we will have to defer the completion
5054		 * of the request until we have checked that the vdisk backend
5055		 * is effectively available (if multiple server) or that there
5056		 * is no reservation conflict (if failfast).
5057		 */
5058		if (status != 0 &&
5059		    ((vdcp->num_servers > 1 &&
5060		    (ldep->flags & VDC_OP_ERRCHK_BACKEND)) ||
5061		    (vdcp->failfast_interval != 0 &&
5062		    (ldep->flags & VDC_OP_ERRCHK_CONFLICT)))) {
5063			/*
5064			 * The I/O has failed and we need to check the error.
5065			 */
5066			(void) vdc_eio_queue(vdcp, idx);
5067		} else {
5068			op = ldep->operation;
5069			if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
5070				if (status == 0) {
5071					VD_UPDATE_IO_STATS(vdcp, op,
5072					    ldep->dep->payload.nbytes);
5073				} else {
5074					VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
5075				}
5076				VD_KSTAT_RUNQ_EXIT(vdcp);
5077				DTRACE_IO1(done, buf_t *, bufp);
5078			}
5079			(void) vdc_depopulate_descriptor(vdcp, idx);
5080			biodone(bufp);
5081		}
5082	}
5083
5084	/* let the arrival signal propogate */
5085	mutex_exit(&vdcp->lock);
5086
5087	/* probe gives the count of how many entries were processed */
5088	DTRACE_PROBE2(processed, int, 1, vdc_t *, vdcp);
5089
5090	return (0);
5091}
5092
5093
5094/*
5095 * Function:
5096 *	vdc_handle_ver_msg()
5097 *
5098 * Description:
5099 *
5100 * Arguments:
5101 *	vdc	- soft state pointer for this instance of the device driver.
5102 *	ver_msg	- LDC message sent by vDisk server
5103 *
5104 * Return Code:
5105 *	0	- Success
5106 */
5107static int
5108vdc_handle_ver_msg(vdc_t *vdc, vio_ver_msg_t *ver_msg)
5109{
5110	int status = 0;
5111
5112	ASSERT(vdc != NULL);
5113	ASSERT(mutex_owned(&vdc->lock));
5114
5115	if (ver_msg->tag.vio_subtype_env != VIO_VER_INFO) {
5116		return (EPROTO);
5117	}
5118
5119	if (ver_msg->dev_class != VDEV_DISK_SERVER) {
5120		return (EINVAL);
5121	}
5122
5123	switch (ver_msg->tag.vio_subtype) {
5124	case VIO_SUBTYPE_ACK:
5125		/*
5126		 * We check to see if the version returned is indeed supported
5127		 * (The server may have also adjusted the minor number downwards
5128		 * and if so 'ver_msg' will contain the actual version agreed)
5129		 */
5130		if (vdc_is_supported_version(ver_msg)) {
5131			vdc->ver.major = ver_msg->ver_major;
5132			vdc->ver.minor = ver_msg->ver_minor;
5133			ASSERT(vdc->ver.major > 0);
5134		} else {
5135			status = EPROTO;
5136		}
5137		break;
5138
5139	case VIO_SUBTYPE_NACK:
5140		/*
5141		 * call vdc_is_supported_version() which will return the next
5142		 * supported version (if any) in 'ver_msg'
5143		 */
5144		(void) vdc_is_supported_version(ver_msg);
5145		if (ver_msg->ver_major > 0) {
5146			size_t len = sizeof (*ver_msg);
5147
5148			ASSERT(vdc->ver.major > 0);
5149
5150			/* reset the necessary fields and resend */
5151			ver_msg->tag.vio_subtype = VIO_SUBTYPE_INFO;
5152			ver_msg->dev_class = VDEV_DISK;
5153
5154			status = vdc_send(vdc, (caddr_t)ver_msg, &len);
5155			DMSG(vdc, 0, "[%d] Resend VER info (LDC status = %d)\n",
5156			    vdc->instance, status);
5157			if (len != sizeof (*ver_msg))
5158				status = EBADMSG;
5159		} else {
5160			DMSG(vdc, 0, "[%d] No common version with vDisk server",
5161			    vdc->instance);
5162			status = ENOTSUP;
5163		}
5164
5165		break;
5166	case VIO_SUBTYPE_INFO:
5167		/*
5168		 * Handle the case where vds starts handshake
5169		 * (for now only vdc is the instigator)
5170		 */
5171		status = ENOTSUP;
5172		break;
5173
5174	default:
5175		status = EINVAL;
5176		break;
5177	}
5178
5179	return (status);
5180}
5181
5182/*
5183 * Function:
5184 *	vdc_handle_attr_msg()
5185 *
5186 * Description:
5187 *
5188 * Arguments:
5189 *	vdc	- soft state pointer for this instance of the device driver.
5190 *	attr_msg	- LDC message sent by vDisk server
5191 *
5192 * Return Code:
5193 *	0	- Success
5194 */
5195static int
5196vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg)
5197{
5198	int status = 0;
5199	vd_disk_type_t old_type;
5200
5201	ASSERT(vdc != NULL);
5202	ASSERT(mutex_owned(&vdc->lock));
5203
5204	if (attr_msg->tag.vio_subtype_env != VIO_ATTR_INFO) {
5205		return (EPROTO);
5206	}
5207
5208	switch (attr_msg->tag.vio_subtype) {
5209	case VIO_SUBTYPE_ACK:
5210		/*
5211		 * We now verify the attributes sent by vds.
5212		 */
5213		if (attr_msg->vdisk_size == 0) {
5214			DMSG(vdc, 0, "[%d] Invalid disk size from vds",
5215			    vdc->instance);
5216			status = EINVAL;
5217			break;
5218		}
5219
5220		if (attr_msg->max_xfer_sz == 0) {
5221			DMSG(vdc, 0, "[%d] Invalid transfer size from vds",
5222			    vdc->instance);
5223			status = EINVAL;
5224			break;
5225		}
5226
5227		if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) {
5228			DMSG(vdc, 0, "[%d] Unknown disk size from vds",
5229			    vdc->instance);
5230			attr_msg->vdisk_size = 0;
5231		}
5232
5233		/* update the VIO block size */
5234		if (attr_msg->vdisk_block_size > 0 &&
5235		    vdc_update_vio_bsize(vdc,
5236		    attr_msg->vdisk_block_size) != 0) {
5237			DMSG(vdc, 0, "[%d] Invalid block size (%u) from vds",
5238			    vdc->instance, attr_msg->vdisk_block_size);
5239			status = EINVAL;
5240			break;
5241		}
5242
5243		/* update disk, block and transfer sizes */
5244		old_type = vdc->vdisk_type;
5245		vdc_update_size(vdc, attr_msg->vdisk_size,
5246		    attr_msg->vdisk_block_size, attr_msg->max_xfer_sz);
5247		vdc->vdisk_type = attr_msg->vdisk_type;
5248		vdc->operations = attr_msg->operations;
5249		if (vio_ver_is_supported(vdc->ver, 1, 1))
5250			vdc->vdisk_media = attr_msg->vdisk_media;
5251		else
5252			vdc->vdisk_media = 0;
5253
5254		DMSG(vdc, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n",
5255		    vdc->instance, vdc->max_xfer_sz, attr_msg->max_xfer_sz);
5256		DMSG(vdc, 0, "[%d] vdisk_block_size: sent %lx acked %x\n",
5257		    vdc->instance, vdc->vdisk_bsize,
5258		    attr_msg->vdisk_block_size);
5259
5260		if ((attr_msg->xfer_mode != VIO_DRING_MODE_V1_0) ||
5261		    (attr_msg->vdisk_size > INT64_MAX) ||
5262		    (attr_msg->operations == 0) ||
5263		    (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) {
5264			DMSG(vdc, 0, "[%d] Invalid attributes from vds",
5265			    vdc->instance);
5266			status = EINVAL;
5267			break;
5268		}
5269
5270		/*
5271		 * Now that we have received all attributes we can create a
5272		 * fake geometry for the disk.
5273		 */
5274		vdc_create_fake_geometry(vdc);
5275
5276		/*
5277		 * If the disk type was previously unknown and device nodes
5278		 * were created then the driver would have created 8 device
5279		 * nodes. If we now find out that this is a single-slice disk
5280		 * then we need to re-create the appropriate device nodes.
5281		 */
5282		if (old_type == VD_DISK_TYPE_UNK &&
5283		    (vdc->initialized & VDC_MINOR) &&
5284		    vdc->vdisk_type == VD_DISK_TYPE_SLICE) {
5285			ddi_remove_minor_node(vdc->dip, NULL);
5286			(void) devfs_clean(ddi_get_parent(vdc->dip),
5287			    NULL, DV_CLEAN_FORCE);
5288			if (vdc_create_device_nodes(vdc) != 0) {
5289				DMSG(vdc, 0, "![%d] Failed to update "
5290				    "device nodes", vdc->instance);
5291			}
5292		}
5293
5294		break;
5295
5296	case VIO_SUBTYPE_NACK:
5297		/*
5298		 * vds could not handle the attributes we sent so we
5299		 * stop negotiating.
5300		 */
5301		status = EPROTO;
5302		break;
5303
5304	case VIO_SUBTYPE_INFO:
5305		/*
5306		 * Handle the case where vds starts the handshake
5307		 * (for now; vdc is the only supported instigatior)
5308		 */
5309		status = ENOTSUP;
5310		break;
5311
5312	default:
5313		status = ENOTSUP;
5314		break;
5315	}
5316
5317	return (status);
5318}
5319
5320/*
5321 * Function:
5322 *	vdc_handle_dring_reg_msg()
5323 *
5324 * Description:
5325 *
5326 * Arguments:
5327 *	vdc		- soft state pointer for this instance of the driver.
5328 *	dring_msg	- LDC message sent by vDisk server
5329 *
5330 * Return Code:
5331 *	0	- Success
5332 */
5333static int
5334vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *dring_msg)
5335{
5336	int		status = 0;
5337
5338	ASSERT(vdc != NULL);
5339	ASSERT(mutex_owned(&vdc->lock));
5340
5341	if (dring_msg->tag.vio_subtype_env != VIO_DRING_REG) {
5342		return (EPROTO);
5343	}
5344
5345	switch (dring_msg->tag.vio_subtype) {
5346	case VIO_SUBTYPE_ACK:
5347		/* save the received dring_ident */
5348		vdc->dring_ident = dring_msg->dring_ident;
5349		DMSG(vdc, 0, "[%d] Received dring ident=0x%lx\n",
5350		    vdc->instance, vdc->dring_ident);
5351		break;
5352
5353	case VIO_SUBTYPE_NACK:
5354		/*
5355		 * vds could not handle the DRing info we sent so we
5356		 * stop negotiating.
5357		 */
5358		DMSG(vdc, 0, "[%d] server could not register DRing\n",
5359		    vdc->instance);
5360		status = EPROTO;
5361		break;
5362
5363	case VIO_SUBTYPE_INFO:
5364		/*
5365		 * Handle the case where vds starts handshake
5366		 * (for now only vdc is the instigatior)
5367		 */
5368		status = ENOTSUP;
5369		break;
5370	default:
5371		status = ENOTSUP;
5372	}
5373
5374	return (status);
5375}
5376
5377/*
5378 * Function:
5379 *	vdc_verify_seq_num()
5380 *
5381 * Description:
5382 *	This functions verifies that the sequence number sent back by the vDisk
5383 *	server with the latest message is what is expected (i.e. it is greater
5384 *	than the last seq num sent by the vDisk server and less than or equal
5385 *	to the last seq num generated by vdc).
5386 *
5387 *	It then checks the request ID to see if any requests need processing
5388 *	in the DRing.
5389 *
5390 * Arguments:
5391 *	vdc		- soft state pointer for this instance of the driver.
5392 *	dring_msg	- pointer to the LDC message sent by vds
5393 *
5394 * Return Code:
5395 *	VDC_SEQ_NUM_TODO	- Message needs to be processed
5396 *	VDC_SEQ_NUM_SKIP	- Message has already been processed
5397 *	VDC_SEQ_NUM_INVALID	- The seq numbers are so out of sync,
5398 *				  vdc cannot deal with them
5399 */
5400static int
5401vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg)
5402{
5403	ASSERT(vdc != NULL);
5404	ASSERT(dring_msg != NULL);
5405	ASSERT(mutex_owned(&vdc->lock));
5406
5407	/*
5408	 * Check to see if the messages were responded to in the correct
5409	 * order by vds.
5410	 */
5411	if ((dring_msg->seq_num <= vdc->seq_num_reply) ||
5412	    (dring_msg->seq_num > vdc->seq_num)) {
5413		DMSG(vdc, 0, "?[%d] Bogus sequence_number %lu: "
5414		    "%lu > expected <= %lu (last proc req %lu sent %lu)\n",
5415		    vdc->instance, dring_msg->seq_num,
5416		    vdc->seq_num_reply, vdc->seq_num,
5417		    vdc->req_id_proc, vdc->req_id);
5418		return (VDC_SEQ_NUM_INVALID);
5419	}
5420	vdc->seq_num_reply = dring_msg->seq_num;
5421
5422	if (vdc->req_id_proc < vdc->req_id)
5423		return (VDC_SEQ_NUM_TODO);
5424	else
5425		return (VDC_SEQ_NUM_SKIP);
5426}
5427
5428
5429/*
5430 * Function:
5431 *	vdc_is_supported_version()
5432 *
5433 * Description:
5434 *	This routine checks if the major/minor version numbers specified in
5435 *	'ver_msg' are supported. If not it finds the next version that is
5436 *	in the supported version list 'vdc_version[]' and sets the fields in
5437 *	'ver_msg' to those values
5438 *
5439 * Arguments:
5440 *	ver_msg	- LDC message sent by vDisk server
5441 *
5442 * Return Code:
5443 *	B_TRUE	- Success
5444 *	B_FALSE	- Version not supported
5445 */
5446static boolean_t
5447vdc_is_supported_version(vio_ver_msg_t *ver_msg)
5448{
5449	int vdc_num_versions = sizeof (vdc_version) / sizeof (vdc_version[0]);
5450
5451	for (int i = 0; i < vdc_num_versions; i++) {
5452		ASSERT(vdc_version[i].major > 0);
5453		ASSERT((i == 0) ||
5454		    (vdc_version[i].major < vdc_version[i-1].major));
5455
5456		/*
5457		 * If the major versions match, adjust the minor version, if
5458		 * necessary, down to the highest value supported by this
5459		 * client. The server should support all minor versions lower
5460		 * than the value it sent
5461		 */
5462		if (ver_msg->ver_major == vdc_version[i].major) {
5463			if (ver_msg->ver_minor > vdc_version[i].minor) {
5464				DMSGX(0,
5465				    "Adjusting minor version from %u to %u",
5466				    ver_msg->ver_minor, vdc_version[i].minor);
5467				ver_msg->ver_minor = vdc_version[i].minor;
5468			}
5469			return (B_TRUE);
5470		}
5471
5472		/*
5473		 * If the message contains a higher major version number, set
5474		 * the message's major/minor versions to the current values
5475		 * and return false, so this message will get resent with
5476		 * these values, and the server will potentially try again
5477		 * with the same or a lower version
5478		 */
5479		if (ver_msg->ver_major > vdc_version[i].major) {
5480			ver_msg->ver_major = vdc_version[i].major;
5481			ver_msg->ver_minor = vdc_version[i].minor;
5482			DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n",
5483			    ver_msg->ver_major, ver_msg->ver_minor);
5484
5485			return (B_FALSE);
5486		}
5487
5488		/*
5489		 * Otherwise, the message's major version is less than the
5490		 * current major version, so continue the loop to the next
5491		 * (lower) supported version
5492		 */
5493	}
5494
5495	/*
5496	 * No common version was found; "ground" the version pair in the
5497	 * message to terminate negotiation
5498	 */
5499	ver_msg->ver_major = 0;
5500	ver_msg->ver_minor = 0;
5501
5502	return (B_FALSE);
5503}
5504/* -------------------------------------------------------------------------- */
5505
5506/*
5507 * DKIO(7) support
5508 */
5509
5510typedef struct vdc_dk_arg {
5511	struct dk_callback	dkc;
5512	int			mode;
5513	dev_t			dev;
5514	vdc_t			*vdc;
5515} vdc_dk_arg_t;
5516
5517/*
5518 * Function:
5519 *	vdc_dkio_flush_cb()
5520 *
5521 * Description:
5522 *	This routine is a callback for DKIOCFLUSHWRITECACHE which can be called
5523 *	by kernel code.
5524 *
5525 * Arguments:
5526 *	arg	- a pointer to a vdc_dk_arg_t structure.
5527 */
5528void
5529vdc_dkio_flush_cb(void *arg)
5530{
5531	struct vdc_dk_arg	*dk_arg = (struct vdc_dk_arg *)arg;
5532	struct dk_callback	*dkc = NULL;
5533	vdc_t			*vdc = NULL;
5534	int			rv;
5535
5536	if (dk_arg == NULL) {
5537		cmn_err(CE_NOTE, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n");
5538		return;
5539	}
5540	dkc = &dk_arg->dkc;
5541	vdc = dk_arg->vdc;
5542	ASSERT(vdc != NULL);
5543
5544	rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0,
5545	    VDCPART(dk_arg->dev), 0, VIO_both_dir, B_TRUE);
5546	if (rv != 0) {
5547		DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n",
5548		    vdc->instance, rv,
5549		    ddi_model_convert_from(dk_arg->mode & FMODELS));
5550	}
5551
5552	/*
5553	 * Trigger the call back to notify the caller the the ioctl call has
5554	 * been completed.
5555	 */
5556	if ((dk_arg->mode & FKIOCTL) &&
5557	    (dkc != NULL) &&
5558	    (dkc->dkc_callback != NULL)) {
5559		ASSERT(dkc->dkc_cookie != NULL);
5560		(*dkc->dkc_callback)(dkc->dkc_cookie, rv);
5561	}
5562
5563	/* Indicate that one less DKIO write flush is outstanding */
5564	mutex_enter(&vdc->lock);
5565	vdc->dkio_flush_pending--;
5566	ASSERT(vdc->dkio_flush_pending >= 0);
5567	mutex_exit(&vdc->lock);
5568
5569	/* free the mem that was allocated when the callback was dispatched */
5570	kmem_free(arg, sizeof (vdc_dk_arg_t));
5571}
5572
5573/*
5574 * Function:
5575 *	vdc_dkio_gapart()
5576 *
5577 * Description:
5578 *	This function implements the DKIOCGAPART ioctl.
5579 *
5580 * Arguments:
5581 *	vdc	- soft state pointer
5582 *	arg	- a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure
5583 *	flag	- ioctl flags
5584 */
5585static int
5586vdc_dkio_gapart(vdc_t *vdc, caddr_t arg, int flag)
5587{
5588	struct dk_geom *geom;
5589	struct extvtoc *vtoc;
5590	union {
5591		struct dk_map map[NDKMAP];
5592		struct dk_map32 map32[NDKMAP];
5593	} data;
5594	int i, rv, size;
5595
5596	mutex_enter(&vdc->lock);
5597
5598	if ((rv = vdc_validate_geometry(vdc)) != 0) {
5599		mutex_exit(&vdc->lock);
5600		return (rv);
5601	}
5602
5603	if (vdc->vdisk_size > VD_OLDVTOC_LIMIT) {
5604		mutex_exit(&vdc->lock);
5605		return (EOVERFLOW);
5606	}
5607
5608	vtoc = vdc->vtoc;
5609	geom = vdc->geom;
5610
5611	if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) {
5612
5613		for (i = 0; i < vtoc->v_nparts; i++) {
5614			data.map32[i].dkl_cylno = vtoc->v_part[i].p_start /
5615			    (geom->dkg_nhead * geom->dkg_nsect);
5616			data.map32[i].dkl_nblk = vtoc->v_part[i].p_size;
5617		}
5618		size = NDKMAP * sizeof (struct dk_map32);
5619
5620	} else {
5621
5622		for (i = 0; i < vtoc->v_nparts; i++) {
5623			data.map[i].dkl_cylno = vtoc->v_part[i].p_start /
5624			    (geom->dkg_nhead * geom->dkg_nsect);
5625			data.map[i].dkl_nblk = vtoc->v_part[i].p_size;
5626		}
5627		size = NDKMAP * sizeof (struct dk_map);
5628
5629	}
5630
5631	mutex_exit(&vdc->lock);
5632
5633	if (ddi_copyout(&data, arg, size, flag) != 0)
5634		return (EFAULT);
5635
5636	return (0);
5637}
5638
5639/*
5640 * Function:
5641 *	vdc_dkio_partition()
5642 *
5643 * Description:
5644 *	This function implements the DKIOCPARTITION ioctl.
5645 *
5646 * Arguments:
5647 *	vdc	- soft state pointer
5648 *	arg	- a pointer to a struct partition64 structure
5649 *	flag	- ioctl flags
5650 */
5651static int
5652vdc_dkio_partition(vdc_t *vdc, caddr_t arg, int flag)
5653{
5654	struct partition64 p64;
5655	efi_gpt_t *gpt;
5656	efi_gpe_t *gpe;
5657	vd_efi_dev_t edev;
5658	uint_t partno;
5659	int rv;
5660
5661	if (ddi_copyin(arg, &p64, sizeof (struct partition64), flag)) {
5662		return (EFAULT);
5663	}
5664
5665	VDC_EFI_DEV_SET(edev, vdc, vd_process_efi_ioctl);
5666
5667	if ((rv = vd_efi_alloc_and_read(&edev, &gpt, &gpe)) != 0) {
5668		return (rv);
5669	}
5670
5671	partno = p64.p_partno;
5672
5673	if (partno >= gpt->efi_gpt_NumberOfPartitionEntries) {
5674		vd_efi_free(&edev, gpt, gpe);
5675		return (ESRCH);
5676	}
5677
5678	bcopy(&gpe[partno].efi_gpe_PartitionTypeGUID, &p64.p_type,
5679	    sizeof (struct uuid));
5680	p64.p_start = gpe[partno].efi_gpe_StartingLBA;
5681	p64.p_size = gpe[partno].efi_gpe_EndingLBA - p64.p_start + 1;
5682
5683	if (ddi_copyout(&p64, arg, sizeof (struct partition64), flag)) {
5684		vd_efi_free(&edev, gpt, gpe);
5685		return (EFAULT);
5686	}
5687
5688	vd_efi_free(&edev, gpt, gpe);
5689	return (0);
5690}
5691
5692/*
5693 * Function:
5694 *	vdc_dioctl_rwcmd()
5695 *
5696 * Description:
5697 *	This function implements the DIOCTL_RWCMD ioctl. This ioctl is used
5698 *	for DKC_DIRECT disks to read or write at an absolute disk offset.
5699 *
5700 * Arguments:
5701 *	dev	- device
5702 *	arg	- a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure
5703 *	flag	- ioctl flags
5704 */
5705static int
5706vdc_dioctl_rwcmd(vdc_t *vdc, caddr_t arg, int flag)
5707{
5708	struct dadkio_rwcmd32 rwcmd32;
5709	struct dadkio_rwcmd rwcmd;
5710	struct iovec aiov;
5711	struct uio auio;
5712	int rw, status;
5713	struct buf *buf;
5714
5715	if (ddi_model_convert_from(flag & FMODELS) == DDI_MODEL_ILP32) {
5716		if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd32,
5717		    sizeof (struct dadkio_rwcmd32), flag)) {
5718			return (EFAULT);
5719		}
5720		rwcmd.cmd = rwcmd32.cmd;
5721		rwcmd.flags = rwcmd32.flags;
5722		rwcmd.blkaddr = (daddr_t)rwcmd32.blkaddr;
5723		rwcmd.buflen = rwcmd32.buflen;
5724		rwcmd.bufaddr = (caddr_t)(uintptr_t)rwcmd32.bufaddr;
5725	} else {
5726		if (ddi_copyin((caddr_t)arg, (caddr_t)&rwcmd,
5727		    sizeof (struct dadkio_rwcmd), flag)) {
5728			return (EFAULT);
5729		}
5730	}
5731
5732	switch (rwcmd.cmd) {
5733	case DADKIO_RWCMD_READ:
5734		rw = B_READ;
5735		break;
5736	case DADKIO_RWCMD_WRITE:
5737		rw = B_WRITE;
5738		break;
5739	default:
5740		return (EINVAL);
5741	}
5742
5743	bzero((caddr_t)&aiov, sizeof (struct iovec));
5744	aiov.iov_base   = rwcmd.bufaddr;
5745	aiov.iov_len    = rwcmd.buflen;
5746
5747	bzero((caddr_t)&auio, sizeof (struct uio));
5748	auio.uio_iov    = &aiov;
5749	auio.uio_iovcnt = 1;
5750	auio.uio_loffset = rwcmd.blkaddr * vdc->vdisk_bsize;
5751	auio.uio_resid  = rwcmd.buflen;
5752	auio.uio_segflg = flag & FKIOCTL ? UIO_SYSSPACE : UIO_USERSPACE;
5753
5754	buf = kmem_alloc(sizeof (buf_t), KM_SLEEP);
5755	bioinit(buf);
5756	/*
5757	 * We use the private field of buf to specify that this is an
5758	 * I/O using an absolute offset.
5759	 */
5760	buf->b_private = (void *)VD_SLICE_NONE;
5761
5762	status = physio(vdc_strategy, buf, VD_MAKE_DEV(vdc->instance, 0),
5763	    rw, vdc_min, &auio);
5764
5765	biofini(buf);
5766	kmem_free(buf, sizeof (buf_t));
5767
5768	return (status);
5769}
5770
5771/*
5772 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated
5773 * buffer is returned in alloc_len.
5774 */
5775static vd_scsi_t *
5776vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len,
5777    int *alloc_len)
5778{
5779	vd_scsi_t *vd_scsi;
5780	int vd_scsi_len = VD_SCSI_SIZE;
5781
5782	vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t));
5783	vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t));
5784	vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t));
5785	vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t));
5786
5787	ASSERT(vd_scsi_len % sizeof (uint64_t) == 0);
5788
5789	vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP);
5790
5791	vd_scsi->cdb_len = cdb_len;
5792	vd_scsi->sense_len = sense_len;
5793	vd_scsi->datain_len = datain_len;
5794	vd_scsi->dataout_len = dataout_len;
5795
5796	*alloc_len = vd_scsi_len;
5797
5798	return (vd_scsi);
5799}
5800
5801/*
5802 * Convert the status of a SCSI command to a Solaris return code.
5803 *
5804 * Arguments:
5805 *	vd_scsi		- The SCSI operation buffer.
5806 *	log_error	- indicate if an error message should be logged.
5807 *
5808 * Note that our SCSI error messages are rather primitive for the moment
5809 * and could be improved by decoding some data like the SCSI command and
5810 * the sense key.
5811 *
5812 * Return value:
5813 *	0		- Status is good.
5814 *	EACCES		- Status reports a reservation conflict.
5815 *	ENOTSUP		- Status reports a check condition and sense key
5816 *			  reports an illegal request.
5817 *	EIO		- Any other status.
5818 */
5819static int
5820vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error)
5821{
5822	int rv;
5823	char path_str[MAXPATHLEN];
5824	char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN];
5825	union scsi_cdb *cdb;
5826	struct scsi_extended_sense *sense;
5827
5828	if (vd_scsi->cmd_status == STATUS_GOOD)
5829		/* no error */
5830		return (0);
5831
5832	/* when the tunable vdc_scsi_log_error is true we log all errors */
5833	if (vdc_scsi_log_error)
5834		log_error = B_TRUE;
5835
5836	if (log_error) {
5837		cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n",
5838		    ddi_pathname(vdc->dip, path_str), vdc->instance,
5839		    GETCMD(VD_SCSI_DATA_CDB(vd_scsi)));
5840	}
5841
5842	/* default returned value */
5843	rv = EIO;
5844
5845	switch (vd_scsi->cmd_status) {
5846
5847	case STATUS_CHECK:
5848	case STATUS_TERMINATED:
5849		if (log_error)
5850			cmn_err(CE_CONT, "\tCheck Condition Error\n");
5851
5852		/* check sense buffer */
5853		if (vd_scsi->sense_len == 0 ||
5854		    vd_scsi->sense_status != STATUS_GOOD) {
5855			if (log_error)
5856				cmn_err(CE_CONT, "\tNo Sense Data Available\n");
5857			break;
5858		}
5859
5860		sense = VD_SCSI_DATA_SENSE(vd_scsi);
5861
5862		if (log_error) {
5863			cmn_err(CE_CONT, "\tSense Key:  0x%x\n"
5864			    "\tASC: 0x%x, ASCQ: 0x%x\n",
5865			    scsi_sense_key((uint8_t *)sense),
5866			    scsi_sense_asc((uint8_t *)sense),
5867			    scsi_sense_ascq((uint8_t *)sense));
5868		}
5869
5870		if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST)
5871			rv = ENOTSUP;
5872		break;
5873
5874	case STATUS_BUSY:
5875		if (log_error)
5876			cmn_err(CE_NOTE, "\tDevice Busy\n");
5877		break;
5878
5879	case STATUS_RESERVATION_CONFLICT:
5880		/*
5881		 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then
5882		 * reservation conflict could be due to various reasons like
5883		 * incorrect keys, not registered or not reserved etc. So,
5884		 * we should not panic in that case.
5885		 */
5886		cdb = VD_SCSI_DATA_CDB(vd_scsi);
5887		if (vdc->failfast_interval != 0 &&
5888		    cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN &&
5889		    cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) {
5890			/* failfast is enabled so we have to panic */
5891			(void) snprintf(panic_str, sizeof (panic_str),
5892			    VDC_RESV_CONFLICT_FMT_STR "%s",
5893			    ddi_pathname(vdc->dip, path_str));
5894			panic(panic_str);
5895		}
5896		if (log_error)
5897			cmn_err(CE_NOTE, "\tReservation Conflict\n");
5898		rv = EACCES;
5899		break;
5900
5901	case STATUS_QFULL:
5902		if (log_error)
5903			cmn_err(CE_NOTE, "\tQueue Full\n");
5904		break;
5905
5906	case STATUS_MET:
5907	case STATUS_INTERMEDIATE:
5908	case STATUS_SCSI2:
5909	case STATUS_INTERMEDIATE_MET:
5910	case STATUS_ACA_ACTIVE:
5911		if (log_error)
5912			cmn_err(CE_CONT,
5913			    "\tUnexpected SCSI status received: 0x%x\n",
5914			    vd_scsi->cmd_status);
5915		break;
5916
5917	default:
5918		if (log_error)
5919			cmn_err(CE_CONT,
5920			    "\tInvalid SCSI status received: 0x%x\n",
5921			    vd_scsi->cmd_status);
5922		break;
5923	}
5924
5925	return (rv);
5926}
5927
5928/*
5929 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to
5930 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI
5931 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is
5932 * converted to a VD_OP_RESET operation.
5933 */
5934static int
5935vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode)
5936{
5937	struct uscsi_cmd	uscsi;
5938	struct uscsi_cmd32	uscsi32;
5939	vd_scsi_t		*vd_scsi;
5940	int			vd_scsi_len;
5941	union scsi_cdb		*cdb;
5942	struct scsi_extended_sense *sense;
5943	char			*datain, *dataout;
5944	size_t			cdb_len, datain_len, dataout_len, sense_len;
5945	int			rv;
5946
5947	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
5948		if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32),
5949		    mode) != 0)
5950			return (EFAULT);
5951		uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi));
5952	} else {
5953		if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd),
5954		    mode) != 0)
5955			return (EFAULT);
5956	}
5957
5958	/* a uscsi reset is converted to a VD_OP_RESET operation */
5959	if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN |
5960	    USCSI_RESET_ALL)) {
5961		rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0,
5962		    VIO_both_dir, B_TRUE);
5963		return (rv);
5964	}
5965
5966	/* cdb buffer length */
5967	cdb_len = uscsi.uscsi_cdblen;
5968
5969	/* data in and out buffers length */
5970	if (uscsi.uscsi_flags & USCSI_READ) {
5971		datain_len = uscsi.uscsi_buflen;
5972		dataout_len = 0;
5973	} else {
5974		datain_len = 0;
5975		dataout_len = uscsi.uscsi_buflen;
5976	}
5977
5978	/* sense buffer length */
5979	if (uscsi.uscsi_flags & USCSI_RQENABLE)
5980		sense_len = uscsi.uscsi_rqlen;
5981	else
5982		sense_len = 0;
5983
5984	/* allocate buffer for the VD_SCSICMD_OP operation */
5985	vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len,
5986	    &vd_scsi_len);
5987
5988	/*
5989	 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague,
5990	 * but basically they prevent a SCSI command from being retried in case
5991	 * of an error.
5992	 */
5993	if ((uscsi.uscsi_flags & USCSI_ISOLATE) ||
5994	    (uscsi.uscsi_flags & USCSI_DIAGNOSE))
5995		vd_scsi->options |= VD_SCSI_OPT_NORETRY;
5996
5997	/* set task attribute */
5998	if (uscsi.uscsi_flags & USCSI_NOTAG) {
5999		vd_scsi->task_attribute = 0;
6000	} else {
6001		if (uscsi.uscsi_flags & USCSI_HEAD)
6002			vd_scsi->task_attribute = VD_SCSI_TASK_ACA;
6003		else if (uscsi.uscsi_flags & USCSI_HTAG)
6004			vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE;
6005		else if (uscsi.uscsi_flags & USCSI_OTAG)
6006			vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED;
6007		else
6008			vd_scsi->task_attribute = 0;
6009	}
6010
6011	/* set timeout */
6012	vd_scsi->timeout = uscsi.uscsi_timeout;
6013
6014	/* copy-in cdb data */
6015	cdb = VD_SCSI_DATA_CDB(vd_scsi);
6016	if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) {
6017		rv = EFAULT;
6018		goto done;
6019	}
6020
6021	/* keep a pointer to the sense buffer */
6022	sense = VD_SCSI_DATA_SENSE(vd_scsi);
6023
6024	/* keep a pointer to the data-in buffer */
6025	datain = (char *)VD_SCSI_DATA_IN(vd_scsi);
6026
6027	/* copy-in request data to the data-out buffer */
6028	dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi);
6029	if (!(uscsi.uscsi_flags & USCSI_READ)) {
6030		if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len,
6031		    mode)) {
6032			rv = EFAULT;
6033			goto done;
6034		}
6035	}
6036
6037	/* submit the request */
6038	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
6039	    0, 0, VIO_both_dir, B_FALSE);
6040
6041	if (rv != 0)
6042		goto done;
6043
6044	/* update scsi status */
6045	uscsi.uscsi_status = vd_scsi->cmd_status;
6046
6047	/* update sense data */
6048	if ((uscsi.uscsi_flags & USCSI_RQENABLE) &&
6049	    (uscsi.uscsi_status == STATUS_CHECK ||
6050	    uscsi.uscsi_status == STATUS_TERMINATED)) {
6051
6052		uscsi.uscsi_rqstatus = vd_scsi->sense_status;
6053
6054		if (uscsi.uscsi_rqstatus == STATUS_GOOD) {
6055			uscsi.uscsi_rqresid = uscsi.uscsi_rqlen -
6056			    vd_scsi->sense_len;
6057			if (ddi_copyout(sense, uscsi.uscsi_rqbuf,
6058			    vd_scsi->sense_len, mode) != 0) {
6059				rv = EFAULT;
6060				goto done;
6061			}
6062		}
6063	}
6064
6065	/* update request data */
6066	if (uscsi.uscsi_status == STATUS_GOOD) {
6067		if (uscsi.uscsi_flags & USCSI_READ) {
6068			uscsi.uscsi_resid = uscsi.uscsi_buflen -
6069			    vd_scsi->datain_len;
6070			if (ddi_copyout(datain, uscsi.uscsi_bufaddr,
6071			    vd_scsi->datain_len, mode) != 0) {
6072				rv = EFAULT;
6073				goto done;
6074			}
6075		} else {
6076			uscsi.uscsi_resid = uscsi.uscsi_buflen -
6077			    vd_scsi->dataout_len;
6078		}
6079	}
6080
6081	/* copy-out result */
6082	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
6083		uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32));
6084		if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32),
6085		    mode) != 0) {
6086			rv = EFAULT;
6087			goto done;
6088		}
6089	} else {
6090		if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd),
6091		    mode) != 0) {
6092			rv = EFAULT;
6093			goto done;
6094		}
6095	}
6096
6097	/* get the return code from the SCSI command status */
6098	rv = vdc_scsi_status(vdc, vd_scsi,
6099	    !(uscsi.uscsi_flags & USCSI_SILENT));
6100
6101done:
6102	kmem_free(vd_scsi, vd_scsi_len);
6103	return (rv);
6104}
6105
6106/*
6107 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command.
6108 *
6109 * Arguments:
6110 *	cmd		- SCSI PERSISTENT IN command
6111 *	len		- length of the SCSI input buffer
6112 *	vd_scsi_len	- return the length of the allocated buffer
6113 *
6114 * Returned Value:
6115 *	a pointer to the allocated VD_OP_SCSICMD buffer.
6116 */
6117static vd_scsi_t *
6118vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len)
6119{
6120	int cdb_len, sense_len, datain_len, dataout_len;
6121	vd_scsi_t *vd_scsi;
6122	union scsi_cdb *cdb;
6123
6124	cdb_len = CDB_GROUP1;
6125	sense_len = sizeof (struct scsi_extended_sense);
6126	datain_len = len;
6127	dataout_len = 0;
6128
6129	vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len,
6130	    vd_scsi_len);
6131
6132	cdb = VD_SCSI_DATA_CDB(vd_scsi);
6133
6134	/* set cdb */
6135	cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN;
6136	cdb->cdb_opaque[1] = cmd;
6137	FORMG1COUNT(cdb, datain_len);
6138
6139	vd_scsi->timeout = vdc_scsi_timeout;
6140
6141	return (vd_scsi);
6142}
6143
6144/*
6145 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command.
6146 *
6147 * Arguments:
6148 *	cmd		- SCSI PERSISTENT OUT command
6149 *	len		- length of the SCSI output buffer
6150 *	vd_scsi_len	- return the length of the allocated buffer
6151 *
6152 * Returned Code:
6153 *	a pointer to the allocated VD_OP_SCSICMD buffer.
6154 */
6155static vd_scsi_t *
6156vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len)
6157{
6158	int cdb_len, sense_len, datain_len, dataout_len;
6159	vd_scsi_t *vd_scsi;
6160	union scsi_cdb *cdb;
6161
6162	cdb_len = CDB_GROUP1;
6163	sense_len = sizeof (struct scsi_extended_sense);
6164	datain_len = 0;
6165	dataout_len = len;
6166
6167	vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len,
6168	    vd_scsi_len);
6169
6170	cdb = VD_SCSI_DATA_CDB(vd_scsi);
6171
6172	/* set cdb */
6173	cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT;
6174	cdb->cdb_opaque[1] = cmd;
6175	FORMG1COUNT(cdb, dataout_len);
6176
6177	vd_scsi->timeout = vdc_scsi_timeout;
6178
6179	return (vd_scsi);
6180}
6181
6182/*
6183 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted
6184 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk
6185 * server with a VD_OP_SCSICMD operation.
6186 */
6187static int
6188vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode)
6189{
6190	vd_scsi_t *vd_scsi;
6191	mhioc_inkeys_t inkeys;
6192	mhioc_key_list_t klist;
6193	struct mhioc_inkeys32 inkeys32;
6194	struct mhioc_key_list32 klist32;
6195	sd_prin_readkeys_t *scsi_keys;
6196	void *user_keys;
6197	int vd_scsi_len;
6198	int listsize, listlen, rv;
6199
6200	/* copyin arguments */
6201	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
6202		rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode);
6203		if (rv != 0)
6204			return (EFAULT);
6205
6206		rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32,
6207		    sizeof (klist32), mode);
6208		if (rv != 0)
6209			return (EFAULT);
6210
6211		listsize = klist32.listsize;
6212	} else {
6213		rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode);
6214		if (rv != 0)
6215			return (EFAULT);
6216
6217		rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode);
6218		if (rv != 0)
6219			return (EFAULT);
6220
6221		listsize = klist.listsize;
6222	}
6223
6224	/* build SCSI VD_OP request */
6225	vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS,
6226	    sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) +
6227	    (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len);
6228
6229	scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi);
6230
6231	/* submit the request */
6232	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
6233	    0, 0, VIO_both_dir, B_FALSE);
6234
6235	if (rv != 0)
6236		goto done;
6237
6238	listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE;
6239
6240	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
6241		inkeys32.generation = scsi_keys->generation;
6242		rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode);
6243		if (rv != 0) {
6244			rv = EFAULT;
6245			goto done;
6246		}
6247
6248		klist32.listlen = listlen;
6249		rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li,
6250		    sizeof (klist32), mode);
6251		if (rv != 0) {
6252			rv = EFAULT;
6253			goto done;
6254		}
6255
6256		user_keys = (caddr_t)(uintptr_t)klist32.list;
6257	} else {
6258		inkeys.generation = scsi_keys->generation;
6259		rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode);
6260		if (rv != 0) {
6261			rv = EFAULT;
6262			goto done;
6263		}
6264
6265		klist.listlen = listlen;
6266		rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode);
6267		if (rv != 0) {
6268			rv = EFAULT;
6269			goto done;
6270		}
6271
6272		user_keys = klist.list;
6273	}
6274
6275	/* copy out keys */
6276	if (listlen > 0 && listsize > 0) {
6277		if (listsize < listlen)
6278			listlen = listsize;
6279		rv = ddi_copyout(&scsi_keys->keylist, user_keys,
6280		    listlen * MHIOC_RESV_KEY_SIZE, mode);
6281		if (rv != 0)
6282			rv = EFAULT;
6283	}
6284
6285	if (rv == 0)
6286		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
6287
6288done:
6289	kmem_free(vd_scsi, vd_scsi_len);
6290
6291	return (rv);
6292}
6293
6294/*
6295 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted
6296 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to
6297 * the vdisk server with a VD_OP_SCSICMD operation.
6298 */
6299static int
6300vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode)
6301{
6302	vd_scsi_t *vd_scsi;
6303	mhioc_inresvs_t inresv;
6304	mhioc_resv_desc_list_t rlist;
6305	struct mhioc_inresvs32 inresv32;
6306	struct mhioc_resv_desc_list32 rlist32;
6307	mhioc_resv_desc_t mhd_resv;
6308	sd_prin_readresv_t *scsi_resv;
6309	sd_readresv_desc_t *resv;
6310	mhioc_resv_desc_t *user_resv;
6311	int vd_scsi_len;
6312	int listsize, listlen, i, rv;
6313
6314	/* copyin arguments */
6315	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
6316		rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode);
6317		if (rv != 0)
6318			return (EFAULT);
6319
6320		rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32,
6321		    sizeof (rlist32), mode);
6322		if (rv != 0)
6323			return (EFAULT);
6324
6325		listsize = rlist32.listsize;
6326	} else {
6327		rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode);
6328		if (rv != 0)
6329			return (EFAULT);
6330
6331		rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode);
6332		if (rv != 0)
6333			return (EFAULT);
6334
6335		listsize = rlist.listsize;
6336	}
6337
6338	/* build SCSI VD_OP request */
6339	vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV,
6340	    sizeof (sd_prin_readresv_t) - sizeof (caddr_t) +
6341	    (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len);
6342
6343	scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi);
6344
6345	/* submit the request */
6346	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
6347	    0, 0, VIO_both_dir, B_FALSE);
6348
6349	if (rv != 0)
6350		goto done;
6351
6352	listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN;
6353
6354	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
6355		inresv32.generation = scsi_resv->generation;
6356		rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode);
6357		if (rv != 0) {
6358			rv = EFAULT;
6359			goto done;
6360		}
6361
6362		rlist32.listlen = listlen;
6363		rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li,
6364		    sizeof (rlist32), mode);
6365		if (rv != 0) {
6366			rv = EFAULT;
6367			goto done;
6368		}
6369
6370		user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list;
6371	} else {
6372		inresv.generation = scsi_resv->generation;
6373		rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode);
6374		if (rv != 0) {
6375			rv = EFAULT;
6376			goto done;
6377		}
6378
6379		rlist.listlen = listlen;
6380		rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode);
6381		if (rv != 0) {
6382			rv = EFAULT;
6383			goto done;
6384		}
6385
6386		user_resv = rlist.list;
6387	}
6388
6389	/* copy out reservations */
6390	if (listsize > 0 && listlen > 0) {
6391		if (listsize < listlen)
6392			listlen = listsize;
6393		resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc;
6394
6395		for (i = 0; i < listlen; i++) {
6396			mhd_resv.type = resv->type;
6397			mhd_resv.scope = resv->scope;
6398			mhd_resv.scope_specific_addr =
6399			    BE_32(resv->scope_specific_addr);
6400			bcopy(&resv->resvkey, &mhd_resv.key,
6401			    MHIOC_RESV_KEY_SIZE);
6402
6403			rv = ddi_copyout(&mhd_resv, user_resv,
6404			    sizeof (mhd_resv), mode);
6405			if (rv != 0) {
6406				rv = EFAULT;
6407				goto done;
6408			}
6409			resv++;
6410			user_resv++;
6411		}
6412	}
6413
6414	if (rv == 0)
6415		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
6416
6417done:
6418	kmem_free(vd_scsi, vd_scsi_len);
6419	return (rv);
6420}
6421
6422/*
6423 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted
6424 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk
6425 * server with a VD_OP_SCSICMD operation.
6426 */
6427static int
6428vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode)
6429{
6430	vd_scsi_t *vd_scsi;
6431	sd_prout_t *scsi_prout;
6432	mhioc_register_t mhd_reg;
6433	int vd_scsi_len, rv;
6434
6435	/* copyin arguments */
6436	rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode);
6437	if (rv != 0)
6438		return (EFAULT);
6439
6440	/* build SCSI VD_OP request */
6441	vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER,
6442	    sizeof (sd_prout_t), &vd_scsi_len);
6443
6444	/* set parameters */
6445	scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi);
6446	bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE);
6447	bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE);
6448	scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl;
6449
6450	/* submit the request */
6451	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
6452	    0, 0, VIO_both_dir, B_FALSE);
6453
6454	if (rv == 0)
6455		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
6456
6457	kmem_free(vd_scsi, vd_scsi_len);
6458	return (rv);
6459}
6460
6461/*
6462 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted
6463 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk
6464 * server with a VD_OP_SCSICMD operation.
6465 */
6466static int
6467vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode)
6468{
6469	union scsi_cdb *cdb;
6470	vd_scsi_t *vd_scsi;
6471	sd_prout_t *scsi_prout;
6472	mhioc_resv_desc_t mhd_resv;
6473	int vd_scsi_len, rv;
6474
6475	/* copyin arguments */
6476	rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode);
6477	if (rv != 0)
6478		return (EFAULT);
6479
6480	/* build SCSI VD_OP request */
6481	vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE,
6482	    sizeof (sd_prout_t), &vd_scsi_len);
6483
6484	/* set parameters */
6485	cdb = VD_SCSI_DATA_CDB(vd_scsi);
6486	scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi);
6487	bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE);
6488	scsi_prout->scope_address = mhd_resv.scope_specific_addr;
6489	cdb->cdb_opaque[2] = mhd_resv.type;
6490
6491	/* submit the request */
6492	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
6493	    0, 0, VIO_both_dir, B_FALSE);
6494
6495	if (rv == 0)
6496		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
6497
6498	kmem_free(vd_scsi, vd_scsi_len);
6499	return (rv);
6500}
6501
6502/*
6503 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is
6504 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which
6505 * is sent to the vdisk server with a VD_OP_SCSICMD operation.
6506 */
6507static int
6508vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode)
6509{
6510	union scsi_cdb *cdb;
6511	vd_scsi_t *vd_scsi;
6512	sd_prout_t *scsi_prout;
6513	mhioc_preemptandabort_t mhd_preempt;
6514	int vd_scsi_len, rv;
6515
6516	/* copyin arguments */
6517	rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode);
6518	if (rv != 0)
6519		return (EFAULT);
6520
6521	/* build SCSI VD_OP request */
6522	vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT,
6523	    sizeof (sd_prout_t), &vd_scsi_len);
6524
6525	/* set parameters */
6526	vd_scsi->task_attribute = VD_SCSI_TASK_ACA;
6527	cdb = VD_SCSI_DATA_CDB(vd_scsi);
6528	scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi);
6529	bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key,
6530	    MHIOC_RESV_KEY_SIZE);
6531	bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key,
6532	    MHIOC_RESV_KEY_SIZE);
6533	scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr;
6534	cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type;
6535
6536	/* submit the request */
6537	rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
6538	    0, 0, VIO_both_dir, B_FALSE);
6539
6540	if (rv == 0)
6541		rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
6542
6543	kmem_free(vd_scsi, vd_scsi_len);
6544	return (rv);
6545}
6546
6547/*
6548 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl
6549 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY
6550 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation.
6551 */
6552static int
6553vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode)
6554{
6555	vd_scsi_t *vd_scsi;
6556	sd_prout_t *scsi_prout;
6557	mhioc_registerandignorekey_t mhd_regi;
6558	int vd_scsi_len, rv;
6559
6560	/* copyin arguments */
6561	rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode);
6562	if (rv != 0)
6563		return (EFAULT);
6564
6565	/* build SCSI VD_OP request */
6566	vd_scsi = vdc_scsi_alloc_persistent_out(