1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2014 Nexenta Systems Inc. All rights reserved.
24 * Copyright (c) 2018, Joyent, Inc.
25 */
26
27/*
28 * Multipath driver interface (MDI) implementation; see mdi_impldefs.h for a
29 * more detailed discussion of the overall mpxio architecture.
30 *
31 * Default locking order:
32 *
33 * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
34 * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
35 * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
36 * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
37 * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
38 * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
39 * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
40 */
41
42#include <sys/note.h>
43#include <sys/types.h>
44#include <sys/varargs.h>
45#include <sys/param.h>
46#include <sys/errno.h>
47#include <sys/uio.h>
48#include <sys/buf.h>
49#include <sys/modctl.h>
50#include <sys/open.h>
51#include <sys/kmem.h>
52#include <sys/poll.h>
53#include <sys/conf.h>
54#include <sys/bootconf.h>
55#include <sys/cmn_err.h>
56#include <sys/stat.h>
57#include <sys/ddi.h>
58#include <sys/sunddi.h>
59#include <sys/ddipropdefs.h>
60#include <sys/sunndi.h>
61#include <sys/ndi_impldefs.h>
62#include <sys/promif.h>
63#include <sys/sunmdi.h>
64#include <sys/mdi_impldefs.h>
65#include <sys/taskq.h>
66#include <sys/epm.h>
67#include <sys/sunpm.h>
68#include <sys/modhash.h>
69#include <sys/disp.h>
70#include <sys/autoconf.h>
71#include <sys/sysmacros.h>
72
73#ifdef	DEBUG
74#include <sys/debug.h>
75int	mdi_debug = 1;
76int	mdi_debug_logonly = 0;
77#define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
78#define	MDI_WARN	CE_WARN, __func__
79#define	MDI_NOTE	CE_NOTE, __func__
80#define	MDI_CONT	CE_CONT, __func__
81static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
82#else	/* !DEBUG */
83#define	MDI_DEBUG(dbglevel, pargs)
84#endif	/* DEBUG */
85int	mdi_debug_consoleonly = 0;
86int	mdi_delay = 3;
87
88extern pri_t	minclsyspri;
89extern int	modrootloaded;
90
91/*
92 * Global mutex:
93 * Protects vHCI list and structure members.
94 */
95kmutex_t	mdi_mutex;
96
97/*
98 * Registered vHCI class driver lists
99 */
100int		mdi_vhci_count;
101mdi_vhci_t	*mdi_vhci_head;
102mdi_vhci_t	*mdi_vhci_tail;
103
104/*
105 * Client Hash Table size
106 */
107static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
108
109/*
110 * taskq interface definitions
111 */
112#define	MDI_TASKQ_N_THREADS	8
113#define	MDI_TASKQ_PRI		minclsyspri
114#define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
115#define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
116
117taskq_t				*mdi_taskq;
118static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
119
120#define	TICKS_PER_SECOND	(drv_usectohz(1000000))
121
122/*
123 * The data should be "quiet" for this interval (in seconds) before the
124 * vhci cached data is flushed to the disk.
125 */
126static int mdi_vhcache_flush_delay = 10;
127
128/* number of seconds the vhcache flush daemon will sleep idle before exiting */
129static int mdi_vhcache_flush_daemon_idle_time = 60;
130
131/*
132 * MDI falls back to discovery of all paths when a bus_config_one fails.
133 * The following parameters can be used to tune this operation.
134 *
135 * mdi_path_discovery_boot
136 *	Number of times path discovery will be attempted during early boot.
137 *	Probably there is no reason to ever set this value to greater than one.
138 *
139 * mdi_path_discovery_postboot
140 *	Number of times path discovery will be attempted after early boot.
141 *	Set it to a minimum of two to allow for discovery of iscsi paths which
142 *	may happen very late during booting.
143 *
144 * mdi_path_discovery_interval
145 *	Minimum number of seconds MDI will wait between successive discovery
146 *	of all paths. Set it to -1 to disable discovery of all paths.
147 */
148static int mdi_path_discovery_boot = 1;
149static int mdi_path_discovery_postboot = 2;
150static int mdi_path_discovery_interval = 10;
151
152/*
153 * number of seconds the asynchronous configuration thread will sleep idle
154 * before exiting.
155 */
156static int mdi_async_config_idle_time = 600;
157
158static int mdi_bus_config_cache_hash_size = 256;
159
160/* turns off multithreaded configuration for certain operations */
161static int mdi_mtc_off = 0;
162
163/*
164 * The "path" to a pathinfo node is identical to the /devices path to a
165 * devinfo node had the device been enumerated under a pHCI instead of
166 * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
167 * This association persists across create/delete of the pathinfo nodes,
168 * but not across reboot.
169 */
170static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
171static int		mdi_pathmap_hash_size = 256;
172static kmutex_t		mdi_pathmap_mutex;
173static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
174static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
175static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */
176
177/*
178 * MDI component property name/value string definitions
179 */
180const char 		*mdi_component_prop = "mpxio-component";
181const char		*mdi_component_prop_vhci = "vhci";
182const char		*mdi_component_prop_phci = "phci";
183const char		*mdi_component_prop_client = "client";
184
185/*
186 * MDI client global unique identifier property name
187 */
188const char		*mdi_client_guid_prop = "client-guid";
189
190/*
191 * MDI client load balancing property name/value string definitions
192 */
193const char		*mdi_load_balance = "load-balance";
194const char		*mdi_load_balance_none = "none";
195const char		*mdi_load_balance_rr = "round-robin";
196const char		*mdi_load_balance_lba = "logical-block";
197
198/*
199 * Obsolete vHCI class definition; to be removed after Leadville update
200 */
201const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
202
203static char vhci_greeting[] =
204	"\tThere already exists one vHCI driver for class %s\n"
205	"\tOnly one vHCI driver for each class is allowed\n";
206
207/*
208 * Static function prototypes
209 */
210static int		i_mdi_phci_offline(dev_info_t *, uint_t);
211static int		i_mdi_client_offline(dev_info_t *, uint_t);
212static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
213static void		i_mdi_phci_post_detach(dev_info_t *,
214			    ddi_detach_cmd_t, int);
215static int		i_mdi_client_pre_detach(dev_info_t *,
216			    ddi_detach_cmd_t);
217static void		i_mdi_client_post_detach(dev_info_t *,
218			    ddi_detach_cmd_t, int);
219static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
220static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
221static int 		i_mdi_lba_lb(mdi_client_t *ct,
222			    mdi_pathinfo_t **ret_pip, struct buf *buf);
223static void		i_mdi_pm_hold_client(mdi_client_t *, int);
224static void		i_mdi_pm_rele_client(mdi_client_t *, int);
225static void		i_mdi_pm_reset_client(mdi_client_t *);
226static int		i_mdi_power_all_phci(mdi_client_t *);
227static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
228
229
230/*
231 * Internal mdi_pathinfo node functions
232 */
233static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
234
235static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
236static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
237static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
238static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
239static void		i_mdi_phci_unlock(mdi_phci_t *);
240static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
241static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
242static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
243static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
244			    mdi_client_t *);
245static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
246static void		i_mdi_client_remove_path(mdi_client_t *,
247			    mdi_pathinfo_t *);
248
249static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
250			    mdi_pathinfo_state_t, int);
251static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
252static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
253			    char **, int);
254static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
255static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
256static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
257static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
258static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
259static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
260static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
261static void		i_mdi_client_update_state(mdi_client_t *);
262static int		i_mdi_client_compute_state(mdi_client_t *,
263			    mdi_phci_t *);
264static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
265static void		i_mdi_client_unlock(mdi_client_t *);
266static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
267static mdi_client_t	*i_devi_get_client(dev_info_t *);
268/*
269 * NOTE: this will be removed once the NWS files are changed to use the new
270 * mdi_{enable,disable}_path interfaces
271 */
272static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
273				int, int);
274static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
275				mdi_vhci_t *vh, int flags, int op);
276/*
277 * Failover related function prototypes
278 */
279static int		i_mdi_failover(void *);
280
281/*
282 * misc internal functions
283 */
284static int		i_mdi_get_hash_key(char *);
285static int		i_map_nvlist_error_to_mdi(int);
286static void		i_mdi_report_path_state(mdi_client_t *,
287			    mdi_pathinfo_t *);
288
289static void		setup_vhci_cache(mdi_vhci_t *);
290static int		destroy_vhci_cache(mdi_vhci_t *);
291static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
292static boolean_t	stop_vhcache_flush_thread(void *, int);
293static void		free_string_array(char **, int);
294static void		free_vhcache_phci(mdi_vhcache_phci_t *);
295static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
296static void		free_vhcache_client(mdi_vhcache_client_t *);
297static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
298static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
299static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
300static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
301static void		vhcache_pi_add(mdi_vhci_config_t *,
302			    struct mdi_pathinfo *);
303static void		vhcache_pi_remove(mdi_vhci_config_t *,
304			    struct mdi_pathinfo *);
305static void		free_phclient_path_list(mdi_phys_path_t *);
306static void		sort_vhcache_paths(mdi_vhcache_client_t *);
307static int		flush_vhcache(mdi_vhci_config_t *, int);
308static void		vhcache_dirty(mdi_vhci_config_t *);
309static void		free_async_client_config(mdi_async_client_config_t *);
310static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
311static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
312static nvlist_t		*read_on_disk_vhci_cache(char *);
313extern int		fread_nvlist(char *, nvlist_t **);
314extern int		fwrite_nvlist(char *, nvlist_t *);
315
316/* called once when first vhci registers with mdi */
317static void
318i_mdi_init()
319{
320	static int initialized = 0;
321
322	if (initialized)
323		return;
324	initialized = 1;
325
326	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
327
328	/* Create our taskq resources */
329	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
330	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
331	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
332	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
333
334	/* Allocate ['path_instance' <-> "path"] maps */
335	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
336	mdi_pathmap_bypath = mod_hash_create_strhash(
337	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
338	    mod_hash_null_valdtor);
339	mdi_pathmap_byinstance = mod_hash_create_idhash(
340	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
341	    mod_hash_null_valdtor);
342	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
343	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
344	    mod_hash_null_valdtor);
345}
346
347/*
348 * mdi_get_component_type():
349 *		Return mpxio component type
350 * Return Values:
351 *		MDI_COMPONENT_NONE
352 *		MDI_COMPONENT_VHCI
353 *		MDI_COMPONENT_PHCI
354 *		MDI_COMPONENT_CLIENT
355 * XXX This doesn't work under multi-level MPxIO and should be
356 *	removed when clients migrate mdi_component_is_*() interfaces.
357 */
358int
359mdi_get_component_type(dev_info_t *dip)
360{
361	return (DEVI(dip)->devi_mdi_component);
362}
363
364/*
365 * mdi_vhci_register():
366 *		Register a vHCI module with the mpxio framework
367 *		mdi_vhci_register() is called by vHCI drivers to register the
368 *		'class_driver' vHCI driver and its MDI entrypoints with the
369 *		mpxio framework.  The vHCI driver must call this interface as
370 *		part of its attach(9e) handler.
371 *		Competing threads may try to attach mdi_vhci_register() as
372 *		the vHCI drivers are loaded and attached as a result of pHCI
373 *		driver instance registration (mdi_phci_register()) with the
374 *		framework.
375 * Return Values:
376 *		MDI_SUCCESS
377 *		MDI_FAILURE
378 */
379/*ARGSUSED*/
380int
381mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
382    int flags)
383{
384	mdi_vhci_t		*vh = NULL;
385
386	/* Registrant can't be older */
387	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
388
389#ifdef DEBUG
390	/*
391	 * IB nexus driver is loaded only when IB hardware is present.
392	 * In order to be able to do this there is a need to drive the loading
393	 * and attaching of the IB nexus driver (especially when an IB hardware
394	 * is dynamically plugged in) when an IB HCA driver (PHCI)
395	 * is being attached. Unfortunately this gets into the limitations
396	 * of devfs as there seems to be no clean way to drive configuration
397	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
398	 * for IB.
399	 */
400	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
401		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
402#endif
403
404	i_mdi_init();
405
406	mutex_enter(&mdi_mutex);
407	/*
408	 * Scan for already registered vhci
409	 */
410	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
411		if (strcmp(vh->vh_class, class) == 0) {
412			/*
413			 * vHCI has already been created.  Check for valid
414			 * vHCI ops registration.  We only support one vHCI
415			 * module per class
416			 */
417			if (vh->vh_ops != NULL) {
418				mutex_exit(&mdi_mutex);
419				cmn_err(CE_NOTE, vhci_greeting, class);
420				return (MDI_FAILURE);
421			}
422			break;
423		}
424	}
425
426	/*
427	 * if not yet created, create the vHCI component
428	 */
429	if (vh == NULL) {
430		struct client_hash	*hash = NULL;
431		char			*load_balance;
432
433		/*
434		 * Allocate and initialize the mdi extensions
435		 */
436		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
437		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
438		    KM_SLEEP);
439		vh->vh_client_table = hash;
440		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
441		(void) strcpy(vh->vh_class, class);
442		vh->vh_lb = LOAD_BALANCE_RR;
443		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
444		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
445			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
446				vh->vh_lb = LOAD_BALANCE_NONE;
447			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
448				    == 0) {
449				vh->vh_lb = LOAD_BALANCE_LBA;
450			}
451			ddi_prop_free(load_balance);
452		}
453
454		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
455		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
456
457		/*
458		 * Store the vHCI ops vectors
459		 */
460		vh->vh_dip = vdip;
461		vh->vh_ops = vops;
462
463		setup_vhci_cache(vh);
464
465		if (mdi_vhci_head == NULL) {
466			mdi_vhci_head = vh;
467		}
468		if (mdi_vhci_tail) {
469			mdi_vhci_tail->vh_next = vh;
470		}
471		mdi_vhci_tail = vh;
472		mdi_vhci_count++;
473	}
474
475	/*
476	 * Claim the devfs node as a vhci component
477	 */
478	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
479
480	/*
481	 * Initialize our back reference from dev_info node
482	 */
483	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
484	mutex_exit(&mdi_mutex);
485	return (MDI_SUCCESS);
486}
487
488/*
489 * mdi_vhci_unregister():
490 *		Unregister a vHCI module from mpxio framework
491 *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
492 * 		of a vhci to unregister it from the framework.
493 * Return Values:
494 *		MDI_SUCCESS
495 *		MDI_FAILURE
496 */
497/*ARGSUSED*/
498int
499mdi_vhci_unregister(dev_info_t *vdip, int flags)
500{
501	mdi_vhci_t	*found, *vh, *prev = NULL;
502
503	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
504
505	/*
506	 * Check for invalid VHCI
507	 */
508	if ((vh = i_devi_get_vhci(vdip)) == NULL)
509		return (MDI_FAILURE);
510
511	/*
512	 * Scan the list of registered vHCIs for a match
513	 */
514	mutex_enter(&mdi_mutex);
515	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
516		if (found == vh)
517			break;
518		prev = found;
519	}
520
521	if (found == NULL) {
522		mutex_exit(&mdi_mutex);
523		return (MDI_FAILURE);
524	}
525
526	/*
527	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
528	 * should have been unregistered, before a vHCI can be
529	 * unregistered.
530	 */
531	MDI_VHCI_PHCI_LOCK(vh);
532	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
533		MDI_VHCI_PHCI_UNLOCK(vh);
534		mutex_exit(&mdi_mutex);
535		return (MDI_FAILURE);
536	}
537	MDI_VHCI_PHCI_UNLOCK(vh);
538
539	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
540		mutex_exit(&mdi_mutex);
541		return (MDI_FAILURE);
542	}
543
544	/*
545	 * Remove the vHCI from the global list
546	 */
547	if (vh == mdi_vhci_head) {
548		mdi_vhci_head = vh->vh_next;
549	} else {
550		prev->vh_next = vh->vh_next;
551	}
552	if (vh == mdi_vhci_tail) {
553		mdi_vhci_tail = prev;
554	}
555	mdi_vhci_count--;
556	mutex_exit(&mdi_mutex);
557
558	vh->vh_ops = NULL;
559	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
560	DEVI(vdip)->devi_mdi_xhci = NULL;
561	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
562	kmem_free(vh->vh_client_table,
563	    mdi_client_table_size * sizeof (struct client_hash));
564	mutex_destroy(&vh->vh_phci_mutex);
565	mutex_destroy(&vh->vh_client_mutex);
566
567	kmem_free(vh, sizeof (mdi_vhci_t));
568	return (MDI_SUCCESS);
569}
570
571/*
572 * i_mdi_vhci_class2vhci():
573 *		Look for a matching vHCI module given a vHCI class name
574 * Return Values:
575 *		Handle to a vHCI component
576 *		NULL
577 */
578static mdi_vhci_t *
579i_mdi_vhci_class2vhci(char *class)
580{
581	mdi_vhci_t	*vh = NULL;
582
583	ASSERT(!MUTEX_HELD(&mdi_mutex));
584
585	mutex_enter(&mdi_mutex);
586	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
587		if (strcmp(vh->vh_class, class) == 0) {
588			break;
589		}
590	}
591	mutex_exit(&mdi_mutex);
592	return (vh);
593}
594
595/*
596 * i_devi_get_vhci():
597 *		Utility function to get the handle to a vHCI component
598 * Return Values:
599 *		Handle to a vHCI component
600 *		NULL
601 */
602mdi_vhci_t *
603i_devi_get_vhci(dev_info_t *vdip)
604{
605	mdi_vhci_t	*vh = NULL;
606	if (MDI_VHCI(vdip)) {
607		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
608	}
609	return (vh);
610}
611
612/*
613 * mdi_phci_register():
614 *		Register a pHCI module with mpxio framework
615 *		mdi_phci_register() is called by pHCI drivers to register with
616 *		the mpxio framework and a specific 'class_driver' vHCI.  The
617 *		pHCI driver must call this interface as part of its attach(9e)
618 *		handler.
619 * Return Values:
620 *		MDI_SUCCESS
621 *		MDI_FAILURE
622 */
623/*ARGSUSED*/
624int
625mdi_phci_register(char *class, dev_info_t *pdip, int flags)
626{
627	mdi_phci_t		*ph;
628	mdi_vhci_t		*vh;
629	char			*data;
630
631	/*
632	 * Some subsystems, like fcp, perform pHCI registration from a
633	 * different thread than the one doing the pHCI attach(9E) - the
634	 * driver attach code is waiting for this other thread to complete.
635	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
636	 * (indicating that some thread has done an ndi_devi_enter of parent)
637	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
638	 */
639	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
640
641	/*
642	 * Check for mpxio-disable property. Enable mpxio if the property is
643	 * missing or not set to "yes".
644	 * If the property is set to "yes" then emit a brief message.
645	 */
646	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
647	    &data) == DDI_SUCCESS)) {
648		if (strcmp(data, "yes") == 0) {
649			MDI_DEBUG(1, (MDI_CONT, pdip,
650			    "?multipath capabilities disabled via %s.conf.",
651			    ddi_driver_name(pdip)));
652			ddi_prop_free(data);
653			return (MDI_FAILURE);
654		}
655		ddi_prop_free(data);
656	}
657
658	/*
659	 * Search for a matching vHCI
660	 */
661	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
662	if (vh == NULL) {
663		return (MDI_FAILURE);
664	}
665
666	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
667	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
668	ph->ph_dip = pdip;
669	ph->ph_vhci = vh;
670	ph->ph_next = NULL;
671	ph->ph_unstable = 0;
672	ph->ph_vprivate = 0;
673	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
674
675	MDI_PHCI_LOCK(ph);
676	MDI_PHCI_SET_POWER_UP(ph);
677	MDI_PHCI_UNLOCK(ph);
678	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
679	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
680
681	vhcache_phci_add(vh->vh_config, ph);
682
683	MDI_VHCI_PHCI_LOCK(vh);
684	if (vh->vh_phci_head == NULL) {
685		vh->vh_phci_head = ph;
686	}
687	if (vh->vh_phci_tail) {
688		vh->vh_phci_tail->ph_next = ph;
689	}
690	vh->vh_phci_tail = ph;
691	vh->vh_phci_count++;
692	MDI_VHCI_PHCI_UNLOCK(vh);
693
694	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
695	return (MDI_SUCCESS);
696}
697
698/*
699 * mdi_phci_unregister():
700 *		Unregister a pHCI module from mpxio framework
701 *		mdi_phci_unregister() is called by the pHCI drivers from their
702 *		detach(9E) handler to unregister their instances from the
703 *		framework.
704 * Return Values:
705 *		MDI_SUCCESS
706 *		MDI_FAILURE
707 */
708/*ARGSUSED*/
709int
710mdi_phci_unregister(dev_info_t *pdip, int flags)
711{
712	mdi_vhci_t		*vh;
713	mdi_phci_t		*ph;
714	mdi_phci_t		*tmp;
715	mdi_phci_t		*prev = NULL;
716	mdi_pathinfo_t		*pip;
717
718	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
719
720	ph = i_devi_get_phci(pdip);
721	if (ph == NULL) {
722		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
723		return (MDI_FAILURE);
724	}
725
726	vh = ph->ph_vhci;
727	ASSERT(vh != NULL);
728	if (vh == NULL) {
729		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
730		return (MDI_FAILURE);
731	}
732
733	MDI_VHCI_PHCI_LOCK(vh);
734	tmp = vh->vh_phci_head;
735	while (tmp) {
736		if (tmp == ph) {
737			break;
738		}
739		prev = tmp;
740		tmp = tmp->ph_next;
741	}
742
743	if (ph == vh->vh_phci_head) {
744		vh->vh_phci_head = ph->ph_next;
745	} else {
746		prev->ph_next = ph->ph_next;
747	}
748
749	if (ph == vh->vh_phci_tail) {
750		vh->vh_phci_tail = prev;
751	}
752
753	vh->vh_phci_count--;
754	MDI_VHCI_PHCI_UNLOCK(vh);
755
756	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
757	MDI_PHCI_LOCK(ph);
758	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
759	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
760		MDI_PI(pip)->pi_phci = NULL;
761	MDI_PHCI_UNLOCK(ph);
762
763	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
764	    ESC_DDI_INITIATOR_UNREGISTER);
765	vhcache_phci_remove(vh->vh_config, ph);
766	cv_destroy(&ph->ph_unstable_cv);
767	mutex_destroy(&ph->ph_mutex);
768	kmem_free(ph, sizeof (mdi_phci_t));
769	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
770	DEVI(pdip)->devi_mdi_xhci = NULL;
771	return (MDI_SUCCESS);
772}
773
774/*
775 * i_devi_get_phci():
776 * 		Utility function to return the phci extensions.
777 */
778static mdi_phci_t *
779i_devi_get_phci(dev_info_t *pdip)
780{
781	mdi_phci_t	*ph = NULL;
782
783	if (MDI_PHCI(pdip)) {
784		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
785	}
786	return (ph);
787}
788
789/*
790 * Single thread mdi entry into devinfo node for modifying its children.
791 * If necessary we perform an ndi_devi_enter of the vHCI before doing
792 * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
793 * for the vHCI and one for the pHCI.
794 */
795void
796mdi_devi_enter(dev_info_t *phci_dip, int *circular)
797{
798	dev_info_t	*vdip;
799	int		vcircular, pcircular;
800
801	/* Verify calling context */
802	ASSERT(MDI_PHCI(phci_dip));
803	vdip = mdi_devi_get_vdip(phci_dip);
804	ASSERT(vdip);			/* A pHCI always has a vHCI */
805
806	/*
807	 * If pHCI is detaching then the framework has already entered the
808	 * vHCI on a threads that went down the code path leading to
809	 * detach_node().  This framework enter of the vHCI during pHCI
810	 * detach is done to avoid deadlock with vHCI power management
811	 * operations which enter the vHCI and the enter down the path
812	 * to the pHCI. If pHCI is detaching then we piggyback this calls
813	 * enter of the vHCI on frameworks vHCI enter that has already
814	 * occurred - this is OK because we know that the framework thread
815	 * doing detach is waiting for our completion.
816	 *
817	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
818	 * race with detach - but we can't do that because the framework has
819	 * already entered the parent, so we have some complexity instead.
820	 */
821	for (;;) {
822		if (ndi_devi_tryenter(vdip, &vcircular)) {
823			ASSERT(vcircular != -1);
824			if (DEVI_IS_DETACHING(phci_dip)) {
825				ndi_devi_exit(vdip, vcircular);
826				vcircular = -1;
827			}
828			break;
829		} else if (DEVI_IS_DETACHING(phci_dip)) {
830			vcircular = -1;
831			break;
832		} else if (servicing_interrupt()) {
833			/*
834			 * Don't delay an interrupt (and ensure adaptive
835			 * mutex inversion support).
836			 */
837			ndi_devi_enter(vdip, &vcircular);
838			break;
839		} else {
840			delay_random(mdi_delay);
841		}
842	}
843
844	ndi_devi_enter(phci_dip, &pcircular);
845	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
846}
847
848/*
849 * Attempt to mdi_devi_enter.
850 */
851int
852mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
853{
854	dev_info_t	*vdip;
855	int		vcircular, pcircular;
856
857	/* Verify calling context */
858	ASSERT(MDI_PHCI(phci_dip));
859	vdip = mdi_devi_get_vdip(phci_dip);
860	ASSERT(vdip);			/* A pHCI always has a vHCI */
861
862	if (ndi_devi_tryenter(vdip, &vcircular)) {
863		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
864			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
865			return (1);	/* locked */
866		}
867		ndi_devi_exit(vdip, vcircular);
868	}
869	return (0);			/* busy */
870}
871
872/*
873 * Release mdi_devi_enter or successful mdi_devi_tryenter.
874 */
875void
876mdi_devi_exit(dev_info_t *phci_dip, int circular)
877{
878	dev_info_t	*vdip;
879	int		vcircular, pcircular;
880
881	/* Verify calling context */
882	ASSERT(MDI_PHCI(phci_dip));
883	vdip = mdi_devi_get_vdip(phci_dip);
884	ASSERT(vdip);			/* A pHCI always has a vHCI */
885
886	/* extract two circular recursion values from single int */
887	pcircular = (short)(circular & 0xFFFF);
888	vcircular = (short)((circular >> 16) & 0xFFFF);
889
890	ndi_devi_exit(phci_dip, pcircular);
891	if (vcircular != -1)
892		ndi_devi_exit(vdip, vcircular);
893}
894
895/*
896 * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
897 * around a pHCI drivers calls to mdi_pi_online/offline, after holding
898 * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
899 * with vHCI power management code during path online/offline.  Each
900 * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
901 * occur within the scope of an active mdi_devi_enter that establishes the
902 * circular value.
903 */
904void
905mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
906{
907	int		pcircular;
908
909	/* Verify calling context */
910	ASSERT(MDI_PHCI(phci_dip));
911
912	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
913	ndi_hold_devi(phci_dip);
914
915	pcircular = (short)(circular & 0xFFFF);
916	ndi_devi_exit(phci_dip, pcircular);
917}
918
919void
920mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
921{
922	int		pcircular;
923
924	/* Verify calling context */
925	ASSERT(MDI_PHCI(phci_dip));
926
927	ndi_devi_enter(phci_dip, &pcircular);
928
929	/* Drop hold from mdi_devi_exit_phci. */
930	ndi_rele_devi(phci_dip);
931
932	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
933	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
934}
935
936/*
937 * mdi_devi_get_vdip():
938 *		given a pHCI dip return vHCI dip
939 */
940dev_info_t *
941mdi_devi_get_vdip(dev_info_t *pdip)
942{
943	mdi_phci_t	*ph;
944
945	ph = i_devi_get_phci(pdip);
946	if (ph && ph->ph_vhci)
947		return (ph->ph_vhci->vh_dip);
948	return (NULL);
949}
950
951/*
952 * mdi_devi_pdip_entered():
953 *		Return 1 if we are vHCI and have done an ndi_devi_enter
954 *		of a pHCI
955 */
956int
957mdi_devi_pdip_entered(dev_info_t *vdip)
958{
959	mdi_vhci_t	*vh;
960	mdi_phci_t	*ph;
961
962	vh = i_devi_get_vhci(vdip);
963	if (vh == NULL)
964		return (0);
965
966	MDI_VHCI_PHCI_LOCK(vh);
967	ph = vh->vh_phci_head;
968	while (ph) {
969		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
970			MDI_VHCI_PHCI_UNLOCK(vh);
971			return (1);
972		}
973		ph = ph->ph_next;
974	}
975	MDI_VHCI_PHCI_UNLOCK(vh);
976	return (0);
977}
978
979/*
980 * mdi_phci_path2devinfo():
981 * 		Utility function to search for a valid phci device given
982 *		the devfs pathname.
983 */
984dev_info_t *
985mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
986{
987	char		*temp_pathname;
988	mdi_vhci_t	*vh;
989	mdi_phci_t	*ph;
990	dev_info_t 	*pdip = NULL;
991
992	vh = i_devi_get_vhci(vdip);
993	ASSERT(vh != NULL);
994
995	if (vh == NULL) {
996		/*
997		 * Invalid vHCI component, return failure
998		 */
999		return (NULL);
1000	}
1001
1002	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1003	MDI_VHCI_PHCI_LOCK(vh);
1004	ph = vh->vh_phci_head;
1005	while (ph != NULL) {
1006		pdip = ph->ph_dip;
1007		ASSERT(pdip != NULL);
1008		*temp_pathname = '\0';
1009		(void) ddi_pathname(pdip, temp_pathname);
1010		if (strcmp(temp_pathname, pathname) == 0) {
1011			break;
1012		}
1013		ph = ph->ph_next;
1014	}
1015	if (ph == NULL) {
1016		pdip = NULL;
1017	}
1018	MDI_VHCI_PHCI_UNLOCK(vh);
1019	kmem_free(temp_pathname, MAXPATHLEN);
1020	return (pdip);
1021}
1022
1023/*
1024 * mdi_phci_get_path_count():
1025 * 		get number of path information nodes associated with a given
1026 *		pHCI device.
1027 */
1028int
1029mdi_phci_get_path_count(dev_info_t *pdip)
1030{
1031	mdi_phci_t	*ph;
1032	int		count = 0;
1033
1034	ph = i_devi_get_phci(pdip);
1035	if (ph != NULL) {
1036		count = ph->ph_path_count;
1037	}
1038	return (count);
1039}
1040
1041/*
1042 * i_mdi_phci_lock():
1043 *		Lock a pHCI device
1044 * Return Values:
1045 *		None
1046 * Note:
1047 *		The default locking order is:
1048 *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1049 *		But there are number of situations where locks need to be
1050 *		grabbed in reverse order.  This routine implements try and lock
1051 *		mechanism depending on the requested parameter option.
1052 */
1053static void
1054i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1055{
1056	if (pip) {
1057		/* Reverse locking is requested. */
1058		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1059			if (servicing_interrupt()) {
1060				MDI_PI_HOLD(pip);
1061				MDI_PI_UNLOCK(pip);
1062				MDI_PHCI_LOCK(ph);
1063				MDI_PI_LOCK(pip);
1064				MDI_PI_RELE(pip);
1065				break;
1066			} else {
1067				/*
1068				 * tryenter failed. Try to grab again
1069				 * after a small delay
1070				 */
1071				MDI_PI_HOLD(pip);
1072				MDI_PI_UNLOCK(pip);
1073				delay_random(mdi_delay);
1074				MDI_PI_LOCK(pip);
1075				MDI_PI_RELE(pip);
1076			}
1077		}
1078	} else {
1079		MDI_PHCI_LOCK(ph);
1080	}
1081}
1082
1083/*
1084 * i_mdi_phci_unlock():
1085 *		Unlock the pHCI component
1086 */
1087static void
1088i_mdi_phci_unlock(mdi_phci_t *ph)
1089{
1090	MDI_PHCI_UNLOCK(ph);
1091}
1092
1093/*
1094 * i_mdi_devinfo_create():
1095 *		create client device's devinfo node
1096 * Return Values:
1097 *		dev_info
1098 *		NULL
1099 * Notes:
1100 */
1101static dev_info_t *
1102i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1103	char **compatible, int ncompatible)
1104{
1105	dev_info_t *cdip = NULL;
1106
1107	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1108
1109	/* Verify for duplicate entry */
1110	cdip = i_mdi_devinfo_find(vh, name, guid);
1111	ASSERT(cdip == NULL);
1112	if (cdip) {
1113		cmn_err(CE_WARN,
1114		    "i_mdi_devinfo_create: client %s@%s already exists",
1115			name ? name : "", guid ? guid : "");
1116	}
1117
1118	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1119	if (cdip == NULL)
1120		goto fail;
1121
1122	/*
1123	 * Create component type and Global unique identifier
1124	 * properties
1125	 */
1126	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1127	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1128		goto fail;
1129	}
1130
1131	/* Decorate the node with compatible property */
1132	if (compatible &&
1133	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1134	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1135		goto fail;
1136	}
1137
1138	return (cdip);
1139
1140fail:
1141	if (cdip) {
1142		(void) ndi_prop_remove_all(cdip);
1143		(void) ndi_devi_free(cdip);
1144	}
1145	return (NULL);
1146}
1147
1148/*
1149 * i_mdi_devinfo_find():
1150 *		Find a matching devinfo node for given client node name
1151 *		and its guid.
1152 * Return Values:
1153 *		Handle to a dev_info node or NULL
1154 */
1155static dev_info_t *
1156i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1157{
1158	char			*data;
1159	dev_info_t 		*cdip = NULL;
1160	dev_info_t 		*ndip = NULL;
1161	int			circular;
1162
1163	ndi_devi_enter(vh->vh_dip, &circular);
1164	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1165	while ((cdip = ndip) != NULL) {
1166		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1167
1168		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1169			continue;
1170		}
1171
1172		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1173		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1174		    &data) != DDI_PROP_SUCCESS) {
1175			continue;
1176		}
1177
1178		if (strcmp(data, guid) != 0) {
1179			ddi_prop_free(data);
1180			continue;
1181		}
1182		ddi_prop_free(data);
1183		break;
1184	}
1185	ndi_devi_exit(vh->vh_dip, circular);
1186	return (cdip);
1187}
1188
1189/*
1190 * i_mdi_devinfo_remove():
1191 *		Remove a client device node
1192 */
1193static int
1194i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1195{
1196	int	rv = MDI_SUCCESS;
1197
1198	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1199	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1200		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
1201		if (rv != NDI_SUCCESS) {
1202			MDI_DEBUG(1, (MDI_NOTE, cdip,
1203			    "!failed: cdip %p", (void *)cdip));
1204		}
1205		/*
1206		 * Convert to MDI error code
1207		 */
1208		switch (rv) {
1209		case NDI_SUCCESS:
1210			rv = MDI_SUCCESS;
1211			break;
1212		case NDI_BUSY:
1213			rv = MDI_BUSY;
1214			break;
1215		default:
1216			rv = MDI_FAILURE;
1217			break;
1218		}
1219	}
1220	return (rv);
1221}
1222
1223/*
1224 * i_devi_get_client()
1225 *		Utility function to get mpxio component extensions
1226 */
1227static mdi_client_t *
1228i_devi_get_client(dev_info_t *cdip)
1229{
1230	mdi_client_t	*ct = NULL;
1231
1232	if (MDI_CLIENT(cdip)) {
1233		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1234	}
1235	return (ct);
1236}
1237
1238/*
1239 * i_mdi_is_child_present():
1240 *		Search for the presence of client device dev_info node
1241 */
1242static int
1243i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1244{
1245	int		rv = MDI_FAILURE;
1246	struct dev_info	*dip;
1247	int		circular;
1248
1249	ndi_devi_enter(vdip, &circular);
1250	dip = DEVI(vdip)->devi_child;
1251	while (dip) {
1252		if (dip == DEVI(cdip)) {
1253			rv = MDI_SUCCESS;
1254			break;
1255		}
1256		dip = dip->devi_sibling;
1257	}
1258	ndi_devi_exit(vdip, circular);
1259	return (rv);
1260}
1261
1262
1263/*
1264 * i_mdi_client_lock():
1265 *		Grab client component lock
1266 * Return Values:
1267 *		None
1268 * Note:
1269 *		The default locking order is:
1270 *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1271 *		But there are number of situations where locks need to be
1272 *		grabbed in reverse order.  This routine implements try and lock
1273 *		mechanism depending on the requested parameter option.
1274 */
1275static void
1276i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1277{
1278	if (pip) {
1279		/*
1280		 * Reverse locking is requested.
1281		 */
1282		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1283			if (servicing_interrupt()) {
1284				MDI_PI_HOLD(pip);
1285				MDI_PI_UNLOCK(pip);
1286				MDI_CLIENT_LOCK(ct);
1287				MDI_PI_LOCK(pip);
1288				MDI_PI_RELE(pip);
1289				break;
1290			} else {
1291				/*
1292				 * tryenter failed. Try to grab again
1293				 * after a small delay
1294				 */
1295				MDI_PI_HOLD(pip);
1296				MDI_PI_UNLOCK(pip);
1297				delay_random(mdi_delay);
1298				MDI_PI_LOCK(pip);
1299				MDI_PI_RELE(pip);
1300			}
1301		}
1302	} else {
1303		MDI_CLIENT_LOCK(ct);
1304	}
1305}
1306
1307/*
1308 * i_mdi_client_unlock():
1309 *		Unlock a client component
1310 */
1311static void
1312i_mdi_client_unlock(mdi_client_t *ct)
1313{
1314	MDI_CLIENT_UNLOCK(ct);
1315}
1316
1317/*
1318 * i_mdi_client_alloc():
1319 * 		Allocate and initialize a client structure.  Caller should
1320 *		hold the vhci client lock.
1321 * Return Values:
1322 *		Handle to a client component
1323 */
1324/*ARGSUSED*/
1325static mdi_client_t *
1326i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1327{
1328	mdi_client_t	*ct;
1329
1330	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1331
1332	/*
1333	 * Allocate and initialize a component structure.
1334	 */
1335	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1336	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1337	ct->ct_hnext = NULL;
1338	ct->ct_hprev = NULL;
1339	ct->ct_dip = NULL;
1340	ct->ct_vhci = vh;
1341	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1342	(void) strcpy(ct->ct_drvname, name);
1343	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1344	(void) strcpy(ct->ct_guid, lguid);
1345	ct->ct_cprivate = NULL;
1346	ct->ct_vprivate = NULL;
1347	ct->ct_flags = 0;
1348	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1349	MDI_CLIENT_LOCK(ct);
1350	MDI_CLIENT_SET_OFFLINE(ct);
1351	MDI_CLIENT_SET_DETACH(ct);
1352	MDI_CLIENT_SET_POWER_UP(ct);
1353	MDI_CLIENT_UNLOCK(ct);
1354	ct->ct_failover_flags = 0;
1355	ct->ct_failover_status = 0;
1356	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1357	ct->ct_unstable = 0;
1358	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1359	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1360	ct->ct_lb = vh->vh_lb;
1361	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1362	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1363	ct->ct_path_count = 0;
1364	ct->ct_path_head = NULL;
1365	ct->ct_path_tail = NULL;
1366	ct->ct_path_last = NULL;
1367
1368	/*
1369	 * Add this client component to our client hash queue
1370	 */
1371	i_mdi_client_enlist_table(vh, ct);
1372	return (ct);
1373}
1374
1375/*
1376 * i_mdi_client_enlist_table():
1377 *		Attach the client device to the client hash table. Caller
1378 *		should hold the vhci client lock.
1379 */
1380static void
1381i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1382{
1383	int 			index;
1384	struct client_hash	*head;
1385
1386	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1387
1388	index = i_mdi_get_hash_key(ct->ct_guid);
1389	head = &vh->vh_client_table[index];
1390	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1391	head->ct_hash_head = ct;
1392	head->ct_hash_count++;
1393	vh->vh_client_count++;
1394}
1395
1396/*
1397 * i_mdi_client_delist_table():
1398 *		Attach the client device to the client hash table.
1399 *		Caller should hold the vhci client lock.
1400 */
1401static void
1402i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1403{
1404	int			index;
1405	char			*guid;
1406	struct client_hash 	*head;
1407	mdi_client_t		*next;
1408	mdi_client_t		*last;
1409
1410	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1411
1412	guid = ct->ct_guid;
1413	index = i_mdi_get_hash_key(guid);
1414	head = &vh->vh_client_table[index];
1415
1416	last = NULL;
1417	next = (mdi_client_t *)head->ct_hash_head;
1418	while (next != NULL) {
1419		if (next == ct) {
1420			break;
1421		}
1422		last = next;
1423		next = next->ct_hnext;
1424	}
1425
1426	if (next) {
1427		head->ct_hash_count--;
1428		if (last == NULL) {
1429			head->ct_hash_head = ct->ct_hnext;
1430		} else {
1431			last->ct_hnext = ct->ct_hnext;
1432		}
1433		ct->ct_hnext = NULL;
1434		vh->vh_client_count--;
1435	}
1436}
1437
1438
1439/*
1440 * i_mdi_client_free():
1441 *		Free a client component
1442 */
1443static int
1444i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1445{
1446	int		rv = MDI_SUCCESS;
1447	int		flags = ct->ct_flags;
1448	dev_info_t	*cdip;
1449	dev_info_t	*vdip;
1450
1451	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1452
1453	vdip = vh->vh_dip;
1454	cdip = ct->ct_dip;
1455
1456	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1457	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1458	DEVI(cdip)->devi_mdi_client = NULL;
1459
1460	/*
1461	 * Clear out back ref. to dev_info_t node
1462	 */
1463	ct->ct_dip = NULL;
1464
1465	/*
1466	 * Remove this client from our hash queue
1467	 */
1468	i_mdi_client_delist_table(vh, ct);
1469
1470	/*
1471	 * Uninitialize and free the component
1472	 */
1473	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1474	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1475	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1476	cv_destroy(&ct->ct_failover_cv);
1477	cv_destroy(&ct->ct_unstable_cv);
1478	cv_destroy(&ct->ct_powerchange_cv);
1479	mutex_destroy(&ct->ct_mutex);
1480	kmem_free(ct, sizeof (*ct));
1481
1482	MDI_VHCI_CLIENT_UNLOCK(vh);
1483	(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1484	MDI_VHCI_CLIENT_LOCK(vh);
1485
1486	return (rv);
1487}
1488
1489/*
1490 * i_mdi_client_find():
1491 * 		Find the client structure corresponding to a given guid
1492 *		Caller should hold the vhci client lock.
1493 */
1494static mdi_client_t *
1495i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1496{
1497	int			index;
1498	struct client_hash	*head;
1499	mdi_client_t		*ct;
1500
1501	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1502
1503	index = i_mdi_get_hash_key(guid);
1504	head = &vh->vh_client_table[index];
1505
1506	ct = head->ct_hash_head;
1507	while (ct != NULL) {
1508		if (strcmp(ct->ct_guid, guid) == 0 &&
1509		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1510			break;
1511		}
1512		ct = ct->ct_hnext;
1513	}
1514	return (ct);
1515}
1516
1517/*
1518 * i_mdi_client_update_state():
1519 *		Compute and update client device state
1520 * Notes:
1521 *		A client device can be in any of three possible states:
1522 *
1523 *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1524 *		one online/standby paths. Can tolerate failures.
1525 *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1526 *		no alternate paths available as standby. A failure on the online
1527 *		would result in loss of access to device data.
1528 *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1529 *		no paths available to access the device.
1530 */
1531static void
1532i_mdi_client_update_state(mdi_client_t *ct)
1533{
1534	int state;
1535
1536	ASSERT(MDI_CLIENT_LOCKED(ct));
1537	state = i_mdi_client_compute_state(ct, NULL);
1538	MDI_CLIENT_SET_STATE(ct, state);
1539}
1540
1541/*
1542 * i_mdi_client_compute_state():
1543 *		Compute client device state
1544 *
1545 *		mdi_phci_t *	Pointer to pHCI structure which should
1546 *				while computing the new value.  Used by
1547 *				i_mdi_phci_offline() to find the new
1548 *				client state after DR of a pHCI.
1549 */
1550static int
1551i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1552{
1553	int		state;
1554	int		online_count = 0;
1555	int		standby_count = 0;
1556	mdi_pathinfo_t	*pip, *next;
1557
1558	ASSERT(MDI_CLIENT_LOCKED(ct));
1559	pip = ct->ct_path_head;
1560	while (pip != NULL) {
1561		MDI_PI_LOCK(pip);
1562		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1563		if (MDI_PI(pip)->pi_phci == ph) {
1564			MDI_PI_UNLOCK(pip);
1565			pip = next;
1566			continue;
1567		}
1568
1569		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1570				== MDI_PATHINFO_STATE_ONLINE)
1571			online_count++;
1572		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1573				== MDI_PATHINFO_STATE_STANDBY)
1574			standby_count++;
1575		MDI_PI_UNLOCK(pip);
1576		pip = next;
1577	}
1578
1579	if (online_count == 0) {
1580		if (standby_count == 0) {
1581			state = MDI_CLIENT_STATE_FAILED;
1582			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
1583			    "client state failed: ct = %p", (void *)ct));
1584		} else if (standby_count == 1) {
1585			state = MDI_CLIENT_STATE_DEGRADED;
1586		} else {
1587			state = MDI_CLIENT_STATE_OPTIMAL;
1588		}
1589	} else if (online_count == 1) {
1590		if (standby_count == 0) {
1591			state = MDI_CLIENT_STATE_DEGRADED;
1592		} else {
1593			state = MDI_CLIENT_STATE_OPTIMAL;
1594		}
1595	} else {
1596		state = MDI_CLIENT_STATE_OPTIMAL;
1597	}
1598	return (state);
1599}
1600
1601/*
1602 * i_mdi_client2devinfo():
1603 *		Utility function
1604 */
1605dev_info_t *
1606i_mdi_client2devinfo(mdi_client_t *ct)
1607{
1608	return (ct->ct_dip);
1609}
1610
1611/*
1612 * mdi_client_path2_devinfo():
1613 * 		Given the parent devinfo and child devfs pathname, search for
1614 *		a valid devfs node handle.
1615 */
1616dev_info_t *
1617mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1618{
1619	dev_info_t 	*cdip = NULL;
1620	dev_info_t 	*ndip = NULL;
1621	char		*temp_pathname;
1622	int		circular;
1623
1624	/*
1625	 * Allocate temp buffer
1626	 */
1627	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1628
1629	/*
1630	 * Lock parent against changes
1631	 */
1632	ndi_devi_enter(vdip, &circular);
1633	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1634	while ((cdip = ndip) != NULL) {
1635		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1636
1637		*temp_pathname = '\0';
1638		(void) ddi_pathname(cdip, temp_pathname);
1639		if (strcmp(temp_pathname, pathname) == 0) {
1640			break;
1641		}
1642	}
1643	/*
1644	 * Release devinfo lock
1645	 */
1646	ndi_devi_exit(vdip, circular);
1647
1648	/*
1649	 * Free the temp buffer
1650	 */
1651	kmem_free(temp_pathname, MAXPATHLEN);
1652	return (cdip);
1653}
1654
1655/*
1656 * mdi_client_get_path_count():
1657 * 		Utility function to get number of path information nodes
1658 *		associated with a given client device.
1659 */
1660int
1661mdi_client_get_path_count(dev_info_t *cdip)
1662{
1663	mdi_client_t	*ct;
1664	int		count = 0;
1665
1666	ct = i_devi_get_client(cdip);
1667	if (ct != NULL) {
1668		count = ct->ct_path_count;
1669	}
1670	return (count);
1671}
1672
1673
1674/*
1675 * i_mdi_get_hash_key():
1676 * 		Create a hash using strings as keys
1677 *
1678 */
1679static int
1680i_mdi_get_hash_key(char *str)
1681{
1682	uint32_t	g, hash = 0;
1683	char		*p;
1684
1685	for (p = str; *p != '\0'; p++) {
1686		g = *p;
1687		hash += g;
1688	}
1689	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1690}
1691
1692/*
1693 * mdi_get_lb_policy():
1694 * 		Get current load balancing policy for a given client device
1695 */
1696client_lb_t
1697mdi_get_lb_policy(dev_info_t *cdip)
1698{
1699	client_lb_t	lb = LOAD_BALANCE_NONE;
1700	mdi_client_t	*ct;
1701
1702	ct = i_devi_get_client(cdip);
1703	if (ct != NULL) {
1704		lb = ct->ct_lb;
1705	}
1706	return (lb);
1707}
1708
1709/*
1710 * mdi_set_lb_region_size():
1711 * 		Set current region size for the load-balance
1712 */
1713int
1714mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1715{
1716	mdi_client_t	*ct;
1717	int		rv = MDI_FAILURE;
1718
1719	ct = i_devi_get_client(cdip);
1720	if (ct != NULL && ct->ct_lb_args != NULL) {
1721		ct->ct_lb_args->region_size = region_size;
1722		rv = MDI_SUCCESS;
1723	}
1724	return (rv);
1725}
1726
1727/*
1728 * mdi_Set_lb_policy():
1729 * 		Set current load balancing policy for a given client device
1730 */
1731int
1732mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1733{
1734	mdi_client_t	*ct;
1735	int		rv = MDI_FAILURE;
1736
1737	ct = i_devi_get_client(cdip);
1738	if (ct != NULL) {
1739		ct->ct_lb = lb;
1740		rv = MDI_SUCCESS;
1741	}
1742	return (rv);
1743}
1744
1745static void
1746mdi_failover_cb(void *arg)
1747{
1748	(void)i_mdi_failover(arg);
1749}
1750
1751/*
1752 * mdi_failover():
1753 *		failover function called by the vHCI drivers to initiate
1754 *		a failover operation.  This is typically due to non-availability
1755 *		of online paths to route I/O requests.  Failover can be
1756 *		triggered through user application also.
1757 *
1758 *		The vHCI driver calls mdi_failover() to initiate a failover
1759 *		operation. mdi_failover() calls back into the vHCI driver's
1760 *		vo_failover() entry point to perform the actual failover
1761 *		operation.  The reason for requiring the vHCI driver to
1762 *		initiate failover by calling mdi_failover(), instead of directly
1763 *		executing vo_failover() itself, is to ensure that the mdi
1764 *		framework can keep track of the client state properly.
1765 *		Additionally, mdi_failover() provides as a convenience the
1766 *		option of performing the failover operation synchronously or
1767 *		asynchronously
1768 *
1769 *		Upon successful completion of the failover operation, the
1770 *		paths that were previously ONLINE will be in the STANDBY state,
1771 *		and the newly activated paths will be in the ONLINE state.
1772 *
1773 *		The flags modifier determines whether the activation is done
1774 *		synchronously: MDI_FAILOVER_SYNC
1775 * Return Values:
1776 *		MDI_SUCCESS
1777 *		MDI_FAILURE
1778 *		MDI_BUSY
1779 */
1780/*ARGSUSED*/
1781int
1782mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1783{
1784	int			rv;
1785	mdi_client_t		*ct;
1786
1787	ct = i_devi_get_client(cdip);
1788	ASSERT(ct != NULL);
1789	if (ct == NULL) {
1790		/* cdip is not a valid client device. Nothing more to do. */
1791		return (MDI_FAILURE);
1792	}
1793
1794	MDI_CLIENT_LOCK(ct);
1795
1796	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1797		/* A path to the client is being freed */
1798		MDI_CLIENT_UNLOCK(ct);
1799		return (MDI_BUSY);
1800	}
1801
1802
1803	if (MDI_CLIENT_IS_FAILED(ct)) {
1804		/*
1805		 * Client is in failed state. Nothing more to do.
1806		 */
1807		MDI_CLIENT_UNLOCK(ct);
1808		return (MDI_FAILURE);
1809	}
1810
1811	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1812		/*
1813		 * Failover is already in progress; return BUSY
1814		 */
1815		MDI_CLIENT_UNLOCK(ct);
1816		return (MDI_BUSY);
1817	}
1818	/*
1819	 * Make sure that mdi_pathinfo node state changes are processed.
1820	 * We do not allow failovers to progress while client path state
1821	 * changes are in progress
1822	 */
1823	if (ct->ct_unstable) {
1824		if (flags == MDI_FAILOVER_ASYNC) {
1825			MDI_CLIENT_UNLOCK(ct);
1826			return (MDI_BUSY);
1827		} else {
1828			while (ct->ct_unstable)
1829				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1830		}
1831	}
1832
1833	/*
1834	 * Client device is in stable state. Before proceeding, perform sanity
1835	 * checks again.
1836	 */
1837	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1838	    (!i_ddi_devi_attached(cdip))) {
1839		/*
1840		 * Client is in failed state. Nothing more to do.
1841		 */
1842		MDI_CLIENT_UNLOCK(ct);
1843		return (MDI_FAILURE);
1844	}
1845
1846	/*
1847	 * Set the client state as failover in progress.
1848	 */
1849	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1850	ct->ct_failover_flags = flags;
1851	MDI_CLIENT_UNLOCK(ct);
1852
1853	if (flags == MDI_FAILOVER_ASYNC) {
1854		/*
1855		 * Submit the initiate failover request via CPR safe
1856		 * taskq threads.
1857		 */
1858		(void) taskq_dispatch(mdi_taskq, mdi_failover_cb, ct, KM_SLEEP);
1859		return (MDI_ACCEPT);
1860	} else {
1861		/*
1862		 * Synchronous failover mode.  Typically invoked from the user
1863		 * land.
1864		 */
1865		rv = i_mdi_failover(ct);
1866	}
1867	return (rv);
1868}
1869
1870/*
1871 * i_mdi_failover():
1872 *		internal failover function. Invokes vHCI drivers failover
1873 *		callback function and process the failover status
1874 * Return Values:
1875 *		None
1876 *
1877 * Note: A client device in failover state can not be detached or freed.
1878 */
1879static int
1880i_mdi_failover(void *arg)
1881{
1882	int		rv = MDI_SUCCESS;
1883	mdi_client_t	*ct = (mdi_client_t *)arg;
1884	mdi_vhci_t	*vh = ct->ct_vhci;
1885
1886	ASSERT(!MDI_CLIENT_LOCKED(ct));
1887
1888	if (vh->vh_ops->vo_failover != NULL) {
1889		/*
1890		 * Call vHCI drivers callback routine
1891		 */
1892		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1893		    ct->ct_failover_flags);
1894	}
1895
1896	MDI_CLIENT_LOCK(ct);
1897	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1898
1899	/*
1900	 * Save the failover return status
1901	 */
1902	ct->ct_failover_status = rv;
1903
1904	/*
1905	 * As a result of failover, client status would have been changed.
1906	 * Update the client state and wake up anyone waiting on this client
1907	 * device.
1908	 */
1909	i_mdi_client_update_state(ct);
1910
1911	cv_broadcast(&ct->ct_failover_cv);
1912	MDI_CLIENT_UNLOCK(ct);
1913	return (rv);
1914}
1915
1916/*
1917 * Load balancing is logical block.
1918 * IOs within the range described by region_size
1919 * would go on the same path. This would improve the
1920 * performance by cache-hit on some of the RAID devices.
1921 * Search only for online paths(At some point we
1922 * may want to balance across target ports).
1923 * If no paths are found then default to round-robin.
1924 */
1925static int
1926i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1927{
1928	int		path_index = -1;
1929	int		online_path_count = 0;
1930	int		online_nonpref_path_count = 0;
1931	int 		region_size = ct->ct_lb_args->region_size;
1932	mdi_pathinfo_t	*pip;
1933	mdi_pathinfo_t	*next;
1934	int		preferred, path_cnt;
1935
1936	pip = ct->ct_path_head;
1937	while (pip) {
1938		MDI_PI_LOCK(pip);
1939		if (MDI_PI(pip)->pi_state ==
1940		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1941			online_path_count++;
1942		} else if (MDI_PI(pip)->pi_state ==
1943		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1944			online_nonpref_path_count++;
1945		}
1946		next = (mdi_pathinfo_t *)
1947		    MDI_PI(pip)->pi_client_link;
1948		MDI_PI_UNLOCK(pip);
1949		pip = next;
1950	}
1951	/* if found any online/preferred then use this type */
1952	if (online_path_count > 0) {
1953		path_cnt = online_path_count;
1954		preferred = 1;
1955	} else if (online_nonpref_path_count > 0) {
1956		path_cnt = online_nonpref_path_count;
1957		preferred = 0;
1958	} else {
1959		path_cnt = 0;
1960	}
1961	if (path_cnt) {
1962		path_index = (bp->b_blkno >> region_size) % path_cnt;
1963		pip = ct->ct_path_head;
1964		while (pip && path_index != -1) {
1965			MDI_PI_LOCK(pip);
1966			if (path_index == 0 &&
1967			    (MDI_PI(pip)->pi_state ==
1968			    MDI_PATHINFO_STATE_ONLINE) &&
1969				MDI_PI(pip)->pi_preferred == preferred) {
1970				MDI_PI_HOLD(pip);
1971				MDI_PI_UNLOCK(pip);
1972				*ret_pip = pip;
1973				return (MDI_SUCCESS);
1974			}
1975			path_index --;
1976			next = (mdi_pathinfo_t *)
1977			    MDI_PI(pip)->pi_client_link;
1978			MDI_PI_UNLOCK(pip);
1979			pip = next;
1980		}
1981		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
1982		    "lba %llx: path %s %p",
1983		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
1984	}
1985	return (MDI_FAILURE);
1986}
1987
1988/*
1989 * mdi_select_path():
1990 *		select a path to access a client device.
1991 *
1992 *		mdi_select_path() function is called by the vHCI drivers to
1993 *		select a path to route the I/O request to.  The caller passes
1994 *		the block I/O data transfer structure ("buf") as one of the
1995 *		parameters.  The mpxio framework uses the buf structure
1996 *		contents to maintain per path statistics (total I/O size /
1997 *		count pending).  If more than one online paths are available to
1998 *		select, the framework automatically selects a suitable path
1999 *		for routing I/O request. If a failover operation is active for
2000 *		this client device the call shall be failed with MDI_BUSY error
2001 *		code.
2002 *
2003 *		By default this function returns a suitable path in online
2004 *		state based on the current load balancing policy.  Currently
2005 *		we support LOAD_BALANCE_NONE (Previously selected online path
2006 *		will continue to be used till the path is usable) and
2007 *		LOAD_BALANCE_RR (Online paths will be selected in a round
2008 *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
2009 *		based on the logical block).  The load balancing
2010 *		through vHCI drivers configuration file (driver.conf).
2011 *
2012 *		vHCI drivers may override this default behavior by specifying
2013 *		appropriate flags.  The meaning of the thrid argument depends
2014 *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
2015 *		then the argument is the "path instance" of the path to select.
2016 *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
2017 *		"start_pip". A non NULL "start_pip" is the starting point to
2018 *		walk and find the next appropriate path.  The following values
2019 *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
2020 *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
2021 *		STANDBY path).
2022 *
2023 *		The non-standard behavior is used by the scsi_vhci driver,
2024 *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
2025 *		attach of client devices (to avoid an unnecessary failover
2026 *		when the STANDBY path comes up first), during failover
2027 *		(to activate a STANDBY path as ONLINE).
2028 *
2029 *		The selected path is returned in a a mdi_hold_path() state
2030 *		(pi_ref_cnt). Caller should release the hold by calling
2031 *		mdi_rele_path().
2032 *
2033 * Return Values:
2034 *		MDI_SUCCESS	- Completed successfully
2035 *		MDI_BUSY 	- Client device is busy failing over
2036 *		MDI_NOPATH	- Client device is online, but no valid path are
2037 *				  available to access this client device
2038 *		MDI_FAILURE	- Invalid client device or state
2039 *		MDI_DEVI_ONLINING
2040 *				- Client device (struct dev_info state) is in
2041 *				  onlining state.
2042 */
2043
2044/*ARGSUSED*/
2045int
2046mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
2047    void *arg, mdi_pathinfo_t **ret_pip)
2048{
2049	mdi_client_t	*ct;
2050	mdi_pathinfo_t	*pip;
2051	mdi_pathinfo_t	*next;
2052	mdi_pathinfo_t	*head;
2053	mdi_pathinfo_t	*start;
2054	client_lb_t	lbp;	/* load balancing policy */
2055	int		sb = 1;	/* standard behavior */
2056	int		preferred = 1;	/* preferred path */
2057	int		cond, cont = 1;
2058	int		retry = 0;
2059	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
2060	int		path_instance;	/* request specific path instance */
2061
2062	/* determine type of arg based on flags */
2063	if (flags & MDI_SELECT_PATH_INSTANCE) {
2064		path_instance = (int)(intptr_t)arg;
2065		start_pip = NULL;
2066	} else {
2067		path_instance = 0;
2068		start_pip = (mdi_pathinfo_t *)arg;
2069	}
2070
2071	if (flags != 0) {
2072		/*
2073		 * disable default behavior
2074		 */
2075		sb = 0;
2076	}
2077
2078	*ret_pip = NULL;
2079	ct = i_devi_get_client(cdip);
2080	if (ct == NULL) {
2081		/* mdi extensions are NULL, Nothing more to do */
2082		return (MDI_FAILURE);
2083	}
2084
2085	MDI_CLIENT_LOCK(ct);
2086
2087	if (sb) {
2088		if (MDI_CLIENT_IS_FAILED(ct)) {
2089			/*
2090			 * Client is not ready to accept any I/O requests.
2091			 * Fail this request.
2092			 */
2093			MDI_DEBUG(2, (MDI_NOTE, cdip,
2094			    "client state offline ct = %p", (void *)ct));
2095			MDI_CLIENT_UNLOCK(ct);
2096			return (MDI_FAILURE);
2097		}
2098
2099		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2100			/*
2101			 * Check for Failover is in progress. If so tell the
2102			 * caller that this device is busy.
2103			 */
2104			MDI_DEBUG(2, (MDI_NOTE, cdip,
2105			    "client failover in progress ct = %p",
2106			    (void *)ct));
2107			MDI_CLIENT_UNLOCK(ct);
2108			return (MDI_BUSY);
2109		}
2110
2111		/*
2112		 * Check to see whether the client device is attached.
2113		 * If not so, let the vHCI driver manually select a path
2114		 * (standby) and let the probe/attach process to continue.
2115		 */
2116		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2117			MDI_DEBUG(4, (MDI_NOTE, cdip,
2118			    "devi is onlining ct = %p", (void *)ct));
2119			MDI_CLIENT_UNLOCK(ct);
2120			return (MDI_DEVI_ONLINING);
2121		}
2122	}
2123
2124	/*
2125	 * Cache in the client list head.  If head of the list is NULL
2126	 * return MDI_NOPATH
2127	 */
2128	head = ct->ct_path_head;
2129	if (head == NULL) {
2130		MDI_CLIENT_UNLOCK(ct);
2131		return (MDI_NOPATH);
2132	}
2133
2134	/* Caller is specifying a specific pathinfo path by path_instance */
2135	if (path_instance) {
2136		/* search for pathinfo with correct path_instance */
2137		for (pip = head;
2138		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2139		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2140			;
2141
2142		/* If path can't be selected then MDI_NOPATH is returned. */
2143		if (pip == NULL) {
2144			MDI_CLIENT_UNLOCK(ct);
2145			return (MDI_NOPATH);
2146		}
2147
2148		/*
2149		 * Verify state of path. When asked to select a specific
2150		 * path_instance, we select the requested path in any
2151		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
2152		 * We don't however select paths where the pHCI has detached.
2153		 * NOTE: last pathinfo node of an opened client device may
2154		 * exist in an OFFLINE state after the pHCI associated with
2155		 * that path has detached (but pi_phci will be NULL if that
2156		 * has occurred).
2157		 */
2158		MDI_PI_LOCK(pip);
2159		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
2160		    (MDI_PI(pip)->pi_phci == NULL)) {
2161			MDI_PI_UNLOCK(pip);
2162			MDI_CLIENT_UNLOCK(ct);
2163			return (MDI_FAILURE);
2164		}
2165
2166		/* Return MDI_BUSY if we have a transient condition */
2167		if (MDI_PI_IS_TRANSIENT(pip)) {
2168			MDI_PI_UNLOCK(pip);
2169			MDI_CLIENT_UNLOCK(ct);
2170			return (MDI_BUSY);
2171		}
2172
2173		/*
2174		 * Return the path in hold state. Caller should release the
2175		 * lock by calling mdi_rele_path()
2176		 */
2177		MDI_PI_HOLD(pip);
2178		MDI_PI_UNLOCK(pip);
2179		*ret_pip = pip;
2180		MDI_CLIENT_UNLOCK(ct);
2181		return (MDI_SUCCESS);
2182	}
2183
2184	/*
2185	 * for non default behavior, bypass current
2186	 * load balancing policy and always use LOAD_BALANCE_RR
2187	 * except that the start point will be adjusted based
2188	 * on the provided start_pip
2189	 */
2190	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2191
2192	switch (lbp) {
2193	case LOAD_BALANCE_NONE:
2194		/*
2195		 * Load balancing is None  or Alternate path mode
2196		 * Start looking for a online mdi_pathinfo node starting from
2197		 * last known selected path
2198		 */
2199		preferred = 1;
2200		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2201		if (pip == NULL) {
2202			pip = head;
2203		}
2204		start = pip;
2205		do {
2206			MDI_PI_LOCK(pip);
2207			/*
2208			 * No need to explicitly check if the path is disabled.
2209			 * Since we are checking for state == ONLINE and the
2210			 * same variable is used for DISABLE/ENABLE information.
2211			 */
2212			if ((MDI_PI(pip)->pi_state  ==
2213				MDI_PATHINFO_STATE_ONLINE) &&
2214				preferred == MDI_PI(pip)->pi_preferred) {
2215				/*
2216				 * Return the path in hold state. Caller should
2217				 * release the lock by calling mdi_rele_path()
2218				 */
2219				MDI_PI_HOLD(pip);
2220				MDI_PI_UNLOCK(pip);
2221				ct->ct_path_last = pip;
2222				*ret_pip = pip;
2223				MDI_CLIENT_UNLOCK(ct);
2224				return (MDI_SUCCESS);
2225			}
2226
2227			/*
2228			 * Path is busy.
2229			 */
2230			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2231			    MDI_PI_IS_TRANSIENT(pip))
2232				retry = 1;
2233			/*
2234			 * Keep looking for a next available online path
2235			 */
2236			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2237			if (next == NULL) {
2238				next = head;
2239			}
2240			MDI_PI_UNLOCK(pip);
2241			pip = next;
2242			if (start == pip && preferred) {
2243				preferred = 0;
2244			} else if (start == pip && !preferred) {
2245				cont = 0;
2246			}
2247		} while (cont);
2248		break;
2249
2250	case LOAD_BALANCE_LBA:
2251		/*
2252		 * Make sure we are looking
2253		 * for an online path. Otherwise, if it is for a STANDBY
2254		 * path request, it will go through and fetch an ONLINE
2255		 * path which is not desirable.
2256		 */
2257		if ((ct->ct_lb_args != NULL) &&
2258			    (ct->ct_lb_args->region_size) && bp &&
2259				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2260			if (i_mdi_lba_lb(ct, ret_pip, bp)
2261				    == MDI_SUCCESS) {
2262				MDI_CLIENT_UNLOCK(ct);
2263				return (MDI_SUCCESS);
2264			}
2265		}
2266		/* FALLTHROUGH */
2267	case LOAD_BALANCE_RR:
2268		/*
2269		 * Load balancing is Round Robin. Start looking for a online
2270		 * mdi_pathinfo node starting from last known selected path
2271		 * as the start point.  If override flags are specified,
2272		 * process accordingly.
2273		 * If the search is already in effect(start_pip not null),
2274		 * then lets just use the same path preference to continue the
2275		 * traversal.
2276		 */
2277
2278		if (start_pip != NULL) {
2279			preferred = MDI_PI(start_pip)->pi_preferred;
2280		} else {
2281			preferred = 1;
2282		}
2283
2284		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2285		if (start == NULL) {
2286			pip = head;
2287		} else {
2288			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2289			if (pip == NULL) {
2290				if ( flags & MDI_SELECT_NO_PREFERRED) {
2291					/*
2292					 * Return since we hit the end of list
2293					 */
2294					MDI_CLIENT_UNLOCK(ct);
2295					return (MDI_NOPATH);
2296				}
2297
2298				if (!sb) {
2299					if (preferred == 0) {
2300						/*
2301						 * Looks like we have completed
2302						 * the traversal as preferred
2303						 * value is 0. Time to bail out.
2304						 */
2305						*ret_pip = NULL;
2306						MDI_CLIENT_UNLOCK(ct);
2307						return (MDI_NOPATH);
2308					} else {
2309						/*
2310						 * Looks like we reached the
2311						 * end of the list. Lets enable
2312						 * traversal of non preferred
2313						 * paths.
2314						 */
2315						preferred = 0;
2316					}
2317				}
2318				pip = head;
2319			}
2320		}
2321		start = pip;
2322		do {
2323			MDI_PI_LOCK(pip);
2324			if (sb) {
2325				cond = ((MDI_PI(pip)->pi_state ==
2326				    MDI_PATHINFO_STATE_ONLINE &&
2327					MDI_PI(pip)->pi_preferred ==
2328						preferred) ? 1 : 0);
2329			} else {
2330				if (flags == MDI_SELECT_ONLINE_PATH) {
2331					cond = ((MDI_PI(pip)->pi_state ==
2332					    MDI_PATHINFO_STATE_ONLINE &&
2333						MDI_PI(pip)->pi_preferred ==
2334						preferred) ? 1 : 0);
2335				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2336					cond = ((MDI_PI(pip)->pi_state ==
2337					    MDI_PATHINFO_STATE_STANDBY &&
2338						MDI_PI(pip)->pi_preferred ==
2339						preferred) ? 1 : 0);
2340				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2341				    MDI_SELECT_STANDBY_PATH)) {
2342					cond = (((MDI_PI(pip)->pi_state ==
2343					    MDI_PATHINFO_STATE_ONLINE ||
2344					    (MDI_PI(pip)->pi_state ==
2345					    MDI_PATHINFO_STATE_STANDBY)) &&
2346						MDI_PI(pip)->pi_preferred ==
2347						preferred) ? 1 : 0);
2348				} else if (flags ==
2349					(MDI_SELECT_STANDBY_PATH |
2350					MDI_SELECT_ONLINE_PATH |
2351					MDI_SELECT_USER_DISABLE_PATH)) {
2352					cond = (((MDI_PI(pip)->pi_state ==
2353					    MDI_PATHINFO_STATE_ONLINE ||
2354					    (MDI_PI(pip)->pi_state ==
2355					    MDI_PATHINFO_STATE_STANDBY) ||
2356						(MDI_PI(pip)->pi_state ==
2357					    (MDI_PATHINFO_STATE_ONLINE|
2358					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2359						(MDI_PI(pip)->pi_state ==
2360					    (MDI_PATHINFO_STATE_STANDBY |
2361					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2362						MDI_PI(pip)->pi_preferred ==
2363						preferred) ? 1 : 0);
2364				} else if (flags ==
2365				    (MDI_SELECT_STANDBY_PATH |
2366				    MDI_SELECT_ONLINE_PATH |
2367				    MDI_SELECT_NO_PREFERRED)) {
2368					cond = (((MDI_PI(pip)->pi_state ==
2369					    MDI_PATHINFO_STATE_ONLINE) ||
2370					    (MDI_PI(pip)->pi_state ==
2371					    MDI_PATHINFO_STATE_STANDBY))
2372					    ? 1 : 0);
2373				} else {
2374					cond = 0;
2375				}
2376			}
2377			/*
2378			 * No need to explicitly check if the path is disabled.
2379			 * Since we are checking for state == ONLINE and the
2380			 * same variable is used for DISABLE/ENABLE information.
2381			 */
2382			if (cond) {
2383				/*
2384				 * Return the path in hold state. Caller should
2385				 * release the lock by calling mdi_rele_path()
2386				 */
2387				MDI_PI_HOLD(pip);
2388				MDI_PI_UNLOCK(pip);
2389				if (sb)
2390					ct->ct_path_last = pip;
2391				*ret_pip = pip;
2392				MDI_CLIENT_UNLOCK(ct);
2393				return (MDI_SUCCESS);
2394			}
2395			/*
2396			 * Path is busy.
2397			 */
2398			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2399			    MDI_PI_IS_TRANSIENT(pip))
2400				retry = 1;
2401
2402			/*
2403			 * Keep looking for a next available online path
2404			 */
2405do_again:
2406			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2407			if (next == NULL) {
2408				if ( flags & MDI_SELECT_NO_PREFERRED) {
2409					/*
2410					 * Bail out since we hit the end of list
2411					 */
2412					MDI_PI_UNLOCK(pip);
2413					break;
2414				}
2415
2416				if (!sb) {
2417					if (preferred == 1) {
2418						/*
2419						 * Looks like we reached the
2420						 * end of the list. Lets enable
2421						 * traversal of non preferred
2422						 * paths.
2423						 */
2424						preferred = 0;
2425						next = head;
2426					} else {
2427						/*
2428						 * We have done both the passes
2429						 * Preferred as well as for
2430						 * Non-preferred. Bail out now.
2431						 */
2432						cont = 0;
2433					}
2434				} else {
2435					/*
2436					 * Standard behavior case.
2437					 */
2438					next = head;
2439				}
2440			}
2441			MDI_PI_UNLOCK(pip);
2442			if (cont == 0) {
2443				break;
2444			}
2445			pip = next;
2446
2447			if (!sb) {
2448				/*
2449				 * We need to handle the selection of
2450				 * non-preferred path in the following
2451				 * case:
2452				 *
2453				 * +------+   +------+   +------+   +-----+
2454				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2455				 * +------+   +------+   +------+   +-----+
2456				 *
2457				 * If we start the search with B, we need to
2458				 * skip beyond B to pick C which is non -
2459				 * preferred in the second pass. The following
2460				 * test, if true, will allow us to skip over
2461				 * the 'start'(B in the example) to select
2462				 * other non preferred elements.
2463				 */
2464				if ((start_pip != NULL) && (start_pip == pip) &&
2465				    (MDI_PI(start_pip)->pi_preferred
2466				    != preferred)) {
2467					/*
2468					 * try again after going past the start
2469					 * pip
2470					 */
2471					MDI_PI_LOCK(pip);
2472					goto do_again;
2473				}
2474			} else {
2475				/*
2476				 * Standard behavior case
2477				 */
2478				if (start == pip && preferred) {
2479					/* look for nonpreferred paths */
2480					preferred = 0;
2481				} else if (start == pip && !preferred) {
2482					/*
2483					 * Exit condition
2484					 */
2485					cont = 0;
2486				}
2487			}
2488		} while (cont);
2489		break;
2490	}
2491
2492	MDI_CLIENT_UNLOCK(ct);
2493	if (retry == 1) {
2494		return (MDI_BUSY);
2495	} else {
2496		return (MDI_NOPATH);
2497	}
2498}
2499
2500/*
2501 * For a client, return the next available path to any phci
2502 *
2503 * Note:
2504 *		Caller should hold the branch's devinfo node to get a consistent
2505 *		snap shot of the mdi_pathinfo nodes.
2506 *
2507 *		Please note that even the list is stable the mdi_pathinfo
2508 *		node state and properties are volatile.  The caller should lock
2509 *		and unlock the nodes by calling mdi_pi_lock() and
2510 *		mdi_pi_unlock() functions to get a stable properties.
2511 *
2512 *		If there is a need to use the nodes beyond the hold of the
2513 *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2514 *		need to be held against unexpected removal by calling
2515 *		mdi_hold_path() and should be released by calling
2516 *		mdi_rele_path() on completion.
2517 */
2518mdi_pathinfo_t *
2519mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2520{
2521	mdi_client_t *ct;
2522
2523	if (!MDI_CLIENT(ct_dip))
2524		return (NULL);
2525
2526	/*
2527	 * Walk through client link
2528	 */
2529	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2530	ASSERT(ct != NULL);
2531
2532	if (pip == NULL)
2533		return ((mdi_pathinfo_t *)ct->ct_path_head);
2534
2535	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2536}
2537
2538/*
2539 * For a phci, return the next available path to any client
2540 * Note: ditto mdi_get_next_phci_path()
2541 */
2542mdi_pathinfo_t *
2543mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2544{
2545	mdi_phci_t *ph;
2546
2547	if (!MDI_PHCI(ph_dip))
2548		return (NULL);
2549
2550	/*
2551	 * Walk through pHCI link
2552	 */
2553	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2554	ASSERT(ph != NULL);
2555
2556	if (pip == NULL)
2557		return ((mdi_pathinfo_t *)ph->ph_path_head);
2558
2559	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2560}
2561
2562/*
2563 * mdi_hold_path():
2564 *		Hold the mdi_pathinfo node against unwanted unexpected free.
2565 * Return Values:
2566 *		None
2567 */
2568void
2569mdi_hold_path(mdi_pathinfo_t *pip)
2570{
2571	if (pip) {
2572		MDI_PI_LOCK(pip);
2573		MDI_PI_HOLD(pip);
2574		MDI_PI_UNLOCK(pip);
2575	}
2576}
2577
2578
2579/*
2580 * mdi_rele_path():
2581 *		Release the mdi_pathinfo node which was selected
2582 *		through mdi_select_path() mechanism or manually held by
2583 *		calling mdi_hold_path().
2584 * Return Values:
2585 *		None
2586 */
2587void
2588mdi_rele_path(mdi_pathinfo_t *pip)
2589{
2590	if (pip) {
2591		MDI_PI_LOCK(pip);
2592		MDI_PI_RELE(pip);
2593		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2594			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2595		}
2596		MDI_PI_UNLOCK(pip);
2597	}
2598}
2599
2600/*
2601 * mdi_pi_lock():
2602 * 		Lock the mdi_pathinfo node.
2603 * Note:
2604 *		The caller should release the lock by calling mdi_pi_unlock()
2605 */
2606void
2607mdi_pi_lock(mdi_pathinfo_t *pip)
2608{
2609	ASSERT(pip != NULL);
2610	if (pip) {
2611		MDI_PI_LOCK(pip);
2612	}
2613}
2614
2615
2616/*
2617 * mdi_pi_unlock():
2618 * 		Unlock the mdi_pathinfo node.
2619 * Note:
2620 *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2621 */
2622void
2623mdi_pi_unlock(mdi_pathinfo_t *pip)
2624{
2625	ASSERT(pip != NULL);
2626	if (pip) {
2627		MDI_PI_UNLOCK(pip);
2628	}
2629}
2630
2631/*
2632 * mdi_pi_find():
2633 *		Search the list of mdi_pathinfo nodes attached to the
2634 *		pHCI/Client device node whose path address matches "paddr".
2635 *		Returns a pointer to the mdi_pathinfo node if a matching node is
2636 *		found.
2637 * Return Values:
2638 *		mdi_pathinfo node handle
2639 *		NULL
2640 * Notes:
2641 *		Caller need not hold any locks to call this function.
2642 */
2643mdi_pathinfo_t *
2644mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2645{
2646	mdi_phci_t		*ph;
2647	mdi_vhci_t		*vh;
2648	mdi_client_t		*ct;
2649	mdi_pathinfo_t		*pip = NULL;
2650
2651	MDI_DEBUG(2, (MDI_NOTE, pdip,
2652	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
2653	if ((pdip == NULL) || (paddr == NULL)) {
2654		return (NULL);
2655	}
2656	ph = i_devi_get_phci(pdip);
2657	if (ph == NULL) {
2658		/*
2659		 * Invalid pHCI device, Nothing more to do.
2660		 */
2661		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
2662		return (NULL);
2663	}
2664
2665	vh = ph->ph_vhci;
2666	if (vh == NULL) {
2667		/*
2668		 * Invalid vHCI device, Nothing more to do.
2669		 */
2670		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
2671		return (NULL);
2672	}
2673
2674	/*
2675	 * Look for pathinfo node identified by paddr.
2676	 */
2677	if (caddr == NULL) {
2678		/*
2679		 * Find a mdi_pathinfo node under pHCI list for a matching
2680		 * unit address.
2681		 */
2682		MDI_PHCI_LOCK(ph);
2683		if (MDI_PHCI_IS_OFFLINE(ph)) {
2684			MDI_DEBUG(2, (MDI_WARN, pdip,
2685			    "offline phci %p", (void *)ph));
2686			MDI_PHCI_UNLOCK(ph);
2687			return (NULL);
2688		}
2689		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2690
2691		while (pip != NULL) {
2692			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2693				break;
2694			}
2695			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2696		}
2697		MDI_PHCI_UNLOCK(ph);
2698		MDI_DEBUG(2, (MDI_NOTE, pdip,
2699		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
2700		return (pip);
2701	}
2702
2703	/*
2704	 * XXX - Is the rest of the code in this function really necessary?
2705	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2706	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2707	 * whether the search is based on the pathinfo nodes attached to
2708	 * the pHCI or the client node, the result will be the same.
2709	 */
2710
2711	/*
2712	 * Find the client device corresponding to 'caddr'
2713	 */
2714	MDI_VHCI_CLIENT_LOCK(vh);
2715
2716	/*
2717	 * XXX - Passing NULL to the following function works as long as the
2718	 * the client addresses (caddr) are unique per vhci basis.
2719	 */
2720	ct = i_mdi_client_find(vh, NULL, caddr);
2721	if (ct == NULL) {
2722		/*
2723		 * Client not found, Obviously mdi_pathinfo node has not been
2724		 * created yet.
2725		 */
2726		MDI_VHCI_CLIENT_UNLOCK(vh);
2727		MDI_DEBUG(2, (MDI_NOTE, pdip,
2728		    "client not found for caddr @%s", caddr ? caddr : ""));
2729		return (NULL);
2730	}
2731
2732	/*
2733	 * Hold the client lock and look for a mdi_pathinfo node with matching
2734	 * pHCI and paddr
2735	 */
2736	MDI_CLIENT_LOCK(ct);
2737
2738	/*
2739	 * Release the global mutex as it is no more needed. Note: We always
2740	 * respect the locking order while acquiring.
2741	 */
2742	MDI_VHCI_CLIENT_UNLOCK(vh);
2743
2744	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2745	while (pip != NULL) {
2746		/*
2747		 * Compare the unit address
2748		 */
2749		if ((MDI_PI(pip)->pi_phci == ph) &&
2750		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2751			break;
2752		}
2753		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2754	}
2755	MDI_CLIENT_UNLOCK(ct);
2756	MDI_DEBUG(2, (MDI_NOTE, pdip,
2757	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
2758	return (pip);
2759}
2760
2761/*
2762 * mdi_pi_alloc():
2763 *		Allocate and initialize a new instance of a mdi_pathinfo node.
2764 *		The mdi_pathinfo node returned by this function identifies a
2765 *		unique device path is capable of having properties attached
2766 *		and passed to mdi_pi_online() to fully attach and online the
2767 *		path and client device node.
2768 *		The mdi_pathinfo node returned by this function must be
2769 *		destroyed using mdi_pi_free() if the path is no longer
2770 *		operational or if the caller fails to attach a client device
2771 *		node when calling mdi_pi_online(). The framework will not free
2772 *		the resources allocated.
2773 *		This function can be called from both interrupt and kernel
2774 *		contexts.  DDI_NOSLEEP flag should be used while calling
2775 *		from interrupt contexts.
2776 * Return Values:
2777 *		MDI_SUCCESS
2778 *		MDI_FAILURE
2779 *		MDI_NOMEM
2780 */
2781/*ARGSUSED*/
2782int
2783mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2784    char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2785{
2786	mdi_vhci_t	*vh;
2787	mdi_phci_t	*ph;
2788	mdi_client_t	*ct;
2789	mdi_pathinfo_t	*pip = NULL;
2790	dev_info_t	*cdip;
2791	int		rv = MDI_NOMEM;
2792	int		path_allocated = 0;
2793
2794	MDI_DEBUG(2, (MDI_NOTE, pdip,
2795	    "cname %s: caddr@%s paddr@%s",
2796	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));
2797
2798	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2799	    ret_pip == NULL) {
2800		/* Nothing more to do */
2801		return (MDI_FAILURE);
2802	}
2803
2804	*ret_pip = NULL;
2805
2806	/* No allocations on detaching pHCI */
2807	if (DEVI_IS_DETACHING(pdip)) {
2808		/* Invalid pHCI device, return failure */
2809		MDI_DEBUG(1, (MDI_WARN, pdip,
2810		    "!detaching pHCI=%p", (void *)pdip));
2811		return (MDI_FAILURE);
2812	}
2813
2814	ph = i_devi_get_phci(pdip);
2815	ASSERT(ph != NULL);
2816	if (ph == NULL) {
2817		/* Invalid pHCI device, return failure */
2818		MDI_DEBUG(1, (MDI_WARN, pdip,
2819		    "!invalid pHCI=%p", (void *)pdip));
2820		return (MDI_FAILURE);
2821	}
2822
2823	MDI_PHCI_LOCK(ph);
2824	vh = ph->ph_vhci;
2825	if (vh == NULL) {
2826		/* Invalid vHCI device, return failure */
2827		MDI_DEBUG(1, (MDI_WARN, pdip,
2828		    "!invalid vHCI=%p", (void *)pdip));
2829		MDI_PHCI_UNLOCK(ph);
2830		return (MDI_FAILURE);
2831	}
2832
2833	if (MDI_PHCI_IS_READY(ph) == 0) {
2834		/*
2835		 * Do not allow new node creation when pHCI is in
2836		 * offline/suspended states
2837		 */
2838		MDI_DEBUG(1, (MDI_WARN, pdip,
2839		    "pHCI=%p is not ready", (void *)ph));
2840		MDI_PHCI_UNLOCK(ph);
2841		return (MDI_BUSY);
2842	}
2843	MDI_PHCI_UNSTABLE(ph);
2844	MDI_PHCI_UNLOCK(ph);
2845
2846	/* look for a matching client, create one if not found */
2847	MDI_VHCI_CLIENT_LOCK(vh);
2848	ct = i_mdi_client_find(vh, cname, caddr);
2849	if (ct == NULL) {
2850		ct = i_mdi_client_alloc(vh, cname, caddr);
2851		ASSERT(ct != NULL);
2852	}
2853
2854	if (ct->ct_dip == NULL) {
2855		/*
2856		 * Allocate a devinfo node
2857		 */
2858		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2859		    compatible, ncompatible);
2860		if (ct->ct_dip == NULL) {
2861			(void) i_mdi_client_free(vh, ct);
2862			goto fail;
2863		}
2864	}
2865	cdip = ct->ct_dip;
2866
2867	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2868	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2869
2870	MDI_CLIENT_LOCK(ct);
2871	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2872	while (pip != NULL) {
2873		/*
2874		 * Compare the unit address
2875		 */
2876		if ((MDI_PI(pip)->pi_phci == ph) &&
2877		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2878			break;
2879		}
2880		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2881	}
2882	MDI_CLIENT_UNLOCK(ct);
2883
2884	if (pip == NULL) {
2885		/*
2886		 * This is a new path for this client device.  Allocate and
2887		 * initialize a new pathinfo node
2888		 */
2889		pip = i_mdi_pi_alloc(ph, paddr, ct);
2890		ASSERT(pip != NULL);
2891		path_allocated = 1;
2892	}
2893	rv = MDI_SUCCESS;
2894
2895fail:
2896	/*
2897	 * Release the global mutex.
2898	 */
2899	MDI_VHCI_CLIENT_UNLOCK(vh);
2900
2901	/*
2902	 * Mark the pHCI as stable
2903	 */
2904	MDI_PHCI_LOCK(ph);
2905	MDI_PHCI_STABLE(ph);
2906	MDI_PHCI_UNLOCK(ph);
2907	*ret_pip = pip;
2908
2909	MDI_DEBUG(2, (MDI_NOTE, pdip,
2910	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));
2911
2912	if (path_allocated)
2913		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2914
2915	return (rv);
2916}
2917
2918/*ARGSUSED*/
2919int
2920mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2921    int flags, mdi_pathinfo_t **ret_pip)
2922{
2923	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2924	    flags, ret_pip));
2925}
2926
2927/*
2928 * i_mdi_pi_alloc():
2929 *		Allocate a mdi_pathinfo node and add to the pHCI path list
2930 * Return Values:
2931 *		mdi_pathinfo
2932 */
2933/*ARGSUSED*/
2934static mdi_pathinfo_t *
2935i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2936{
2937	mdi_pathinfo_t	*pip;
2938	int		ct_circular;
2939	int		ph_circular;
2940	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
2941	char		*path_persistent;
2942	int		path_instance;
2943	mod_hash_val_t	hv;
2944
2945	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2946
2947	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2948	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2949	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2950	    MDI_PATHINFO_STATE_TRANSIENT;
2951
2952	if (MDI_PHCI_IS_USER_DISABLED(ph))
2953		MDI_PI_SET_USER_DISABLE(pip);
2954
2955	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2956		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2957
2958	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2959		MDI_PI_SET_DRV_DISABLE(pip);
2960
2961	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2962	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2963	MDI_PI(pip)->pi_client = ct;
2964	MDI_PI(pip)->pi_phci = ph;
2965	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2966	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2967
2968        /*
2969	 * We form the "path" to the pathinfo node, and see if we have
2970	 * already allocated a 'path_instance' for that "path".  If so,
2971	 * we use the already allocated 'path_instance'.  If not, we
2972	 * allocate a new 'path_instance' and associate it with a copy of
2973	 * the "path" string (which is never freed). The association
2974	 * between a 'path_instance' this "path" string persists until
2975	 * reboot.
2976	 */
2977        mutex_enter(&mdi_pathmap_mutex);
2978	(void) ddi_pathname(ph->ph_dip, path);
2979	(void) sprintf(path + strlen(path), "/%s@%s",
2980	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2981        if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2982                path_instance = (uint_t)(intptr_t)hv;
2983        } else {
2984		/* allocate a new 'path_instance' and persistent "path" */
2985		path_instance = mdi_pathmap_instance++;
2986		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2987                (void) mod_hash_insert(mdi_pathmap_bypath,
2988                    (mod_hash_key_t)path_persistent,
2989                    (mod_hash_val_t)(intptr_t)path_instance);
2990		(void) mod_hash_insert(mdi_pathmap_byinstance,
2991		    (mod_hash_key_t)(intptr_t)path_instance,
2992		    (mod_hash_val_t)path_persistent);
2993
2994		/* create shortpath name */
2995		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
2996		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
2997		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2998		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2999		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
3000		    (mod_hash_key_t)(intptr_t)path_instance,
3001		    (mod_hash_val_t)path_persistent);
3002        }
3003        mutex_exit(&mdi_pathmap_mutex);
3004	MDI_PI(pip)->pi_path_instance = path_instance;
3005
3006	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
3007	ASSERT(MDI_PI(pip)->pi_prop != NULL);
3008	MDI_PI(pip)->pi_pprivate = NULL;
3009	MDI_PI(pip)->pi_cprivate = NULL;
3010	MDI_PI(pip)->pi_vprivate = NULL;
3011	MDI_PI(pip)->pi_client_link = NULL;
3012	MDI_PI(pip)->pi_phci_link = NULL;
3013	MDI_PI(pip)->pi_ref_cnt = 0;
3014	MDI_PI(pip)->pi_kstats = NULL;
3015	MDI_PI(pip)->pi_preferred = 1;
3016	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
3017
3018	/*
3019	 * Lock both dev_info nodes against changes in parallel.
3020	 *
3021	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
3022	 * This atypical operation is done to synchronize pathinfo nodes
3023	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
3024	 * the pathinfo nodes are children of the Client.
3025	 */
3026	ndi_devi_enter(ct->ct_dip, &ct_circular);
3027	ndi_devi_enter(ph->ph_dip, &ph_circular);
3028
3029	i_mdi_phci_add_path(ph, pip);
3030	i_mdi_client_add_path(ct, pip);
3031
3032	ndi_devi_exit(ph->ph_dip, ph_circular);
3033	ndi_devi_exit(ct->ct_dip, ct_circular);
3034
3035	return (pip);
3036}
3037
3038/*
3039 * mdi_pi_pathname_by_instance():
3040 *	Lookup of "path" by 'path_instance'. Return "path".
3041 *	NOTE: returned "path" remains valid forever (until reboot).
3042 */
3043char *
3044mdi_pi_pathname_by_instance(int path_instance)
3045{
3046	char		*path;
3047	mod_hash_val_t	hv;
3048
3049	/* mdi_pathmap lookup of "path" by 'path_instance' */
3050	mutex_enter(&mdi_pathmap_mutex);
3051	if (mod_hash_find(mdi_pathmap_byinstance,
3052	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3053		path = (char *)hv;
3054	else
3055		path = NULL;
3056	mutex_exit(&mdi_pathmap_mutex);
3057	return (path);
3058}
3059
3060/*
3061 * mdi_pi_spathname_by_instance():
3062 *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
3063 *	NOTE: returned "shortpath" remains valid forever (until reboot).
3064 */
3065char *
3066mdi_pi_spathname_by_instance(int path_instance)
3067{
3068	char		*path;
3069	mod_hash_val_t	hv;
3070
3071	/* mdi_pathmap lookup of "path" by 'path_instance' */
3072	mutex_enter(&mdi_pathmap_mutex);
3073	if (mod_hash_find(mdi_pathmap_sbyinstance,
3074	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3075		path = (char *)hv;
3076	else
3077		path = NULL;
3078	mutex_exit(&mdi_pathmap_mutex);
3079	return (path);
3080}
3081
3082
3083/*
3084 * i_mdi_phci_add_path():
3085 * 		Add a mdi_pathinfo node to pHCI list.
3086 * Notes:
3087 *		Caller should per-pHCI mutex
3088 */
3089static void
3090i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3091{
3092	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3093
3094	MDI_PHCI_LOCK(ph);
3095	if (ph->ph_path_head == NULL) {
3096		ph->ph_path_head = pip;
3097	} else {
3098		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
3099	}
3100	ph->ph_path_tail = pip;
3101	ph->ph_path_count++;
3102	MDI_PHCI_UNLOCK(ph);
3103}
3104
3105/*
3106 * i_mdi_client_add_path():
3107 *		Add mdi_pathinfo node to client list
3108 */
3109static void
3110i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3111{
3112	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3113
3114	MDI_CLIENT_LOCK(ct);
3115	if (ct->ct_path_head == NULL) {
3116		ct->ct_path_head = pip;
3117	} else {
3118		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3119	}
3120	ct->ct_path_tail = pip;
3121	ct->ct_path_count++;
3122	MDI_CLIENT_UNLOCK(ct);
3123}
3124
3125/*
3126 * mdi_pi_free():
3127 *		Free the mdi_pathinfo node and also client device node if this
3128 *		is the last path to the device
3129 * Return Values:
3130 *		MDI_SUCCESS
3131 *		MDI_FAILURE
3132 *		MDI_BUSY
3133 */
3134/*ARGSUSED*/
3135int
3136mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3137{
3138	int		rv;
3139	mdi_vhci_t	*vh;
3140	mdi_phci_t	*ph;
3141	mdi_client_t	*ct;
3142	int		(*f)();
3143	int		client_held = 0;
3144
3145	MDI_PI_LOCK(pip);
3146	ph = MDI_PI(pip)->pi_phci;
3147	ASSERT(ph != NULL);
3148	if (ph == NULL) {
3149		/*
3150		 * Invalid pHCI device, return failure
3151		 */
3152		MDI_DEBUG(1, (MDI_WARN, NULL,
3153		    "!invalid pHCI: pip %s %p",
3154		    mdi_pi_spathname(pip), (void *)pip));
3155		MDI_PI_UNLOCK(pip);
3156		return (MDI_FAILURE);
3157	}
3158
3159	vh = ph->ph_vhci;
3160	ASSERT(vh != NULL);
3161	if (vh == NULL) {
3162		/* Invalid pHCI device, return failure */
3163		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3164		    "!invalid vHCI: pip %s %p",
3165		    mdi_pi_spathname(pip), (void *)pip));
3166		MDI_PI_UNLOCK(pip);
3167		return (MDI_FAILURE);
3168	}
3169
3170	ct = MDI_PI(pip)->pi_client;
3171	ASSERT(ct != NULL);
3172	if (ct == NULL) {
3173		/*
3174		 * Invalid Client device, return failure
3175		 */
3176		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3177		    "!invalid client: pip %s %p",
3178		    mdi_pi_spathname(pip), (void *)pip));
3179		MDI_PI_UNLOCK(pip);
3180		return (MDI_FAILURE);
3181	}
3182
3183	/*
3184	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3185	 * if the node state is either offline or init and the reference count
3186	 * is zero.
3187	 */
3188	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3189	    MDI_PI_IS_INITING(pip))) {
3190		/*
3191		 * Node is busy
3192		 */
3193		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3194		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
3195		MDI_PI_UNLOCK(pip);
3196		return (MDI_BUSY);
3197	}
3198
3199	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3200		/*
3201		 * Give a chance for pending I/Os to complete.
3202		 */
3203		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3204		    "!%d cmds still pending on path: %s %p",
3205		    MDI_PI(pip)->pi_ref_cnt,
3206		    mdi_pi_spathname(pip), (void *)pip));
3207		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3208		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3209		    TR_CLOCK_TICK) == -1) {
3210			/*
3211			 * The timeout time reached without ref_cnt being zero
3212			 * being signaled.
3213			 */
3214			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3215			    "!Timeout reached on path %s %p without the cond",
3216			    mdi_pi_spathname(pip), (void *)pip));
3217			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3218			    "!%d cmds still pending on path %s %p",
3219			    MDI_PI(pip)->pi_ref_cnt,
3220			    mdi_pi_spathname(pip), (void *)pip));
3221			MDI_PI_UNLOCK(pip);
3222			return (MDI_BUSY);
3223		}
3224	}
3225	if (MDI_PI(pip)->pi_pm_held) {
3226		client_held = 1;
3227	}
3228	MDI_PI_UNLOCK(pip);
3229
3230	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3231
3232	MDI_CLIENT_LOCK(ct);
3233
3234	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3235	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3236
3237	/*
3238	 * Wait till failover is complete before removing this node.
3239	 */
3240	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3241		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3242
3243	MDI_CLIENT_UNLOCK(ct);
3244	MDI_VHCI_CLIENT_LOCK(vh);
3245	MDI_CLIENT_LOCK(ct);
3246	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3247
3248	rv = MDI_SUCCESS;
3249	if (!MDI_PI_IS_INITING(pip)) {
3250		f = vh->vh_ops->vo_pi_uninit;
3251		if (f != NULL) {
3252			rv = (*f)(vh->vh_dip, pip, 0);
3253		}
3254	}
3255
3256	/*
3257	 * If vo_pi_uninit() completed successfully.
3258	 */
3259	if (rv == MDI_SUCCESS) {
3260		if (client_held) {
3261			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3262			    "i_mdi_pm_rele_client\n"));
3263			i_mdi_pm_rele_client(ct, 1);
3264		}
3265		i_mdi_pi_free(ph, pip, ct);
3266		if (ct->ct_path_count == 0) {
3267			/*
3268			 * Client lost its last path.
3269			 * Clean up the client device
3270			 */
3271			MDI_CLIENT_UNLOCK(ct);
3272			(void) i_mdi_client_free(ct->ct_vhci, ct);
3273			MDI_VHCI_CLIENT_UNLOCK(vh);
3274			return (rv);
3275		}
3276	}
3277	MDI_CLIENT_UNLOCK(ct);
3278	MDI_VHCI_CLIENT_UNLOCK(vh);
3279
3280	if (rv == MDI_FAILURE)
3281		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3282
3283	return (rv);
3284}
3285
3286/*
3287 * i_mdi_pi_free():
3288 *		Free the mdi_pathinfo node
3289 */
3290static void
3291i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3292{
3293	int	ct_circular;
3294	int	ph_circular;
3295
3296	ASSERT(MDI_CLIENT_LOCKED(ct));
3297
3298	/*
3299	 * remove any per-path kstats
3300	 */
3301	i_mdi_pi_kstat_destroy(pip);
3302
3303	/* See comments in i_mdi_pi_alloc() */
3304	ndi_devi_enter(ct->ct_dip, &ct_circular);
3305	ndi_devi_enter(ph->ph_dip, &ph_circular);
3306
3307	i_mdi_client_remove_path(ct, pip);
3308	i_mdi_phci_remove_path(ph, pip);
3309
3310	ndi_devi_exit(ph->ph_dip, ph_circular);
3311	ndi_devi_exit(ct->ct_dip, ct_circular);
3312
3313	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3314	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3315	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3316	if (MDI_PI(pip)->pi_addr) {
3317		kmem_free(MDI_PI(pip)->pi_addr,
3318		    strlen(MDI_PI(pip)->pi_addr) + 1);
3319		MDI_PI(pip)->pi_addr = NULL;
3320	}
3321
3322	if (MDI_PI(pip)->pi_prop) {
3323		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3324		MDI_PI(pip)->pi_prop = NULL;
3325	}
3326	kmem_free(pip, sizeof (struct mdi_pathinfo));
3327}
3328
3329
3330/*
3331 * i_mdi_phci_remove_path():
3332 * 		Remove a mdi_pathinfo node from pHCI list.
3333 * Notes:
3334 *		Caller should hold per-pHCI mutex
3335 */
3336static void
3337i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3338{
3339	mdi_pathinfo_t	*prev = NULL;
3340	mdi_pathinfo_t	*path = NULL;
3341
3342	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3343
3344	MDI_PHCI_LOCK(ph);
3345	path = ph->ph_path_head;
3346	while (path != NULL) {
3347		if (path == pip) {
3348			break;
3349		}
3350		prev = path;
3351		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3352	}
3353
3354	if (path) {
3355		ph->ph_path_count--;
3356		if (prev) {
3357			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3358		} else {
3359			ph->ph_path_head =
3360			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3361		}
3362		if (ph->ph_path_tail == path) {
3363			ph->ph_path_tail = prev;
3364		}
3365	}
3366
3367	/*
3368	 * Clear the pHCI link
3369	 */
3370	MDI_PI(pip)->pi_phci_link = NULL;
3371	MDI_PI(pip)->pi_phci = NULL;
3372	MDI_PHCI_UNLOCK(ph);
3373}
3374
3375/*
3376 * i_mdi_client_remove_path():
3377 * 		Remove a mdi_pathinfo node from client path list.
3378 */
3379static void
3380i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3381{
3382	mdi_pathinfo_t	*prev = NULL;
3383	mdi_pathinfo_t	*path;
3384
3385	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3386
3387	ASSERT(MDI_CLIENT_LOCKED(ct));
3388	path = ct->ct_path_head;
3389	while (path != NULL) {
3390		if (path == pip) {
3391			break;
3392		}
3393		prev = path;
3394		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3395	}
3396
3397	if (path) {
3398		ct->ct_path_count--;
3399		if (prev) {
3400			MDI_PI(prev)->pi_client_link =
3401			    MDI_PI(path)->pi_client_link;
3402		} else {
3403			ct->ct_path_head =
3404			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3405		}
3406		if (ct->ct_path_tail == path) {
3407			ct->ct_path_tail = prev;
3408		}
3409		if (ct->ct_path_last == path) {
3410			ct->ct_path_last = ct->ct_path_head;
3411		}
3412	}
3413	MDI_PI(pip)->pi_client_link = NULL;
3414	MDI_PI(pip)->pi_client = NULL;
3415}
3416
3417/*
3418 * i_mdi_pi_state_change():
3419 *		online a mdi_pathinfo node
3420 *
3421 * Return Values:
3422 *		MDI_SUCCESS
3423 *		MDI_FAILURE
3424 */
3425/*ARGSUSED*/
3426static int
3427i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3428{
3429	int		rv = MDI_SUCCESS;
3430	mdi_vhci_t	*vh;
3431	mdi_phci_t	*ph;
3432	mdi_client_t	*ct;
3433	int		(*f)();
3434	dev_info_t	*cdip;
3435
3436	MDI_PI_LOCK(pip);
3437
3438	ph = MDI_PI(pip)->pi_phci;
3439	ASSERT(ph);
3440	if (ph == NULL) {
3441		/*
3442		 * Invalid pHCI device, fail the request
3443		 */
3444		MDI_PI_UNLOCK(pip);
3445		MDI_DEBUG(1, (MDI_WARN, NULL,
3446		    "!invalid phci: pip %s %p",
3447		    mdi_pi_spathname(pip), (void *)pip));
3448		return (MDI_FAILURE);
3449	}
3450
3451	vh = ph->ph_vhci;
3452	ASSERT(vh);
3453	if (vh == NULL) {
3454		/*
3455		 * Invalid vHCI device, fail the request
3456		 */
3457		MDI_PI_UNLOCK(pip);
3458		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3459		    "!invalid vhci: pip %s %p",
3460		    mdi_pi_spathname(pip), (void *)pip));
3461		return (MDI_FAILURE);
3462	}
3463
3464	ct = MDI_PI(pip)->pi_client;
3465	ASSERT(ct != NULL);
3466	if (ct == NULL) {
3467		/*
3468		 * Invalid client device, fail the request
3469		 */
3470		MDI_PI_UNLOCK(pip);
3471		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3472		    "!invalid client: pip %s %p",
3473		    mdi_pi_spathname(pip), (void *)pip));
3474		return (MDI_FAILURE);
3475	}
3476
3477	/*
3478	 * If this path has not been initialized yet, Callback vHCI driver's
3479	 * pathinfo node initialize entry point
3480	 */
3481
3482	if (MDI_PI_IS_INITING(pip)) {
3483		MDI_PI_UNLOCK(pip);
3484		f = vh->vh_ops->vo_pi_init;
3485		if (f != NULL) {
3486			rv = (*f)(vh->vh_dip, pip, 0);
3487			if (rv != MDI_SUCCESS) {
3488				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3489				    "!vo_pi_init failed: vHCI %p, pip %s %p",
3490				    (void *)vh, mdi_pi_spathname(pip),
3491				    (void *)pip));
3492				return (MDI_FAILURE);
3493			}
3494		}
3495		MDI_PI_LOCK(pip);
3496		MDI_PI_CLEAR_TRANSIENT(pip);
3497	}
3498
3499	/*
3500	 * Do not allow state transition when pHCI is in offline/suspended
3501	 * states
3502	 */
3503	i_mdi_phci_lock(ph, pip);
3504	if (MDI_PHCI_IS_READY(ph) == 0) {
3505		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3506		    "!pHCI not ready, pHCI=%p", (void *)ph));
3507		MDI_PI_UNLOCK(pip);
3508		i_mdi_phci_unlock(ph);
3509		return (MDI_BUSY);
3510	}
3511	MDI_PHCI_UNSTABLE(ph);
3512	i_mdi_phci_unlock(ph);
3513
3514	/*
3515	 * Check if mdi_pathinfo state is in transient state.
3516	 * If yes, offlining is in progress and wait till transient state is
3517	 * cleared.
3518	 */
3519	if (MDI_PI_IS_TRANSIENT(pip)) {
3520		while (MDI_PI_IS_TRANSIENT(pip)) {
3521			cv_wait(&MDI_PI(pip)->pi_state_cv,
3522			    &MDI_PI(pip)->pi_mutex);
3523		}
3524	}
3525
3526	/*
3527	 * Grab the client lock in reverse order sequence and release the
3528	 * mdi_pathinfo mutex.
3529	 */
3530	i_mdi_client_lock(ct, pip);
3531	MDI_PI_UNLOCK(pip);
3532
3533	/*
3534	 * Wait till failover state is cleared
3535	 */
3536	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3537		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3538
3539	/*
3540	 * Mark the mdi_pathinfo node state as transient
3541	 */
3542	MDI_PI_LOCK(pip);
3543	switch (state) {
3544	case MDI_PATHINFO_STATE_ONLINE:
3545		MDI_PI_SET_ONLINING(pip);
3546		break;
3547
3548	case MDI_PATHINFO_STATE_STANDBY:
3549		MDI_PI_SET_STANDBYING(pip);
3550		break;
3551
3552	case MDI_PATHINFO_STATE_FAULT:
3553		/*
3554		 * Mark the pathinfo state as FAULTED
3555		 */
3556		MDI_PI_SET_FAULTING(pip);
3557		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3558		break;
3559
3560	case MDI_PATHINFO_STATE_OFFLINE:
3561		/*
3562		 * ndi_devi_offline() cannot hold pip or ct locks.
3563		 */
3564		MDI_PI_UNLOCK(pip);
3565
3566		/*
3567		 * If this is a user initiated path online->offline operation
3568		 * who's success would transition a client from DEGRADED to
3569		 * FAILED then only proceed if we can offline the client first.
3570		 */
3571		cdip = ct->ct_dip;
3572		if ((flag & NDI_USER_REQ) &&
3573		    MDI_PI_IS_ONLINE(pip) &&
3574		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3575			i_mdi_client_unlock(ct);
3576			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
3577			if (rv != NDI_SUCCESS) {
3578				/*
3579				 * Convert to MDI error code
3580				 */
3581				switch (rv) {
3582				case NDI_BUSY:
3583					rv = MDI_BUSY;
3584					break;
3585				default:
3586					rv = MDI_FAILURE;
3587					break;
3588				}
3589				goto state_change_exit;
3590			} else {
3591				i_mdi_client_lock(ct, NULL);
3592			}
3593		}
3594		/*
3595		 * Mark the mdi_pathinfo node state as transient
3596		 */
3597		MDI_PI_LOCK(pip);
3598		MDI_PI_SET_OFFLINING(pip);
3599		break;
3600	}
3601	MDI_PI_UNLOCK(pip);
3602	MDI_CLIENT_UNSTABLE(ct);
3603	i_mdi_client_unlock(ct);
3604
3605	f = vh->vh_ops->vo_pi_state_change;
3606	if (f != NULL)
3607		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3608
3609	MDI_CLIENT_LOCK(ct);
3610	MDI_PI_LOCK(pip);
3611	if (rv == MDI_NOT_SUPPORTED) {
3612		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3613	}
3614	if (rv != MDI_SUCCESS) {
3615		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
3616		    "vo_pi_state_change failed: rv %x", rv));
3617	}
3618	if (MDI_PI_IS_TRANSIENT(pip)) {
3619		if (rv == MDI_SUCCESS) {
3620			MDI_PI_CLEAR_TRANSIENT(pip);
3621		} else {
3622			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3623		}
3624	}
3625
3626	/*
3627	 * Wake anyone waiting for this mdi_pathinfo node
3628	 */
3629	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3630	MDI_PI_UNLOCK(pip);
3631
3632	/*
3633	 * Mark the client device as stable
3634	 */
3635	MDI_CLIENT_STABLE(ct);
3636	if (rv == MDI_SUCCESS) {
3637		if (ct->ct_unstable == 0) {
3638			cdip = ct->ct_dip;
3639
3640			/*
3641			 * Onlining the mdi_pathinfo node will impact the
3642			 * client state Update the client and dev_info node
3643			 * state accordingly
3644			 */
3645			rv = NDI_SUCCESS;
3646			i_mdi_client_update_state(ct);
3647			switch (MDI_CLIENT_STATE(ct)) {
3648			case MDI_CLIENT_STATE_OPTIMAL:
3649			case MDI_CLIENT_STATE_DEGRADED:
3650				if (cdip && !i_ddi_devi_attached(cdip) &&
3651				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3652				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3653
3654					/*
3655					 * Must do ndi_devi_online() through
3656					 * hotplug thread for deferred
3657					 * attach mechanism to work
3658					 */
3659					MDI_CLIENT_UNLOCK(ct);
3660					rv = ndi_devi_online(cdip, 0);
3661					MDI_CLIENT_LOCK(ct);
3662					if ((rv != NDI_SUCCESS) &&
3663					    (MDI_CLIENT_STATE(ct) ==
3664					    MDI_CLIENT_STATE_DEGRADED)) {
3665						MDI_DEBUG(1, (MDI_WARN, cdip,
3666						    "!ndi_devi_online failed "
3667						    "error %x", rv));
3668					}
3669					rv = NDI_SUCCESS;
3670				}
3671				break;
3672
3673			case MDI_CLIENT_STATE_FAILED:
3674				/*
3675				 * This is the last path case for
3676				 * non-user initiated events.
3677				 */
3678				if (((flag & NDI_USER_REQ) == 0) &&
3679				    cdip && (i_ddi_node_state(cdip) >=
3680				    DS_INITIALIZED)) {
3681					MDI_CLIENT_UNLOCK(ct);
3682					rv = ndi_devi_offline(cdip,
3683					    NDI_DEVFS_CLEAN);
3684					MDI_CLIENT_LOCK(ct);
3685
3686					if (rv != NDI_SUCCESS) {
3687						/*
3688						 * ndi_devi_offline failed.
3689						 * Reset client flags to
3690						 * online as the path could not
3691						 * be offlined.
3692						 */
3693						MDI_DEBUG(1, (MDI_WARN, cdip,
3694						    "!ndi_devi_offline failed: "
3695						    "error %x", rv));
3696						MDI_CLIENT_SET_ONLINE(ct);
3697					}
3698				}
3699				break;
3700			}
3701			/*
3702			 * Convert to MDI error code
3703			 */
3704			switch (rv) {
3705			case NDI_SUCCESS:
3706				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3707				i_mdi_report_path_state(ct, pip);
3708				rv = MDI_SUCCESS;
3709				break;
3710			case NDI_BUSY:
3711				rv = MDI_BUSY;
3712				break;
3713			default:
3714				rv = MDI_FAILURE;
3715				break;
3716			}
3717		}
3718	}
3719	MDI_CLIENT_UNLOCK(ct);
3720
3721state_change_exit:
3722	/*
3723	 * Mark the pHCI as stable again.
3724	 */
3725	MDI_PHCI_LOCK(ph);
3726	MDI_PHCI_STABLE(ph);
3727	MDI_PHCI_UNLOCK(ph);
3728	return (rv);
3729}
3730
3731/*
3732 * mdi_pi_online():
3733 *		Place the path_info node in the online state.  The path is
3734 *		now available to be selected by mdi_select_path() for
3735 *		transporting I/O requests to client devices.
3736 * Return Values:
3737 *		MDI_SUCCESS
3738 *		MDI_FAILURE
3739 */
3740int
3741mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3742{
3743	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3744	int		client_held = 0;
3745	int		rv;
3746
3747	ASSERT(ct != NULL);
3748	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3749	if (rv != MDI_SUCCESS)
3750		return (rv);
3751
3752	MDI_PI_LOCK(pip);
3753	if (MDI_PI(pip)->pi_pm_held == 0) {
3754		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3755		    "i_mdi_pm_hold_pip %p", (void *)pip));
3756		i_mdi_pm_hold_pip(pip);
3757		client_held = 1;
3758	}
3759	MDI_PI_UNLOCK(pip);
3760
3761	if (client_held) {
3762		MDI_CLIENT_LOCK(ct);
3763		if (ct->ct_power_cnt == 0) {
3764			rv = i_mdi_power_all_phci(ct);
3765		}
3766
3767		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3768		    "i_mdi_pm_hold_client %p", (void *)ct));
3769		i_mdi_pm_hold_client(ct, 1);
3770		MDI_CLIENT_UNLOCK(ct);
3771	}
3772
3773	return (rv);
3774}
3775
3776/*
3777 * mdi_pi_standby():
3778 *		Place the mdi_pathinfo node in standby state
3779 *
3780 * Return Values:
3781 *		MDI_SUCCESS
3782 *		MDI_FAILURE
3783 */
3784int
3785mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3786{
3787	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3788}
3789
3790/*
3791 * mdi_pi_fault():
3792 *		Place the mdi_pathinfo node in fault'ed state
3793 * Return Values:
3794 *		MDI_SUCCESS
3795 *		MDI_FAILURE
3796 */
3797int
3798mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3799{
3800	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3801}
3802
3803/*
3804 * mdi_pi_offline():
3805 *		Offline a mdi_pathinfo node.
3806 * Return Values:
3807 *		MDI_SUCCESS
3808 *		MDI_FAILURE
3809 */
3810int
3811mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3812{
3813	int	ret, client_held = 0;
3814	mdi_client_t	*ct;
3815
3816	/*
3817	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
3818	 * used it to mean "user initiated operation" (i.e. devctl). Callers
3819	 * should now just use NDI_USER_REQ.
3820	 */
3821	if (flags & NDI_DEVI_REMOVE) {
3822		flags &= ~NDI_DEVI_REMOVE;
3823		flags |= NDI_USER_REQ;
3824	}
3825
3826	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3827
3828	if (ret == MDI_SUCCESS) {
3829		MDI_PI_LOCK(pip);
3830		if (MDI_PI(pip)->pi_pm_held) {
3831			client_held = 1;
3832		}
3833		MDI_PI_UNLOCK(pip);
3834
3835		if (client_held) {
3836			ct = MDI_PI(pip)->pi_client;
3837			MDI_CLIENT_LOCK(ct);
3838			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3839			    "i_mdi_pm_rele_client\n"));
3840			i_mdi_pm_rele_client(ct, 1);
3841			MDI_CLIENT_UNLOCK(ct);
3842		}
3843	}
3844
3845	return (ret);
3846}
3847
3848/*
3849 * i_mdi_pi_offline():
3850 *		Offline a mdi_pathinfo node and call the vHCI driver's callback
3851 */
3852static int
3853i_mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3854{
3855	dev_info_t	*vdip = NULL;
3856	mdi_vhci_t	*vh = NULL;
3857	mdi_client_t	*ct = NULL;
3858	int		(*f)();
3859	int		rv;
3860
3861	MDI_PI_LOCK(pip);
3862	ct = MDI_PI(pip)->pi_client;
3863	ASSERT(ct != NULL);
3864
3865	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3866		/*
3867		 * Give a chance for pending I/Os to complete.
3868		 */
3869		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3870		    "!%d cmds still pending on path %s %p",
3871		    MDI_PI(pip)->pi_ref_cnt, mdi_pi_spathname(pip),
3872		    (void *)pip));
3873		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3874		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3875		    TR_CLOCK_TICK) == -1) {
3876			/*
3877			 * The timeout time reached without ref_cnt being zero
3878			 * being signaled.
3879			 */
3880			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3881			    "!Timeout reached on path %s %p without the cond",
3882			    mdi_pi_spathname(pip), (void *)pip));
3883			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3884			    "!%d cmds still pending on path %s %p",
3885			    MDI_PI(pip)->pi_ref_cnt,
3886			    mdi_pi_spathname(pip), (void *)pip));
3887		}
3888	}
3889	vh = ct->ct_vhci;
3890	vdip = vh->vh_dip;
3891
3892	/*
3893	 * Notify vHCI that has registered this event
3894	 */
3895	ASSERT(vh->vh_ops);
3896	f = vh->vh_ops->vo_pi_state_change;
3897
3898	rv = MDI_SUCCESS;
3899	if (f != NULL) {
3900		MDI_PI_UNLOCK(pip);
3901		if ((rv = (*f)(vdip, pip, MDI_PATHINFO_STATE_OFFLINE, 0,
3902		    flags)) != MDI_SUCCESS) {
3903			MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3904			    "!vo_path_offline failed: vdip %s%d %p: path %s %p",
3905			    ddi_driver_name(vdip), ddi_get_instance(vdip),
3906			    (void *)vdip, mdi_pi_spathname(pip), (void *)pip));
3907		}
3908		MDI_PI_LOCK(pip);
3909	}
3910
3911	/*
3912	 * Set the mdi_pathinfo node state and clear the transient condition
3913	 */
3914	MDI_PI_SET_OFFLINE(pip);
3915	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3916	MDI_PI_UNLOCK(pip);
3917
3918	MDI_CLIENT_LOCK(ct);
3919	if (rv == MDI_SUCCESS) {
3920		if (ct->ct_unstable == 0) {
3921			dev_info_t	*cdip = ct->ct_dip;
3922
3923			/*
3924			 * Onlining the mdi_pathinfo node will impact the
3925			 * client state Update the client and dev_info node
3926			 * state accordingly
3927			 */
3928			i_mdi_client_update_state(ct);
3929			rv = NDI_SUCCESS;
3930			if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
3931				if (cdip &&
3932				    (i_ddi_node_state(cdip) >=
3933				    DS_INITIALIZED)) {
3934					MDI_CLIENT_UNLOCK(ct);
3935					rv = ndi_devi_offline(cdip,
3936					    NDI_DEVFS_CLEAN);
3937					MDI_CLIENT_LOCK(ct);
3938					if (rv != NDI_SUCCESS) {
3939						/*
3940						 * ndi_devi_offline failed.
3941						 * Reset client flags to
3942						 * online.
3943						 */
3944						MDI_DEBUG(4, (MDI_WARN, cdip,
3945						    "ndi_devi_offline failed: "
3946						    "error %x", rv));
3947						MDI_CLIENT_SET_ONLINE(ct);
3948					}
3949				}
3950			}
3951			/*
3952			 * Convert to MDI error code
3953			 */
3954			switch (rv) {
3955			case NDI_SUCCESS:
3956				rv = MDI_SUCCESS;
3957				break;
3958			case NDI_BUSY:
3959				rv = MDI_BUSY;
3960				break;
3961			default:
3962				rv = MDI_FAILURE;
3963				break;
3964			}
3965		}
3966		MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3967		i_mdi_report_path_state(ct, pip);
3968	}
3969
3970	MDI_CLIENT_UNLOCK(ct);
3971
3972	/*
3973	 * Change in the mdi_pathinfo node state will impact the client state
3974	 */
3975	MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
3976	    "ct = %p pip = %p", (void *)ct, (void *)pip));
3977	return (rv);
3978}
3979
3980/*
3981 * i_mdi_pi_online():
3982 *		Online a mdi_pathinfo node and call the vHCI driver's callback
3983 */
3984static int
3985i_mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3986{
3987	mdi_vhci_t	*vh = NULL;
3988	mdi_client_t	*ct = NULL;
3989	mdi_phci_t	*ph;
3990	int		(*f)();
3991	int		rv;
3992
3993	MDI_PI_LOCK(pip);
3994	ph = MDI_PI(pip)->pi_phci;
3995	vh = ph->ph_vhci;
3996	ct = MDI_PI(pip)->pi_client;
3997	MDI_PI_SET_ONLINING(pip)
3998	MDI_PI_UNLOCK(pip);
3999	f = vh->vh_ops->vo_pi_state_change;
4000	rv = MDI_SUCCESS;
4001	if (f != NULL)
4002		rv = (*f)(vh->vh_dip, pip, MDI_PATHINFO_STATE_ONLINE, 0, flags);
4003	MDI_CLIENT_LOCK(ct);
4004	MDI_PI_LOCK(pip);
4005	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
4006	MDI_PI_UNLOCK(pip);
4007	if (rv == MDI_SUCCESS) {
4008		dev_info_t	*cdip = ct->ct_dip;
4009
4010		i_mdi_client_update_state(ct);
4011		if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL ||
4012		    MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4013			if (cdip && !i_ddi_devi_attached(cdip)) {
4014				MDI_CLIENT_UNLOCK(ct);
4015				rv = ndi_devi_online(cdip, 0);
4016				MDI_CLIENT_LOCK(ct);
4017				if ((rv != NDI_SUCCESS) &&
4018				    (MDI_CLIENT_STATE(ct) ==
4019				    MDI_CLIENT_STATE_DEGRADED)) {
4020					MDI_CLIENT_SET_OFFLINE(ct);
4021				}
4022				if (rv != NDI_SUCCESS) {
4023					/* Reset the path state */
4024					MDI_PI_LOCK(pip);
4025					MDI_PI(pip)->pi_state =
4026					    MDI_PI_OLD_STATE(pip);
4027					MDI_PI_UNLOCK(pip);
4028				}
4029			}
4030		}
4031		switch (rv) {
4032		case NDI_SUCCESS:
4033			MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
4034			i_mdi_report_path_state(ct, pip);
4035			rv = MDI_SUCCESS;
4036			break;
4037		case NDI_BUSY:
4038			rv = MDI_BUSY;
4039			break;
4040		default:
4041			rv = MDI_FAILURE;
4042			break;
4043		}
4044	} else {
4045		/* Reset the path state */
4046		MDI_PI_LOCK(pip);
4047		MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
4048		MDI_PI_UNLOCK(pip);
4049	}
4050	MDI_CLIENT_UNLOCK(ct);
4051	return (rv);
4052}
4053
4054/*
4055 * mdi_pi_get_node_name():
4056 *              Get the name associated with a mdi_pathinfo node.
4057 *              Since pathinfo nodes are not directly named, we
4058 *              return the node_name of the client.
4059 *
4060 * Return Values:
4061 *              char *
4062 */
4063char *
4064mdi_pi_get_node_name(mdi_pathinfo_t *pip)
4065{
4066	mdi_client_t    *ct;
4067
4068	if (pip == NULL)
4069		return (NULL);
4070	ct = MDI_PI(pip)->pi_client;
4071	if ((ct == NULL) || (ct->ct_dip == NULL))
4072		return (NULL);
4073	return (ddi_node_name(ct->ct_dip));
4074}
4075
4076/*
4077 * mdi_pi_get_addr():
4078 *		Get the unit address associated with a mdi_pathinfo node
4079 *
4080 * Return Values:
4081 *		char *
4082 */
4083char *
4084mdi_pi_get_addr(mdi_pathinfo_t *pip)
4085{
4086	if (pip == NULL)
4087		return (NULL);
4088
4089	return (MDI_PI(pip)->pi_addr);
4090}
4091
4092/*
4093 * mdi_pi_get_path_instance():
4094 *		Get the 'path_instance' of a mdi_pathinfo node
4095 *
4096 * Return Values:
4097 *		path_instance
4098 */
4099int
4100mdi_pi_get_path_instance(mdi_pathinfo_t *pip)
4101{
4102	if (pip == NULL)
4103		return (0);
4104
4105	return (MDI_PI(pip)->pi_path_instance);
4106}
4107
4108/*
4109 * mdi_pi_pathname():
4110 *		Return pointer to path to pathinfo node.
4111 */
4112char *
4113mdi_pi_pathname(mdi_pathinfo_t *pip)
4114{
4115	if (pip == NULL)
4116		return (NULL);
4117	return (mdi_pi_pathname_by_instance(mdi_pi_get_path_instance(pip)));
4118}
4119
4120/*
4121 * mdi_pi_spathname():
4122 *		Return pointer to shortpath to pathinfo node. Used for debug
4123 *		messages, so return "" instead of NULL when unknown.
4124 */
4125char *
4126mdi_pi_spathname(mdi_pathinfo_t *pip)
4127{
4128	char	*spath = "";
4129
4130	if (pip) {
4131		spath = mdi_pi_spathname_by_instance(
4132		    mdi_pi_get_path_instance(pip));
4133		if (spath == NULL)
4134			spath = "";
4135	}
4136	return (spath);
4137}
4138
4139char *
4140mdi_pi_pathname_obp(mdi_pathinfo_t *pip, char *path)
4141{
4142	char *obp_path = NULL;
4143	if ((pip == NULL) || (path == NULL))
4144		return (NULL);
4145
4146	if (mdi_prop_lookup_string(pip, "obp-path", &obp_path) == MDI_SUCCESS) {
4147		(void) strcpy(path, obp_path);
4148		(void) mdi_prop_free(obp_path);
4149	} else {
4150		path = NULL;
4151	}
4152	return (path);
4153}
4154
4155int
4156mdi_pi_pathname_obp_set(mdi_pathinfo_t *pip, char *component)
4157{
4158	dev_info_t *pdip;
4159	char *obp_path = NULL;
4160	int rc = MDI_FAILURE;
4161
4162	if (pip == NULL)
4163		return (MDI_FAILURE);
4164
4165	pdip = mdi_pi_get_phci(pip);
4166	if (pdip == NULL)
4167		return (MDI_FAILURE);
4168
4169	obp_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4170
4171	if (ddi_pathname_obp(pdip, obp_path) == NULL) {
4172		(void) ddi_pathname(pdip, obp_path);
4173	}
4174
4175	if (component) {
4176		(void) strncat(obp_path, "/", MAXPATHLEN);
4177		(void) strncat(obp_path, component, MAXPATHLEN);
4178	}
4179	rc = mdi_prop_update_string(pip, "obp-path", obp_path);
4180
4181	if (obp_path)
4182		kmem_free(obp_path, MAXPATHLEN);
4183	return (rc);
4184}
4185
4186/*
4187 * mdi_pi_get_client():
4188 *		Get the client devinfo associated with a mdi_pathinfo node
4189 *
4190 * Return Values:
4191 *		Handle to client device dev_info node
4192 */
4193dev_info_t *
4194mdi_pi_get_client(mdi_pathinfo_t *pip)
4195{
4196	dev_info_t	*dip = NULL;
4197	if (pip) {
4198		dip = MDI_PI(pip)->pi_client->ct_dip;
4199	}
4200	return (dip);
4201}
4202
4203/*
4204 * mdi_pi_get_phci():
4205 *		Get the pHCI devinfo associated with the mdi_pathinfo node
4206 * Return Values:
4207 *		Handle to dev_info node
4208 */
4209dev_info_t *
4210mdi_pi_get_phci(mdi_pathinfo_t *pip)
4211{
4212	dev_info_t	*dip = NULL;
4213	mdi_phci_t	*ph;
4214
4215	if (pip) {
4216		ph = MDI_PI(pip)->pi_phci;
4217		if (ph)
4218			dip = ph->ph_dip;
4219	}
4220	return (dip);
4221}
4222
4223/*
4224 * mdi_pi_get_client_private():
4225 *		Get the client private information associated with the
4226 *		mdi_pathinfo node
4227 */
4228void *
4229mdi_pi_get_client_private(mdi_pathinfo_t *pip)
4230{
4231	void *cprivate = NULL;
4232	if (pip) {
4233		cprivate = MDI_PI(pip)->pi_cprivate;
4234	}
4235	return (cprivate);
4236}
4237
4238/*
4239 * mdi_pi_set_client_private():
4240 *		Set the client private information in the mdi_pathinfo node
4241 */
4242void
4243mdi_pi_set_client_private(mdi_pathinfo_t *pip, void *priv)
4244{
4245	if (pip) {
4246		MDI_PI(pip)->pi_cprivate = priv;
4247	}
4248}
4249
4250/*
4251 * mdi_pi_get_phci_private():
4252 *		Get the pHCI private information associated with the
4253 *		mdi_pathinfo node
4254 */
4255caddr_t
4256mdi_pi_get_phci_private(mdi_pathinfo_t *pip)
4257{
4258	caddr_t	pprivate = NULL;
4259
4260	if (pip) {
4261		pprivate = MDI_PI(pip)->pi_pprivate;
4262	}
4263	return (pprivate);
4264}
4265
4266/*
4267 * mdi_pi_set_phci_private():
4268 *		Set the pHCI private information in the mdi_pathinfo node
4269 */
4270void
4271mdi_pi_set_phci_private(mdi_pathinfo_t *pip, caddr_t priv)
4272{
4273	if (pip) {
4274		MDI_PI(pip)->pi_pprivate = priv;
4275	}
4276}
4277
4278/*
4279 * mdi_pi_get_state():
4280 *		Get the mdi_pathinfo node state. Transient states are internal
4281 *		and not provided to the users
4282 */
4283mdi_pathinfo_state_t
4284mdi_pi_get_state(mdi_pathinfo_t *pip)
4285{
4286	mdi_pathinfo_state_t    state = MDI_PATHINFO_STATE_INIT;
4287
4288	if (pip) {
4289		if (MDI_PI_IS_TRANSIENT(pip)) {
4290			/*
4291			 * mdi_pathinfo is in state transition.  Return the
4292			 * last good state.
4293			 */
4294			state = MDI_PI_OLD_STATE(pip);
4295		} else {
4296			state = MDI_PI_STATE(pip);
4297		}
4298	}
4299	return (state);
4300}
4301
4302/*
4303 * mdi_pi_get_flags():
4304 *		Get the mdi_pathinfo node flags.
4305 */
4306uint_t
4307mdi_pi_get_flags(mdi_pathinfo_t *pip)
4308{
4309	return (pip ? MDI_PI(pip)->pi_flags : 0);
4310}
4311
4312/*
4313 * Note that the following function needs to be the new interface for
4314 * mdi_pi_get_state when mpxio gets integrated to ON.
4315 */
4316int
4317mdi_pi_get_state2(mdi_pathinfo_t *pip, mdi_pathinfo_state_t *state,
4318		uint32_t *ext_state)
4319{
4320	*state = MDI_PATHINFO_STATE_INIT;
4321
4322	if (pip) {
4323		if (MDI_PI_IS_TRANSIENT(pip)) {
4324			/*
4325			 * mdi_pathinfo is in state transition.  Return the
4326			 * last good state.
4327			 */
4328			*state = MDI_PI_OLD_STATE(pip);
4329			*ext_state = MDI_PI_OLD_EXT_STATE(pip);
4330		} else {
4331			*state = MDI_PI_STATE(pip);
4332			*ext_state = MDI_PI_EXT_STATE(pip);
4333		}
4334	}
4335	return (MDI_SUCCESS);
4336}
4337
4338/*
4339 * mdi_pi_get_preferred:
4340 *	Get the preferred path flag
4341 */
4342int
4343mdi_pi_get_preferred(mdi_pathinfo_t *pip)
4344{
4345	if (pip) {
4346		return (MDI_PI(pip)->pi_preferred);
4347	}
4348	return (0);
4349}
4350
4351/*
4352 * mdi_pi_set_preferred:
4353 *	Set the preferred path flag
4354 */
4355void
4356mdi_pi_set_preferred(mdi_pathinfo_t *pip, int preferred)
4357{
4358	if (pip) {
4359		MDI_PI(pip)->pi_preferred = preferred;
4360	}
4361}
4362
4363/*
4364 * mdi_pi_set_state():
4365 *		Set the mdi_pathinfo node state
4366 */
4367void
4368mdi_pi_set_state(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state)
4369{
4370	uint32_t	ext_state;
4371
4372	if (pip) {
4373		ext_state = MDI_PI(pip)->pi_state & MDI_PATHINFO_EXT_STATE_MASK;
4374		MDI_PI(pip)->pi_state = state;
4375		MDI_PI(pip)->pi_state |= ext_state;
4376
4377		/* Path has changed state, invalidate DINFOCACHE snap shot. */
4378		i_ddi_di_cache_invalidate();
4379	}
4380}
4381
4382/*
4383 * Property functions:
4384 */
4385int
4386i_map_nvlist_error_to_mdi(int val)
4387{
4388	int rv;
4389
4390	switch (val) {
4391	case 0:
4392		rv = DDI_PROP_SUCCESS;
4393		break;
4394	case EINVAL:
4395	case ENOTSUP:
4396		rv = DDI_PROP_INVAL_ARG;
4397		break;
4398	case ENOMEM:
4399		rv = DDI_PROP_NO_MEMORY;
4400		break;
4401	default:
4402		rv = DDI_PROP_NOT_FOUND;
4403		break;
4404	}
4405	return (rv);
4406}
4407
4408/*
4409 * mdi_pi_get_next_prop():
4410 * 		Property walk function.  The caller should hold mdi_pi_lock()
4411 *		and release by calling mdi_pi_unlock() at the end of walk to
4412 *		get a consistent value.
4413 */
4414nvpair_t *
4415mdi_pi_get_next_prop(mdi_pathinfo_t *pip, nvpair_t *prev)
4416{
4417	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4418		return (NULL);
4419	}
4420	ASSERT(MDI_PI_LOCKED(pip));
4421	return (nvlist_next_nvpair(MDI_PI(pip)->pi_prop, prev));
4422}
4423
4424/*
4425 * mdi_prop_remove():
4426 * 		Remove the named property from the named list.
4427 */
4428int
4429mdi_prop_remove(mdi_pathinfo_t *pip, char *name)
4430{
4431	if (pip == NULL) {
4432		return (DDI_PROP_NOT_FOUND);
4433	}
4434	ASSERT(!MDI_PI_LOCKED(pip));
4435	MDI_PI_LOCK(pip);
4436	if (MDI_PI(pip)->pi_prop == NULL) {
4437		MDI_PI_UNLOCK(pip);
4438		return (DDI_PROP_NOT_FOUND);
4439	}
4440	if (name) {
4441		(void) nvlist_remove_all(MDI_PI(pip)->pi_prop, name);
4442	} else {
4443		char		nvp_name[MAXNAMELEN];
4444		nvpair_t	*nvp;
4445		nvp = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, NULL);
4446		while (nvp) {
4447			nvpair_t	*next;
4448			next = nvlist_next_nvpair(MDI_PI(pip)->pi_prop, nvp);
4449			(void) snprintf(nvp_name, sizeof(nvp_name), "%s",
4450			    nvpair_name(nvp));
4451			(void) nvlist_remove_all(MDI_PI(pip)->pi_prop,
4452			    nvp_name);
4453			nvp = next;
4454		}
4455	}
4456	MDI_PI_UNLOCK(pip);
4457	return (DDI_PROP_SUCCESS);
4458}
4459
4460/*
4461 * mdi_prop_size():
4462 * 		Get buffer size needed to pack the property data.
4463 * 		Caller should hold the mdi_pathinfo_t lock to get a consistent
4464 *		buffer size.
4465 */
4466int
4467mdi_prop_size(mdi_pathinfo_t *pip, size_t *buflenp)
4468{
4469	int	rv;
4470	size_t	bufsize;
4471
4472	*buflenp = 0;
4473	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4474		return (DDI_PROP_NOT_FOUND);
4475	}
4476	ASSERT(MDI_PI_LOCKED(pip));
4477	rv = nvlist_size(MDI_PI(pip)->pi_prop,
4478	    &bufsize, NV_ENCODE_NATIVE);
4479	*buflenp = bufsize;
4480	return (i_map_nvlist_error_to_mdi(rv));
4481}
4482
4483/*
4484 * mdi_prop_pack():
4485 * 		pack the property list.  The caller should hold the
4486 *		mdi_pathinfo_t node to get a consistent data
4487 */
4488int
4489mdi_prop_pack(mdi_pathinfo_t *pip, char **bufp, uint_t buflen)
4490{
4491	int	rv;
4492	size_t	bufsize;
4493
4494	if ((pip == NULL) || MDI_PI(pip)->pi_prop == NULL) {
4495		return (DDI_PROP_NOT_FOUND);
4496	}
4497
4498	ASSERT(MDI_PI_LOCKED(pip));
4499
4500	bufsize = buflen;
4501	rv = nvlist_pack(MDI_PI(pip)->pi_prop, bufp, (size_t *)&bufsize,
4502	    NV_ENCODE_NATIVE, KM_SLEEP);
4503
4504	return (i_map_nvlist_error_to_mdi(rv));
4505}
4506
4507/*
4508 * mdi_prop_update_byte():
4509 *		Create/Update a byte property
4510 */
4511int
4512mdi_prop_update_byte(mdi_pathinfo_t *pip, char *name, uchar_t data)
4513{
4514	int rv;
4515
4516	if (pip == NULL) {
4517		return (DDI_PROP_INVAL_ARG);
4518	}
4519	ASSERT(!MDI_PI_LOCKED(pip));
4520	MDI_PI_LOCK(pip);
4521	if (MDI_PI(pip)->pi_prop == NULL) {
4522		MDI_PI_UNLOCK(pip);
4523		return (DDI_PROP_NOT_FOUND);
4524	}
4525	rv = nvlist_add_byte(MDI_PI(pip)->pi_prop, name, data);
4526	MDI_PI_UNLOCK(pip);
4527	return (i_map_nvlist_error_to_mdi(rv));
4528}
4529
4530/*
4531 * mdi_prop_update_byte_array():
4532 *		Create/Update a byte array property
4533 */
4534int
4535mdi_prop_update_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t *data,
4536    uint_t nelements)
4537{
4538	int rv;
4539
4540	if (pip == NULL) {
4541		return (DDI_PROP_INVAL_ARG);
4542	}
4543	ASSERT(!MDI_PI_LOCKED(pip));
4544	MDI_PI_LOCK(pip);
4545	if (MDI_PI(pip)->pi_prop == NULL) {
4546		MDI_PI_UNLOCK(pip);
4547		return (DDI_PROP_NOT_FOUND);
4548	}
4549	rv = nvlist_add_byte_array(MDI_PI(pip)->pi_prop, name, data, nelements);
4550	MDI_PI_UNLOCK(pip);
4551	return (i_map_nvlist_error_to_mdi(rv));
4552}
4553
4554/*
4555 * mdi_prop_update_int():
4556 *		Create/Update a 32 bit integer property
4557 */
4558int
4559mdi_prop_update_int(mdi_pathinfo_t *pip, char *name, int data)
4560{
4561	int rv;
4562
4563	if (pip == NULL) {
4564		return (DDI_PROP_INVAL_ARG);
4565	}
4566	ASSERT(!MDI_PI_LOCKED(pip));
4567	MDI_PI_LOCK(pip);
4568	if (MDI_PI(pip)->pi_prop == NULL) {
4569		MDI_PI_UNLOCK(pip);
4570		return (DDI_PROP_NOT_FOUND);
4571	}
4572	rv = nvlist_add_int32(MDI_PI(pip)->pi_prop, name, (int32_t)data);
4573	MDI_PI_UNLOCK(pip);
4574	return (i_map_nvlist_error_to_mdi(rv));
4575}
4576
4577/*
4578 * mdi_prop_update_int64():
4579 *		Create/Update a 64 bit integer property
4580 */
4581int
4582mdi_prop_update_int64(mdi_pathinfo_t *pip, char *name, int64_t data)
4583{
4584	int rv;
4585
4586	if (pip == NULL) {
4587		return (DDI_PROP_INVAL_ARG);
4588	}
4589	ASSERT(!MDI_PI_LOCKED(pip));
4590	MDI_PI_LOCK(pip);
4591	if (MDI_PI(pip)->pi_prop == NULL) {
4592		MDI_PI_UNLOCK(pip);
4593		return (DDI_PROP_NOT_FOUND);
4594	}
4595	rv = nvlist_add_int64(MDI_PI(pip)->pi_prop, name, data);
4596	MDI_PI_UNLOCK(pip);
4597	return (i_map_nvlist_error_to_mdi(rv));
4598}
4599
4600/*
4601 * mdi_prop_update_int_array():
4602 *		Create/Update a int array property
4603 */
4604int
4605mdi_prop_update_int_array(mdi_pathinfo_t *pip, char *name, int *data,
4606	    uint_t nelements)
4607{
4608	int rv;
4609
4610	if (pip == NULL) {
4611		return (DDI_PROP_INVAL_ARG);
4612	}
4613	ASSERT(!MDI_PI_LOCKED(pip));
4614	MDI_PI_LOCK(pip);
4615	if (MDI_PI(pip)->pi_prop == NULL) {
4616		MDI_PI_UNLOCK(pip);
4617		return (DDI_PROP_NOT_FOUND);
4618	}
4619	rv = nvlist_add_int32_array(MDI_PI(pip)->pi_prop, name, (int32_t *)data,
4620	    nelements);
4621	MDI_PI_UNLOCK(pip);
4622	return (i_map_nvlist_error_to_mdi(rv));
4623}
4624
4625/*
4626 * mdi_prop_update_string():
4627 *		Create/Update a string property
4628 */
4629int
4630mdi_prop_update_string(mdi_pathinfo_t *pip, char *name, char *data)
4631{
4632	int rv;
4633
4634	if (pip == NULL) {
4635		return (DDI_PROP_INVAL_ARG);
4636	}
4637	ASSERT(!MDI_PI_LOCKED(pip));
4638	MDI_PI_LOCK(pip);
4639	if (MDI_PI(pip)->pi_prop == NULL) {
4640		MDI_PI_UNLOCK(pip);
4641		return (DDI_PROP_NOT_FOUND);
4642	}
4643	rv = nvlist_add_string(MDI_PI(pip)->pi_prop, name, data);
4644	MDI_PI_UNLOCK(pip);
4645	return (i_map_nvlist_error_to_mdi(rv));
4646}
4647
4648/*
4649 * mdi_prop_update_string_array():
4650 *		Create/Update a string array property
4651 */
4652int
4653mdi_prop_update_string_array(mdi_pathinfo_t *pip, char *name, char **data,
4654    uint_t nelements)
4655{
4656	int rv;
4657
4658	if (pip == NULL) {
4659		return (DDI_PROP_INVAL_ARG);
4660	}
4661	ASSERT(!MDI_PI_LOCKED(pip));
4662	MDI_PI_LOCK(pip);
4663	if (MDI_PI(pip)->pi_prop == NULL) {
4664		MDI_PI_UNLOCK(pip);
4665		return (DDI_PROP_NOT_FOUND);
4666	}
4667	rv = nvlist_add_string_array(MDI_PI(pip)->pi_prop, name, data,
4668	    nelements);
4669	MDI_PI_UNLOCK(pip);
4670	return (i_map_nvlist_error_to_mdi(rv));
4671}
4672
4673/*
4674 * mdi_prop_lookup_byte():
4675 * 		Look for byte property identified by name.  The data returned
4676 *		is the actual property and valid as long as mdi_pathinfo_t node
4677 *		is alive.
4678 */
4679int
4680mdi_prop_lookup_byte(mdi_pathinfo_t *pip, char *name, uchar_t *data)
4681{
4682	int rv;
4683
4684	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4685		return (DDI_PROP_NOT_FOUND);
4686	}
4687	rv = nvlist_lookup_byte(MDI_PI(pip)->pi_prop, name, data);
4688	return (i_map_nvlist_error_to_mdi(rv));
4689}
4690
4691
4692/*
4693 * mdi_prop_lookup_byte_array():
4694 * 		Look for byte array property identified by name.  The data
4695 *		returned is the actual property and valid as long as
4696 *		mdi_pathinfo_t node is alive.
4697 */
4698int
4699mdi_prop_lookup_byte_array(mdi_pathinfo_t *pip, char *name, uchar_t **data,
4700    uint_t *nelements)
4701{
4702	int rv;
4703
4704	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4705		return (DDI_PROP_NOT_FOUND);
4706	}
4707	rv = nvlist_lookup_byte_array(MDI_PI(pip)->pi_prop, name, data,
4708	    nelements);
4709	return (i_map_nvlist_error_to_mdi(rv));
4710}
4711
4712/*
4713 * mdi_prop_lookup_int():
4714 * 		Look for int property identified by name.  The data returned
4715 *		is the actual property and valid as long as mdi_pathinfo_t
4716 *		node is alive.
4717 */
4718int
4719mdi_prop_lookup_int(mdi_pathinfo_t *pip, char *name, int *data)
4720{
4721	int rv;
4722
4723	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4724		return (DDI_PROP_NOT_FOUND);
4725	}
4726	rv = nvlist_lookup_int32(MDI_PI(pip)->pi_prop, name, (int32_t *)data);
4727	return (i_map_nvlist_error_to_mdi(rv));
4728}
4729
4730/*
4731 * mdi_prop_lookup_int64():
4732 * 		Look for int64 property identified by name.  The data returned
4733 *		is the actual property and valid as long as mdi_pathinfo_t node
4734 *		is alive.
4735 */
4736int
4737mdi_prop_lookup_int64(mdi_pathinfo_t *pip, char *name, int64_t *data)
4738{
4739	int rv;
4740	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4741		return (DDI_PROP_NOT_FOUND);
4742	}
4743	rv = nvlist_lookup_int64(MDI_PI(pip)->pi_prop, name, data);
4744	return (i_map_nvlist_error_to_mdi(rv));
4745}
4746
4747/*
4748 * mdi_prop_lookup_int_array():
4749 * 		Look for int array property identified by name.  The data
4750 *		returned is the actual property and valid as long as
4751 *		mdi_pathinfo_t node is alive.
4752 */
4753int
4754mdi_prop_lookup_int_array(mdi_pathinfo_t *pip, char *name, int **data,
4755    uint_t *nelements)
4756{
4757	int rv;
4758
4759	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4760		return (DDI_PROP_NOT_FOUND);
4761	}
4762	rv = nvlist_lookup_int32_array(MDI_PI(pip)->pi_prop, name,
4763	    (int32_t **)data, nelements);
4764	return (i_map_nvlist_error_to_mdi(rv));
4765}
4766
4767/*
4768 * mdi_prop_lookup_string():
4769 * 		Look for string property identified by name.  The data
4770 *		returned is the actual property and valid as long as
4771 *		mdi_pathinfo_t node is alive.
4772 */
4773int
4774mdi_prop_lookup_string(mdi_pathinfo_t *pip, char *name, char **data)
4775{
4776	int rv;
4777
4778	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4779		return (DDI_PROP_NOT_FOUND);
4780	}
4781	rv = nvlist_lookup_string(MDI_PI(pip)->pi_prop, name, data);
4782	return (i_map_nvlist_error_to_mdi(rv));
4783}
4784
4785/*
4786 * mdi_prop_lookup_string_array():
4787 * 		Look for string array property identified by name.  The data
4788 *		returned is the actual property and valid as long as
4789 *		mdi_pathinfo_t node is alive.
4790 */
4791int
4792mdi_prop_lookup_string_array(mdi_pathinfo_t *pip, char *name, char ***data,
4793    uint_t *nelements)
4794{
4795	int rv;
4796
4797	if ((pip == NULL) || (MDI_PI(pip)->pi_prop == NULL)) {
4798		return (DDI_PROP_NOT_FOUND);
4799	}
4800	rv = nvlist_lookup_string_array(MDI_PI(pip)->pi_prop, name, data,
4801	    nelements);
4802	return (i_map_nvlist_error_to_mdi(rv));
4803}
4804
4805/*
4806 * mdi_prop_free():
4807 * 		Symmetrical function to ddi_prop_free(). nvlist_lookup_xx()
4808 *		functions return the pointer to actual property data and not a
4809 *		copy of it.  So the data returned is valid as long as
4810 *		mdi_pathinfo_t node is valid.
4811 */
4812/*ARGSUSED*/
4813int
4814mdi_prop_free(void *data)
4815{
4816	return (DDI_PROP_SUCCESS);
4817}
4818
4819/*ARGSUSED*/
4820static void
4821i_mdi_report_path_state(mdi_client_t *ct, mdi_pathinfo_t *pip)
4822{
4823	char		*ct_path;
4824	char		*ct_status;
4825	char		*status;
4826	dev_info_t	*cdip = ct->ct_dip;
4827	char		lb_buf[64];
4828	int		report_lb_c = 0, report_lb_p = 0;
4829
4830	ASSERT(MDI_CLIENT_LOCKED(ct));
4831	if ((cdip == NULL) || (ddi_get_instance(cdip) == -1) ||
4832	    (MDI_CLIENT_IS_REPORT_DEV_NEEDED(ct) == 0)) {
4833		return;
4834	}
4835	if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_OPTIMAL) {
4836		ct_status = "optimal";
4837		report_lb_c = 1;
4838	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED) {
4839		ct_status = "degraded";
4840	} else if (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_FAILED) {
4841		ct_status = "failed";
4842	} else {
4843		ct_status = "unknown";
4844	}
4845
4846	lb_buf[0] = 0;		/* not interested in load balancing config */
4847
4848	if (MDI_PI_FLAGS_IS_DEVICE_REMOVED(pip)) {
4849		status = "removed";
4850	} else if (MDI_PI_IS_OFFLINE(pip)) {
4851		status = "offline";
4852	} else if (MDI_PI_IS_ONLINE(pip)) {
4853		status = "online";
4854		report_lb_p = 1;
4855	} else if (MDI_PI_IS_STANDBY(pip)) {
4856		status = "standby";
4857	} else if (MDI_PI_IS_FAULT(pip)) {
4858		status = "faulted";
4859	} else {
4860		status = "unknown";
4861	}
4862
4863	if (cdip) {
4864		ct_path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
4865
4866		/*
4867		 * NOTE: Keeping "multipath status: %s" and
4868		 * "Load balancing: %s" format unchanged in case someone
4869		 * scrubs /var/adm/messages looking for these messages.
4870		 */
4871		if (report_lb_c && report_lb_p) {
4872			if (ct->ct_lb == LOAD_BALANCE_LBA) {
4873				(void) snprintf(lb_buf, sizeof (lb_buf),
4874				    "%s, region-size: %d", mdi_load_balance_lba,
4875				    ct->ct_lb_args->region_size);
4876			} else if (ct->ct_lb == LOAD_BALANCE_NONE) {
4877				(void) snprintf(lb_buf, sizeof (lb_buf),
4878				    "%s", mdi_load_balance_none);
4879			} else {
4880				(void) snprintf(lb_buf, sizeof (lb_buf), "%s",
4881				    mdi_load_balance_rr);
4882			}
4883
4884			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4885			    "?%s (%s%d) multipath status: %s: "
4886			    "path %d %s is %s: Load balancing: %s\n",
4887			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4888			    ddi_get_instance(cdip), ct_status,
4889			    mdi_pi_get_path_instance(pip),
4890			    mdi_pi_spathname(pip), status, lb_buf);
4891		} else {
4892			cmn_err(mdi_debug_consoleonly ? CE_NOTE : CE_CONT,
4893			    "?%s (%s%d) multipath status: %s: "
4894			    "path %d %s is %s\n",
4895			    ddi_pathname(cdip, ct_path), ddi_driver_name(cdip),
4896			    ddi_get_instance(cdip), ct_status,
4897			    mdi_pi_get_path_instance(pip),
4898			    mdi_pi_spathname(pip), status);
4899		}
4900
4901		kmem_free(ct_path, MAXPATHLEN);
4902		MDI_CLIENT_CLEAR_REPORT_DEV_NEEDED(ct);
4903	}
4904}
4905
4906#ifdef	DEBUG
4907/*
4908 * i_mdi_log():
4909 *		Utility function for error message management
4910 *
4911 *		NOTE: Implementation takes care of trailing \n for cmn_err,
4912 *		MDI_DEBUG should not terminate fmt strings with \n.
4913 *
4914 *		NOTE: If the level is >= 2, and there is no leading !?^
4915 *		then a leading ! is implied (but can be overriden via
4916 *		mdi_debug_consoleonly). If you are using kmdb on the console,
4917 *		consider setting mdi_debug_consoleonly to 1 as an aid.
4918 */
4919/*PRINTFLIKE4*/
4920static void
4921i_mdi_log(int level, const char *func, dev_info_t *dip, const char *fmt, ...)
4922{
4923	char		name[MAXNAMELEN];
4924	char		buf[512];
4925	char		*bp;
4926	va_list		ap;
4927	int		log_only = 0;
4928	int		boot_only = 0;
4929	int		console_only = 0;
4930
4931	if (dip) {
4932		(void) snprintf(name, sizeof(name), "%s%d: ",
4933		    ddi_driver_name(dip), ddi_get_instance(dip));
4934	} else {
4935		name[0] = 0;
4936	}
4937
4938	va_start(ap, fmt);
4939	(void) vsnprintf(buf, sizeof(buf), fmt, ap);
4940	va_end(ap);
4941
4942	switch (buf[0]) {
4943	case '!':
4944		bp = &buf[1];
4945		log_only = 1;
4946		break;
4947	case '?':
4948		bp = &buf[1];
4949		boot_only = 1;
4950		break;
4951	case '^':
4952		bp = &buf[1];
4953		console_only = 1;
4954		break;
4955	default:
4956		if (level >= 2)
4957			log_only = 1;		/* ! implied */
4958		bp = buf;
4959		break;
4960	}
4961	if (mdi_debug_logonly) {
4962		log_only = 1;
4963		boot_only = 0;
4964		console_only = 0;
4965	}
4966	if (mdi_debug_consoleonly) {
4967		log_only = 0;
4968		boot_only = 0;
4969		console_only = 1;
4970		level = CE_NOTE;
4971		goto console;
4972	}
4973
4974	switch (level) {
4975	case CE_NOTE:
4976		level = CE_CONT;
4977		/* FALLTHROUGH */
4978	case CE_CONT:
4979		if (boot_only) {
4980			cmn_err(level, "?mdi: %s%s: %s\n", name, func, bp);
4981		} else if (console_only) {
4982			cmn_err(level, "^mdi: %s%s: %s\n", name, func, bp);
4983		} else if (log_only) {
4984			cmn_err(level, "!mdi: %s%s: %s\n", name, func, bp);
4985		} else {
4986			cmn_err(level, "mdi: %s%s: %s\n", name, func, bp);
4987		}
4988		break;
4989
4990	case CE_WARN:
4991	case CE_PANIC:
4992	console:
4993		if (boot_only) {
4994			cmn_err(level, "?mdi: %s%s: %s", name, func, bp);
4995		} else if (console_only) {
4996			cmn_err(level, "^mdi: %s%s: %s", name, func, bp);
4997		} else if (log_only) {
4998			cmn_err(level, "!mdi: %s%s: %s", name, func, bp);
4999		} else {
5000			cmn_err(level, "mdi: %s%s: %s", name, func, bp);
5001		}
5002		break;
5003	default:
5004		cmn_err(level, "mdi: %s%s", name, bp);
5005		break;
5006	}
5007}
5008#endif	/* DEBUG */
5009
5010void
5011i_mdi_client_online(dev_info_t *ct_dip)
5012{
5013	mdi_client_t	*ct;
5014
5015	/*
5016	 * Client online notification. Mark client state as online
5017	 * restore our binding with dev_info node
5018	 */
5019	ct = i_devi_get_client(ct_dip);
5020	ASSERT(ct != NULL);
5021	MDI_CLIENT_LOCK(ct);
5022	MDI_CLIENT_SET_ONLINE(ct);
5023	/* catch for any memory leaks */
5024	ASSERT((ct->ct_dip == NULL) || (ct->ct_dip == ct_dip));
5025	ct->ct_dip = ct_dip;
5026
5027	if (ct->ct_power_cnt == 0)
5028		(void) i_mdi_power_all_phci(ct);
5029
5030	MDI_DEBUG(4, (MDI_NOTE, ct_dip,
5031	    "i_mdi_pm_hold_client %p", (void *)ct));
5032	i_mdi_pm_hold_client(ct, 1);
5033
5034	MDI_CLIENT_UNLOCK(ct);
5035}
5036
5037void
5038i_mdi_phci_online(dev_info_t *ph_dip)
5039{
5040	mdi_phci_t	*ph;
5041
5042	/* pHCI online notification. Mark state accordingly */
5043	ph = i_devi_get_phci(ph_dip);
5044	ASSERT(ph != NULL);
5045	MDI_PHCI_LOCK(ph);
5046	MDI_PHCI_SET_ONLINE(ph);
5047	MDI_PHCI_UNLOCK(ph);
5048}
5049
5050/*
5051 * mdi_devi_online():
5052 * 		Online notification from NDI framework on pHCI/client
5053 *		device online.
5054 * Return Values:
5055 *		NDI_SUCCESS
5056 *		MDI_FAILURE
5057 */
5058/*ARGSUSED*/
5059int
5060mdi_devi_online(dev_info_t *dip, uint_t flags)
5061{
5062	if (MDI_PHCI(dip)) {
5063		i_mdi_phci_online(dip);
5064	}
5065
5066	if (MDI_CLIENT(dip)) {
5067		i_mdi_client_online(dip);
5068	}
5069	return (NDI_SUCCESS);
5070}
5071
5072/*
5073 * mdi_devi_offline():
5074 * 		Offline notification from NDI framework on pHCI/Client device
5075 *		offline.
5076 *
5077 * Return Values:
5078 *		NDI_SUCCESS
5079 *		NDI_FAILURE
5080 */
5081/*ARGSUSED*/
5082int
5083mdi_devi_offline(dev_info_t *dip, uint_t flags)
5084{
5085	int		rv = NDI_SUCCESS;
5086
5087	if (MDI_CLIENT(dip)) {
5088		rv = i_mdi_client_offline(dip, flags);
5089		if (rv != NDI_SUCCESS)
5090			return (rv);
5091	}
5092
5093	if (MDI_PHCI(dip)) {
5094		rv = i_mdi_phci_offline(dip, flags);
5095
5096		if ((rv != NDI_SUCCESS) && MDI_CLIENT(dip)) {
5097			/* set client back online */
5098			i_mdi_client_online(dip);
5099		}
5100	}
5101
5102	return (rv);
5103}
5104
5105/*ARGSUSED*/
5106static int
5107i_mdi_phci_offline(dev_info_t *dip, uint_t flags)
5108{
5109	int		rv = NDI_SUCCESS;
5110	mdi_phci_t	*ph;
5111	mdi_client_t	*ct;
5112	mdi_pathinfo_t	*pip;
5113	mdi_pathinfo_t	*next;
5114	mdi_pathinfo_t	*failed_pip = NULL;
5115	dev_info_t	*cdip;
5116
5117	/*
5118	 * pHCI component offline notification
5119	 * Make sure that this pHCI instance is free to be offlined.
5120	 * If it is OK to proceed, Offline and remove all the child
5121	 * mdi_pathinfo nodes.  This process automatically offlines
5122	 * corresponding client devices, for which this pHCI provides
5123	 * critical services.
5124	 */
5125	ph = i_devi_get_phci(dip);
5126	MDI_DEBUG(2, (MDI_NOTE, dip,
5127	    "called %p %p", (void *)dip, (void *)ph));
5128	if (ph == NULL) {
5129		return (rv);
5130	}
5131
5132	MDI_PHCI_LOCK(ph);
5133
5134	if (MDI_PHCI_IS_OFFLINE(ph)) {
5135		MDI_DEBUG(1, (MDI_WARN, dip,
5136		    "!pHCI already offlined: %p", (void *)dip));
5137		MDI_PHCI_UNLOCK(ph);
5138		return (NDI_SUCCESS);
5139	}
5140
5141	/*
5142	 * Check to see if the pHCI can be offlined
5143	 */
5144	if (ph->ph_unstable) {
5145		MDI_DEBUG(1, (MDI_WARN, dip,
5146		    "!One or more target devices are in transient state. "
5147		    "This device can not be removed at this moment. "
5148		    "Please try again later."));
5149		MDI_PHCI_UNLOCK(ph);
5150		return (NDI_BUSY);
5151	}
5152
5153	pip = ph->ph_path_head;
5154	while (pip != NULL) {
5155		MDI_PI_LOCK(pip);
5156		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5157
5158		/*
5159		 * The mdi_pathinfo state is OK. Check the client state.
5160		 * If failover in progress fail the pHCI from offlining
5161		 */
5162		ct = MDI_PI(pip)->pi_client;
5163		i_mdi_client_lock(ct, pip);
5164		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5165		    (ct->ct_unstable)) {
5166			/*
5167			 * Failover is in progress, Fail the DR
5168			 */
5169			MDI_DEBUG(1, (MDI_WARN, dip,
5170			    "!pHCI device is busy. "
5171			    "This device can not be removed at this moment. "
5172			    "Please try again later."));
5173			MDI_PI_UNLOCK(pip);
5174			i_mdi_client_unlock(ct);
5175			MDI_PHCI_UNLOCK(ph);
5176			return (NDI_BUSY);
5177		}
5178		MDI_PI_UNLOCK(pip);
5179
5180		/*
5181		 * Check to see of we are removing the last path of this
5182		 * client device...
5183		 */
5184		cdip = ct->ct_dip;
5185		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5186		    (i_mdi_client_compute_state(ct, ph) ==
5187		    MDI_CLIENT_STATE_FAILED)) {
5188			i_mdi_client_unlock(ct);
5189			MDI_PHCI_UNLOCK(ph);
5190			if (ndi_devi_offline(cdip,
5191			    NDI_DEVFS_CLEAN) != NDI_SUCCESS) {
5192				/*
5193				 * ndi_devi_offline() failed.
5194				 * This pHCI provides the critical path
5195				 * to one or more client devices.
5196				 * Return busy.
5197				 */
5198				MDI_PHCI_LOCK(ph);
5199				MDI_DEBUG(1, (MDI_WARN, dip,
5200				    "!pHCI device is busy. "
5201				    "This device can not be removed at this "
5202				    "moment. Please try again later."));
5203				failed_pip = pip;
5204				break;
5205			} else {
5206				MDI_PHCI_LOCK(ph);
5207				pip = next;
5208			}
5209		} else {
5210			i_mdi_client_unlock(ct);
5211			pip = next;
5212		}
5213	}
5214
5215	if (failed_pip) {
5216		pip = ph->ph_path_head;
5217		while (pip != failed_pip) {
5218			MDI_PI_LOCK(pip);
5219			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5220			ct = MDI_PI(pip)->pi_client;
5221			i_mdi_client_lock(ct, pip);
5222			cdip = ct->ct_dip;
5223			switch (MDI_CLIENT_STATE(ct)) {
5224			case MDI_CLIENT_STATE_OPTIMAL:
5225			case MDI_CLIENT_STATE_DEGRADED:
5226				if (cdip) {
5227					MDI_PI_UNLOCK(pip);
5228					i_mdi_client_unlock(ct);
5229					MDI_PHCI_UNLOCK(ph);
5230					(void) ndi_devi_online(cdip, 0);
5231					MDI_PHCI_LOCK(ph);
5232					pip = next;
5233					continue;
5234				}
5235				break;
5236
5237			case MDI_CLIENT_STATE_FAILED:
5238				if (cdip) {
5239					MDI_PI_UNLOCK(pip);
5240					i_mdi_client_unlock(ct);
5241					MDI_PHCI_UNLOCK(ph);
5242					(void) ndi_devi_offline(cdip,
5243						NDI_DEVFS_CLEAN);
5244					MDI_PHCI_LOCK(ph);
5245					pip = next;
5246					continue;
5247				}
5248				break;
5249			}
5250			MDI_PI_UNLOCK(pip);
5251			i_mdi_client_unlock(ct);
5252			pip = next;
5253		}
5254		MDI_PHCI_UNLOCK(ph);
5255		return (NDI_BUSY);
5256	}
5257
5258	/*
5259	 * Mark the pHCI as offline
5260	 */
5261	MDI_PHCI_SET_OFFLINE(ph);
5262
5263	/*
5264	 * Mark the child mdi_pathinfo nodes as transient
5265	 */
5266	pip = ph->ph_path_head;
5267	while (pip != NULL) {
5268		MDI_PI_LOCK(pip);
5269		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5270		MDI_PI_SET_OFFLINING(pip);
5271		MDI_PI_UNLOCK(pip);
5272		pip = next;
5273	}
5274	MDI_PHCI_UNLOCK(ph);
5275	/*
5276	 * Give a chance for any pending commands to execute
5277	 */
5278	delay_random(mdi_delay);
5279	MDI_PHCI_LOCK(ph);
5280	pip = ph->ph_path_head;
5281	while (pip != NULL) {
5282		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5283		(void) i_mdi_pi_offline(pip, flags);
5284		MDI_PI_LOCK(pip);
5285		ct = MDI_PI(pip)->pi_client;
5286		if (!MDI_PI_IS_OFFLINE(pip)) {
5287			MDI_DEBUG(1, (MDI_WARN, dip,
5288			    "!pHCI device is busy. "
5289			    "This device can not be removed at this moment. "
5290			    "Please try again later."));
5291			MDI_PI_UNLOCK(pip);
5292			MDI_PHCI_SET_ONLINE(ph);
5293			MDI_PHCI_UNLOCK(ph);
5294			return (NDI_BUSY);
5295		}
5296		MDI_PI_UNLOCK(pip);
5297		pip = next;
5298	}
5299	MDI_PHCI_UNLOCK(ph);
5300
5301	return (rv);
5302}
5303
5304void
5305mdi_phci_mark_retiring(dev_info_t *dip, char **cons_array)
5306{
5307	mdi_phci_t	*ph;
5308	mdi_client_t	*ct;
5309	mdi_pathinfo_t	*pip;
5310	mdi_pathinfo_t	*next;
5311	dev_info_t	*cdip;
5312
5313	if (!MDI_PHCI(dip))
5314		return;
5315
5316	ph = i_devi_get_phci(dip);
5317	if (ph == NULL) {
5318		return;
5319	}
5320
5321	MDI_PHCI_LOCK(ph);
5322
5323	if (MDI_PHCI_IS_OFFLINE(ph)) {
5324		/* has no last path */
5325		MDI_PHCI_UNLOCK(ph);
5326		return;
5327	}
5328
5329	pip = ph->ph_path_head;
5330	while (pip != NULL) {
5331		MDI_PI_LOCK(pip);
5332		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5333
5334		ct = MDI_PI(pip)->pi_client;
5335		i_mdi_client_lock(ct, pip);
5336		MDI_PI_UNLOCK(pip);
5337
5338		cdip = ct->ct_dip;
5339		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5340		    (i_mdi_client_compute_state(ct, ph) ==
5341		    MDI_CLIENT_STATE_FAILED)) {
5342			/* Last path. Mark client dip as retiring */
5343			i_mdi_client_unlock(ct);
5344			MDI_PHCI_UNLOCK(ph);
5345			(void) e_ddi_mark_retiring(cdip, cons_array);
5346			MDI_PHCI_LOCK(ph);
5347			pip = next;
5348		} else {
5349			i_mdi_client_unlock(ct);
5350			pip = next;
5351		}
5352	}
5353
5354	MDI_PHCI_UNLOCK(ph);
5355
5356	return;
5357}
5358
5359void
5360mdi_phci_retire_notify(dev_info_t *dip, int *constraint)
5361{
5362	mdi_phci_t	*ph;
5363	mdi_client_t	*ct;
5364	mdi_pathinfo_t	*pip;
5365	mdi_pathinfo_t	*next;
5366	dev_info_t	*cdip;
5367
5368	if (!MDI_PHCI(dip))
5369		return;
5370
5371	ph = i_devi_get_phci(dip);
5372	if (ph == NULL)
5373		return;
5374
5375	MDI_PHCI_LOCK(ph);
5376
5377	if (MDI_PHCI_IS_OFFLINE(ph)) {
5378		MDI_PHCI_UNLOCK(ph);
5379		/* not last path */
5380		return;
5381	}
5382
5383	if (ph->ph_unstable) {
5384		MDI_PHCI_UNLOCK(ph);
5385		/* can't check for constraints */
5386		*constraint = 0;
5387		return;
5388	}
5389
5390	pip = ph->ph_path_head;
5391	while (pip != NULL) {
5392		MDI_PI_LOCK(pip);
5393		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5394
5395		/*
5396		 * The mdi_pathinfo state is OK. Check the client state.
5397		 * If failover in progress fail the pHCI from offlining
5398		 */
5399		ct = MDI_PI(pip)->pi_client;
5400		i_mdi_client_lock(ct, pip);
5401		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5402		    (ct->ct_unstable)) {
5403			/*
5404			 * Failover is in progress, can't check for constraints
5405			 */
5406			MDI_PI_UNLOCK(pip);
5407			i_mdi_client_unlock(ct);
5408			MDI_PHCI_UNLOCK(ph);
5409			*constraint = 0;
5410			return;
5411		}
5412		MDI_PI_UNLOCK(pip);
5413
5414		/*
5415		 * Check to see of we are retiring the last path of this
5416		 * client device...
5417		 */
5418		cdip = ct->ct_dip;
5419		if (cdip && (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5420		    (i_mdi_client_compute_state(ct, ph) ==
5421		    MDI_CLIENT_STATE_FAILED)) {
5422			i_mdi_client_unlock(ct);
5423			MDI_PHCI_UNLOCK(ph);
5424			(void) e_ddi_retire_notify(cdip, constraint);
5425			MDI_PHCI_LOCK(ph);
5426			pip = next;
5427		} else {
5428			i_mdi_client_unlock(ct);
5429			pip = next;
5430		}
5431	}
5432
5433	MDI_PHCI_UNLOCK(ph);
5434
5435	return;
5436}
5437
5438/*
5439 * offline the path(s) hanging off the pHCI. If the
5440 * last path to any client, check that constraints
5441 * have been applied.
5442 *
5443 * If constraint is 0, we aren't going to retire the
5444 * pHCI. However we still need to go through the paths
5445 * calling e_ddi_retire_finalize() to clear their
5446 * contract barriers.
5447 */
5448void
5449mdi_phci_retire_finalize(dev_info_t *dip, int phci_only, void *constraint)
5450{
5451	mdi_phci_t	*ph;
5452	mdi_client_t	*ct;
5453	mdi_pathinfo_t	*pip;
5454	mdi_pathinfo_t	*next;
5455	dev_info_t	*cdip;
5456	int		unstable = 0;
5457	int		tmp_constraint;
5458
5459	if (!MDI_PHCI(dip))
5460		return;
5461
5462	ph = i_devi_get_phci(dip);
5463	if (ph == NULL) {
5464		/* no last path and no pips */
5465		return;
5466	}
5467
5468	MDI_PHCI_LOCK(ph);
5469
5470	if (MDI_PHCI_IS_OFFLINE(ph)) {
5471		MDI_PHCI_UNLOCK(ph);
5472		/* no last path and no pips */
5473		return;
5474	}
5475
5476	/*
5477	 * Check to see if the pHCI can be offlined
5478	 */
5479	if (ph->ph_unstable) {
5480		unstable = 1;
5481	}
5482
5483	pip = ph->ph_path_head;
5484	while (pip != NULL) {
5485		MDI_PI_LOCK(pip);
5486		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5487
5488		/*
5489		 * if failover in progress fail the pHCI from offlining
5490		 */
5491		ct = MDI_PI(pip)->pi_client;
5492		i_mdi_client_lock(ct, pip);
5493		if ((MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) ||
5494		    (ct->ct_unstable)) {
5495			unstable = 1;
5496		}
5497		MDI_PI_UNLOCK(pip);
5498
5499		/*
5500		 * Check to see of we are removing the last path of this
5501		 * client device...
5502		 */
5503		cdip = ct->ct_dip;
5504		if (!phci_only && cdip &&
5505		    (i_ddi_node_state(cdip) >= DS_INITIALIZED) &&
5506		    (i_mdi_client_compute_state(ct, ph) ==
5507		    MDI_CLIENT_STATE_FAILED)) {
5508			i_mdi_client_unlock(ct);
5509			MDI_PHCI_UNLOCK(ph);
5510			/*
5511			 * This is the last path to this client.
5512			 *
5513			 * Constraint will only be set to 1 if this client can
5514			 * be retired (as already determined by
5515			 * mdi_phci_retire_notify). However we don't actually
5516			 * need to retire the client (we just retire the last
5517			 * path - MPXIO will then fail all I/Os to the client).
5518			 * But we still need to call e_ddi_retire_finalize so
5519			 * the contract barriers can be cleared. Therefore we
5520			 * temporarily set constraint = 0 so that the client
5521			 * dip is not retired.
5522			 */
5523			tmp_constraint = 0;
5524			(void) e_ddi_retire_finalize(cdip, &tmp_constraint);
5525			MDI_PHCI_LOCK(ph);
5526			pip = next;
5527		} else {
5528			i_mdi_client_unlock(ct);
5529			pip = next;
5530		}
5531	}
5532
5533	if (!phci_only && *((int *)constraint) == 0) {
5534		MDI_PHCI_UNLOCK(ph);
5535		return;
5536	}
5537
5538	/*
5539	 * Cannot offline pip(s)
5540	 */
5541	if (unstable) {
5542		cmn_err(CE_WARN, "%s%d: mdi_phci_retire_finalize: "
5543		    "pHCI in transient state, cannot retire",
5544		    ddi_driver_name(dip), ddi_get_instance(dip));
5545		MDI_PHCI_UNLOCK(ph);
5546		return;
5547	}
5548
5549	/*
5550	 * Mark the pHCI as offline
5551	 */
5552	MDI_PHCI_SET_OFFLINE(ph);
5553
5554	/*
5555	 * Mark the child mdi_pathinfo nodes as transient
5556	 */
5557	pip = ph->ph_path_head;
5558	while (pip != NULL) {
5559		MDI_PI_LOCK(pip);
5560		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5561		MDI_PI_SET_OFFLINING(pip);
5562		MDI_PI_UNLOCK(pip);
5563		pip = next;
5564	}
5565	MDI_PHCI_UNLOCK(ph);
5566	/*
5567	 * Give a chance for any pending commands to execute
5568	 */
5569	delay_random(mdi_delay);
5570	MDI_PHCI_LOCK(ph);
5571	pip = ph->ph_path_head;
5572	while (pip != NULL) {
5573		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5574		(void) i_mdi_pi_offline(pip, 0);
5575		MDI_PI_LOCK(pip);
5576		ct = MDI_PI(pip)->pi_client;
5577		if (!MDI_PI_IS_OFFLINE(pip)) {
5578			cmn_err(CE_WARN, "mdi_phci_retire_finalize: "
5579			    "path %d %s busy, cannot offline",
5580			    mdi_pi_get_path_instance(pip),
5581			    mdi_pi_spathname(pip));
5582			MDI_PI_UNLOCK(pip);
5583			MDI_PHCI_SET_ONLINE(ph);
5584			MDI_PHCI_UNLOCK(ph);
5585			return;
5586		}
5587		MDI_PI_UNLOCK(pip);
5588		pip = next;
5589	}
5590	MDI_PHCI_UNLOCK(ph);
5591
5592	return;
5593}
5594
5595void
5596mdi_phci_unretire(dev_info_t *dip)
5597{
5598	mdi_phci_t	*ph;
5599	mdi_pathinfo_t	*pip;
5600	mdi_pathinfo_t	*next;
5601
5602	ASSERT(MDI_PHCI(dip));
5603
5604	/*
5605	 * Online the phci
5606	 */
5607	i_mdi_phci_online(dip);
5608
5609	ph = i_devi_get_phci(dip);
5610	MDI_PHCI_LOCK(ph);
5611	pip = ph->ph_path_head;
5612	while (pip != NULL) {
5613		MDI_PI_LOCK(pip);
5614		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5615		MDI_PI_UNLOCK(pip);
5616		(void) i_mdi_pi_online(pip, 0);
5617		pip = next;
5618	}
5619	MDI_PHCI_UNLOCK(ph);
5620}
5621
5622/*ARGSUSED*/
5623static int
5624i_mdi_client_offline(dev_info_t *dip, uint_t flags)
5625{
5626	int		rv = NDI_SUCCESS;
5627	mdi_client_t	*ct;
5628
5629	/*
5630	 * Client component to go offline.  Make sure that we are
5631	 * not in failing over state and update client state
5632	 * accordingly
5633	 */
5634	ct = i_devi_get_client(dip);
5635	MDI_DEBUG(2, (MDI_NOTE, dip,
5636	    "called %p %p", (void *)dip, (void *)ct));
5637	if (ct != NULL) {
5638		MDI_CLIENT_LOCK(ct);
5639		if (ct->ct_unstable) {
5640			/*
5641			 * One or more paths are in transient state,
5642			 * Dont allow offline of a client device
5643			 */
5644			MDI_DEBUG(1, (MDI_WARN, dip,
5645			    "!One or more paths to "
5646			    "this device are in transient state. "
5647			    "This device can not be removed at this moment. "
5648			    "Please try again later."));
5649			MDI_CLIENT_UNLOCK(ct);
5650			return (NDI_BUSY);
5651		}
5652		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
5653			/*
5654			 * Failover is in progress, Dont allow DR of
5655			 * a client device
5656			 */
5657			MDI_DEBUG(1, (MDI_WARN, dip,
5658			    "!Client device is Busy. "
5659			    "This device can not be removed at this moment. "
5660			    "Please try again later."));
5661			MDI_CLIENT_UNLOCK(ct);
5662			return (NDI_BUSY);
5663		}
5664		MDI_CLIENT_SET_OFFLINE(ct);
5665
5666		/*
5667		 * Unbind our relationship with the dev_info node
5668		 */
5669		if (flags & NDI_DEVI_REMOVE) {
5670			ct->ct_dip = NULL;
5671		}
5672		MDI_CLIENT_UNLOCK(ct);
5673	}
5674	return (rv);
5675}
5676
5677/*
5678 * mdi_pre_attach():
5679 *		Pre attach() notification handler
5680 */
5681/*ARGSUSED*/
5682int
5683mdi_pre_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5684{
5685	/* don't support old DDI_PM_RESUME */
5686	if ((DEVI(dip)->devi_mdi_component != MDI_COMPONENT_NONE) &&
5687	    (cmd == DDI_PM_RESUME))
5688		return (DDI_FAILURE);
5689
5690	return (DDI_SUCCESS);
5691}
5692
5693/*
5694 * mdi_post_attach():
5695 *		Post attach() notification handler
5696 */
5697/*ARGSUSED*/
5698void
5699mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
5700{
5701	mdi_phci_t	*ph;
5702	mdi_client_t	*ct;
5703	mdi_vhci_t	*vh;
5704
5705	if (MDI_PHCI(dip)) {
5706		ph = i_devi_get_phci(dip);
5707		ASSERT(ph != NULL);
5708
5709		MDI_PHCI_LOCK(ph);
5710		switch (cmd) {
5711		case DDI_ATTACH:
5712			MDI_DEBUG(2, (MDI_NOTE, dip,
5713			    "phci post_attach called %p", (void *)ph));
5714			if (error == DDI_SUCCESS) {
5715				MDI_PHCI_SET_ATTACH(ph);
5716			} else {
5717				MDI_DEBUG(1, (MDI_NOTE, dip,
5718				    "!pHCI post_attach failed: error %d",
5719				    error));
5720				MDI_PHCI_SET_DETACH(ph);
5721			}
5722			break;
5723
5724		case DDI_RESUME:
5725			MDI_DEBUG(2, (MDI_NOTE, dip,
5726			    "pHCI post_resume: called %p", (void *)ph));
5727			if (error == DDI_SUCCESS) {
5728				MDI_PHCI_SET_RESUME(ph);
5729			} else {
5730				MDI_DEBUG(1, (MDI_NOTE, dip,
5731				    "!pHCI post_resume failed: error %d",
5732				    error));
5733				MDI_PHCI_SET_SUSPEND(ph);
5734			}
5735			break;
5736		}
5737		MDI_PHCI_UNLOCK(ph);
5738	}
5739
5740	if (MDI_CLIENT(dip)) {
5741		ct = i_devi_get_client(dip);
5742		ASSERT(ct != NULL);
5743
5744		MDI_CLIENT_LOCK(ct);
5745		switch (cmd) {
5746		case DDI_ATTACH:
5747			MDI_DEBUG(2, (MDI_NOTE, dip,
5748			    "client post_attach called %p", (void *)ct));
5749			if (error != DDI_SUCCESS) {
5750				MDI_DEBUG(1, (MDI_NOTE, dip,
5751				    "!client post_attach failed: error %d",
5752				    error));
5753				MDI_CLIENT_SET_DETACH(ct);
5754				MDI_DEBUG(4, (MDI_WARN, dip,
5755				    "i_mdi_pm_reset_client"));
5756				i_mdi_pm_reset_client(ct);
5757				break;
5758			}
5759
5760			/*
5761			 * Client device has successfully attached, inform
5762			 * the vhci.
5763			 */
5764			vh = ct->ct_vhci;
5765			if (vh->vh_ops->vo_client_attached)
5766				(*vh->vh_ops->vo_client_attached)(dip);
5767
5768			MDI_CLIENT_SET_ATTACH(ct);
5769			break;
5770
5771		case DDI_RESUME:
5772			MDI_DEBUG(2, (MDI_NOTE, dip,
5773			    "client post_attach: called %p", (void *)ct));
5774			if (error == DDI_SUCCESS) {
5775				MDI_CLIENT_SET_RESUME(ct);
5776			} else {
5777				MDI_DEBUG(1, (MDI_NOTE, dip,
5778				    "!client post_resume failed: error %d",
5779				    error));
5780				MDI_CLIENT_SET_SUSPEND(ct);
5781			}
5782			break;
5783		}
5784		MDI_CLIENT_UNLOCK(ct);
5785	}
5786}
5787
5788/*
5789 * mdi_pre_detach():
5790 *		Pre detach notification handler
5791 */
5792/*ARGSUSED*/
5793int
5794mdi_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5795{
5796	int rv = DDI_SUCCESS;
5797
5798	if (MDI_CLIENT(dip)) {
5799		(void) i_mdi_client_pre_detach(dip, cmd);
5800	}
5801
5802	if (MDI_PHCI(dip)) {
5803		rv = i_mdi_phci_pre_detach(dip, cmd);
5804	}
5805
5806	return (rv);
5807}
5808
5809/*ARGSUSED*/
5810static int
5811i_mdi_phci_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5812{
5813	int		rv = DDI_SUCCESS;
5814	mdi_phci_t	*ph;
5815	mdi_client_t	*ct;
5816	mdi_pathinfo_t	*pip;
5817	mdi_pathinfo_t	*failed_pip = NULL;
5818	mdi_pathinfo_t	*next;
5819
5820	ph = i_devi_get_phci(dip);
5821	if (ph == NULL) {
5822		return (rv);
5823	}
5824
5825	MDI_PHCI_LOCK(ph);
5826	switch (cmd) {
5827	case DDI_DETACH:
5828		MDI_DEBUG(2, (MDI_NOTE, dip,
5829		    "pHCI pre_detach: called %p", (void *)ph));
5830		if (!MDI_PHCI_IS_OFFLINE(ph)) {
5831			/*
5832			 * mdi_pathinfo nodes are still attached to
5833			 * this pHCI. Fail the detach for this pHCI.
5834			 */
5835			MDI_DEBUG(2, (MDI_WARN, dip,
5836			    "pHCI pre_detach: paths are still attached %p",
5837			    (void *)ph));
5838			rv = DDI_FAILURE;
5839			break;
5840		}
5841		MDI_PHCI_SET_DETACH(ph);
5842		break;
5843
5844	case DDI_SUSPEND:
5845		/*
5846		 * pHCI is getting suspended.  Since mpxio client
5847		 * devices may not be suspended at this point, to avoid
5848		 * a potential stack overflow, it is important to suspend
5849		 * client devices before pHCI can be suspended.
5850		 */
5851
5852		MDI_DEBUG(2, (MDI_NOTE, dip,
5853		    "pHCI pre_suspend: called %p", (void *)ph));
5854		/*
5855		 * Suspend all the client devices accessible through this pHCI
5856		 */
5857		pip = ph->ph_path_head;
5858		while (pip != NULL && rv == DDI_SUCCESS) {
5859			dev_info_t *cdip;
5860			MDI_PI_LOCK(pip);
5861			next =
5862			    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5863			ct = MDI_PI(pip)->pi_client;
5864			i_mdi_client_lock(ct, pip);
5865			cdip = ct->ct_dip;
5866			MDI_PI_UNLOCK(pip);
5867			if ((MDI_CLIENT_IS_DETACHED(ct) == 0) &&
5868			    MDI_CLIENT_IS_SUSPENDED(ct) == 0) {
5869				i_mdi_client_unlock(ct);
5870				if ((rv = devi_detach(cdip, DDI_SUSPEND)) !=
5871				    DDI_SUCCESS) {
5872					/*
5873					 * Suspend of one of the client
5874					 * device has failed.
5875					 */
5876					MDI_DEBUG(1, (MDI_WARN, dip,
5877					    "!suspend of device (%s%d) failed.",
5878					    ddi_driver_name(cdip),
5879					    ddi_get_instance(cdip)));
5880					failed_pip = pip;
5881					break;
5882				}
5883			} else {
5884				i_mdi_client_unlock(ct);
5885			}
5886			pip = next;
5887		}
5888
5889		if (rv == DDI_SUCCESS) {
5890			/*
5891			 * Suspend of client devices is complete. Proceed
5892			 * with pHCI suspend.
5893			 */
5894			MDI_PHCI_SET_SUSPEND(ph);
5895		} else {
5896			/*
5897			 * Revert back all the suspended client device states
5898			 * to converse.
5899			 */
5900			pip = ph->ph_path_head;
5901			while (pip != failed_pip) {
5902				dev_info_t *cdip;
5903				MDI_PI_LOCK(pip);
5904				next =
5905				    (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
5906				ct = MDI_PI(pip)->pi_client;
5907				i_mdi_client_lock(ct, pip);
5908				cdip = ct->ct_dip;
5909				MDI_PI_UNLOCK(pip);
5910				if (MDI_CLIENT_IS_SUSPENDED(ct)) {
5911					i_mdi_client_unlock(ct);
5912					(void) devi_attach(cdip, DDI_RESUME);
5913				} else {
5914					i_mdi_client_unlock(ct);
5915				}
5916				pip = next;
5917			}
5918		}
5919		break;
5920
5921	default:
5922		rv = DDI_FAILURE;
5923		break;
5924	}
5925	MDI_PHCI_UNLOCK(ph);
5926	return (rv);
5927}
5928
5929/*ARGSUSED*/
5930static int
5931i_mdi_client_pre_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5932{
5933	int		rv = DDI_SUCCESS;
5934	mdi_client_t	*ct;
5935
5936	ct = i_devi_get_client(dip);
5937	if (ct == NULL) {
5938		return (rv);
5939	}
5940
5941	MDI_CLIENT_LOCK(ct);
5942	switch (cmd) {
5943	case DDI_DETACH:
5944		MDI_DEBUG(2, (MDI_NOTE, dip,
5945		    "client pre_detach: called %p",
5946		     (void *)ct));
5947		MDI_CLIENT_SET_DETACH(ct);
5948		break;
5949
5950	case DDI_SUSPEND:
5951		MDI_DEBUG(2, (MDI_NOTE, dip,
5952		    "client pre_suspend: called %p",
5953		    (void *)ct));
5954		MDI_CLIENT_SET_SUSPEND(ct);
5955		break;
5956
5957	default:
5958		rv = DDI_FAILURE;
5959		break;
5960	}
5961	MDI_CLIENT_UNLOCK(ct);
5962	return (rv);
5963}
5964
5965/*
5966 * mdi_post_detach():
5967 *		Post detach notification handler
5968 */
5969/*ARGSUSED*/
5970void
5971mdi_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5972{
5973	/*
5974	 * Detach/Suspend of mpxio component failed. Update our state
5975	 * too
5976	 */
5977	if (MDI_PHCI(dip))
5978		i_mdi_phci_post_detach(dip, cmd, error);
5979
5980	if (MDI_CLIENT(dip))
5981		i_mdi_client_post_detach(dip, cmd, error);
5982}
5983
5984/*ARGSUSED*/
5985static void
5986i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
5987{
5988	mdi_phci_t	*ph;
5989
5990	/*
5991	 * Detach/Suspend of phci component failed. Update our state
5992	 * too
5993	 */
5994	ph = i_devi_get_phci(dip);
5995	if (ph == NULL) {
5996		return;
5997	}
5998
5999	MDI_PHCI_LOCK(ph);
6000	/*
6001	 * Detach of pHCI failed. Restore back converse
6002	 * state
6003	 */
6004	switch (cmd) {
6005	case DDI_DETACH:
6006		MDI_DEBUG(2, (MDI_NOTE, dip,
6007		    "pHCI post_detach: called %p",
6008		    (void *)ph));
6009		if (error != DDI_SUCCESS)
6010			MDI_PHCI_SET_ATTACH(ph);
6011		break;
6012
6013	case DDI_SUSPEND:
6014		MDI_DEBUG(2, (MDI_NOTE, dip,
6015		    "pHCI post_suspend: called %p",
6016		    (void *)ph));
6017		if (error != DDI_SUCCESS)
6018			MDI_PHCI_SET_RESUME(ph);
6019		break;
6020	}
6021	MDI_PHCI_UNLOCK(ph);
6022}
6023
6024/*ARGSUSED*/
6025static void
6026i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
6027{
6028	mdi_client_t	*ct;
6029
6030	ct = i_devi_get_client(dip);
6031	if (ct == NULL) {
6032		return;
6033	}
6034	MDI_CLIENT_LOCK(ct);
6035	/*
6036	 * Detach of Client failed. Restore back converse
6037	 * state
6038	 */
6039	switch (cmd) {
6040	case DDI_DETACH:
6041		MDI_DEBUG(2, (MDI_NOTE, dip,
6042		    "client post_detach: called %p", (void *)ct));
6043		if (DEVI_IS_ATTACHING(dip)) {
6044			MDI_DEBUG(4, (MDI_NOTE, dip,
6045			    "i_mdi_pm_rele_client\n"));
6046			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6047		} else {
6048			MDI_DEBUG(4, (MDI_NOTE, dip,
6049			    "i_mdi_pm_reset_client\n"));
6050			i_mdi_pm_reset_client(ct);
6051		}
6052		if (error != DDI_SUCCESS)
6053			MDI_CLIENT_SET_ATTACH(ct);
6054		break;
6055
6056	case DDI_SUSPEND:
6057		MDI_DEBUG(2, (MDI_NOTE, dip,
6058		    "called %p", (void *)ct));
6059		if (error != DDI_SUCCESS)
6060			MDI_CLIENT_SET_RESUME(ct);
6061		break;
6062	}
6063	MDI_CLIENT_UNLOCK(ct);
6064}
6065
6066int
6067mdi_pi_kstat_exists(mdi_pathinfo_t *pip)
6068{
6069	return (MDI_PI(pip)->pi_kstats ? 1 : 0);
6070}
6071
6072/*
6073 * create and install per-path (client - pHCI) statistics
6074 * I/O stats supported: nread, nwritten, reads, and writes
6075 * Error stats - hard errors, soft errors, & transport errors
6076 */
6077int
6078mdi_pi_kstat_create(mdi_pathinfo_t *pip, char *ksname)
6079{
6080	kstat_t			*kiosp, *kerrsp;
6081	struct pi_errs		*nsp;
6082	struct mdi_pi_kstats	*mdi_statp;
6083
6084	if (MDI_PI(pip)->pi_kstats != NULL)
6085		return (MDI_SUCCESS);
6086
6087	if ((kiosp = kstat_create("mdi", 0, ksname, "iopath",
6088	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
6089		return (MDI_FAILURE);
6090	}
6091
6092	(void) strcat(ksname, ",err");
6093	kerrsp = kstat_create("mdi", 0, ksname, "iopath_errors",
6094	    KSTAT_TYPE_NAMED,
6095	    sizeof (struct pi_errs) / sizeof (kstat_named_t), 0);
6096	if (kerrsp == NULL) {
6097		kstat_delete(kiosp);
6098		return (MDI_FAILURE);
6099	}
6100
6101	nsp = (struct pi_errs *)kerrsp->ks_data;
6102	kstat_named_init(&nsp->pi_softerrs, "Soft Errors", KSTAT_DATA_UINT32);
6103	kstat_named_init(&nsp->pi_harderrs, "Hard Errors", KSTAT_DATA_UINT32);
6104	kstat_named_init(&nsp->pi_transerrs, "Transport Errors",
6105	    KSTAT_DATA_UINT32);
6106	kstat_named_init(&nsp->pi_icnt_busy, "Interconnect Busy",
6107	    KSTAT_DATA_UINT32);
6108	kstat_named_init(&nsp->pi_icnt_errors, "Interconnect Errors",
6109	    KSTAT_DATA_UINT32);
6110	kstat_named_init(&nsp->pi_phci_rsrc, "pHCI No Resources",
6111	    KSTAT_DATA_UINT32);
6112	kstat_named_init(&nsp->pi_phci_localerr, "pHCI Local Errors",
6113	    KSTAT_DATA_UINT32);
6114	kstat_named_init(&nsp->pi_phci_invstate, "pHCI Invalid State",
6115	    KSTAT_DATA_UINT32);
6116	kstat_named_init(&nsp->pi_failedfrom, "Failed From",
6117	    KSTAT_DATA_UINT32);
6118	kstat_named_init(&nsp->pi_failedto, "Failed To", KSTAT_DATA_UINT32);
6119
6120	mdi_statp = kmem_alloc(sizeof (*mdi_statp), KM_SLEEP);
6121	mdi_statp->pi_kstat_ref = 1;
6122	mdi_statp->pi_kstat_iostats = kiosp;
6123	mdi_statp->pi_kstat_errstats = kerrsp;
6124	kstat_install(kiosp);
6125	kstat_install(kerrsp);
6126	MDI_PI(pip)->pi_kstats = mdi_statp;
6127	return (MDI_SUCCESS);
6128}
6129
6130/*
6131 * destroy per-path properties
6132 */
6133static void
6134i_mdi_pi_kstat_destroy(mdi_pathinfo_t *pip)
6135{
6136
6137	struct mdi_pi_kstats *mdi_statp;
6138
6139	if (MDI_PI(pip)->pi_kstats == NULL)
6140		return;
6141	if ((mdi_statp = MDI_PI(pip)->pi_kstats) == NULL)
6142		return;
6143
6144	MDI_PI(pip)->pi_kstats = NULL;
6145
6146	/*
6147	 * the kstat may be shared between multiple pathinfo nodes
6148	 * decrement this pathinfo's usage, removing the kstats
6149	 * themselves when the last pathinfo reference is removed.
6150	 */
6151	ASSERT(mdi_statp->pi_kstat_ref > 0);
6152	if (--mdi_statp->pi_kstat_ref != 0)
6153		return;
6154
6155	kstat_delete(mdi_statp->pi_kstat_iostats);
6156	kstat_delete(mdi_statp->pi_kstat_errstats);
6157	kmem_free(mdi_statp, sizeof (*mdi_statp));
6158}
6159
6160/*
6161 * update I/O paths KSTATS
6162 */
6163void
6164mdi_pi_kstat_iosupdate(mdi_pathinfo_t *pip, struct buf *bp)
6165{
6166	kstat_t *iostatp;
6167	size_t xfer_cnt;
6168
6169	ASSERT(pip != NULL);
6170
6171	/*
6172	 * I/O can be driven across a path prior to having path
6173	 * statistics available, i.e. probe(9e).
6174	 */
6175	if (bp != NULL && MDI_PI(pip)->pi_kstats != NULL) {
6176		iostatp = MDI_PI(pip)->pi_kstats->pi_kstat_iostats;
6177		xfer_cnt = bp->b_bcount - bp->b_resid;
6178		if (bp->b_flags & B_READ) {
6179			KSTAT_IO_PTR(iostatp)->reads++;
6180			KSTAT_IO_PTR(iostatp)->nread += xfer_cnt;
6181		} else {
6182			KSTAT_IO_PTR(iostatp)->writes++;
6183			KSTAT_IO_PTR(iostatp)->nwritten += xfer_cnt;
6184		}
6185	}
6186}
6187
6188/*
6189 * Enable the path(specific client/target/initiator)
6190 * Enabling a path means that MPxIO may select the enabled path for routing
6191 * future I/O requests, subject to other path state constraints.
6192 */
6193int
6194mdi_pi_enable_path(mdi_pathinfo_t *pip, int flags)
6195{
6196	mdi_phci_t	*ph;
6197
6198	ph = MDI_PI(pip)->pi_phci;
6199	if (ph == NULL) {
6200		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6201		    "!failed: path %s %p: NULL ph",
6202		    mdi_pi_spathname(pip), (void *)pip));
6203		return (MDI_FAILURE);
6204	}
6205
6206	(void) i_mdi_enable_disable_path(pip, ph->ph_vhci, flags,
6207		MDI_ENABLE_OP);
6208	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6209	    "!returning success pip = %p. ph = %p",
6210	    (void *)pip, (void *)ph));
6211	return (MDI_SUCCESS);
6212
6213}
6214
6215/*
6216 * Disable the path (specific client/target/initiator)
6217 * Disabling a path means that MPxIO will not select the disabled path for
6218 * routing any new I/O requests.
6219 */
6220int
6221mdi_pi_disable_path(mdi_pathinfo_t *pip, int flags)
6222{
6223	mdi_phci_t	*ph;
6224
6225	ph = MDI_PI(pip)->pi_phci;
6226	if (ph == NULL) {
6227		MDI_DEBUG(1, (MDI_NOTE, mdi_pi_get_phci(pip),
6228		    "!failed: path %s %p: NULL ph",
6229		    mdi_pi_spathname(pip), (void *)pip));
6230		return (MDI_FAILURE);
6231	}
6232
6233	(void) i_mdi_enable_disable_path(pip,
6234	    ph->ph_vhci, flags, MDI_DISABLE_OP);
6235	MDI_DEBUG(5, (MDI_NOTE, ph->ph_dip,
6236	    "!returning success pip = %p. ph = %p",
6237	    (void *)pip, (void *)ph));
6238	return (MDI_SUCCESS);
6239}
6240
6241/*
6242 * disable the path to a particular pHCI (pHCI specified in the phci_path
6243 * argument) for a particular client (specified in the client_path argument).
6244 * Disabling a path means that MPxIO will not select the disabled path for
6245 * routing any new I/O requests.
6246 * NOTE: this will be removed once the NWS files are changed to use the new
6247 * mdi_{enable,disable}_path interfaces
6248 */
6249int
6250mdi_pi_disable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6251{
6252	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_DISABLE_OP));
6253}
6254
6255/*
6256 * Enable the path to a particular pHCI (pHCI specified in the phci_path
6257 * argument) for a particular client (specified in the client_path argument).
6258 * Enabling a path means that MPxIO may select the enabled path for routing
6259 * future I/O requests, subject to other path state constraints.
6260 * NOTE: this will be removed once the NWS files are changed to use the new
6261 * mdi_{enable,disable}_path interfaces
6262 */
6263
6264int
6265mdi_pi_enable(dev_info_t *cdip, dev_info_t *pdip, int flags)
6266{
6267	return (i_mdi_pi_enable_disable(cdip, pdip, flags, MDI_ENABLE_OP));
6268}
6269
6270/*
6271 * Common routine for doing enable/disable.
6272 */
6273static mdi_pathinfo_t *
6274i_mdi_enable_disable_path(mdi_pathinfo_t *pip, mdi_vhci_t *vh, int flags,
6275		int op)
6276{
6277	int		sync_flag = 0;
6278	int		rv;
6279	mdi_pathinfo_t 	*next;
6280	int		(*f)() = NULL;
6281
6282	/*
6283	 * Check to make sure the path is not already in the
6284	 * requested state. If it is just return the next path
6285	 * as we have nothing to do here.
6286	 */
6287	if ((MDI_PI_IS_DISABLE(pip) && op == MDI_DISABLE_OP) ||
6288	    (!MDI_PI_IS_DISABLE(pip) && op == MDI_ENABLE_OP)) {
6289		MDI_PI_LOCK(pip);
6290		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6291		MDI_PI_UNLOCK(pip);
6292		return (next);
6293	}
6294
6295	f = vh->vh_ops->vo_pi_state_change;
6296
6297	sync_flag = (flags << 8) & 0xf00;
6298
6299	/*
6300	 * Do a callback into the mdi consumer to let it
6301	 * know that path is about to get enabled/disabled.
6302	 */
6303	rv = MDI_SUCCESS;
6304	if (f != NULL) {
6305		rv = (*f)(vh->vh_dip, pip, 0,
6306			MDI_PI_EXT_STATE(pip),
6307			MDI_EXT_STATE_CHANGE | sync_flag |
6308			op | MDI_BEFORE_STATE_CHANGE);
6309		if (rv != MDI_SUCCESS) {
6310			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6311			    "vo_pi_state_change: failed rv = %x", rv));
6312		}
6313	}
6314	MDI_PI_LOCK(pip);
6315	next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
6316
6317	switch (flags) {
6318		case USER_DISABLE:
6319			if (op == MDI_DISABLE_OP) {
6320				MDI_PI_SET_USER_DISABLE(pip);
6321			} else {
6322				MDI_PI_SET_USER_ENABLE(pip);
6323			}
6324			break;
6325		case DRIVER_DISABLE:
6326			if (op == MDI_DISABLE_OP) {
6327				MDI_PI_SET_DRV_DISABLE(pip);
6328			} else {
6329				MDI_PI_SET_DRV_ENABLE(pip);
6330			}
6331			break;
6332		case DRIVER_DISABLE_TRANSIENT:
6333			if (op == MDI_DISABLE_OP && rv == MDI_SUCCESS) {
6334				MDI_PI_SET_DRV_DISABLE_TRANS(pip);
6335			} else {
6336				MDI_PI_SET_DRV_ENABLE_TRANS(pip);
6337			}
6338			break;
6339	}
6340	MDI_PI_UNLOCK(pip);
6341	/*
6342	 * Do a callback into the mdi consumer to let it
6343	 * know that path is now enabled/disabled.
6344	 */
6345	if (f != NULL) {
6346		rv = (*f)(vh->vh_dip, pip, 0,
6347			MDI_PI_EXT_STATE(pip),
6348			MDI_EXT_STATE_CHANGE | sync_flag |
6349			op | MDI_AFTER_STATE_CHANGE);
6350		if (rv != MDI_SUCCESS) {
6351			MDI_DEBUG(2, (MDI_WARN, vh->vh_dip,
6352			    "vo_pi_state_change failed: rv = %x", rv));
6353		}
6354	}
6355	return (next);
6356}
6357
6358/*
6359 * Common routine for doing enable/disable.
6360 * NOTE: this will be removed once the NWS files are changed to use the new
6361 * mdi_{enable,disable}_path has been putback
6362 */
6363int
6364i_mdi_pi_enable_disable(dev_info_t *cdip, dev_info_t *pdip, int flags, int op)
6365{
6366
6367	mdi_phci_t	*ph;
6368	mdi_vhci_t	*vh = NULL;
6369	mdi_client_t	*ct;
6370	mdi_pathinfo_t	*next, *pip;
6371	int		found_it;
6372
6373	ph = i_devi_get_phci(pdip);
6374	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6375	    "!op = %d pdip = %p cdip = %p", op, (void *)pdip,
6376	    (void *)cdip));
6377	if (ph == NULL) {
6378		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6379		    "!failed: operation %d: NULL ph", op));
6380		return (MDI_FAILURE);
6381	}
6382
6383	if ((op != MDI_ENABLE_OP) && (op != MDI_DISABLE_OP)) {
6384		MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6385		    "!failed: invalid operation %d", op));
6386		return (MDI_FAILURE);
6387	}
6388
6389	vh = ph->ph_vhci;
6390
6391	if (cdip == NULL) {
6392		/*
6393		 * Need to mark the Phci as enabled/disabled.
6394		 */
6395		MDI_DEBUG(4, (MDI_NOTE, cdip ? cdip : pdip,
6396		    "op %d for the phci", op));
6397		MDI_PHCI_LOCK(ph);
6398		switch (flags) {
6399			case USER_DISABLE:
6400				if (op == MDI_DISABLE_OP) {
6401					MDI_PHCI_SET_USER_DISABLE(ph);
6402				} else {
6403					MDI_PHCI_SET_USER_ENABLE(ph);
6404				}
6405				break;
6406			case DRIVER_DISABLE:
6407				if (op == MDI_DISABLE_OP) {
6408					MDI_PHCI_SET_DRV_DISABLE(ph);
6409				} else {
6410					MDI_PHCI_SET_DRV_ENABLE(ph);
6411				}
6412				break;
6413			case DRIVER_DISABLE_TRANSIENT:
6414				if (op == MDI_DISABLE_OP) {
6415					MDI_PHCI_SET_DRV_DISABLE_TRANSIENT(ph);
6416				} else {
6417					MDI_PHCI_SET_DRV_ENABLE_TRANSIENT(ph);
6418				}
6419				break;
6420			default:
6421				MDI_PHCI_UNLOCK(ph);
6422				MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6423				    "!invalid flag argument= %d", flags));
6424		}
6425
6426		/*
6427		 * Phci has been disabled. Now try to enable/disable
6428		 * path info's to each client.
6429		 */
6430		pip = ph->ph_path_head;
6431		while (pip != NULL) {
6432			pip = i_mdi_enable_disable_path(pip, vh, flags, op);
6433		}
6434		MDI_PHCI_UNLOCK(ph);
6435	} else {
6436
6437		/*
6438		 * Disable a specific client.
6439		 */
6440		ct = i_devi_get_client(cdip);
6441		if (ct == NULL) {
6442			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6443			    "!failed: operation = %d: NULL ct", op));
6444			return (MDI_FAILURE);
6445		}
6446
6447		MDI_CLIENT_LOCK(ct);
6448		pip = ct->ct_path_head;
6449		found_it = 0;
6450		while (pip != NULL) {
6451			MDI_PI_LOCK(pip);
6452			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6453			if (MDI_PI(pip)->pi_phci == ph) {
6454				MDI_PI_UNLOCK(pip);
6455				found_it = 1;
6456				break;
6457			}
6458			MDI_PI_UNLOCK(pip);
6459			pip = next;
6460		}
6461
6462
6463		MDI_CLIENT_UNLOCK(ct);
6464		if (found_it == 0) {
6465			MDI_DEBUG(1, (MDI_NOTE, cdip ? cdip : pdip,
6466			    "!failed. Could not find corresponding pip\n"));
6467			return (MDI_FAILURE);
6468		}
6469
6470		(void) i_mdi_enable_disable_path(pip, vh, flags, op);
6471	}
6472
6473	MDI_DEBUG(5, (MDI_NOTE, cdip ? cdip : pdip,
6474	    "!op %d returning success pdip = %p cdip = %p",
6475	    op, (void *)pdip, (void *)cdip));
6476	return (MDI_SUCCESS);
6477}
6478
6479/*
6480 * Ensure phci powered up
6481 */
6482static void
6483i_mdi_pm_hold_pip(mdi_pathinfo_t *pip)
6484{
6485	dev_info_t	*ph_dip;
6486
6487	ASSERT(pip != NULL);
6488	ASSERT(MDI_PI_LOCKED(pip));
6489
6490	if (MDI_PI(pip)->pi_pm_held) {
6491		return;
6492	}
6493
6494	ph_dip = mdi_pi_get_phci(pip);
6495	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6496	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6497	if (ph_dip == NULL) {
6498		return;
6499	}
6500
6501	MDI_PI_UNLOCK(pip);
6502	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt was %d",
6503	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6504	pm_hold_power(ph_dip);
6505	MDI_DEBUG(4, (MDI_NOTE, ph_dip, "kidsupcnt is %d",
6506	    DEVI(ph_dip)->devi_pm_kidsupcnt));
6507	MDI_PI_LOCK(pip);
6508
6509	/* If PM_GET_PM_INFO is NULL the pm_hold_power above was a noop */
6510	if (DEVI(ph_dip)->devi_pm_info)
6511		MDI_PI(pip)->pi_pm_held = 1;
6512}
6513
6514/*
6515 * Allow phci powered down
6516 */
6517static void
6518i_mdi_pm_rele_pip(mdi_pathinfo_t *pip)
6519{
6520	dev_info_t	*ph_dip = NULL;
6521
6522	ASSERT(pip != NULL);
6523	ASSERT(MDI_PI_LOCKED(pip));
6524
6525	if (MDI_PI(pip)->pi_pm_held == 0) {
6526		return;
6527	}
6528
6529	ph_dip = mdi_pi_get_phci(pip);
6530	ASSERT(ph_dip != NULL);
6531
6532	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6533	    "%s %p", mdi_pi_spathname(pip), (void *)pip));
6534
6535	MDI_PI_UNLOCK(pip);
6536	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6537	    "kidsupcnt was %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6538	pm_rele_power(ph_dip);
6539	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6540	    "kidsupcnt is %d", DEVI(ph_dip)->devi_pm_kidsupcnt));
6541	MDI_PI_LOCK(pip);
6542
6543	MDI_PI(pip)->pi_pm_held = 0;
6544}
6545
6546static void
6547i_mdi_pm_hold_client(mdi_client_t *ct, int incr)
6548{
6549	ASSERT(MDI_CLIENT_LOCKED(ct));
6550
6551	ct->ct_power_cnt += incr;
6552	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6553	    "%p ct_power_cnt = %d incr = %d",
6554	    (void *)ct, ct->ct_power_cnt, incr));
6555	ASSERT(ct->ct_power_cnt >= 0);
6556}
6557
6558static void
6559i_mdi_rele_all_phci(mdi_client_t *ct)
6560{
6561	mdi_pathinfo_t  *pip;
6562
6563	ASSERT(MDI_CLIENT_LOCKED(ct));
6564	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6565	while (pip != NULL) {
6566		mdi_hold_path(pip);
6567		MDI_PI_LOCK(pip);
6568		i_mdi_pm_rele_pip(pip);
6569		MDI_PI_UNLOCK(pip);
6570		mdi_rele_path(pip);
6571		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6572	}
6573}
6574
6575static void
6576i_mdi_pm_rele_client(mdi_client_t *ct, int decr)
6577{
6578	ASSERT(MDI_CLIENT_LOCKED(ct));
6579
6580	if (i_ddi_devi_attached(ct->ct_dip)) {
6581		ct->ct_power_cnt -= decr;
6582		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6583		    "%p ct_power_cnt = %d decr = %d",
6584		    (void *)ct, ct->ct_power_cnt, decr));
6585	}
6586
6587	ASSERT(ct->ct_power_cnt >= 0);
6588	if (ct->ct_power_cnt == 0) {
6589		i_mdi_rele_all_phci(ct);
6590		return;
6591	}
6592}
6593
6594static void
6595i_mdi_pm_reset_client(mdi_client_t *ct)
6596{
6597	MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
6598	    "%p ct_power_cnt = %d", (void *)ct, ct->ct_power_cnt));
6599	ASSERT(MDI_CLIENT_LOCKED(ct));
6600	ct->ct_power_cnt = 0;
6601	i_mdi_rele_all_phci(ct);
6602	ct->ct_powercnt_config = 0;
6603	ct->ct_powercnt_unconfig = 0;
6604	ct->ct_powercnt_reset = 1;
6605}
6606
6607static int
6608i_mdi_power_one_phci(mdi_pathinfo_t *pip)
6609{
6610	int		ret;
6611	dev_info_t	*ph_dip;
6612
6613	MDI_PI_LOCK(pip);
6614	i_mdi_pm_hold_pip(pip);
6615
6616	ph_dip = mdi_pi_get_phci(pip);
6617	MDI_PI_UNLOCK(pip);
6618
6619	/* bring all components of phci to full power */
6620	MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6621	    "pm_powerup for %s%d %p", ddi_driver_name(ph_dip),
6622	    ddi_get_instance(ph_dip), (void *)pip));
6623
6624	ret = pm_powerup(ph_dip);
6625
6626	if (ret == DDI_FAILURE) {
6627		MDI_DEBUG(4, (MDI_NOTE, ph_dip,
6628		    "pm_powerup FAILED for %s%d %p",
6629		    ddi_driver_name(ph_dip), ddi_get_instance(ph_dip),
6630		    (void *)pip));
6631
6632		MDI_PI_LOCK(pip);
6633		i_mdi_pm_rele_pip(pip);
6634		MDI_PI_UNLOCK(pip);
6635		return (MDI_FAILURE);
6636	}
6637
6638	return (MDI_SUCCESS);
6639}
6640
6641static int
6642i_mdi_power_all_phci(mdi_client_t *ct)
6643{
6644	mdi_pathinfo_t  *pip;
6645	int		succeeded = 0;
6646
6647	ASSERT(MDI_CLIENT_LOCKED(ct));
6648	pip = (mdi_pathinfo_t *)ct->ct_path_head;
6649	while (pip != NULL) {
6650		/*
6651		 * Don't power if MDI_PATHINFO_STATE_FAULT
6652		 * or MDI_PATHINFO_STATE_OFFLINE.
6653		 */
6654		if (MDI_PI_IS_INIT(pip) ||
6655		    MDI_PI_IS_ONLINE(pip) || MDI_PI_IS_STANDBY(pip)) {
6656			mdi_hold_path(pip);
6657			MDI_CLIENT_UNLOCK(ct);
6658			if (i_mdi_power_one_phci(pip) == MDI_SUCCESS)
6659				succeeded = 1;
6660
6661			ASSERT(ct == MDI_PI(pip)->pi_client);
6662			MDI_CLIENT_LOCK(ct);
6663			mdi_rele_path(pip);
6664		}
6665		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
6666	}
6667
6668	return (succeeded ? MDI_SUCCESS : MDI_FAILURE);
6669}
6670
6671/*
6672 * mdi_bus_power():
6673 *		1. Place the phci(s) into powered up state so that
6674 *		   client can do power management
6675 *		2. Ensure phci powered up as client power managing
6676 * Return Values:
6677 *		MDI_SUCCESS
6678 *		MDI_FAILURE
6679 */
6680int
6681mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
6682    void *arg, void *result)
6683{
6684	int			ret = MDI_SUCCESS;
6685	pm_bp_child_pwrchg_t	*bpc;
6686	mdi_client_t		*ct;
6687	dev_info_t		*cdip;
6688	pm_bp_has_changed_t	*bphc;
6689
6690	/*
6691	 * BUS_POWER_NOINVOL not supported
6692	 */
6693	if (op == BUS_POWER_NOINVOL)
6694		return (MDI_FAILURE);
6695
6696	/*
6697	 * ignore other OPs.
6698	 * return quickly to save cou cycles on the ct processing
6699	 */
6700	switch (op) {
6701	case BUS_POWER_PRE_NOTIFICATION:
6702	case BUS_POWER_POST_NOTIFICATION:
6703		bpc = (pm_bp_child_pwrchg_t *)arg;
6704		cdip = bpc->bpc_dip;
6705		break;
6706	case BUS_POWER_HAS_CHANGED:
6707		bphc = (pm_bp_has_changed_t *)arg;
6708		cdip = bphc->bphc_dip;
6709		break;
6710	default:
6711		return (pm_busop_bus_power(parent, impl_arg, op, arg, result));
6712	}
6713
6714	ASSERT(MDI_CLIENT(cdip));
6715
6716	ct = i_devi_get_client(cdip);
6717	if (ct == NULL)
6718		return (MDI_FAILURE);
6719
6720	/*
6721	 * wait till the mdi_pathinfo node state change are processed
6722	 */
6723	MDI_CLIENT_LOCK(ct);
6724	switch (op) {
6725	case BUS_POWER_PRE_NOTIFICATION:
6726		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6727		    "BUS_POWER_PRE_NOTIFICATION:"
6728		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6729		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6730		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp));
6731
6732		/* serialize power level change per client */
6733		while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6734			cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6735
6736		MDI_CLIENT_SET_POWER_TRANSITION(ct);
6737
6738		if (ct->ct_power_cnt == 0) {
6739			ret = i_mdi_power_all_phci(ct);
6740		}
6741
6742		/*
6743		 * if new_level > 0:
6744		 *	- hold phci(s)
6745		 *	- power up phci(s) if not already
6746		 * ignore power down
6747		 */
6748		if (bpc->bpc_nlevel > 0) {
6749			if (!DEVI_IS_ATTACHING(ct->ct_dip)) {
6750				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6751				    "i_mdi_pm_hold_client\n"));
6752				i_mdi_pm_hold_client(ct, ct->ct_path_count);
6753			}
6754		}
6755		break;
6756	case BUS_POWER_POST_NOTIFICATION:
6757		MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6758		    "BUS_POWER_POST_NOTIFICATION:"
6759		    "%s@%s, olevel=%d, nlevel=%d, comp=%d result=%d",
6760		    ddi_node_name(bpc->bpc_dip), PM_ADDR(bpc->bpc_dip),
6761		    bpc->bpc_olevel, bpc->bpc_nlevel, bpc->bpc_comp,
6762		    *(int *)result));
6763
6764		if (*(int *)result == DDI_SUCCESS) {
6765			if (bpc->bpc_nlevel > 0) {
6766				MDI_CLIENT_SET_POWER_UP(ct);
6767			} else {
6768				MDI_CLIENT_SET_POWER_DOWN(ct);
6769			}
6770		}
6771
6772		/* release the hold we did in pre-notification */
6773		if (bpc->bpc_nlevel > 0 && (*(int *)result != DDI_SUCCESS) &&
6774		    !DEVI_IS_ATTACHING(ct->ct_dip)) {
6775			MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6776			    "i_mdi_pm_rele_client\n"));
6777			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6778		}
6779
6780		if (bpc->bpc_nlevel == 0 && (*(int *)result == DDI_SUCCESS)) {
6781			/* another thread might started attaching */
6782			if (DEVI_IS_ATTACHING(ct->ct_dip)) {
6783				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6784				    "i_mdi_pm_rele_client\n"));
6785				i_mdi_pm_rele_client(ct, ct->ct_path_count);
6786			/* detaching has been taken care in pm_post_unconfig */
6787			} else if (!DEVI_IS_DETACHING(ct->ct_dip)) {
6788				MDI_DEBUG(4, (MDI_NOTE, bpc->bpc_dip,
6789				    "i_mdi_pm_reset_client\n"));
6790				i_mdi_pm_reset_client(ct);
6791			}
6792		}
6793
6794		MDI_CLIENT_CLEAR_POWER_TRANSITION(ct);
6795		cv_broadcast(&ct->ct_powerchange_cv);
6796
6797		break;
6798
6799	/* need to do more */
6800	case BUS_POWER_HAS_CHANGED:
6801		MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6802		    "BUS_POWER_HAS_CHANGED:"
6803		    "%s@%s, olevel=%d, nlevel=%d, comp=%d",
6804		    ddi_node_name(bphc->bphc_dip), PM_ADDR(bphc->bphc_dip),
6805		    bphc->bphc_olevel, bphc->bphc_nlevel, bphc->bphc_comp));
6806
6807		if (bphc->bphc_nlevel > 0 &&
6808		    bphc->bphc_nlevel > bphc->bphc_olevel) {
6809			if (ct->ct_power_cnt == 0) {
6810				ret = i_mdi_power_all_phci(ct);
6811			}
6812			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6813			    "i_mdi_pm_hold_client\n"));
6814			i_mdi_pm_hold_client(ct, ct->ct_path_count);
6815		}
6816
6817		if (bphc->bphc_nlevel == 0 && bphc->bphc_olevel != -1) {
6818			MDI_DEBUG(4, (MDI_NOTE, bphc->bphc_dip,
6819			    "i_mdi_pm_rele_client\n"));
6820			i_mdi_pm_rele_client(ct, ct->ct_path_count);
6821		}
6822		break;
6823	}
6824
6825	MDI_CLIENT_UNLOCK(ct);
6826	return (ret);
6827}
6828
6829static int
6830i_mdi_pm_pre_config_one(dev_info_t *child)
6831{
6832	int		ret = MDI_SUCCESS;
6833	mdi_client_t	*ct;
6834
6835	ct = i_devi_get_client(child);
6836	if (ct == NULL)
6837		return (MDI_FAILURE);
6838
6839	MDI_CLIENT_LOCK(ct);
6840	while (MDI_CLIENT_IS_POWER_TRANSITION(ct))
6841		cv_wait(&ct->ct_powerchange_cv, &ct->ct_mutex);
6842
6843	if (!MDI_CLIENT_IS_FAILED(ct)) {
6844		MDI_CLIENT_UNLOCK(ct);
6845		MDI_DEBUG(4, (MDI_NOTE, child, "already configured\n"));
6846		return (MDI_SUCCESS);
6847	}
6848
6849	if (ct->ct_powercnt_config) {
6850		MDI_CLIENT_UNLOCK(ct);
6851		MDI_DEBUG(4, (MDI_NOTE, child, "already held\n"));
6852		return (MDI_SUCCESS);
6853	}
6854
6855	if (ct->ct_power_cnt == 0) {
6856		ret = i_mdi_power_all_phci(ct);
6857	}
6858	MDI_DEBUG(4, (MDI_NOTE, child, "i_mdi_pm_hold_client\n"));
6859	i_mdi_pm_hold_client(ct, ct->ct_path_count);
6860	ct->ct_powercnt_config = 1;
6861	ct->ct_powercnt_reset = 0;
6862	MDI_CLIENT_UNLOCK(ct);
6863	return (ret);
6864}
6865
6866static int
6867i_mdi_pm_pre_config(dev_info_t *vdip, dev_info_t *child)
6868{
6869	int			ret = MDI_SUCCESS;
6870	dev_info_t		*cdip;
6871	int			circ;
6872
6873	ASSERT(MDI_VHCI(vdip));
6874
6875	/* ndi_devi_config_one */
6876	if (child) {
6877		ASSERT(DEVI_BUSY_OWNED(vdip));
6878		return (i_mdi_pm_pre_config_one(child));
6879	}
6880
6881	/* devi_config_common */
6882	ndi_devi_enter(vdip, &circ);
6883	cdip = ddi_get_child(vdip);
6884	while (cdip) {
6885		dev_info_t *next = ddi_get_next_sibling(cdip);
6886
6887		ret = i_mdi_pm_pre_config_one(cdip);
6888		if (ret != MDI_SUCCESS)
6889			break;
6890		cdip = next;
6891	}
6892	ndi_devi_exit(vdip, circ);
6893	return (ret);
6894}
6895
6896static int
6897