xref: /illumos-gate/usr/src/uts/common/os/sunmdi.c (revision c6f039c7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2014 Nexenta Systems Inc. All rights reserved.
24  * Copyright (c) 2018, Joyent, Inc.
25  */
26 
27 /*
28  * Multipath driver interface (MDI) implementation; see mdi_impldefs.h for a
29  * more detailed discussion of the overall mpxio architecture.
30  *
31  * Default locking order:
32  *
33  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_phci_mutex);
34  * _NOTE(LOCK_ORDER(mdi_mutex, mdi_vhci:vh_client_mutex);
35  * _NOTE(LOCK_ORDER(mdi_vhci:vh_phci_mutex, mdi_phci::ph_mutex);
36  * _NOTE(LOCK_ORDER(mdi_vhci:vh_client_mutex, mdi_client::ct_mutex);
37  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
38  * _NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_client::ct_mutex))
39  * _NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
40  */
41 
42 #include <sys/note.h>
43 #include <sys/types.h>
44 #include <sys/varargs.h>
45 #include <sys/param.h>
46 #include <sys/errno.h>
47 #include <sys/uio.h>
48 #include <sys/buf.h>
49 #include <sys/modctl.h>
50 #include <sys/open.h>
51 #include <sys/kmem.h>
52 #include <sys/poll.h>
53 #include <sys/conf.h>
54 #include <sys/bootconf.h>
55 #include <sys/cmn_err.h>
56 #include <sys/stat.h>
57 #include <sys/ddi.h>
58 #include <sys/sunddi.h>
59 #include <sys/ddipropdefs.h>
60 #include <sys/sunndi.h>
61 #include <sys/ndi_impldefs.h>
62 #include <sys/promif.h>
63 #include <sys/sunmdi.h>
64 #include <sys/mdi_impldefs.h>
65 #include <sys/taskq.h>
66 #include <sys/epm.h>
67 #include <sys/sunpm.h>
68 #include <sys/modhash.h>
69 #include <sys/disp.h>
70 #include <sys/autoconf.h>
71 #include <sys/sysmacros.h>
72 
73 #ifdef	DEBUG
74 #include <sys/debug.h>
75 int	mdi_debug = 1;
76 int	mdi_debug_logonly = 0;
77 #define	MDI_DEBUG(dbglevel, pargs) if (mdi_debug >= (dbglevel))	i_mdi_log pargs
78 #define	MDI_WARN	CE_WARN, __func__
79 #define	MDI_NOTE	CE_NOTE, __func__
80 #define	MDI_CONT	CE_CONT, __func__
81 static void i_mdi_log(int, const char *, dev_info_t *, const char *, ...);
82 #else	/* !DEBUG */
83 #define	MDI_DEBUG(dbglevel, pargs)
84 #endif	/* DEBUG */
85 int	mdi_debug_consoleonly = 0;
86 int	mdi_delay = 3;
87 
88 extern pri_t	minclsyspri;
89 extern int	modrootloaded;
90 
91 /*
92  * Global mutex:
93  * Protects vHCI list and structure members.
94  */
95 kmutex_t	mdi_mutex;
96 
97 /*
98  * Registered vHCI class driver lists
99  */
100 int		mdi_vhci_count;
101 mdi_vhci_t	*mdi_vhci_head;
102 mdi_vhci_t	*mdi_vhci_tail;
103 
104 /*
105  * Client Hash Table size
106  */
107 static int	mdi_client_table_size = CLIENT_HASH_TABLE_SIZE;
108 
109 /*
110  * taskq interface definitions
111  */
112 #define	MDI_TASKQ_N_THREADS	8
113 #define	MDI_TASKQ_PRI		minclsyspri
114 #define	MDI_TASKQ_MINALLOC	(4*mdi_taskq_n_threads)
115 #define	MDI_TASKQ_MAXALLOC	(500*mdi_taskq_n_threads)
116 
117 taskq_t				*mdi_taskq;
118 static uint_t			mdi_taskq_n_threads = MDI_TASKQ_N_THREADS;
119 
120 #define	TICKS_PER_SECOND	(drv_usectohz(1000000))
121 
122 /*
123  * The data should be "quiet" for this interval (in seconds) before the
124  * vhci cached data is flushed to the disk.
125  */
126 static int mdi_vhcache_flush_delay = 10;
127 
128 /* number of seconds the vhcache flush daemon will sleep idle before exiting */
129 static int mdi_vhcache_flush_daemon_idle_time = 60;
130 
131 /*
132  * MDI falls back to discovery of all paths when a bus_config_one fails.
133  * The following parameters can be used to tune this operation.
134  *
135  * mdi_path_discovery_boot
136  *	Number of times path discovery will be attempted during early boot.
137  *	Probably there is no reason to ever set this value to greater than one.
138  *
139  * mdi_path_discovery_postboot
140  *	Number of times path discovery will be attempted after early boot.
141  *	Set it to a minimum of two to allow for discovery of iscsi paths which
142  *	may happen very late during booting.
143  *
144  * mdi_path_discovery_interval
145  *	Minimum number of seconds MDI will wait between successive discovery
146  *	of all paths. Set it to -1 to disable discovery of all paths.
147  */
148 static int mdi_path_discovery_boot = 1;
149 static int mdi_path_discovery_postboot = 2;
150 static int mdi_path_discovery_interval = 10;
151 
152 /*
153  * number of seconds the asynchronous configuration thread will sleep idle
154  * before exiting.
155  */
156 static int mdi_async_config_idle_time = 600;
157 
158 static int mdi_bus_config_cache_hash_size = 256;
159 
160 /* turns off multithreaded configuration for certain operations */
161 static int mdi_mtc_off = 0;
162 
163 /*
164  * The "path" to a pathinfo node is identical to the /devices path to a
165  * devinfo node had the device been enumerated under a pHCI instead of
166  * a vHCI.  This pathinfo "path" is associated with a 'path_instance'.
167  * This association persists across create/delete of the pathinfo nodes,
168  * but not across reboot.
169  */
170 static uint_t		mdi_pathmap_instance = 1;	/* 0 -> any path */
171 static int		mdi_pathmap_hash_size = 256;
172 static kmutex_t		mdi_pathmap_mutex;
173 static mod_hash_t	*mdi_pathmap_bypath;		/* "path"->instance */
174 static mod_hash_t	*mdi_pathmap_byinstance;	/* instance->"path" */
175 static mod_hash_t	*mdi_pathmap_sbyinstance;	/* inst->shortpath */
176 
177 /*
178  * MDI component property name/value string definitions
179  */
180 const char 		*mdi_component_prop = "mpxio-component";
181 const char		*mdi_component_prop_vhci = "vhci";
182 const char		*mdi_component_prop_phci = "phci";
183 const char		*mdi_component_prop_client = "client";
184 
185 /*
186  * MDI client global unique identifier property name
187  */
188 const char		*mdi_client_guid_prop = "client-guid";
189 
190 /*
191  * MDI client load balancing property name/value string definitions
192  */
193 const char		*mdi_load_balance = "load-balance";
194 const char		*mdi_load_balance_none = "none";
195 const char		*mdi_load_balance_rr = "round-robin";
196 const char		*mdi_load_balance_lba = "logical-block";
197 
198 /*
199  * Obsolete vHCI class definition; to be removed after Leadville update
200  */
201 const char *mdi_vhci_class_scsi = MDI_HCI_CLASS_SCSI;
202 
203 static char vhci_greeting[] =
204 	"\tThere already exists one vHCI driver for class %s\n"
205 	"\tOnly one vHCI driver for each class is allowed\n";
206 
207 /*
208  * Static function prototypes
209  */
210 static int		i_mdi_phci_offline(dev_info_t *, uint_t);
211 static int		i_mdi_client_offline(dev_info_t *, uint_t);
212 static int		i_mdi_phci_pre_detach(dev_info_t *, ddi_detach_cmd_t);
213 static void		i_mdi_phci_post_detach(dev_info_t *,
214 			    ddi_detach_cmd_t, int);
215 static int		i_mdi_client_pre_detach(dev_info_t *,
216 			    ddi_detach_cmd_t);
217 static void		i_mdi_client_post_detach(dev_info_t *,
218 			    ddi_detach_cmd_t, int);
219 static void		i_mdi_pm_hold_pip(mdi_pathinfo_t *);
220 static void		i_mdi_pm_rele_pip(mdi_pathinfo_t *);
221 static int 		i_mdi_lba_lb(mdi_client_t *ct,
222 			    mdi_pathinfo_t **ret_pip, struct buf *buf);
223 static void		i_mdi_pm_hold_client(mdi_client_t *, int);
224 static void		i_mdi_pm_rele_client(mdi_client_t *, int);
225 static void		i_mdi_pm_reset_client(mdi_client_t *);
226 static int		i_mdi_power_all_phci(mdi_client_t *);
227 static void		i_mdi_log_sysevent(dev_info_t *, char *, char *);
228 
229 
230 /*
231  * Internal mdi_pathinfo node functions
232  */
233 static void		i_mdi_pi_kstat_destroy(mdi_pathinfo_t *);
234 
235 static mdi_vhci_t	*i_mdi_vhci_class2vhci(char *);
236 static mdi_vhci_t	*i_devi_get_vhci(dev_info_t *);
237 static mdi_phci_t	*i_devi_get_phci(dev_info_t *);
238 static void		i_mdi_phci_lock(mdi_phci_t *, mdi_pathinfo_t *);
239 static void		i_mdi_phci_unlock(mdi_phci_t *);
240 static mdi_pathinfo_t	*i_mdi_pi_alloc(mdi_phci_t *, char *, mdi_client_t *);
241 static void		i_mdi_phci_add_path(mdi_phci_t *, mdi_pathinfo_t *);
242 static void		i_mdi_client_add_path(mdi_client_t *, mdi_pathinfo_t *);
243 static void		i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *,
244 			    mdi_client_t *);
245 static void		i_mdi_phci_remove_path(mdi_phci_t *, mdi_pathinfo_t *);
246 static void		i_mdi_client_remove_path(mdi_client_t *,
247 			    mdi_pathinfo_t *);
248 
249 static int		i_mdi_pi_state_change(mdi_pathinfo_t *,
250 			    mdi_pathinfo_state_t, int);
251 static int		i_mdi_pi_offline(mdi_pathinfo_t *, int);
252 static dev_info_t	*i_mdi_devinfo_create(mdi_vhci_t *, char *, char *,
253 			    char **, int);
254 static dev_info_t	*i_mdi_devinfo_find(mdi_vhci_t *, char *, char *);
255 static int		i_mdi_devinfo_remove(dev_info_t *, dev_info_t *, int);
256 static int		i_mdi_is_child_present(dev_info_t *, dev_info_t *);
257 static mdi_client_t	*i_mdi_client_alloc(mdi_vhci_t *, char *, char *);
258 static void		i_mdi_client_enlist_table(mdi_vhci_t *, mdi_client_t *);
259 static void		i_mdi_client_delist_table(mdi_vhci_t *, mdi_client_t *);
260 static mdi_client_t	*i_mdi_client_find(mdi_vhci_t *, char *, char *);
261 static void		i_mdi_client_update_state(mdi_client_t *);
262 static int		i_mdi_client_compute_state(mdi_client_t *,
263 			    mdi_phci_t *);
264 static void		i_mdi_client_lock(mdi_client_t *, mdi_pathinfo_t *);
265 static void		i_mdi_client_unlock(mdi_client_t *);
266 static int		i_mdi_client_free(mdi_vhci_t *, mdi_client_t *);
267 static mdi_client_t	*i_devi_get_client(dev_info_t *);
268 /*
269  * NOTE: this will be removed once the NWS files are changed to use the new
270  * mdi_{enable,disable}_path interfaces
271  */
272 static int		i_mdi_pi_enable_disable(dev_info_t *, dev_info_t *,
273 				int, int);
274 static mdi_pathinfo_t 	*i_mdi_enable_disable_path(mdi_pathinfo_t *pip,
275 				mdi_vhci_t *vh, int flags, int op);
276 /*
277  * Failover related function prototypes
278  */
279 static int		i_mdi_failover(void *);
280 
281 /*
282  * misc internal functions
283  */
284 static int		i_mdi_get_hash_key(char *);
285 static int		i_map_nvlist_error_to_mdi(int);
286 static void		i_mdi_report_path_state(mdi_client_t *,
287 			    mdi_pathinfo_t *);
288 
289 static void		setup_vhci_cache(mdi_vhci_t *);
290 static int		destroy_vhci_cache(mdi_vhci_t *);
291 static int		stop_vhcache_async_threads(mdi_vhci_config_t *);
292 static boolean_t	stop_vhcache_flush_thread(void *, int);
293 static void		free_string_array(char **, int);
294 static void		free_vhcache_phci(mdi_vhcache_phci_t *);
295 static void		free_vhcache_pathinfo(mdi_vhcache_pathinfo_t *);
296 static void		free_vhcache_client(mdi_vhcache_client_t *);
297 static int		mainnvl_to_vhcache(mdi_vhci_cache_t *, nvlist_t *);
298 static nvlist_t		*vhcache_to_mainnvl(mdi_vhci_cache_t *);
299 static void		vhcache_phci_add(mdi_vhci_config_t *, mdi_phci_t *);
300 static void		vhcache_phci_remove(mdi_vhci_config_t *, mdi_phci_t *);
301 static void		vhcache_pi_add(mdi_vhci_config_t *,
302 			    struct mdi_pathinfo *);
303 static void		vhcache_pi_remove(mdi_vhci_config_t *,
304 			    struct mdi_pathinfo *);
305 static void		free_phclient_path_list(mdi_phys_path_t *);
306 static void		sort_vhcache_paths(mdi_vhcache_client_t *);
307 static int		flush_vhcache(mdi_vhci_config_t *, int);
308 static void		vhcache_dirty(mdi_vhci_config_t *);
309 static void		free_async_client_config(mdi_async_client_config_t *);
310 static void		single_threaded_vhconfig_enter(mdi_vhci_config_t *);
311 static void		single_threaded_vhconfig_exit(mdi_vhci_config_t *);
312 static nvlist_t		*read_on_disk_vhci_cache(char *);
313 extern int		fread_nvlist(char *, nvlist_t **);
314 extern int		fwrite_nvlist(char *, nvlist_t *);
315 
316 /* called once when first vhci registers with mdi */
317 static void
i_mdi_init()318 i_mdi_init()
319 {
320 	static int initialized = 0;
321 
322 	if (initialized)
323 		return;
324 	initialized = 1;
325 
326 	mutex_init(&mdi_mutex, NULL, MUTEX_DEFAULT, NULL);
327 
328 	/* Create our taskq resources */
329 	mdi_taskq = taskq_create("mdi_taskq", mdi_taskq_n_threads,
330 	    MDI_TASKQ_PRI, MDI_TASKQ_MINALLOC, MDI_TASKQ_MAXALLOC,
331 	    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
332 	ASSERT(mdi_taskq != NULL);	/* taskq_create never fails */
333 
334 	/* Allocate ['path_instance' <-> "path"] maps */
335 	mutex_init(&mdi_pathmap_mutex, NULL, MUTEX_DRIVER, NULL);
336 	mdi_pathmap_bypath = mod_hash_create_strhash(
337 	    "mdi_pathmap_bypath", mdi_pathmap_hash_size,
338 	    mod_hash_null_valdtor);
339 	mdi_pathmap_byinstance = mod_hash_create_idhash(
340 	    "mdi_pathmap_byinstance", mdi_pathmap_hash_size,
341 	    mod_hash_null_valdtor);
342 	mdi_pathmap_sbyinstance = mod_hash_create_idhash(
343 	    "mdi_pathmap_sbyinstance", mdi_pathmap_hash_size,
344 	    mod_hash_null_valdtor);
345 }
346 
347 /*
348  * mdi_get_component_type():
349  *		Return mpxio component type
350  * Return Values:
351  *		MDI_COMPONENT_NONE
352  *		MDI_COMPONENT_VHCI
353  *		MDI_COMPONENT_PHCI
354  *		MDI_COMPONENT_CLIENT
355  * XXX This doesn't work under multi-level MPxIO and should be
356  *	removed when clients migrate mdi_component_is_*() interfaces.
357  */
358 int
mdi_get_component_type(dev_info_t * dip)359 mdi_get_component_type(dev_info_t *dip)
360 {
361 	return (DEVI(dip)->devi_mdi_component);
362 }
363 
364 /*
365  * mdi_vhci_register():
366  *		Register a vHCI module with the mpxio framework
367  *		mdi_vhci_register() is called by vHCI drivers to register the
368  *		'class_driver' vHCI driver and its MDI entrypoints with the
369  *		mpxio framework.  The vHCI driver must call this interface as
370  *		part of its attach(9e) handler.
371  *		Competing threads may try to attach mdi_vhci_register() as
372  *		the vHCI drivers are loaded and attached as a result of pHCI
373  *		driver instance registration (mdi_phci_register()) with the
374  *		framework.
375  * Return Values:
376  *		MDI_SUCCESS
377  *		MDI_FAILURE
378  */
379 /*ARGSUSED*/
380 int
mdi_vhci_register(char * class,dev_info_t * vdip,mdi_vhci_ops_t * vops,int flags)381 mdi_vhci_register(char *class, dev_info_t *vdip, mdi_vhci_ops_t *vops,
382     int flags)
383 {
384 	mdi_vhci_t		*vh = NULL;
385 
386 	/* Registrant can't be older */
387 	ASSERT(vops->vo_revision <= MDI_VHCI_OPS_REV);
388 
389 #ifdef DEBUG
390 	/*
391 	 * IB nexus driver is loaded only when IB hardware is present.
392 	 * In order to be able to do this there is a need to drive the loading
393 	 * and attaching of the IB nexus driver (especially when an IB hardware
394 	 * is dynamically plugged in) when an IB HCA driver (PHCI)
395 	 * is being attached. Unfortunately this gets into the limitations
396 	 * of devfs as there seems to be no clean way to drive configuration
397 	 * of a subtree from another subtree of a devfs. Hence, do not ASSERT
398 	 * for IB.
399 	 */
400 	if (strcmp(class, MDI_HCI_CLASS_IB) != 0)
401 		ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
402 #endif
403 
404 	i_mdi_init();
405 
406 	mutex_enter(&mdi_mutex);
407 	/*
408 	 * Scan for already registered vhci
409 	 */
410 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
411 		if (strcmp(vh->vh_class, class) == 0) {
412 			/*
413 			 * vHCI has already been created.  Check for valid
414 			 * vHCI ops registration.  We only support one vHCI
415 			 * module per class
416 			 */
417 			if (vh->vh_ops != NULL) {
418 				mutex_exit(&mdi_mutex);
419 				cmn_err(CE_NOTE, vhci_greeting, class);
420 				return (MDI_FAILURE);
421 			}
422 			break;
423 		}
424 	}
425 
426 	/*
427 	 * if not yet created, create the vHCI component
428 	 */
429 	if (vh == NULL) {
430 		struct client_hash	*hash = NULL;
431 		char			*load_balance;
432 
433 		/*
434 		 * Allocate and initialize the mdi extensions
435 		 */
436 		vh = kmem_zalloc(sizeof (mdi_vhci_t), KM_SLEEP);
437 		hash = kmem_zalloc(mdi_client_table_size * sizeof (*hash),
438 		    KM_SLEEP);
439 		vh->vh_client_table = hash;
440 		vh->vh_class = kmem_zalloc(strlen(class) + 1, KM_SLEEP);
441 		(void) strcpy(vh->vh_class, class);
442 		vh->vh_lb = LOAD_BALANCE_RR;
443 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vdip,
444 		    0, LOAD_BALANCE_PROP, &load_balance) == DDI_SUCCESS) {
445 			if (strcmp(load_balance, LOAD_BALANCE_PROP_NONE) == 0) {
446 				vh->vh_lb = LOAD_BALANCE_NONE;
447 			} else if (strcmp(load_balance, LOAD_BALANCE_PROP_LBA)
448 				    == 0) {
449 				vh->vh_lb = LOAD_BALANCE_LBA;
450 			}
451 			ddi_prop_free(load_balance);
452 		}
453 
454 		mutex_init(&vh->vh_phci_mutex, NULL, MUTEX_DEFAULT, NULL);
455 		mutex_init(&vh->vh_client_mutex, NULL, MUTEX_DEFAULT, NULL);
456 
457 		/*
458 		 * Store the vHCI ops vectors
459 		 */
460 		vh->vh_dip = vdip;
461 		vh->vh_ops = vops;
462 
463 		setup_vhci_cache(vh);
464 
465 		if (mdi_vhci_head == NULL) {
466 			mdi_vhci_head = vh;
467 		}
468 		if (mdi_vhci_tail) {
469 			mdi_vhci_tail->vh_next = vh;
470 		}
471 		mdi_vhci_tail = vh;
472 		mdi_vhci_count++;
473 	}
474 
475 	/*
476 	 * Claim the devfs node as a vhci component
477 	 */
478 	DEVI(vdip)->devi_mdi_component |= MDI_COMPONENT_VHCI;
479 
480 	/*
481 	 * Initialize our back reference from dev_info node
482 	 */
483 	DEVI(vdip)->devi_mdi_xhci = (caddr_t)vh;
484 	mutex_exit(&mdi_mutex);
485 	return (MDI_SUCCESS);
486 }
487 
488 /*
489  * mdi_vhci_unregister():
490  *		Unregister a vHCI module from mpxio framework
491  *		mdi_vhci_unregister() is called from the detach(9E) entrypoint
492  * 		of a vhci to unregister it from the framework.
493  * Return Values:
494  *		MDI_SUCCESS
495  *		MDI_FAILURE
496  */
497 /*ARGSUSED*/
498 int
mdi_vhci_unregister(dev_info_t * vdip,int flags)499 mdi_vhci_unregister(dev_info_t *vdip, int flags)
500 {
501 	mdi_vhci_t	*found, *vh, *prev = NULL;
502 
503 	ASSERT(DEVI_BUSY_OWNED(ddi_get_parent(vdip)));
504 
505 	/*
506 	 * Check for invalid VHCI
507 	 */
508 	if ((vh = i_devi_get_vhci(vdip)) == NULL)
509 		return (MDI_FAILURE);
510 
511 	/*
512 	 * Scan the list of registered vHCIs for a match
513 	 */
514 	mutex_enter(&mdi_mutex);
515 	for (found = mdi_vhci_head; found != NULL; found = found->vh_next) {
516 		if (found == vh)
517 			break;
518 		prev = found;
519 	}
520 
521 	if (found == NULL) {
522 		mutex_exit(&mdi_mutex);
523 		return (MDI_FAILURE);
524 	}
525 
526 	/*
527 	 * Check the vHCI, pHCI and client count. All the pHCIs and clients
528 	 * should have been unregistered, before a vHCI can be
529 	 * unregistered.
530 	 */
531 	MDI_VHCI_PHCI_LOCK(vh);
532 	if (vh->vh_refcnt || vh->vh_phci_count || vh->vh_client_count) {
533 		MDI_VHCI_PHCI_UNLOCK(vh);
534 		mutex_exit(&mdi_mutex);
535 		return (MDI_FAILURE);
536 	}
537 	MDI_VHCI_PHCI_UNLOCK(vh);
538 
539 	if (destroy_vhci_cache(vh) != MDI_SUCCESS) {
540 		mutex_exit(&mdi_mutex);
541 		return (MDI_FAILURE);
542 	}
543 
544 	/*
545 	 * Remove the vHCI from the global list
546 	 */
547 	if (vh == mdi_vhci_head) {
548 		mdi_vhci_head = vh->vh_next;
549 	} else {
550 		prev->vh_next = vh->vh_next;
551 	}
552 	if (vh == mdi_vhci_tail) {
553 		mdi_vhci_tail = prev;
554 	}
555 	mdi_vhci_count--;
556 	mutex_exit(&mdi_mutex);
557 
558 	vh->vh_ops = NULL;
559 	DEVI(vdip)->devi_mdi_component &= ~MDI_COMPONENT_VHCI;
560 	DEVI(vdip)->devi_mdi_xhci = NULL;
561 	kmem_free(vh->vh_class, strlen(vh->vh_class)+1);
562 	kmem_free(vh->vh_client_table,
563 	    mdi_client_table_size * sizeof (struct client_hash));
564 	mutex_destroy(&vh->vh_phci_mutex);
565 	mutex_destroy(&vh->vh_client_mutex);
566 
567 	kmem_free(vh, sizeof (mdi_vhci_t));
568 	return (MDI_SUCCESS);
569 }
570 
571 /*
572  * i_mdi_vhci_class2vhci():
573  *		Look for a matching vHCI module given a vHCI class name
574  * Return Values:
575  *		Handle to a vHCI component
576  *		NULL
577  */
578 static mdi_vhci_t *
i_mdi_vhci_class2vhci(char * class)579 i_mdi_vhci_class2vhci(char *class)
580 {
581 	mdi_vhci_t	*vh = NULL;
582 
583 	ASSERT(!MUTEX_HELD(&mdi_mutex));
584 
585 	mutex_enter(&mdi_mutex);
586 	for (vh = mdi_vhci_head; vh != NULL; vh = vh->vh_next) {
587 		if (strcmp(vh->vh_class, class) == 0) {
588 			break;
589 		}
590 	}
591 	mutex_exit(&mdi_mutex);
592 	return (vh);
593 }
594 
595 /*
596  * i_devi_get_vhci():
597  *		Utility function to get the handle to a vHCI component
598  * Return Values:
599  *		Handle to a vHCI component
600  *		NULL
601  */
602 mdi_vhci_t *
i_devi_get_vhci(dev_info_t * vdip)603 i_devi_get_vhci(dev_info_t *vdip)
604 {
605 	mdi_vhci_t	*vh = NULL;
606 	if (MDI_VHCI(vdip)) {
607 		vh = (mdi_vhci_t *)DEVI(vdip)->devi_mdi_xhci;
608 	}
609 	return (vh);
610 }
611 
612 /*
613  * mdi_phci_register():
614  *		Register a pHCI module with mpxio framework
615  *		mdi_phci_register() is called by pHCI drivers to register with
616  *		the mpxio framework and a specific 'class_driver' vHCI.  The
617  *		pHCI driver must call this interface as part of its attach(9e)
618  *		handler.
619  * Return Values:
620  *		MDI_SUCCESS
621  *		MDI_FAILURE
622  */
623 /*ARGSUSED*/
624 int
mdi_phci_register(char * class,dev_info_t * pdip,int flags)625 mdi_phci_register(char *class, dev_info_t *pdip, int flags)
626 {
627 	mdi_phci_t		*ph;
628 	mdi_vhci_t		*vh;
629 	char			*data;
630 
631 	/*
632 	 * Some subsystems, like fcp, perform pHCI registration from a
633 	 * different thread than the one doing the pHCI attach(9E) - the
634 	 * driver attach code is waiting for this other thread to complete.
635 	 * This means we can only ASSERT DEVI_BUSY_CHANGING of parent
636 	 * (indicating that some thread has done an ndi_devi_enter of parent)
637 	 * not DEVI_BUSY_OWNED (which would indicate that we did the enter).
638 	 */
639 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
640 
641 	/*
642 	 * Check for mpxio-disable property. Enable mpxio if the property is
643 	 * missing or not set to "yes".
644 	 * If the property is set to "yes" then emit a brief message.
645 	 */
646 	if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, 0, "mpxio-disable",
647 	    &data) == DDI_SUCCESS)) {
648 		if (strcmp(data, "yes") == 0) {
649 			MDI_DEBUG(1, (MDI_CONT, pdip,
650 			    "?multipath capabilities disabled via %s.conf.",
651 			    ddi_driver_name(pdip)));
652 			ddi_prop_free(data);
653 			return (MDI_FAILURE);
654 		}
655 		ddi_prop_free(data);
656 	}
657 
658 	/*
659 	 * Search for a matching vHCI
660 	 */
661 	vh = (mdi_vhci_t *)i_mdi_vhci_class2vhci(class);
662 	if (vh == NULL) {
663 		return (MDI_FAILURE);
664 	}
665 
666 	ph = kmem_zalloc(sizeof (mdi_phci_t), KM_SLEEP);
667 	mutex_init(&ph->ph_mutex, NULL, MUTEX_DEFAULT, NULL);
668 	ph->ph_dip = pdip;
669 	ph->ph_vhci = vh;
670 	ph->ph_next = NULL;
671 	ph->ph_unstable = 0;
672 	ph->ph_vprivate = 0;
673 	cv_init(&ph->ph_unstable_cv, NULL, CV_DRIVER, NULL);
674 
675 	MDI_PHCI_LOCK(ph);
676 	MDI_PHCI_SET_POWER_UP(ph);
677 	MDI_PHCI_UNLOCK(ph);
678 	DEVI(pdip)->devi_mdi_component |= MDI_COMPONENT_PHCI;
679 	DEVI(pdip)->devi_mdi_xhci = (caddr_t)ph;
680 
681 	vhcache_phci_add(vh->vh_config, ph);
682 
683 	MDI_VHCI_PHCI_LOCK(vh);
684 	if (vh->vh_phci_head == NULL) {
685 		vh->vh_phci_head = ph;
686 	}
687 	if (vh->vh_phci_tail) {
688 		vh->vh_phci_tail->ph_next = ph;
689 	}
690 	vh->vh_phci_tail = ph;
691 	vh->vh_phci_count++;
692 	MDI_VHCI_PHCI_UNLOCK(vh);
693 
694 	i_mdi_log_sysevent(pdip, class, ESC_DDI_INITIATOR_REGISTER);
695 	return (MDI_SUCCESS);
696 }
697 
698 /*
699  * mdi_phci_unregister():
700  *		Unregister a pHCI module from mpxio framework
701  *		mdi_phci_unregister() is called by the pHCI drivers from their
702  *		detach(9E) handler to unregister their instances from the
703  *		framework.
704  * Return Values:
705  *		MDI_SUCCESS
706  *		MDI_FAILURE
707  */
708 /*ARGSUSED*/
709 int
mdi_phci_unregister(dev_info_t * pdip,int flags)710 mdi_phci_unregister(dev_info_t *pdip, int flags)
711 {
712 	mdi_vhci_t		*vh;
713 	mdi_phci_t		*ph;
714 	mdi_phci_t		*tmp;
715 	mdi_phci_t		*prev = NULL;
716 	mdi_pathinfo_t		*pip;
717 
718 	ASSERT(DEVI_BUSY_CHANGING(ddi_get_parent(pdip)));
719 
720 	ph = i_devi_get_phci(pdip);
721 	if (ph == NULL) {
722 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid pHCI"));
723 		return (MDI_FAILURE);
724 	}
725 
726 	vh = ph->ph_vhci;
727 	ASSERT(vh != NULL);
728 	if (vh == NULL) {
729 		MDI_DEBUG(1, (MDI_WARN, pdip, "!not a valid vHCI"));
730 		return (MDI_FAILURE);
731 	}
732 
733 	MDI_VHCI_PHCI_LOCK(vh);
734 	tmp = vh->vh_phci_head;
735 	while (tmp) {
736 		if (tmp == ph) {
737 			break;
738 		}
739 		prev = tmp;
740 		tmp = tmp->ph_next;
741 	}
742 
743 	if (ph == vh->vh_phci_head) {
744 		vh->vh_phci_head = ph->ph_next;
745 	} else {
746 		prev->ph_next = ph->ph_next;
747 	}
748 
749 	if (ph == vh->vh_phci_tail) {
750 		vh->vh_phci_tail = prev;
751 	}
752 
753 	vh->vh_phci_count--;
754 	MDI_VHCI_PHCI_UNLOCK(vh);
755 
756 	/* Walk remaining pathinfo nodes and disassociate them from pHCI */
757 	MDI_PHCI_LOCK(ph);
758 	for (pip = (mdi_pathinfo_t *)ph->ph_path_head; pip;
759 	    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link)
760 		MDI_PI(pip)->pi_phci = NULL;
761 	MDI_PHCI_UNLOCK(ph);
762 
763 	i_mdi_log_sysevent(pdip, ph->ph_vhci->vh_class,
764 	    ESC_DDI_INITIATOR_UNREGISTER);
765 	vhcache_phci_remove(vh->vh_config, ph);
766 	cv_destroy(&ph->ph_unstable_cv);
767 	mutex_destroy(&ph->ph_mutex);
768 	kmem_free(ph, sizeof (mdi_phci_t));
769 	DEVI(pdip)->devi_mdi_component &= ~MDI_COMPONENT_PHCI;
770 	DEVI(pdip)->devi_mdi_xhci = NULL;
771 	return (MDI_SUCCESS);
772 }
773 
774 /*
775  * i_devi_get_phci():
776  * 		Utility function to return the phci extensions.
777  */
778 static mdi_phci_t *
i_devi_get_phci(dev_info_t * pdip)779 i_devi_get_phci(dev_info_t *pdip)
780 {
781 	mdi_phci_t	*ph = NULL;
782 
783 	if (MDI_PHCI(pdip)) {
784 		ph = (mdi_phci_t *)DEVI(pdip)->devi_mdi_xhci;
785 	}
786 	return (ph);
787 }
788 
789 /*
790  * Single thread mdi entry into devinfo node for modifying its children.
791  * If necessary we perform an ndi_devi_enter of the vHCI before doing
792  * an ndi_devi_enter of 'dip'.  We maintain circular in two parts: one
793  * for the vHCI and one for the pHCI.
794  */
795 void
mdi_devi_enter(dev_info_t * phci_dip,int * circular)796 mdi_devi_enter(dev_info_t *phci_dip, int *circular)
797 {
798 	dev_info_t	*vdip;
799 	int		vcircular, pcircular;
800 
801 	/* Verify calling context */
802 	ASSERT(MDI_PHCI(phci_dip));
803 	vdip = mdi_devi_get_vdip(phci_dip);
804 	ASSERT(vdip);			/* A pHCI always has a vHCI */
805 
806 	/*
807 	 * If pHCI is detaching then the framework has already entered the
808 	 * vHCI on a threads that went down the code path leading to
809 	 * detach_node().  This framework enter of the vHCI during pHCI
810 	 * detach is done to avoid deadlock with vHCI power management
811 	 * operations which enter the vHCI and the enter down the path
812 	 * to the pHCI. If pHCI is detaching then we piggyback this calls
813 	 * enter of the vHCI on frameworks vHCI enter that has already
814 	 * occurred - this is OK because we know that the framework thread
815 	 * doing detach is waiting for our completion.
816 	 *
817 	 * We should DEVI_IS_DETACHING under an enter of the parent to avoid
818 	 * race with detach - but we can't do that because the framework has
819 	 * already entered the parent, so we have some complexity instead.
820 	 */
821 	for (;;) {
822 		if (ndi_devi_tryenter(vdip, &vcircular)) {
823 			ASSERT(vcircular != -1);
824 			if (DEVI_IS_DETACHING(phci_dip)) {
825 				ndi_devi_exit(vdip, vcircular);
826 				vcircular = -1;
827 			}
828 			break;
829 		} else if (DEVI_IS_DETACHING(phci_dip)) {
830 			vcircular = -1;
831 			break;
832 		} else if (servicing_interrupt()) {
833 			/*
834 			 * Don't delay an interrupt (and ensure adaptive
835 			 * mutex inversion support).
836 			 */
837 			ndi_devi_enter(vdip, &vcircular);
838 			break;
839 		} else {
840 			delay_random(mdi_delay);
841 		}
842 	}
843 
844 	ndi_devi_enter(phci_dip, &pcircular);
845 	*circular = (vcircular << 16) | (pcircular & 0xFFFF);
846 }
847 
848 /*
849  * Attempt to mdi_devi_enter.
850  */
851 int
mdi_devi_tryenter(dev_info_t * phci_dip,int * circular)852 mdi_devi_tryenter(dev_info_t *phci_dip, int *circular)
853 {
854 	dev_info_t	*vdip;
855 	int		vcircular, pcircular;
856 
857 	/* Verify calling context */
858 	ASSERT(MDI_PHCI(phci_dip));
859 	vdip = mdi_devi_get_vdip(phci_dip);
860 	ASSERT(vdip);			/* A pHCI always has a vHCI */
861 
862 	if (ndi_devi_tryenter(vdip, &vcircular)) {
863 		if (ndi_devi_tryenter(phci_dip, &pcircular)) {
864 			*circular = (vcircular << 16) | (pcircular & 0xFFFF);
865 			return (1);	/* locked */
866 		}
867 		ndi_devi_exit(vdip, vcircular);
868 	}
869 	return (0);			/* busy */
870 }
871 
872 /*
873  * Release mdi_devi_enter or successful mdi_devi_tryenter.
874  */
875 void
mdi_devi_exit(dev_info_t * phci_dip,int circular)876 mdi_devi_exit(dev_info_t *phci_dip, int circular)
877 {
878 	dev_info_t	*vdip;
879 	int		vcircular, pcircular;
880 
881 	/* Verify calling context */
882 	ASSERT(MDI_PHCI(phci_dip));
883 	vdip = mdi_devi_get_vdip(phci_dip);
884 	ASSERT(vdip);			/* A pHCI always has a vHCI */
885 
886 	/* extract two circular recursion values from single int */
887 	pcircular = (short)(circular & 0xFFFF);
888 	vcircular = (short)((circular >> 16) & 0xFFFF);
889 
890 	ndi_devi_exit(phci_dip, pcircular);
891 	if (vcircular != -1)
892 		ndi_devi_exit(vdip, vcircular);
893 }
894 
895 /*
896  * The functions mdi_devi_exit_phci() and mdi_devi_enter_phci() are used
897  * around a pHCI drivers calls to mdi_pi_online/offline, after holding
898  * the pathinfo node via mdi_hold_path/mdi_rele_path, to avoid deadlock
899  * with vHCI power management code during path online/offline.  Each
900  * mdi_devi_exit_phci must have a matching mdi_devi_enter_phci, and both must
901  * occur within the scope of an active mdi_devi_enter that establishes the
902  * circular value.
903  */
904 void
mdi_devi_exit_phci(dev_info_t * phci_dip,int circular)905 mdi_devi_exit_phci(dev_info_t *phci_dip, int circular)
906 {
907 	int		pcircular;
908 
909 	/* Verify calling context */
910 	ASSERT(MDI_PHCI(phci_dip));
911 
912 	/* Keep hold on pHCI until we reenter in mdi_devi_enter_phci */
913 	ndi_hold_devi(phci_dip);
914 
915 	pcircular = (short)(circular & 0xFFFF);
916 	ndi_devi_exit(phci_dip, pcircular);
917 }
918 
919 void
mdi_devi_enter_phci(dev_info_t * phci_dip,int * circular)920 mdi_devi_enter_phci(dev_info_t *phci_dip, int *circular)
921 {
922 	int		pcircular;
923 
924 	/* Verify calling context */
925 	ASSERT(MDI_PHCI(phci_dip));
926 
927 	ndi_devi_enter(phci_dip, &pcircular);
928 
929 	/* Drop hold from mdi_devi_exit_phci. */
930 	ndi_rele_devi(phci_dip);
931 
932 	/* verify matching mdi_devi_exit_phci/mdi_devi_enter_phci use */
933 	ASSERT(pcircular == ((short)(*circular & 0xFFFF)));
934 }
935 
936 /*
937  * mdi_devi_get_vdip():
938  *		given a pHCI dip return vHCI dip
939  */
940 dev_info_t *
mdi_devi_get_vdip(dev_info_t * pdip)941 mdi_devi_get_vdip(dev_info_t *pdip)
942 {
943 	mdi_phci_t	*ph;
944 
945 	ph = i_devi_get_phci(pdip);
946 	if (ph && ph->ph_vhci)
947 		return (ph->ph_vhci->vh_dip);
948 	return (NULL);
949 }
950 
951 /*
952  * mdi_devi_pdip_entered():
953  *		Return 1 if we are vHCI and have done an ndi_devi_enter
954  *		of a pHCI
955  */
956 int
mdi_devi_pdip_entered(dev_info_t * vdip)957 mdi_devi_pdip_entered(dev_info_t *vdip)
958 {
959 	mdi_vhci_t	*vh;
960 	mdi_phci_t	*ph;
961 
962 	vh = i_devi_get_vhci(vdip);
963 	if (vh == NULL)
964 		return (0);
965 
966 	MDI_VHCI_PHCI_LOCK(vh);
967 	ph = vh->vh_phci_head;
968 	while (ph) {
969 		if (ph->ph_dip && DEVI_BUSY_OWNED(ph->ph_dip)) {
970 			MDI_VHCI_PHCI_UNLOCK(vh);
971 			return (1);
972 		}
973 		ph = ph->ph_next;
974 	}
975 	MDI_VHCI_PHCI_UNLOCK(vh);
976 	return (0);
977 }
978 
979 /*
980  * mdi_phci_path2devinfo():
981  * 		Utility function to search for a valid phci device given
982  *		the devfs pathname.
983  */
984 dev_info_t *
mdi_phci_path2devinfo(dev_info_t * vdip,caddr_t pathname)985 mdi_phci_path2devinfo(dev_info_t *vdip, caddr_t pathname)
986 {
987 	char		*temp_pathname;
988 	mdi_vhci_t	*vh;
989 	mdi_phci_t	*ph;
990 	dev_info_t 	*pdip = NULL;
991 
992 	vh = i_devi_get_vhci(vdip);
993 	ASSERT(vh != NULL);
994 
995 	if (vh == NULL) {
996 		/*
997 		 * Invalid vHCI component, return failure
998 		 */
999 		return (NULL);
1000 	}
1001 
1002 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1003 	MDI_VHCI_PHCI_LOCK(vh);
1004 	ph = vh->vh_phci_head;
1005 	while (ph != NULL) {
1006 		pdip = ph->ph_dip;
1007 		ASSERT(pdip != NULL);
1008 		*temp_pathname = '\0';
1009 		(void) ddi_pathname(pdip, temp_pathname);
1010 		if (strcmp(temp_pathname, pathname) == 0) {
1011 			break;
1012 		}
1013 		ph = ph->ph_next;
1014 	}
1015 	if (ph == NULL) {
1016 		pdip = NULL;
1017 	}
1018 	MDI_VHCI_PHCI_UNLOCK(vh);
1019 	kmem_free(temp_pathname, MAXPATHLEN);
1020 	return (pdip);
1021 }
1022 
1023 /*
1024  * mdi_phci_get_path_count():
1025  * 		get number of path information nodes associated with a given
1026  *		pHCI device.
1027  */
1028 int
mdi_phci_get_path_count(dev_info_t * pdip)1029 mdi_phci_get_path_count(dev_info_t *pdip)
1030 {
1031 	mdi_phci_t	*ph;
1032 	int		count = 0;
1033 
1034 	ph = i_devi_get_phci(pdip);
1035 	if (ph != NULL) {
1036 		count = ph->ph_path_count;
1037 	}
1038 	return (count);
1039 }
1040 
1041 /*
1042  * i_mdi_phci_lock():
1043  *		Lock a pHCI device
1044  * Return Values:
1045  *		None
1046  * Note:
1047  *		The default locking order is:
1048  *		_NOTE(LOCK_ORDER(mdi_phci::ph_mutex mdi_pathinfo::pi_mutex))
1049  *		But there are number of situations where locks need to be
1050  *		grabbed in reverse order.  This routine implements try and lock
1051  *		mechanism depending on the requested parameter option.
1052  */
1053 static void
i_mdi_phci_lock(mdi_phci_t * ph,mdi_pathinfo_t * pip)1054 i_mdi_phci_lock(mdi_phci_t *ph, mdi_pathinfo_t *pip)
1055 {
1056 	if (pip) {
1057 		/* Reverse locking is requested. */
1058 		while (MDI_PHCI_TRYLOCK(ph) == 0) {
1059 			if (servicing_interrupt()) {
1060 				MDI_PI_HOLD(pip);
1061 				MDI_PI_UNLOCK(pip);
1062 				MDI_PHCI_LOCK(ph);
1063 				MDI_PI_LOCK(pip);
1064 				MDI_PI_RELE(pip);
1065 				break;
1066 			} else {
1067 				/*
1068 				 * tryenter failed. Try to grab again
1069 				 * after a small delay
1070 				 */
1071 				MDI_PI_HOLD(pip);
1072 				MDI_PI_UNLOCK(pip);
1073 				delay_random(mdi_delay);
1074 				MDI_PI_LOCK(pip);
1075 				MDI_PI_RELE(pip);
1076 			}
1077 		}
1078 	} else {
1079 		MDI_PHCI_LOCK(ph);
1080 	}
1081 }
1082 
1083 /*
1084  * i_mdi_phci_unlock():
1085  *		Unlock the pHCI component
1086  */
1087 static void
i_mdi_phci_unlock(mdi_phci_t * ph)1088 i_mdi_phci_unlock(mdi_phci_t *ph)
1089 {
1090 	MDI_PHCI_UNLOCK(ph);
1091 }
1092 
1093 /*
1094  * i_mdi_devinfo_create():
1095  *		create client device's devinfo node
1096  * Return Values:
1097  *		dev_info
1098  *		NULL
1099  * Notes:
1100  */
1101 static dev_info_t *
i_mdi_devinfo_create(mdi_vhci_t * vh,char * name,char * guid,char ** compatible,int ncompatible)1102 i_mdi_devinfo_create(mdi_vhci_t *vh, char *name, char *guid,
1103 	char **compatible, int ncompatible)
1104 {
1105 	dev_info_t *cdip = NULL;
1106 
1107 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1108 
1109 	/* Verify for duplicate entry */
1110 	cdip = i_mdi_devinfo_find(vh, name, guid);
1111 	ASSERT(cdip == NULL);
1112 	if (cdip) {
1113 		cmn_err(CE_WARN,
1114 		    "i_mdi_devinfo_create: client %s@%s already exists",
1115 			name ? name : "", guid ? guid : "");
1116 	}
1117 
1118 	ndi_devi_alloc_sleep(vh->vh_dip, name, DEVI_SID_NODEID, &cdip);
1119 	if (cdip == NULL)
1120 		goto fail;
1121 
1122 	/*
1123 	 * Create component type and Global unique identifier
1124 	 * properties
1125 	 */
1126 	if (ndi_prop_update_string(DDI_DEV_T_NONE, cdip,
1127 	    MDI_CLIENT_GUID_PROP, guid) != DDI_PROP_SUCCESS) {
1128 		goto fail;
1129 	}
1130 
1131 	/* Decorate the node with compatible property */
1132 	if (compatible &&
1133 	    (ndi_prop_update_string_array(DDI_DEV_T_NONE, cdip,
1134 	    "compatible", compatible, ncompatible) != DDI_PROP_SUCCESS)) {
1135 		goto fail;
1136 	}
1137 
1138 	return (cdip);
1139 
1140 fail:
1141 	if (cdip) {
1142 		(void) ndi_prop_remove_all(cdip);
1143 		(void) ndi_devi_free(cdip);
1144 	}
1145 	return (NULL);
1146 }
1147 
1148 /*
1149  * i_mdi_devinfo_find():
1150  *		Find a matching devinfo node for given client node name
1151  *		and its guid.
1152  * Return Values:
1153  *		Handle to a dev_info node or NULL
1154  */
1155 static dev_info_t *
i_mdi_devinfo_find(mdi_vhci_t * vh,caddr_t name,char * guid)1156 i_mdi_devinfo_find(mdi_vhci_t *vh, caddr_t name, char *guid)
1157 {
1158 	char			*data;
1159 	dev_info_t 		*cdip = NULL;
1160 	dev_info_t 		*ndip = NULL;
1161 	int			circular;
1162 
1163 	ndi_devi_enter(vh->vh_dip, &circular);
1164 	ndip = (dev_info_t *)DEVI(vh->vh_dip)->devi_child;
1165 	while ((cdip = ndip) != NULL) {
1166 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1167 
1168 		if (strcmp(DEVI(cdip)->devi_node_name, name)) {
1169 			continue;
1170 		}
1171 
1172 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, cdip,
1173 		    DDI_PROP_DONTPASS, MDI_CLIENT_GUID_PROP,
1174 		    &data) != DDI_PROP_SUCCESS) {
1175 			continue;
1176 		}
1177 
1178 		if (strcmp(data, guid) != 0) {
1179 			ddi_prop_free(data);
1180 			continue;
1181 		}
1182 		ddi_prop_free(data);
1183 		break;
1184 	}
1185 	ndi_devi_exit(vh->vh_dip, circular);
1186 	return (cdip);
1187 }
1188 
1189 /*
1190  * i_mdi_devinfo_remove():
1191  *		Remove a client device node
1192  */
1193 static int
i_mdi_devinfo_remove(dev_info_t * vdip,dev_info_t * cdip,int flags)1194 i_mdi_devinfo_remove(dev_info_t *vdip, dev_info_t *cdip, int flags)
1195 {
1196 	int	rv = MDI_SUCCESS;
1197 
1198 	if (i_mdi_is_child_present(vdip, cdip) == MDI_SUCCESS ||
1199 	    (flags & MDI_CLIENT_FLAGS_DEV_NOT_SUPPORTED)) {
1200 		rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN | NDI_DEVI_REMOVE);
1201 		if (rv != NDI_SUCCESS) {
1202 			MDI_DEBUG(1, (MDI_NOTE, cdip,
1203 			    "!failed: cdip %p", (void *)cdip));
1204 		}
1205 		/*
1206 		 * Convert to MDI error code
1207 		 */
1208 		switch (rv) {
1209 		case NDI_SUCCESS:
1210 			rv = MDI_SUCCESS;
1211 			break;
1212 		case NDI_BUSY:
1213 			rv = MDI_BUSY;
1214 			break;
1215 		default:
1216 			rv = MDI_FAILURE;
1217 			break;
1218 		}
1219 	}
1220 	return (rv);
1221 }
1222 
1223 /*
1224  * i_devi_get_client()
1225  *		Utility function to get mpxio component extensions
1226  */
1227 static mdi_client_t *
i_devi_get_client(dev_info_t * cdip)1228 i_devi_get_client(dev_info_t *cdip)
1229 {
1230 	mdi_client_t	*ct = NULL;
1231 
1232 	if (MDI_CLIENT(cdip)) {
1233 		ct = (mdi_client_t *)DEVI(cdip)->devi_mdi_client;
1234 	}
1235 	return (ct);
1236 }
1237 
1238 /*
1239  * i_mdi_is_child_present():
1240  *		Search for the presence of client device dev_info node
1241  */
1242 static int
i_mdi_is_child_present(dev_info_t * vdip,dev_info_t * cdip)1243 i_mdi_is_child_present(dev_info_t *vdip, dev_info_t *cdip)
1244 {
1245 	int		rv = MDI_FAILURE;
1246 	struct dev_info	*dip;
1247 	int		circular;
1248 
1249 	ndi_devi_enter(vdip, &circular);
1250 	dip = DEVI(vdip)->devi_child;
1251 	while (dip) {
1252 		if (dip == DEVI(cdip)) {
1253 			rv = MDI_SUCCESS;
1254 			break;
1255 		}
1256 		dip = dip->devi_sibling;
1257 	}
1258 	ndi_devi_exit(vdip, circular);
1259 	return (rv);
1260 }
1261 
1262 
1263 /*
1264  * i_mdi_client_lock():
1265  *		Grab client component lock
1266  * Return Values:
1267  *		None
1268  * Note:
1269  *		The default locking order is:
1270  *		_NOTE(LOCK_ORDER(mdi_client::ct_mutex mdi_pathinfo::pi_mutex))
1271  *		But there are number of situations where locks need to be
1272  *		grabbed in reverse order.  This routine implements try and lock
1273  *		mechanism depending on the requested parameter option.
1274  */
1275 static void
i_mdi_client_lock(mdi_client_t * ct,mdi_pathinfo_t * pip)1276 i_mdi_client_lock(mdi_client_t *ct, mdi_pathinfo_t *pip)
1277 {
1278 	if (pip) {
1279 		/*
1280 		 * Reverse locking is requested.
1281 		 */
1282 		while (MDI_CLIENT_TRYLOCK(ct) == 0) {
1283 			if (servicing_interrupt()) {
1284 				MDI_PI_HOLD(pip);
1285 				MDI_PI_UNLOCK(pip);
1286 				MDI_CLIENT_LOCK(ct);
1287 				MDI_PI_LOCK(pip);
1288 				MDI_PI_RELE(pip);
1289 				break;
1290 			} else {
1291 				/*
1292 				 * tryenter failed. Try to grab again
1293 				 * after a small delay
1294 				 */
1295 				MDI_PI_HOLD(pip);
1296 				MDI_PI_UNLOCK(pip);
1297 				delay_random(mdi_delay);
1298 				MDI_PI_LOCK(pip);
1299 				MDI_PI_RELE(pip);
1300 			}
1301 		}
1302 	} else {
1303 		MDI_CLIENT_LOCK(ct);
1304 	}
1305 }
1306 
1307 /*
1308  * i_mdi_client_unlock():
1309  *		Unlock a client component
1310  */
1311 static void
i_mdi_client_unlock(mdi_client_t * ct)1312 i_mdi_client_unlock(mdi_client_t *ct)
1313 {
1314 	MDI_CLIENT_UNLOCK(ct);
1315 }
1316 
1317 /*
1318  * i_mdi_client_alloc():
1319  * 		Allocate and initialize a client structure.  Caller should
1320  *		hold the vhci client lock.
1321  * Return Values:
1322  *		Handle to a client component
1323  */
1324 /*ARGSUSED*/
1325 static mdi_client_t *
i_mdi_client_alloc(mdi_vhci_t * vh,char * name,char * lguid)1326 i_mdi_client_alloc(mdi_vhci_t *vh, char *name, char *lguid)
1327 {
1328 	mdi_client_t	*ct;
1329 
1330 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1331 
1332 	/*
1333 	 * Allocate and initialize a component structure.
1334 	 */
1335 	ct = kmem_zalloc(sizeof (*ct), KM_SLEEP);
1336 	mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1337 	ct->ct_hnext = NULL;
1338 	ct->ct_hprev = NULL;
1339 	ct->ct_dip = NULL;
1340 	ct->ct_vhci = vh;
1341 	ct->ct_drvname = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1342 	(void) strcpy(ct->ct_drvname, name);
1343 	ct->ct_guid = kmem_alloc(strlen(lguid) + 1, KM_SLEEP);
1344 	(void) strcpy(ct->ct_guid, lguid);
1345 	ct->ct_cprivate = NULL;
1346 	ct->ct_vprivate = NULL;
1347 	ct->ct_flags = 0;
1348 	ct->ct_state = MDI_CLIENT_STATE_FAILED;
1349 	MDI_CLIENT_LOCK(ct);
1350 	MDI_CLIENT_SET_OFFLINE(ct);
1351 	MDI_CLIENT_SET_DETACH(ct);
1352 	MDI_CLIENT_SET_POWER_UP(ct);
1353 	MDI_CLIENT_UNLOCK(ct);
1354 	ct->ct_failover_flags = 0;
1355 	ct->ct_failover_status = 0;
1356 	cv_init(&ct->ct_failover_cv, NULL, CV_DRIVER, NULL);
1357 	ct->ct_unstable = 0;
1358 	cv_init(&ct->ct_unstable_cv, NULL, CV_DRIVER, NULL);
1359 	cv_init(&ct->ct_powerchange_cv, NULL, CV_DRIVER, NULL);
1360 	ct->ct_lb = vh->vh_lb;
1361 	ct->ct_lb_args =  kmem_zalloc(sizeof (client_lb_args_t), KM_SLEEP);
1362 	ct->ct_lb_args->region_size = LOAD_BALANCE_DEFAULT_REGION_SIZE;
1363 	ct->ct_path_count = 0;
1364 	ct->ct_path_head = NULL;
1365 	ct->ct_path_tail = NULL;
1366 	ct->ct_path_last = NULL;
1367 
1368 	/*
1369 	 * Add this client component to our client hash queue
1370 	 */
1371 	i_mdi_client_enlist_table(vh, ct);
1372 	return (ct);
1373 }
1374 
1375 /*
1376  * i_mdi_client_enlist_table():
1377  *		Attach the client device to the client hash table. Caller
1378  *		should hold the vhci client lock.
1379  */
1380 static void
i_mdi_client_enlist_table(mdi_vhci_t * vh,mdi_client_t * ct)1381 i_mdi_client_enlist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1382 {
1383 	int 			index;
1384 	struct client_hash	*head;
1385 
1386 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1387 
1388 	index = i_mdi_get_hash_key(ct->ct_guid);
1389 	head = &vh->vh_client_table[index];
1390 	ct->ct_hnext = (mdi_client_t *)head->ct_hash_head;
1391 	head->ct_hash_head = ct;
1392 	head->ct_hash_count++;
1393 	vh->vh_client_count++;
1394 }
1395 
1396 /*
1397  * i_mdi_client_delist_table():
1398  *		Attach the client device to the client hash table.
1399  *		Caller should hold the vhci client lock.
1400  */
1401 static void
i_mdi_client_delist_table(mdi_vhci_t * vh,mdi_client_t * ct)1402 i_mdi_client_delist_table(mdi_vhci_t *vh, mdi_client_t *ct)
1403 {
1404 	int			index;
1405 	char			*guid;
1406 	struct client_hash 	*head;
1407 	mdi_client_t		*next;
1408 	mdi_client_t		*last;
1409 
1410 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1411 
1412 	guid = ct->ct_guid;
1413 	index = i_mdi_get_hash_key(guid);
1414 	head = &vh->vh_client_table[index];
1415 
1416 	last = NULL;
1417 	next = (mdi_client_t *)head->ct_hash_head;
1418 	while (next != NULL) {
1419 		if (next == ct) {
1420 			break;
1421 		}
1422 		last = next;
1423 		next = next->ct_hnext;
1424 	}
1425 
1426 	if (next) {
1427 		head->ct_hash_count--;
1428 		if (last == NULL) {
1429 			head->ct_hash_head = ct->ct_hnext;
1430 		} else {
1431 			last->ct_hnext = ct->ct_hnext;
1432 		}
1433 		ct->ct_hnext = NULL;
1434 		vh->vh_client_count--;
1435 	}
1436 }
1437 
1438 
1439 /*
1440  * i_mdi_client_free():
1441  *		Free a client component
1442  */
1443 static int
i_mdi_client_free(mdi_vhci_t * vh,mdi_client_t * ct)1444 i_mdi_client_free(mdi_vhci_t *vh, mdi_client_t *ct)
1445 {
1446 	int		rv = MDI_SUCCESS;
1447 	int		flags = ct->ct_flags;
1448 	dev_info_t	*cdip;
1449 	dev_info_t	*vdip;
1450 
1451 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1452 
1453 	vdip = vh->vh_dip;
1454 	cdip = ct->ct_dip;
1455 
1456 	(void) ndi_prop_remove(DDI_DEV_T_NONE, cdip, MDI_CLIENT_GUID_PROP);
1457 	DEVI(cdip)->devi_mdi_component &= ~MDI_COMPONENT_CLIENT;
1458 	DEVI(cdip)->devi_mdi_client = NULL;
1459 
1460 	/*
1461 	 * Clear out back ref. to dev_info_t node
1462 	 */
1463 	ct->ct_dip = NULL;
1464 
1465 	/*
1466 	 * Remove this client from our hash queue
1467 	 */
1468 	i_mdi_client_delist_table(vh, ct);
1469 
1470 	/*
1471 	 * Uninitialize and free the component
1472 	 */
1473 	kmem_free(ct->ct_drvname, strlen(ct->ct_drvname) + 1);
1474 	kmem_free(ct->ct_guid, strlen(ct->ct_guid) + 1);
1475 	kmem_free(ct->ct_lb_args, sizeof (client_lb_args_t));
1476 	cv_destroy(&ct->ct_failover_cv);
1477 	cv_destroy(&ct->ct_unstable_cv);
1478 	cv_destroy(&ct->ct_powerchange_cv);
1479 	mutex_destroy(&ct->ct_mutex);
1480 	kmem_free(ct, sizeof (*ct));
1481 
1482 	MDI_VHCI_CLIENT_UNLOCK(vh);
1483 	(void) i_mdi_devinfo_remove(vdip, cdip, flags);
1484 	MDI_VHCI_CLIENT_LOCK(vh);
1485 
1486 	return (rv);
1487 }
1488 
1489 /*
1490  * i_mdi_client_find():
1491  * 		Find the client structure corresponding to a given guid
1492  *		Caller should hold the vhci client lock.
1493  */
1494 static mdi_client_t *
i_mdi_client_find(mdi_vhci_t * vh,char * cname,char * guid)1495 i_mdi_client_find(mdi_vhci_t *vh, char *cname, char *guid)
1496 {
1497 	int			index;
1498 	struct client_hash	*head;
1499 	mdi_client_t		*ct;
1500 
1501 	ASSERT(MDI_VHCI_CLIENT_LOCKED(vh));
1502 
1503 	index = i_mdi_get_hash_key(guid);
1504 	head = &vh->vh_client_table[index];
1505 
1506 	ct = head->ct_hash_head;
1507 	while (ct != NULL) {
1508 		if (strcmp(ct->ct_guid, guid) == 0 &&
1509 		    (cname == NULL || strcmp(ct->ct_drvname, cname) == 0)) {
1510 			break;
1511 		}
1512 		ct = ct->ct_hnext;
1513 	}
1514 	return (ct);
1515 }
1516 
1517 /*
1518  * i_mdi_client_update_state():
1519  *		Compute and update client device state
1520  * Notes:
1521  *		A client device can be in any of three possible states:
1522  *
1523  *		MDI_CLIENT_STATE_OPTIMAL - Client in optimal state with more
1524  *		one online/standby paths. Can tolerate failures.
1525  *		MDI_CLIENT_STATE_DEGRADED - Client device in degraded state with
1526  *		no alternate paths available as standby. A failure on the online
1527  *		would result in loss of access to device data.
1528  *		MDI_CLIENT_STATE_FAILED - Client device in failed state with
1529  *		no paths available to access the device.
1530  */
1531 static void
i_mdi_client_update_state(mdi_client_t * ct)1532 i_mdi_client_update_state(mdi_client_t *ct)
1533 {
1534 	int state;
1535 
1536 	ASSERT(MDI_CLIENT_LOCKED(ct));
1537 	state = i_mdi_client_compute_state(ct, NULL);
1538 	MDI_CLIENT_SET_STATE(ct, state);
1539 }
1540 
1541 /*
1542  * i_mdi_client_compute_state():
1543  *		Compute client device state
1544  *
1545  *		mdi_phci_t *	Pointer to pHCI structure which should
1546  *				while computing the new value.  Used by
1547  *				i_mdi_phci_offline() to find the new
1548  *				client state after DR of a pHCI.
1549  */
1550 static int
i_mdi_client_compute_state(mdi_client_t * ct,mdi_phci_t * ph)1551 i_mdi_client_compute_state(mdi_client_t *ct, mdi_phci_t *ph)
1552 {
1553 	int		state;
1554 	int		online_count = 0;
1555 	int		standby_count = 0;
1556 	mdi_pathinfo_t	*pip, *next;
1557 
1558 	ASSERT(MDI_CLIENT_LOCKED(ct));
1559 	pip = ct->ct_path_head;
1560 	while (pip != NULL) {
1561 		MDI_PI_LOCK(pip);
1562 		next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
1563 		if (MDI_PI(pip)->pi_phci == ph) {
1564 			MDI_PI_UNLOCK(pip);
1565 			pip = next;
1566 			continue;
1567 		}
1568 
1569 		if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1570 				== MDI_PATHINFO_STATE_ONLINE)
1571 			online_count++;
1572 		else if ((MDI_PI(pip)->pi_state & MDI_PATHINFO_STATE_MASK)
1573 				== MDI_PATHINFO_STATE_STANDBY)
1574 			standby_count++;
1575 		MDI_PI_UNLOCK(pip);
1576 		pip = next;
1577 	}
1578 
1579 	if (online_count == 0) {
1580 		if (standby_count == 0) {
1581 			state = MDI_CLIENT_STATE_FAILED;
1582 			MDI_DEBUG(2, (MDI_NOTE, ct->ct_dip,
1583 			    "client state failed: ct = %p", (void *)ct));
1584 		} else if (standby_count == 1) {
1585 			state = MDI_CLIENT_STATE_DEGRADED;
1586 		} else {
1587 			state = MDI_CLIENT_STATE_OPTIMAL;
1588 		}
1589 	} else if (online_count == 1) {
1590 		if (standby_count == 0) {
1591 			state = MDI_CLIENT_STATE_DEGRADED;
1592 		} else {
1593 			state = MDI_CLIENT_STATE_OPTIMAL;
1594 		}
1595 	} else {
1596 		state = MDI_CLIENT_STATE_OPTIMAL;
1597 	}
1598 	return (state);
1599 }
1600 
1601 /*
1602  * i_mdi_client2devinfo():
1603  *		Utility function
1604  */
1605 dev_info_t *
i_mdi_client2devinfo(mdi_client_t * ct)1606 i_mdi_client2devinfo(mdi_client_t *ct)
1607 {
1608 	return (ct->ct_dip);
1609 }
1610 
1611 /*
1612  * mdi_client_path2_devinfo():
1613  * 		Given the parent devinfo and child devfs pathname, search for
1614  *		a valid devfs node handle.
1615  */
1616 dev_info_t *
mdi_client_path2devinfo(dev_info_t * vdip,char * pathname)1617 mdi_client_path2devinfo(dev_info_t *vdip, char *pathname)
1618 {
1619 	dev_info_t 	*cdip = NULL;
1620 	dev_info_t 	*ndip = NULL;
1621 	char		*temp_pathname;
1622 	int		circular;
1623 
1624 	/*
1625 	 * Allocate temp buffer
1626 	 */
1627 	temp_pathname = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1628 
1629 	/*
1630 	 * Lock parent against changes
1631 	 */
1632 	ndi_devi_enter(vdip, &circular);
1633 	ndip = (dev_info_t *)DEVI(vdip)->devi_child;
1634 	while ((cdip = ndip) != NULL) {
1635 		ndip = (dev_info_t *)DEVI(cdip)->devi_sibling;
1636 
1637 		*temp_pathname = '\0';
1638 		(void) ddi_pathname(cdip, temp_pathname);
1639 		if (strcmp(temp_pathname, pathname) == 0) {
1640 			break;
1641 		}
1642 	}
1643 	/*
1644 	 * Release devinfo lock
1645 	 */
1646 	ndi_devi_exit(vdip, circular);
1647 
1648 	/*
1649 	 * Free the temp buffer
1650 	 */
1651 	kmem_free(temp_pathname, MAXPATHLEN);
1652 	return (cdip);
1653 }
1654 
1655 /*
1656  * mdi_client_get_path_count():
1657  * 		Utility function to get number of path information nodes
1658  *		associated with a given client device.
1659  */
1660 int
mdi_client_get_path_count(dev_info_t * cdip)1661 mdi_client_get_path_count(dev_info_t *cdip)
1662 {
1663 	mdi_client_t	*ct;
1664 	int		count = 0;
1665 
1666 	ct = i_devi_get_client(cdip);
1667 	if (ct != NULL) {
1668 		count = ct->ct_path_count;
1669 	}
1670 	return (count);
1671 }
1672 
1673 
1674 /*
1675  * i_mdi_get_hash_key():
1676  * 		Create a hash using strings as keys
1677  *
1678  */
1679 static int
i_mdi_get_hash_key(char * str)1680 i_mdi_get_hash_key(char *str)
1681 {
1682 	uint32_t	g, hash = 0;
1683 	char		*p;
1684 
1685 	for (p = str; *p != '\0'; p++) {
1686 		g = *p;
1687 		hash += g;
1688 	}
1689 	return (hash % (CLIENT_HASH_TABLE_SIZE - 1));
1690 }
1691 
1692 /*
1693  * mdi_get_lb_policy():
1694  * 		Get current load balancing policy for a given client device
1695  */
1696 client_lb_t
mdi_get_lb_policy(dev_info_t * cdip)1697 mdi_get_lb_policy(dev_info_t *cdip)
1698 {
1699 	client_lb_t	lb = LOAD_BALANCE_NONE;
1700 	mdi_client_t	*ct;
1701 
1702 	ct = i_devi_get_client(cdip);
1703 	if (ct != NULL) {
1704 		lb = ct->ct_lb;
1705 	}
1706 	return (lb);
1707 }
1708 
1709 /*
1710  * mdi_set_lb_region_size():
1711  * 		Set current region size for the load-balance
1712  */
1713 int
mdi_set_lb_region_size(dev_info_t * cdip,int region_size)1714 mdi_set_lb_region_size(dev_info_t *cdip, int region_size)
1715 {
1716 	mdi_client_t	*ct;
1717 	int		rv = MDI_FAILURE;
1718 
1719 	ct = i_devi_get_client(cdip);
1720 	if (ct != NULL && ct->ct_lb_args != NULL) {
1721 		ct->ct_lb_args->region_size = region_size;
1722 		rv = MDI_SUCCESS;
1723 	}
1724 	return (rv);
1725 }
1726 
1727 /*
1728  * mdi_Set_lb_policy():
1729  * 		Set current load balancing policy for a given client device
1730  */
1731 int
mdi_set_lb_policy(dev_info_t * cdip,client_lb_t lb)1732 mdi_set_lb_policy(dev_info_t *cdip, client_lb_t lb)
1733 {
1734 	mdi_client_t	*ct;
1735 	int		rv = MDI_FAILURE;
1736 
1737 	ct = i_devi_get_client(cdip);
1738 	if (ct != NULL) {
1739 		ct->ct_lb = lb;
1740 		rv = MDI_SUCCESS;
1741 	}
1742 	return (rv);
1743 }
1744 
1745 static void
mdi_failover_cb(void * arg)1746 mdi_failover_cb(void *arg)
1747 {
1748 	(void)i_mdi_failover(arg);
1749 }
1750 
1751 /*
1752  * mdi_failover():
1753  *		failover function called by the vHCI drivers to initiate
1754  *		a failover operation.  This is typically due to non-availability
1755  *		of online paths to route I/O requests.  Failover can be
1756  *		triggered through user application also.
1757  *
1758  *		The vHCI driver calls mdi_failover() to initiate a failover
1759  *		operation. mdi_failover() calls back into the vHCI driver's
1760  *		vo_failover() entry point to perform the actual failover
1761  *		operation.  The reason for requiring the vHCI driver to
1762  *		initiate failover by calling mdi_failover(), instead of directly
1763  *		executing vo_failover() itself, is to ensure that the mdi
1764  *		framework can keep track of the client state properly.
1765  *		Additionally, mdi_failover() provides as a convenience the
1766  *		option of performing the failover operation synchronously or
1767  *		asynchronously
1768  *
1769  *		Upon successful completion of the failover operation, the
1770  *		paths that were previously ONLINE will be in the STANDBY state,
1771  *		and the newly activated paths will be in the ONLINE state.
1772  *
1773  *		The flags modifier determines whether the activation is done
1774  *		synchronously: MDI_FAILOVER_SYNC
1775  * Return Values:
1776  *		MDI_SUCCESS
1777  *		MDI_FAILURE
1778  *		MDI_BUSY
1779  */
1780 /*ARGSUSED*/
1781 int
mdi_failover(dev_info_t * vdip,dev_info_t * cdip,int flags)1782 mdi_failover(dev_info_t *vdip, dev_info_t *cdip, int flags)
1783 {
1784 	int			rv;
1785 	mdi_client_t		*ct;
1786 
1787 	ct = i_devi_get_client(cdip);
1788 	ASSERT(ct != NULL);
1789 	if (ct == NULL) {
1790 		/* cdip is not a valid client device. Nothing more to do. */
1791 		return (MDI_FAILURE);
1792 	}
1793 
1794 	MDI_CLIENT_LOCK(ct);
1795 
1796 	if (MDI_CLIENT_IS_PATH_FREE_IN_PROGRESS(ct)) {
1797 		/* A path to the client is being freed */
1798 		MDI_CLIENT_UNLOCK(ct);
1799 		return (MDI_BUSY);
1800 	}
1801 
1802 
1803 	if (MDI_CLIENT_IS_FAILED(ct)) {
1804 		/*
1805 		 * Client is in failed state. Nothing more to do.
1806 		 */
1807 		MDI_CLIENT_UNLOCK(ct);
1808 		return (MDI_FAILURE);
1809 	}
1810 
1811 	if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
1812 		/*
1813 		 * Failover is already in progress; return BUSY
1814 		 */
1815 		MDI_CLIENT_UNLOCK(ct);
1816 		return (MDI_BUSY);
1817 	}
1818 	/*
1819 	 * Make sure that mdi_pathinfo node state changes are processed.
1820 	 * We do not allow failovers to progress while client path state
1821 	 * changes are in progress
1822 	 */
1823 	if (ct->ct_unstable) {
1824 		if (flags == MDI_FAILOVER_ASYNC) {
1825 			MDI_CLIENT_UNLOCK(ct);
1826 			return (MDI_BUSY);
1827 		} else {
1828 			while (ct->ct_unstable)
1829 				cv_wait(&ct->ct_unstable_cv, &ct->ct_mutex);
1830 		}
1831 	}
1832 
1833 	/*
1834 	 * Client device is in stable state. Before proceeding, perform sanity
1835 	 * checks again.
1836 	 */
1837 	if ((MDI_CLIENT_IS_DETACHED(ct)) || (MDI_CLIENT_IS_FAILED(ct)) ||
1838 	    (!i_ddi_devi_attached(cdip))) {
1839 		/*
1840 		 * Client is in failed state. Nothing more to do.
1841 		 */
1842 		MDI_CLIENT_UNLOCK(ct);
1843 		return (MDI_FAILURE);
1844 	}
1845 
1846 	/*
1847 	 * Set the client state as failover in progress.
1848 	 */
1849 	MDI_CLIENT_SET_FAILOVER_IN_PROGRESS(ct);
1850 	ct->ct_failover_flags = flags;
1851 	MDI_CLIENT_UNLOCK(ct);
1852 
1853 	if (flags == MDI_FAILOVER_ASYNC) {
1854 		/*
1855 		 * Submit the initiate failover request via CPR safe
1856 		 * taskq threads.
1857 		 */
1858 		(void) taskq_dispatch(mdi_taskq, mdi_failover_cb, ct, KM_SLEEP);
1859 		return (MDI_ACCEPT);
1860 	} else {
1861 		/*
1862 		 * Synchronous failover mode.  Typically invoked from the user
1863 		 * land.
1864 		 */
1865 		rv = i_mdi_failover(ct);
1866 	}
1867 	return (rv);
1868 }
1869 
1870 /*
1871  * i_mdi_failover():
1872  *		internal failover function. Invokes vHCI drivers failover
1873  *		callback function and process the failover status
1874  * Return Values:
1875  *		None
1876  *
1877  * Note: A client device in failover state can not be detached or freed.
1878  */
1879 static int
i_mdi_failover(void * arg)1880 i_mdi_failover(void *arg)
1881 {
1882 	int		rv = MDI_SUCCESS;
1883 	mdi_client_t	*ct = (mdi_client_t *)arg;
1884 	mdi_vhci_t	*vh = ct->ct_vhci;
1885 
1886 	ASSERT(!MDI_CLIENT_LOCKED(ct));
1887 
1888 	if (vh->vh_ops->vo_failover != NULL) {
1889 		/*
1890 		 * Call vHCI drivers callback routine
1891 		 */
1892 		rv = (*vh->vh_ops->vo_failover)(vh->vh_dip, ct->ct_dip,
1893 		    ct->ct_failover_flags);
1894 	}
1895 
1896 	MDI_CLIENT_LOCK(ct);
1897 	MDI_CLIENT_CLEAR_FAILOVER_IN_PROGRESS(ct);
1898 
1899 	/*
1900 	 * Save the failover return status
1901 	 */
1902 	ct->ct_failover_status = rv;
1903 
1904 	/*
1905 	 * As a result of failover, client status would have been changed.
1906 	 * Update the client state and wake up anyone waiting on this client
1907 	 * device.
1908 	 */
1909 	i_mdi_client_update_state(ct);
1910 
1911 	cv_broadcast(&ct->ct_failover_cv);
1912 	MDI_CLIENT_UNLOCK(ct);
1913 	return (rv);
1914 }
1915 
1916 /*
1917  * Load balancing is logical block.
1918  * IOs within the range described by region_size
1919  * would go on the same path. This would improve the
1920  * performance by cache-hit on some of the RAID devices.
1921  * Search only for online paths(At some point we
1922  * may want to balance across target ports).
1923  * If no paths are found then default to round-robin.
1924  */
1925 static int
i_mdi_lba_lb(mdi_client_t * ct,mdi_pathinfo_t ** ret_pip,struct buf * bp)1926 i_mdi_lba_lb(mdi_client_t *ct, mdi_pathinfo_t **ret_pip, struct buf *bp)
1927 {
1928 	int		path_index = -1;
1929 	int		online_path_count = 0;
1930 	int		online_nonpref_path_count = 0;
1931 	int 		region_size = ct->ct_lb_args->region_size;
1932 	mdi_pathinfo_t	*pip;
1933 	mdi_pathinfo_t	*next;
1934 	int		preferred, path_cnt;
1935 
1936 	pip = ct->ct_path_head;
1937 	while (pip) {
1938 		MDI_PI_LOCK(pip);
1939 		if (MDI_PI(pip)->pi_state ==
1940 		    MDI_PATHINFO_STATE_ONLINE && MDI_PI(pip)->pi_preferred) {
1941 			online_path_count++;
1942 		} else if (MDI_PI(pip)->pi_state ==
1943 		    MDI_PATHINFO_STATE_ONLINE && !MDI_PI(pip)->pi_preferred) {
1944 			online_nonpref_path_count++;
1945 		}
1946 		next = (mdi_pathinfo_t *)
1947 		    MDI_PI(pip)->pi_client_link;
1948 		MDI_PI_UNLOCK(pip);
1949 		pip = next;
1950 	}
1951 	/* if found any online/preferred then use this type */
1952 	if (online_path_count > 0) {
1953 		path_cnt = online_path_count;
1954 		preferred = 1;
1955 	} else if (online_nonpref_path_count > 0) {
1956 		path_cnt = online_nonpref_path_count;
1957 		preferred = 0;
1958 	} else {
1959 		path_cnt = 0;
1960 	}
1961 	if (path_cnt) {
1962 		path_index = (bp->b_blkno >> region_size) % path_cnt;
1963 		pip = ct->ct_path_head;
1964 		while (pip && path_index != -1) {
1965 			MDI_PI_LOCK(pip);
1966 			if (path_index == 0 &&
1967 			    (MDI_PI(pip)->pi_state ==
1968 			    MDI_PATHINFO_STATE_ONLINE) &&
1969 				MDI_PI(pip)->pi_preferred == preferred) {
1970 				MDI_PI_HOLD(pip);
1971 				MDI_PI_UNLOCK(pip);
1972 				*ret_pip = pip;
1973 				return (MDI_SUCCESS);
1974 			}
1975 			path_index --;
1976 			next = (mdi_pathinfo_t *)
1977 			    MDI_PI(pip)->pi_client_link;
1978 			MDI_PI_UNLOCK(pip);
1979 			pip = next;
1980 		}
1981 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
1982 		    "lba %llx: path %s %p",
1983 		    bp->b_lblkno, mdi_pi_spathname(pip), (void *)pip));
1984 	}
1985 	return (MDI_FAILURE);
1986 }
1987 
1988 /*
1989  * mdi_select_path():
1990  *		select a path to access a client device.
1991  *
1992  *		mdi_select_path() function is called by the vHCI drivers to
1993  *		select a path to route the I/O request to.  The caller passes
1994  *		the block I/O data transfer structure ("buf") as one of the
1995  *		parameters.  The mpxio framework uses the buf structure
1996  *		contents to maintain per path statistics (total I/O size /
1997  *		count pending).  If more than one online paths are available to
1998  *		select, the framework automatically selects a suitable path
1999  *		for routing I/O request. If a failover operation is active for
2000  *		this client device the call shall be failed with MDI_BUSY error
2001  *		code.
2002  *
2003  *		By default this function returns a suitable path in online
2004  *		state based on the current load balancing policy.  Currently
2005  *		we support LOAD_BALANCE_NONE (Previously selected online path
2006  *		will continue to be used till the path is usable) and
2007  *		LOAD_BALANCE_RR (Online paths will be selected in a round
2008  *		robin fashion), LOAD_BALANCE_LB(Online paths will be selected
2009  *		based on the logical block).  The load balancing
2010  *		through vHCI drivers configuration file (driver.conf).
2011  *
2012  *		vHCI drivers may override this default behavior by specifying
2013  *		appropriate flags.  The meaning of the thrid argument depends
2014  *		on the flags specified. If MDI_SELECT_PATH_INSTANCE is set
2015  *		then the argument is the "path instance" of the path to select.
2016  *		If MDI_SELECT_PATH_INSTANCE is not set then the argument is
2017  *		"start_pip". A non NULL "start_pip" is the starting point to
2018  *		walk and find the next appropriate path.  The following values
2019  *		are currently defined: MDI_SELECT_ONLINE_PATH (to select an
2020  *		ONLINE path) and/or MDI_SELECT_STANDBY_PATH (to select an
2021  *		STANDBY path).
2022  *
2023  *		The non-standard behavior is used by the scsi_vhci driver,
2024  *		whenever it has to use a STANDBY/FAULTED path.  Eg. during
2025  *		attach of client devices (to avoid an unnecessary failover
2026  *		when the STANDBY path comes up first), during failover
2027  *		(to activate a STANDBY path as ONLINE).
2028  *
2029  *		The selected path is returned in a a mdi_hold_path() state
2030  *		(pi_ref_cnt). Caller should release the hold by calling
2031  *		mdi_rele_path().
2032  *
2033  * Return Values:
2034  *		MDI_SUCCESS	- Completed successfully
2035  *		MDI_BUSY 	- Client device is busy failing over
2036  *		MDI_NOPATH	- Client device is online, but no valid path are
2037  *				  available to access this client device
2038  *		MDI_FAILURE	- Invalid client device or state
2039  *		MDI_DEVI_ONLINING
2040  *				- Client device (struct dev_info state) is in
2041  *				  onlining state.
2042  */
2043 
2044 /*ARGSUSED*/
2045 int
mdi_select_path(dev_info_t * cdip,struct buf * bp,int flags,void * arg,mdi_pathinfo_t ** ret_pip)2046 mdi_select_path(dev_info_t *cdip, struct buf *bp, int flags,
2047     void *arg, mdi_pathinfo_t **ret_pip)
2048 {
2049 	mdi_client_t	*ct;
2050 	mdi_pathinfo_t	*pip;
2051 	mdi_pathinfo_t	*next;
2052 	mdi_pathinfo_t	*head;
2053 	mdi_pathinfo_t	*start;
2054 	client_lb_t	lbp;	/* load balancing policy */
2055 	int		sb = 1;	/* standard behavior */
2056 	int		preferred = 1;	/* preferred path */
2057 	int		cond, cont = 1;
2058 	int		retry = 0;
2059 	mdi_pathinfo_t	*start_pip;	/* request starting pathinfo */
2060 	int		path_instance;	/* request specific path instance */
2061 
2062 	/* determine type of arg based on flags */
2063 	if (flags & MDI_SELECT_PATH_INSTANCE) {
2064 		path_instance = (int)(intptr_t)arg;
2065 		start_pip = NULL;
2066 	} else {
2067 		path_instance = 0;
2068 		start_pip = (mdi_pathinfo_t *)arg;
2069 	}
2070 
2071 	if (flags != 0) {
2072 		/*
2073 		 * disable default behavior
2074 		 */
2075 		sb = 0;
2076 	}
2077 
2078 	*ret_pip = NULL;
2079 	ct = i_devi_get_client(cdip);
2080 	if (ct == NULL) {
2081 		/* mdi extensions are NULL, Nothing more to do */
2082 		return (MDI_FAILURE);
2083 	}
2084 
2085 	MDI_CLIENT_LOCK(ct);
2086 
2087 	if (sb) {
2088 		if (MDI_CLIENT_IS_FAILED(ct)) {
2089 			/*
2090 			 * Client is not ready to accept any I/O requests.
2091 			 * Fail this request.
2092 			 */
2093 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2094 			    "client state offline ct = %p", (void *)ct));
2095 			MDI_CLIENT_UNLOCK(ct);
2096 			return (MDI_FAILURE);
2097 		}
2098 
2099 		if (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct)) {
2100 			/*
2101 			 * Check for Failover is in progress. If so tell the
2102 			 * caller that this device is busy.
2103 			 */
2104 			MDI_DEBUG(2, (MDI_NOTE, cdip,
2105 			    "client failover in progress ct = %p",
2106 			    (void *)ct));
2107 			MDI_CLIENT_UNLOCK(ct);
2108 			return (MDI_BUSY);
2109 		}
2110 
2111 		/*
2112 		 * Check to see whether the client device is attached.
2113 		 * If not so, let the vHCI driver manually select a path
2114 		 * (standby) and let the probe/attach process to continue.
2115 		 */
2116 		if (MDI_CLIENT_IS_DETACHED(ct) || !i_ddi_devi_attached(cdip)) {
2117 			MDI_DEBUG(4, (MDI_NOTE, cdip,
2118 			    "devi is onlining ct = %p", (void *)ct));
2119 			MDI_CLIENT_UNLOCK(ct);
2120 			return (MDI_DEVI_ONLINING);
2121 		}
2122 	}
2123 
2124 	/*
2125 	 * Cache in the client list head.  If head of the list is NULL
2126 	 * return MDI_NOPATH
2127 	 */
2128 	head = ct->ct_path_head;
2129 	if (head == NULL) {
2130 		MDI_CLIENT_UNLOCK(ct);
2131 		return (MDI_NOPATH);
2132 	}
2133 
2134 	/* Caller is specifying a specific pathinfo path by path_instance */
2135 	if (path_instance) {
2136 		/* search for pathinfo with correct path_instance */
2137 		for (pip = head;
2138 		    pip && (mdi_pi_get_path_instance(pip) != path_instance);
2139 		    pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link)
2140 			;
2141 
2142 		/* If path can't be selected then MDI_NOPATH is returned. */
2143 		if (pip == NULL) {
2144 			MDI_CLIENT_UNLOCK(ct);
2145 			return (MDI_NOPATH);
2146 		}
2147 
2148 		/*
2149 		 * Verify state of path. When asked to select a specific
2150 		 * path_instance, we select the requested path in any
2151 		 * state (ONLINE, OFFLINE, STANDBY, FAULT) other than INIT.
2152 		 * We don't however select paths where the pHCI has detached.
2153 		 * NOTE: last pathinfo node of an opened client device may
2154 		 * exist in an OFFLINE state after the pHCI associated with
2155 		 * that path has detached (but pi_phci will be NULL if that
2156 		 * has occurred).
2157 		 */
2158 		MDI_PI_LOCK(pip);
2159 		if ((MDI_PI(pip)->pi_state == MDI_PATHINFO_STATE_INIT) ||
2160 		    (MDI_PI(pip)->pi_phci == NULL)) {
2161 			MDI_PI_UNLOCK(pip);
2162 			MDI_CLIENT_UNLOCK(ct);
2163 			return (MDI_FAILURE);
2164 		}
2165 
2166 		/* Return MDI_BUSY if we have a transient condition */
2167 		if (MDI_PI_IS_TRANSIENT(pip)) {
2168 			MDI_PI_UNLOCK(pip);
2169 			MDI_CLIENT_UNLOCK(ct);
2170 			return (MDI_BUSY);
2171 		}
2172 
2173 		/*
2174 		 * Return the path in hold state. Caller should release the
2175 		 * lock by calling mdi_rele_path()
2176 		 */
2177 		MDI_PI_HOLD(pip);
2178 		MDI_PI_UNLOCK(pip);
2179 		*ret_pip = pip;
2180 		MDI_CLIENT_UNLOCK(ct);
2181 		return (MDI_SUCCESS);
2182 	}
2183 
2184 	/*
2185 	 * for non default behavior, bypass current
2186 	 * load balancing policy and always use LOAD_BALANCE_RR
2187 	 * except that the start point will be adjusted based
2188 	 * on the provided start_pip
2189 	 */
2190 	lbp = sb ? ct->ct_lb : LOAD_BALANCE_RR;
2191 
2192 	switch (lbp) {
2193 	case LOAD_BALANCE_NONE:
2194 		/*
2195 		 * Load balancing is None  or Alternate path mode
2196 		 * Start looking for a online mdi_pathinfo node starting from
2197 		 * last known selected path
2198 		 */
2199 		preferred = 1;
2200 		pip = (mdi_pathinfo_t *)ct->ct_path_last;
2201 		if (pip == NULL) {
2202 			pip = head;
2203 		}
2204 		start = pip;
2205 		do {
2206 			MDI_PI_LOCK(pip);
2207 			/*
2208 			 * No need to explicitly check if the path is disabled.
2209 			 * Since we are checking for state == ONLINE and the
2210 			 * same variable is used for DISABLE/ENABLE information.
2211 			 */
2212 			if ((MDI_PI(pip)->pi_state  ==
2213 				MDI_PATHINFO_STATE_ONLINE) &&
2214 				preferred == MDI_PI(pip)->pi_preferred) {
2215 				/*
2216 				 * Return the path in hold state. Caller should
2217 				 * release the lock by calling mdi_rele_path()
2218 				 */
2219 				MDI_PI_HOLD(pip);
2220 				MDI_PI_UNLOCK(pip);
2221 				ct->ct_path_last = pip;
2222 				*ret_pip = pip;
2223 				MDI_CLIENT_UNLOCK(ct);
2224 				return (MDI_SUCCESS);
2225 			}
2226 
2227 			/*
2228 			 * Path is busy.
2229 			 */
2230 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2231 			    MDI_PI_IS_TRANSIENT(pip))
2232 				retry = 1;
2233 			/*
2234 			 * Keep looking for a next available online path
2235 			 */
2236 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2237 			if (next == NULL) {
2238 				next = head;
2239 			}
2240 			MDI_PI_UNLOCK(pip);
2241 			pip = next;
2242 			if (start == pip && preferred) {
2243 				preferred = 0;
2244 			} else if (start == pip && !preferred) {
2245 				cont = 0;
2246 			}
2247 		} while (cont);
2248 		break;
2249 
2250 	case LOAD_BALANCE_LBA:
2251 		/*
2252 		 * Make sure we are looking
2253 		 * for an online path. Otherwise, if it is for a STANDBY
2254 		 * path request, it will go through and fetch an ONLINE
2255 		 * path which is not desirable.
2256 		 */
2257 		if ((ct->ct_lb_args != NULL) &&
2258 			    (ct->ct_lb_args->region_size) && bp &&
2259 				(sb || (flags == MDI_SELECT_ONLINE_PATH))) {
2260 			if (i_mdi_lba_lb(ct, ret_pip, bp)
2261 				    == MDI_SUCCESS) {
2262 				MDI_CLIENT_UNLOCK(ct);
2263 				return (MDI_SUCCESS);
2264 			}
2265 		}
2266 		/* FALLTHROUGH */
2267 	case LOAD_BALANCE_RR:
2268 		/*
2269 		 * Load balancing is Round Robin. Start looking for a online
2270 		 * mdi_pathinfo node starting from last known selected path
2271 		 * as the start point.  If override flags are specified,
2272 		 * process accordingly.
2273 		 * If the search is already in effect(start_pip not null),
2274 		 * then lets just use the same path preference to continue the
2275 		 * traversal.
2276 		 */
2277 
2278 		if (start_pip != NULL) {
2279 			preferred = MDI_PI(start_pip)->pi_preferred;
2280 		} else {
2281 			preferred = 1;
2282 		}
2283 
2284 		start = sb ? (mdi_pathinfo_t *)ct->ct_path_last : start_pip;
2285 		if (start == NULL) {
2286 			pip = head;
2287 		} else {
2288 			pip = (mdi_pathinfo_t *)MDI_PI(start)->pi_client_link;
2289 			if (pip == NULL) {
2290 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2291 					/*
2292 					 * Return since we hit the end of list
2293 					 */
2294 					MDI_CLIENT_UNLOCK(ct);
2295 					return (MDI_NOPATH);
2296 				}
2297 
2298 				if (!sb) {
2299 					if (preferred == 0) {
2300 						/*
2301 						 * Looks like we have completed
2302 						 * the traversal as preferred
2303 						 * value is 0. Time to bail out.
2304 						 */
2305 						*ret_pip = NULL;
2306 						MDI_CLIENT_UNLOCK(ct);
2307 						return (MDI_NOPATH);
2308 					} else {
2309 						/*
2310 						 * Looks like we reached the
2311 						 * end of the list. Lets enable
2312 						 * traversal of non preferred
2313 						 * paths.
2314 						 */
2315 						preferred = 0;
2316 					}
2317 				}
2318 				pip = head;
2319 			}
2320 		}
2321 		start = pip;
2322 		do {
2323 			MDI_PI_LOCK(pip);
2324 			if (sb) {
2325 				cond = ((MDI_PI(pip)->pi_state ==
2326 				    MDI_PATHINFO_STATE_ONLINE &&
2327 					MDI_PI(pip)->pi_preferred ==
2328 						preferred) ? 1 : 0);
2329 			} else {
2330 				if (flags == MDI_SELECT_ONLINE_PATH) {
2331 					cond = ((MDI_PI(pip)->pi_state ==
2332 					    MDI_PATHINFO_STATE_ONLINE &&
2333 						MDI_PI(pip)->pi_preferred ==
2334 						preferred) ? 1 : 0);
2335 				} else if (flags == MDI_SELECT_STANDBY_PATH) {
2336 					cond = ((MDI_PI(pip)->pi_state ==
2337 					    MDI_PATHINFO_STATE_STANDBY &&
2338 						MDI_PI(pip)->pi_preferred ==
2339 						preferred) ? 1 : 0);
2340 				} else if (flags == (MDI_SELECT_ONLINE_PATH |
2341 				    MDI_SELECT_STANDBY_PATH)) {
2342 					cond = (((MDI_PI(pip)->pi_state ==
2343 					    MDI_PATHINFO_STATE_ONLINE ||
2344 					    (MDI_PI(pip)->pi_state ==
2345 					    MDI_PATHINFO_STATE_STANDBY)) &&
2346 						MDI_PI(pip)->pi_preferred ==
2347 						preferred) ? 1 : 0);
2348 				} else if (flags ==
2349 					(MDI_SELECT_STANDBY_PATH |
2350 					MDI_SELECT_ONLINE_PATH |
2351 					MDI_SELECT_USER_DISABLE_PATH)) {
2352 					cond = (((MDI_PI(pip)->pi_state ==
2353 					    MDI_PATHINFO_STATE_ONLINE ||
2354 					    (MDI_PI(pip)->pi_state ==
2355 					    MDI_PATHINFO_STATE_STANDBY) ||
2356 						(MDI_PI(pip)->pi_state ==
2357 					    (MDI_PATHINFO_STATE_ONLINE|
2358 					    MDI_PATHINFO_STATE_USER_DISABLE)) ||
2359 						(MDI_PI(pip)->pi_state ==
2360 					    (MDI_PATHINFO_STATE_STANDBY |
2361 					    MDI_PATHINFO_STATE_USER_DISABLE)))&&
2362 						MDI_PI(pip)->pi_preferred ==
2363 						preferred) ? 1 : 0);
2364 				} else if (flags ==
2365 				    (MDI_SELECT_STANDBY_PATH |
2366 				    MDI_SELECT_ONLINE_PATH |
2367 				    MDI_SELECT_NO_PREFERRED)) {
2368 					cond = (((MDI_PI(pip)->pi_state ==
2369 					    MDI_PATHINFO_STATE_ONLINE) ||
2370 					    (MDI_PI(pip)->pi_state ==
2371 					    MDI_PATHINFO_STATE_STANDBY))
2372 					    ? 1 : 0);
2373 				} else {
2374 					cond = 0;
2375 				}
2376 			}
2377 			/*
2378 			 * No need to explicitly check if the path is disabled.
2379 			 * Since we are checking for state == ONLINE and the
2380 			 * same variable is used for DISABLE/ENABLE information.
2381 			 */
2382 			if (cond) {
2383 				/*
2384 				 * Return the path in hold state. Caller should
2385 				 * release the lock by calling mdi_rele_path()
2386 				 */
2387 				MDI_PI_HOLD(pip);
2388 				MDI_PI_UNLOCK(pip);
2389 				if (sb)
2390 					ct->ct_path_last = pip;
2391 				*ret_pip = pip;
2392 				MDI_CLIENT_UNLOCK(ct);
2393 				return (MDI_SUCCESS);
2394 			}
2395 			/*
2396 			 * Path is busy.
2397 			 */
2398 			if (MDI_PI_IS_DRV_DISABLE_TRANSIENT(pip) ||
2399 			    MDI_PI_IS_TRANSIENT(pip))
2400 				retry = 1;
2401 
2402 			/*
2403 			 * Keep looking for a next available online path
2404 			 */
2405 do_again:
2406 			next = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2407 			if (next == NULL) {
2408 				if ( flags & MDI_SELECT_NO_PREFERRED) {
2409 					/*
2410 					 * Bail out since we hit the end of list
2411 					 */
2412 					MDI_PI_UNLOCK(pip);
2413 					break;
2414 				}
2415 
2416 				if (!sb) {
2417 					if (preferred == 1) {
2418 						/*
2419 						 * Looks like we reached the
2420 						 * end of the list. Lets enable
2421 						 * traversal of non preferred
2422 						 * paths.
2423 						 */
2424 						preferred = 0;
2425 						next = head;
2426 					} else {
2427 						/*
2428 						 * We have done both the passes
2429 						 * Preferred as well as for
2430 						 * Non-preferred. Bail out now.
2431 						 */
2432 						cont = 0;
2433 					}
2434 				} else {
2435 					/*
2436 					 * Standard behavior case.
2437 					 */
2438 					next = head;
2439 				}
2440 			}
2441 			MDI_PI_UNLOCK(pip);
2442 			if (cont == 0) {
2443 				break;
2444 			}
2445 			pip = next;
2446 
2447 			if (!sb) {
2448 				/*
2449 				 * We need to handle the selection of
2450 				 * non-preferred path in the following
2451 				 * case:
2452 				 *
2453 				 * +------+   +------+   +------+   +-----+
2454 				 * | A : 1| - | B : 1| - | C : 0| - |NULL |
2455 				 * +------+   +------+   +------+   +-----+
2456 				 *
2457 				 * If we start the search with B, we need to
2458 				 * skip beyond B to pick C which is non -
2459 				 * preferred in the second pass. The following
2460 				 * test, if true, will allow us to skip over
2461 				 * the 'start'(B in the example) to select
2462 				 * other non preferred elements.
2463 				 */
2464 				if ((start_pip != NULL) && (start_pip == pip) &&
2465 				    (MDI_PI(start_pip)->pi_preferred
2466 				    != preferred)) {
2467 					/*
2468 					 * try again after going past the start
2469 					 * pip
2470 					 */
2471 					MDI_PI_LOCK(pip);
2472 					goto do_again;
2473 				}
2474 			} else {
2475 				/*
2476 				 * Standard behavior case
2477 				 */
2478 				if (start == pip && preferred) {
2479 					/* look for nonpreferred paths */
2480 					preferred = 0;
2481 				} else if (start == pip && !preferred) {
2482 					/*
2483 					 * Exit condition
2484 					 */
2485 					cont = 0;
2486 				}
2487 			}
2488 		} while (cont);
2489 		break;
2490 	}
2491 
2492 	MDI_CLIENT_UNLOCK(ct);
2493 	if (retry == 1) {
2494 		return (MDI_BUSY);
2495 	} else {
2496 		return (MDI_NOPATH);
2497 	}
2498 }
2499 
2500 /*
2501  * For a client, return the next available path to any phci
2502  *
2503  * Note:
2504  *		Caller should hold the branch's devinfo node to get a consistent
2505  *		snap shot of the mdi_pathinfo nodes.
2506  *
2507  *		Please note that even the list is stable the mdi_pathinfo
2508  *		node state and properties are volatile.  The caller should lock
2509  *		and unlock the nodes by calling mdi_pi_lock() and
2510  *		mdi_pi_unlock() functions to get a stable properties.
2511  *
2512  *		If there is a need to use the nodes beyond the hold of the
2513  *		devinfo node period (For ex. I/O), then mdi_pathinfo node
2514  *		need to be held against unexpected removal by calling
2515  *		mdi_hold_path() and should be released by calling
2516  *		mdi_rele_path() on completion.
2517  */
2518 mdi_pathinfo_t *
mdi_get_next_phci_path(dev_info_t * ct_dip,mdi_pathinfo_t * pip)2519 mdi_get_next_phci_path(dev_info_t *ct_dip, mdi_pathinfo_t *pip)
2520 {
2521 	mdi_client_t *ct;
2522 
2523 	if (!MDI_CLIENT(ct_dip))
2524 		return (NULL);
2525 
2526 	/*
2527 	 * Walk through client link
2528 	 */
2529 	ct = (mdi_client_t *)DEVI(ct_dip)->devi_mdi_client;
2530 	ASSERT(ct != NULL);
2531 
2532 	if (pip == NULL)
2533 		return ((mdi_pathinfo_t *)ct->ct_path_head);
2534 
2535 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link);
2536 }
2537 
2538 /*
2539  * For a phci, return the next available path to any client
2540  * Note: ditto mdi_get_next_phci_path()
2541  */
2542 mdi_pathinfo_t *
mdi_get_next_client_path(dev_info_t * ph_dip,mdi_pathinfo_t * pip)2543 mdi_get_next_client_path(dev_info_t *ph_dip, mdi_pathinfo_t *pip)
2544 {
2545 	mdi_phci_t *ph;
2546 
2547 	if (!MDI_PHCI(ph_dip))
2548 		return (NULL);
2549 
2550 	/*
2551 	 * Walk through pHCI link
2552 	 */
2553 	ph = (mdi_phci_t *)DEVI(ph_dip)->devi_mdi_xhci;
2554 	ASSERT(ph != NULL);
2555 
2556 	if (pip == NULL)
2557 		return ((mdi_pathinfo_t *)ph->ph_path_head);
2558 
2559 	return ((mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link);
2560 }
2561 
2562 /*
2563  * mdi_hold_path():
2564  *		Hold the mdi_pathinfo node against unwanted unexpected free.
2565  * Return Values:
2566  *		None
2567  */
2568 void
mdi_hold_path(mdi_pathinfo_t * pip)2569 mdi_hold_path(mdi_pathinfo_t *pip)
2570 {
2571 	if (pip) {
2572 		MDI_PI_LOCK(pip);
2573 		MDI_PI_HOLD(pip);
2574 		MDI_PI_UNLOCK(pip);
2575 	}
2576 }
2577 
2578 
2579 /*
2580  * mdi_rele_path():
2581  *		Release the mdi_pathinfo node which was selected
2582  *		through mdi_select_path() mechanism or manually held by
2583  *		calling mdi_hold_path().
2584  * Return Values:
2585  *		None
2586  */
2587 void
mdi_rele_path(mdi_pathinfo_t * pip)2588 mdi_rele_path(mdi_pathinfo_t *pip)
2589 {
2590 	if (pip) {
2591 		MDI_PI_LOCK(pip);
2592 		MDI_PI_RELE(pip);
2593 		if (MDI_PI(pip)->pi_ref_cnt == 0) {
2594 			cv_broadcast(&MDI_PI(pip)->pi_ref_cv);
2595 		}
2596 		MDI_PI_UNLOCK(pip);
2597 	}
2598 }
2599 
2600 /*
2601  * mdi_pi_lock():
2602  * 		Lock the mdi_pathinfo node.
2603  * Note:
2604  *		The caller should release the lock by calling mdi_pi_unlock()
2605  */
2606 void
mdi_pi_lock(mdi_pathinfo_t * pip)2607 mdi_pi_lock(mdi_pathinfo_t *pip)
2608 {
2609 	ASSERT(pip != NULL);
2610 	if (pip) {
2611 		MDI_PI_LOCK(pip);
2612 	}
2613 }
2614 
2615 
2616 /*
2617  * mdi_pi_unlock():
2618  * 		Unlock the mdi_pathinfo node.
2619  * Note:
2620  *		The mdi_pathinfo node should have been locked with mdi_pi_lock()
2621  */
2622 void
mdi_pi_unlock(mdi_pathinfo_t * pip)2623 mdi_pi_unlock(mdi_pathinfo_t *pip)
2624 {
2625 	ASSERT(pip != NULL);
2626 	if (pip) {
2627 		MDI_PI_UNLOCK(pip);
2628 	}
2629 }
2630 
2631 /*
2632  * mdi_pi_find():
2633  *		Search the list of mdi_pathinfo nodes attached to the
2634  *		pHCI/Client device node whose path address matches "paddr".
2635  *		Returns a pointer to the mdi_pathinfo node if a matching node is
2636  *		found.
2637  * Return Values:
2638  *		mdi_pathinfo node handle
2639  *		NULL
2640  * Notes:
2641  *		Caller need not hold any locks to call this function.
2642  */
2643 mdi_pathinfo_t *
mdi_pi_find(dev_info_t * pdip,char * caddr,char * paddr)2644 mdi_pi_find(dev_info_t *pdip, char *caddr, char *paddr)
2645 {
2646 	mdi_phci_t		*ph;
2647 	mdi_vhci_t		*vh;
2648 	mdi_client_t		*ct;
2649 	mdi_pathinfo_t		*pip = NULL;
2650 
2651 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2652 	    "caddr@%s paddr@%s", caddr ? caddr : "", paddr ? paddr : ""));
2653 	if ((pdip == NULL) || (paddr == NULL)) {
2654 		return (NULL);
2655 	}
2656 	ph = i_devi_get_phci(pdip);
2657 	if (ph == NULL) {
2658 		/*
2659 		 * Invalid pHCI device, Nothing more to do.
2660 		 */
2661 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid phci"));
2662 		return (NULL);
2663 	}
2664 
2665 	vh = ph->ph_vhci;
2666 	if (vh == NULL) {
2667 		/*
2668 		 * Invalid vHCI device, Nothing more to do.
2669 		 */
2670 		MDI_DEBUG(2, (MDI_WARN, pdip, "invalid vhci"));
2671 		return (NULL);
2672 	}
2673 
2674 	/*
2675 	 * Look for pathinfo node identified by paddr.
2676 	 */
2677 	if (caddr == NULL) {
2678 		/*
2679 		 * Find a mdi_pathinfo node under pHCI list for a matching
2680 		 * unit address.
2681 		 */
2682 		MDI_PHCI_LOCK(ph);
2683 		if (MDI_PHCI_IS_OFFLINE(ph)) {
2684 			MDI_DEBUG(2, (MDI_WARN, pdip,
2685 			    "offline phci %p", (void *)ph));
2686 			MDI_PHCI_UNLOCK(ph);
2687 			return (NULL);
2688 		}
2689 		pip = (mdi_pathinfo_t *)ph->ph_path_head;
2690 
2691 		while (pip != NULL) {
2692 			if (strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2693 				break;
2694 			}
2695 			pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_phci_link;
2696 		}
2697 		MDI_PHCI_UNLOCK(ph);
2698 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2699 		    "found %s %p", mdi_pi_spathname(pip), (void *)pip));
2700 		return (pip);
2701 	}
2702 
2703 	/*
2704 	 * XXX - Is the rest of the code in this function really necessary?
2705 	 * The consumers of mdi_pi_find() can search for the desired pathinfo
2706 	 * node by calling mdi_pi_find(pdip, NULL, paddr). Irrespective of
2707 	 * whether the search is based on the pathinfo nodes attached to
2708 	 * the pHCI or the client node, the result will be the same.
2709 	 */
2710 
2711 	/*
2712 	 * Find the client device corresponding to 'caddr'
2713 	 */
2714 	MDI_VHCI_CLIENT_LOCK(vh);
2715 
2716 	/*
2717 	 * XXX - Passing NULL to the following function works as long as the
2718 	 * the client addresses (caddr) are unique per vhci basis.
2719 	 */
2720 	ct = i_mdi_client_find(vh, NULL, caddr);
2721 	if (ct == NULL) {
2722 		/*
2723 		 * Client not found, Obviously mdi_pathinfo node has not been
2724 		 * created yet.
2725 		 */
2726 		MDI_VHCI_CLIENT_UNLOCK(vh);
2727 		MDI_DEBUG(2, (MDI_NOTE, pdip,
2728 		    "client not found for caddr @%s", caddr ? caddr : ""));
2729 		return (NULL);
2730 	}
2731 
2732 	/*
2733 	 * Hold the client lock and look for a mdi_pathinfo node with matching
2734 	 * pHCI and paddr
2735 	 */
2736 	MDI_CLIENT_LOCK(ct);
2737 
2738 	/*
2739 	 * Release the global mutex as it is no more needed. Note: We always
2740 	 * respect the locking order while acquiring.
2741 	 */
2742 	MDI_VHCI_CLIENT_UNLOCK(vh);
2743 
2744 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2745 	while (pip != NULL) {
2746 		/*
2747 		 * Compare the unit address
2748 		 */
2749 		if ((MDI_PI(pip)->pi_phci == ph) &&
2750 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2751 			break;
2752 		}
2753 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2754 	}
2755 	MDI_CLIENT_UNLOCK(ct);
2756 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2757 	    "found: %s %p", mdi_pi_spathname(pip), (void *)pip));
2758 	return (pip);
2759 }
2760 
2761 /*
2762  * mdi_pi_alloc():
2763  *		Allocate and initialize a new instance of a mdi_pathinfo node.
2764  *		The mdi_pathinfo node returned by this function identifies a
2765  *		unique device path is capable of having properties attached
2766  *		and passed to mdi_pi_online() to fully attach and online the
2767  *		path and client device node.
2768  *		The mdi_pathinfo node returned by this function must be
2769  *		destroyed using mdi_pi_free() if the path is no longer
2770  *		operational or if the caller fails to attach a client device
2771  *		node when calling mdi_pi_online(). The framework will not free
2772  *		the resources allocated.
2773  *		This function can be called from both interrupt and kernel
2774  *		contexts.  DDI_NOSLEEP flag should be used while calling
2775  *		from interrupt contexts.
2776  * Return Values:
2777  *		MDI_SUCCESS
2778  *		MDI_FAILURE
2779  *		MDI_NOMEM
2780  */
2781 /*ARGSUSED*/
2782 int
mdi_pi_alloc_compatible(dev_info_t * pdip,char * cname,char * caddr,char * paddr,char ** compatible,int ncompatible,int flags,mdi_pathinfo_t ** ret_pip)2783 mdi_pi_alloc_compatible(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2784     char **compatible, int ncompatible, int flags, mdi_pathinfo_t **ret_pip)
2785 {
2786 	mdi_vhci_t	*vh;
2787 	mdi_phci_t	*ph;
2788 	mdi_client_t	*ct;
2789 	mdi_pathinfo_t	*pip = NULL;
2790 	dev_info_t	*cdip;
2791 	int		rv = MDI_NOMEM;
2792 	int		path_allocated = 0;
2793 
2794 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2795 	    "cname %s: caddr@%s paddr@%s",
2796 	    cname ? cname : "", caddr ? caddr : "", paddr ? paddr : ""));
2797 
2798 	if (pdip == NULL || cname == NULL || caddr == NULL || paddr == NULL ||
2799 	    ret_pip == NULL) {
2800 		/* Nothing more to do */
2801 		return (MDI_FAILURE);
2802 	}
2803 
2804 	*ret_pip = NULL;
2805 
2806 	/* No allocations on detaching pHCI */
2807 	if (DEVI_IS_DETACHING(pdip)) {
2808 		/* Invalid pHCI device, return failure */
2809 		MDI_DEBUG(1, (MDI_WARN, pdip,
2810 		    "!detaching pHCI=%p", (void *)pdip));
2811 		return (MDI_FAILURE);
2812 	}
2813 
2814 	ph = i_devi_get_phci(pdip);
2815 	ASSERT(ph != NULL);
2816 	if (ph == NULL) {
2817 		/* Invalid pHCI device, return failure */
2818 		MDI_DEBUG(1, (MDI_WARN, pdip,
2819 		    "!invalid pHCI=%p", (void *)pdip));
2820 		return (MDI_FAILURE);
2821 	}
2822 
2823 	MDI_PHCI_LOCK(ph);
2824 	vh = ph->ph_vhci;
2825 	if (vh == NULL) {
2826 		/* Invalid vHCI device, return failure */
2827 		MDI_DEBUG(1, (MDI_WARN, pdip,
2828 		    "!invalid vHCI=%p", (void *)pdip));
2829 		MDI_PHCI_UNLOCK(ph);
2830 		return (MDI_FAILURE);
2831 	}
2832 
2833 	if (MDI_PHCI_IS_READY(ph) == 0) {
2834 		/*
2835 		 * Do not allow new node creation when pHCI is in
2836 		 * offline/suspended states
2837 		 */
2838 		MDI_DEBUG(1, (MDI_WARN, pdip,
2839 		    "pHCI=%p is not ready", (void *)ph));
2840 		MDI_PHCI_UNLOCK(ph);
2841 		return (MDI_BUSY);
2842 	}
2843 	MDI_PHCI_UNSTABLE(ph);
2844 	MDI_PHCI_UNLOCK(ph);
2845 
2846 	/* look for a matching client, create one if not found */
2847 	MDI_VHCI_CLIENT_LOCK(vh);
2848 	ct = i_mdi_client_find(vh, cname, caddr);
2849 	if (ct == NULL) {
2850 		ct = i_mdi_client_alloc(vh, cname, caddr);
2851 		ASSERT(ct != NULL);
2852 	}
2853 
2854 	if (ct->ct_dip == NULL) {
2855 		/*
2856 		 * Allocate a devinfo node
2857 		 */
2858 		ct->ct_dip = i_mdi_devinfo_create(vh, cname, caddr,
2859 		    compatible, ncompatible);
2860 		if (ct->ct_dip == NULL) {
2861 			(void) i_mdi_client_free(vh, ct);
2862 			goto fail;
2863 		}
2864 	}
2865 	cdip = ct->ct_dip;
2866 
2867 	DEVI(cdip)->devi_mdi_component |= MDI_COMPONENT_CLIENT;
2868 	DEVI(cdip)->devi_mdi_client = (caddr_t)ct;
2869 
2870 	MDI_CLIENT_LOCK(ct);
2871 	pip = (mdi_pathinfo_t *)ct->ct_path_head;
2872 	while (pip != NULL) {
2873 		/*
2874 		 * Compare the unit address
2875 		 */
2876 		if ((MDI_PI(pip)->pi_phci == ph) &&
2877 		    strcmp(MDI_PI(pip)->pi_addr, paddr) == 0) {
2878 			break;
2879 		}
2880 		pip = (mdi_pathinfo_t *)MDI_PI(pip)->pi_client_link;
2881 	}
2882 	MDI_CLIENT_UNLOCK(ct);
2883 
2884 	if (pip == NULL) {
2885 		/*
2886 		 * This is a new path for this client device.  Allocate and
2887 		 * initialize a new pathinfo node
2888 		 */
2889 		pip = i_mdi_pi_alloc(ph, paddr, ct);
2890 		ASSERT(pip != NULL);
2891 		path_allocated = 1;
2892 	}
2893 	rv = MDI_SUCCESS;
2894 
2895 fail:
2896 	/*
2897 	 * Release the global mutex.
2898 	 */
2899 	MDI_VHCI_CLIENT_UNLOCK(vh);
2900 
2901 	/*
2902 	 * Mark the pHCI as stable
2903 	 */
2904 	MDI_PHCI_LOCK(ph);
2905 	MDI_PHCI_STABLE(ph);
2906 	MDI_PHCI_UNLOCK(ph);
2907 	*ret_pip = pip;
2908 
2909 	MDI_DEBUG(2, (MDI_NOTE, pdip,
2910 	    "alloc %s %p", mdi_pi_spathname(pip), (void *)pip));
2911 
2912 	if (path_allocated)
2913 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
2914 
2915 	return (rv);
2916 }
2917 
2918 /*ARGSUSED*/
2919 int
mdi_pi_alloc(dev_info_t * pdip,char * cname,char * caddr,char * paddr,int flags,mdi_pathinfo_t ** ret_pip)2920 mdi_pi_alloc(dev_info_t *pdip, char *cname, char *caddr, char *paddr,
2921     int flags, mdi_pathinfo_t **ret_pip)
2922 {
2923 	return (mdi_pi_alloc_compatible(pdip, cname, caddr, paddr, NULL, 0,
2924 	    flags, ret_pip));
2925 }
2926 
2927 /*
2928  * i_mdi_pi_alloc():
2929  *		Allocate a mdi_pathinfo node and add to the pHCI path list
2930  * Return Values:
2931  *		mdi_pathinfo
2932  */
2933 /*ARGSUSED*/
2934 static mdi_pathinfo_t *
i_mdi_pi_alloc(mdi_phci_t * ph,char * paddr,mdi_client_t * ct)2935 i_mdi_pi_alloc(mdi_phci_t *ph, char *paddr, mdi_client_t *ct)
2936 {
2937 	mdi_pathinfo_t	*pip;
2938 	int		ct_circular;
2939 	int		ph_circular;
2940 	static char	path[MAXPATHLEN];	/* mdi_pathmap_mutex protects */
2941 	char		*path_persistent;
2942 	int		path_instance;
2943 	mod_hash_val_t	hv;
2944 
2945 	ASSERT(MDI_VHCI_CLIENT_LOCKED(ph->ph_vhci));
2946 
2947 	pip = kmem_zalloc(sizeof (struct mdi_pathinfo), KM_SLEEP);
2948 	mutex_init(&MDI_PI(pip)->pi_mutex, NULL, MUTEX_DEFAULT, NULL);
2949 	MDI_PI(pip)->pi_state = MDI_PATHINFO_STATE_INIT |
2950 	    MDI_PATHINFO_STATE_TRANSIENT;
2951 
2952 	if (MDI_PHCI_IS_USER_DISABLED(ph))
2953 		MDI_PI_SET_USER_DISABLE(pip);
2954 
2955 	if (MDI_PHCI_IS_DRV_DISABLED_TRANSIENT(ph))
2956 		MDI_PI_SET_DRV_DISABLE_TRANS(pip);
2957 
2958 	if (MDI_PHCI_IS_DRV_DISABLED(ph))
2959 		MDI_PI_SET_DRV_DISABLE(pip);
2960 
2961 	MDI_PI(pip)->pi_old_state = MDI_PATHINFO_STATE_INIT;
2962 	cv_init(&MDI_PI(pip)->pi_state_cv, NULL, CV_DEFAULT, NULL);
2963 	MDI_PI(pip)->pi_client = ct;
2964 	MDI_PI(pip)->pi_phci = ph;
2965 	MDI_PI(pip)->pi_addr = kmem_alloc(strlen(paddr) + 1, KM_SLEEP);
2966 	(void) strcpy(MDI_PI(pip)->pi_addr, paddr);
2967 
2968         /*
2969 	 * We form the "path" to the pathinfo node, and see if we have
2970 	 * already allocated a 'path_instance' for that "path".  If so,
2971 	 * we use the already allocated 'path_instance'.  If not, we
2972 	 * allocate a new 'path_instance' and associate it with a copy of
2973 	 * the "path" string (which is never freed). The association
2974 	 * between a 'path_instance' this "path" string persists until
2975 	 * reboot.
2976 	 */
2977         mutex_enter(&mdi_pathmap_mutex);
2978 	(void) ddi_pathname(ph->ph_dip, path);
2979 	(void) sprintf(path + strlen(path), "/%s@%s",
2980 	    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2981         if (mod_hash_find(mdi_pathmap_bypath, (mod_hash_key_t)path, &hv) == 0) {
2982                 path_instance = (uint_t)(intptr_t)hv;
2983         } else {
2984 		/* allocate a new 'path_instance' and persistent "path" */
2985 		path_instance = mdi_pathmap_instance++;
2986 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2987                 (void) mod_hash_insert(mdi_pathmap_bypath,
2988                     (mod_hash_key_t)path_persistent,
2989                     (mod_hash_val_t)(intptr_t)path_instance);
2990 		(void) mod_hash_insert(mdi_pathmap_byinstance,
2991 		    (mod_hash_key_t)(intptr_t)path_instance,
2992 		    (mod_hash_val_t)path_persistent);
2993 
2994 		/* create shortpath name */
2995 		(void) snprintf(path, sizeof(path), "%s%d/%s@%s",
2996 		    ddi_driver_name(ph->ph_dip), ddi_get_instance(ph->ph_dip),
2997 		    mdi_pi_get_node_name(pip), mdi_pi_get_addr(pip));
2998 		path_persistent = i_ddi_strdup(path, KM_SLEEP);
2999 		(void) mod_hash_insert(mdi_pathmap_sbyinstance,
3000 		    (mod_hash_key_t)(intptr_t)path_instance,
3001 		    (mod_hash_val_t)path_persistent);
3002         }
3003         mutex_exit(&mdi_pathmap_mutex);
3004 	MDI_PI(pip)->pi_path_instance = path_instance;
3005 
3006 	(void) nvlist_alloc(&MDI_PI(pip)->pi_prop, NV_UNIQUE_NAME, KM_SLEEP);
3007 	ASSERT(MDI_PI(pip)->pi_prop != NULL);
3008 	MDI_PI(pip)->pi_pprivate = NULL;
3009 	MDI_PI(pip)->pi_cprivate = NULL;
3010 	MDI_PI(pip)->pi_vprivate = NULL;
3011 	MDI_PI(pip)->pi_client_link = NULL;
3012 	MDI_PI(pip)->pi_phci_link = NULL;
3013 	MDI_PI(pip)->pi_ref_cnt = 0;
3014 	MDI_PI(pip)->pi_kstats = NULL;
3015 	MDI_PI(pip)->pi_preferred = 1;
3016 	cv_init(&MDI_PI(pip)->pi_ref_cv, NULL, CV_DEFAULT, NULL);
3017 
3018 	/*
3019 	 * Lock both dev_info nodes against changes in parallel.
3020 	 *
3021 	 * The ndi_devi_enter(Client), is atypical since the client is a leaf.
3022 	 * This atypical operation is done to synchronize pathinfo nodes
3023 	 * during devinfo snapshot (see di_register_pip) by 'pretending' that
3024 	 * the pathinfo nodes are children of the Client.
3025 	 */
3026 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3027 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3028 
3029 	i_mdi_phci_add_path(ph, pip);
3030 	i_mdi_client_add_path(ct, pip);
3031 
3032 	ndi_devi_exit(ph->ph_dip, ph_circular);
3033 	ndi_devi_exit(ct->ct_dip, ct_circular);
3034 
3035 	return (pip);
3036 }
3037 
3038 /*
3039  * mdi_pi_pathname_by_instance():
3040  *	Lookup of "path" by 'path_instance'. Return "path".
3041  *	NOTE: returned "path" remains valid forever (until reboot).
3042  */
3043 char *
mdi_pi_pathname_by_instance(int path_instance)3044 mdi_pi_pathname_by_instance(int path_instance)
3045 {
3046 	char		*path;
3047 	mod_hash_val_t	hv;
3048 
3049 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3050 	mutex_enter(&mdi_pathmap_mutex);
3051 	if (mod_hash_find(mdi_pathmap_byinstance,
3052 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3053 		path = (char *)hv;
3054 	else
3055 		path = NULL;
3056 	mutex_exit(&mdi_pathmap_mutex);
3057 	return (path);
3058 }
3059 
3060 /*
3061  * mdi_pi_spathname_by_instance():
3062  *	Lookup of "shortpath" by 'path_instance'. Return "shortpath".
3063  *	NOTE: returned "shortpath" remains valid forever (until reboot).
3064  */
3065 char *
mdi_pi_spathname_by_instance(int path_instance)3066 mdi_pi_spathname_by_instance(int path_instance)
3067 {
3068 	char		*path;
3069 	mod_hash_val_t	hv;
3070 
3071 	/* mdi_pathmap lookup of "path" by 'path_instance' */
3072 	mutex_enter(&mdi_pathmap_mutex);
3073 	if (mod_hash_find(mdi_pathmap_sbyinstance,
3074 	    (mod_hash_key_t)(intptr_t)path_instance, &hv) == 0)
3075 		path = (char *)hv;
3076 	else
3077 		path = NULL;
3078 	mutex_exit(&mdi_pathmap_mutex);
3079 	return (path);
3080 }
3081 
3082 
3083 /*
3084  * i_mdi_phci_add_path():
3085  * 		Add a mdi_pathinfo node to pHCI list.
3086  * Notes:
3087  *		Caller should per-pHCI mutex
3088  */
3089 static void
i_mdi_phci_add_path(mdi_phci_t * ph,mdi_pathinfo_t * pip)3090 i_mdi_phci_add_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3091 {
3092 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3093 
3094 	MDI_PHCI_LOCK(ph);
3095 	if (ph->ph_path_head == NULL) {
3096 		ph->ph_path_head = pip;
3097 	} else {
3098 		MDI_PI(ph->ph_path_tail)->pi_phci_link = MDI_PI(pip);
3099 	}
3100 	ph->ph_path_tail = pip;
3101 	ph->ph_path_count++;
3102 	MDI_PHCI_UNLOCK(ph);
3103 }
3104 
3105 /*
3106  * i_mdi_client_add_path():
3107  *		Add mdi_pathinfo node to client list
3108  */
3109 static void
i_mdi_client_add_path(mdi_client_t * ct,mdi_pathinfo_t * pip)3110 i_mdi_client_add_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3111 {
3112 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3113 
3114 	MDI_CLIENT_LOCK(ct);
3115 	if (ct->ct_path_head == NULL) {
3116 		ct->ct_path_head = pip;
3117 	} else {
3118 		MDI_PI(ct->ct_path_tail)->pi_client_link = MDI_PI(pip);
3119 	}
3120 	ct->ct_path_tail = pip;
3121 	ct->ct_path_count++;
3122 	MDI_CLIENT_UNLOCK(ct);
3123 }
3124 
3125 /*
3126  * mdi_pi_free():
3127  *		Free the mdi_pathinfo node and also client device node if this
3128  *		is the last path to the device
3129  * Return Values:
3130  *		MDI_SUCCESS
3131  *		MDI_FAILURE
3132  *		MDI_BUSY
3133  */
3134 /*ARGSUSED*/
3135 int
mdi_pi_free(mdi_pathinfo_t * pip,int flags)3136 mdi_pi_free(mdi_pathinfo_t *pip, int flags)
3137 {
3138 	int		rv;
3139 	mdi_vhci_t	*vh;
3140 	mdi_phci_t	*ph;
3141 	mdi_client_t	*ct;
3142 	int		(*f)();
3143 	int		client_held = 0;
3144 
3145 	MDI_PI_LOCK(pip);
3146 	ph = MDI_PI(pip)->pi_phci;
3147 	ASSERT(ph != NULL);
3148 	if (ph == NULL) {
3149 		/*
3150 		 * Invalid pHCI device, return failure
3151 		 */
3152 		MDI_DEBUG(1, (MDI_WARN, NULL,
3153 		    "!invalid pHCI: pip %s %p",
3154 		    mdi_pi_spathname(pip), (void *)pip));
3155 		MDI_PI_UNLOCK(pip);
3156 		return (MDI_FAILURE);
3157 	}
3158 
3159 	vh = ph->ph_vhci;
3160 	ASSERT(vh != NULL);
3161 	if (vh == NULL) {
3162 		/* Invalid pHCI device, return failure */
3163 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3164 		    "!invalid vHCI: pip %s %p",
3165 		    mdi_pi_spathname(pip), (void *)pip));
3166 		MDI_PI_UNLOCK(pip);
3167 		return (MDI_FAILURE);
3168 	}
3169 
3170 	ct = MDI_PI(pip)->pi_client;
3171 	ASSERT(ct != NULL);
3172 	if (ct == NULL) {
3173 		/*
3174 		 * Invalid Client device, return failure
3175 		 */
3176 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3177 		    "!invalid client: pip %s %p",
3178 		    mdi_pi_spathname(pip), (void *)pip));
3179 		MDI_PI_UNLOCK(pip);
3180 		return (MDI_FAILURE);
3181 	}
3182 
3183 	/*
3184 	 * Check to see for busy condition.  A mdi_pathinfo can only be freed
3185 	 * if the node state is either offline or init and the reference count
3186 	 * is zero.
3187 	 */
3188 	if (!(MDI_PI_IS_OFFLINE(pip) || MDI_PI_IS_INIT(pip) ||
3189 	    MDI_PI_IS_INITING(pip))) {
3190 		/*
3191 		 * Node is busy
3192 		 */
3193 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3194 		    "!busy: pip %s %p", mdi_pi_spathname(pip), (void *)pip));
3195 		MDI_PI_UNLOCK(pip);
3196 		return (MDI_BUSY);
3197 	}
3198 
3199 	while (MDI_PI(pip)->pi_ref_cnt != 0) {
3200 		/*
3201 		 * Give a chance for pending I/Os to complete.
3202 		 */
3203 		MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3204 		    "!%d cmds still pending on path: %s %p",
3205 		    MDI_PI(pip)->pi_ref_cnt,
3206 		    mdi_pi_spathname(pip), (void *)pip));
3207 		if (cv_reltimedwait(&MDI_PI(pip)->pi_ref_cv,
3208 		    &MDI_PI(pip)->pi_mutex, drv_usectohz(60 * 1000000),
3209 		    TR_CLOCK_TICK) == -1) {
3210 			/*
3211 			 * The timeout time reached without ref_cnt being zero
3212 			 * being signaled.
3213 			 */
3214 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3215 			    "!Timeout reached on path %s %p without the cond",
3216 			    mdi_pi_spathname(pip), (void *)pip));
3217 			MDI_DEBUG(1, (MDI_NOTE, ct->ct_dip,
3218 			    "!%d cmds still pending on path %s %p",
3219 			    MDI_PI(pip)->pi_ref_cnt,
3220 			    mdi_pi_spathname(pip), (void *)pip));
3221 			MDI_PI_UNLOCK(pip);
3222 			return (MDI_BUSY);
3223 		}
3224 	}
3225 	if (MDI_PI(pip)->pi_pm_held) {
3226 		client_held = 1;
3227 	}
3228 	MDI_PI_UNLOCK(pip);
3229 
3230 	vhcache_pi_remove(vh->vh_config, MDI_PI(pip));
3231 
3232 	MDI_CLIENT_LOCK(ct);
3233 
3234 	/* Prevent further failovers till MDI_VHCI_CLIENT_LOCK is held */
3235 	MDI_CLIENT_SET_PATH_FREE_IN_PROGRESS(ct);
3236 
3237 	/*
3238 	 * Wait till failover is complete before removing this node.
3239 	 */
3240 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3241 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3242 
3243 	MDI_CLIENT_UNLOCK(ct);
3244 	MDI_VHCI_CLIENT_LOCK(vh);
3245 	MDI_CLIENT_LOCK(ct);
3246 	MDI_CLIENT_CLEAR_PATH_FREE_IN_PROGRESS(ct);
3247 
3248 	rv = MDI_SUCCESS;
3249 	if (!MDI_PI_IS_INITING(pip)) {
3250 		f = vh->vh_ops->vo_pi_uninit;
3251 		if (f != NULL) {
3252 			rv = (*f)(vh->vh_dip, pip, 0);
3253 		}
3254 	}
3255 
3256 	/*
3257 	 * If vo_pi_uninit() completed successfully.
3258 	 */
3259 	if (rv == MDI_SUCCESS) {
3260 		if (client_held) {
3261 			MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3262 			    "i_mdi_pm_rele_client\n"));
3263 			i_mdi_pm_rele_client(ct, 1);
3264 		}
3265 		i_mdi_pi_free(ph, pip, ct);
3266 		if (ct->ct_path_count == 0) {
3267 			/*
3268 			 * Client lost its last path.
3269 			 * Clean up the client device
3270 			 */
3271 			MDI_CLIENT_UNLOCK(ct);
3272 			(void) i_mdi_client_free(ct->ct_vhci, ct);
3273 			MDI_VHCI_CLIENT_UNLOCK(vh);
3274 			return (rv);
3275 		}
3276 	}
3277 	MDI_CLIENT_UNLOCK(ct);
3278 	MDI_VHCI_CLIENT_UNLOCK(vh);
3279 
3280 	if (rv == MDI_FAILURE)
3281 		vhcache_pi_add(vh->vh_config, MDI_PI(pip));
3282 
3283 	return (rv);
3284 }
3285 
3286 /*
3287  * i_mdi_pi_free():
3288  *		Free the mdi_pathinfo node
3289  */
3290 static void
i_mdi_pi_free(mdi_phci_t * ph,mdi_pathinfo_t * pip,mdi_client_t * ct)3291 i_mdi_pi_free(mdi_phci_t *ph, mdi_pathinfo_t *pip, mdi_client_t *ct)
3292 {
3293 	int	ct_circular;
3294 	int	ph_circular;
3295 
3296 	ASSERT(MDI_CLIENT_LOCKED(ct));
3297 
3298 	/*
3299 	 * remove any per-path kstats
3300 	 */
3301 	i_mdi_pi_kstat_destroy(pip);
3302 
3303 	/* See comments in i_mdi_pi_alloc() */
3304 	ndi_devi_enter(ct->ct_dip, &ct_circular);
3305 	ndi_devi_enter(ph->ph_dip, &ph_circular);
3306 
3307 	i_mdi_client_remove_path(ct, pip);
3308 	i_mdi_phci_remove_path(ph, pip);
3309 
3310 	ndi_devi_exit(ph->ph_dip, ph_circular);
3311 	ndi_devi_exit(ct->ct_dip, ct_circular);
3312 
3313 	mutex_destroy(&MDI_PI(pip)->pi_mutex);
3314 	cv_destroy(&MDI_PI(pip)->pi_state_cv);
3315 	cv_destroy(&MDI_PI(pip)->pi_ref_cv);
3316 	if (MDI_PI(pip)->pi_addr) {
3317 		kmem_free(MDI_PI(pip)->pi_addr,
3318 		    strlen(MDI_PI(pip)->pi_addr) + 1);
3319 		MDI_PI(pip)->pi_addr = NULL;
3320 	}
3321 
3322 	if (MDI_PI(pip)->pi_prop) {
3323 		(void) nvlist_free(MDI_PI(pip)->pi_prop);
3324 		MDI_PI(pip)->pi_prop = NULL;
3325 	}
3326 	kmem_free(pip, sizeof (struct mdi_pathinfo));
3327 }
3328 
3329 
3330 /*
3331  * i_mdi_phci_remove_path():
3332  * 		Remove a mdi_pathinfo node from pHCI list.
3333  * Notes:
3334  *		Caller should hold per-pHCI mutex
3335  */
3336 static void
i_mdi_phci_remove_path(mdi_phci_t * ph,mdi_pathinfo_t * pip)3337 i_mdi_phci_remove_path(mdi_phci_t *ph, mdi_pathinfo_t *pip)
3338 {
3339 	mdi_pathinfo_t	*prev = NULL;
3340 	mdi_pathinfo_t	*path = NULL;
3341 
3342 	ASSERT(DEVI_BUSY_OWNED(ph->ph_dip));
3343 
3344 	MDI_PHCI_LOCK(ph);
3345 	path = ph->ph_path_head;
3346 	while (path != NULL) {
3347 		if (path == pip) {
3348 			break;
3349 		}
3350 		prev = path;
3351 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3352 	}
3353 
3354 	if (path) {
3355 		ph->ph_path_count--;
3356 		if (prev) {
3357 			MDI_PI(prev)->pi_phci_link = MDI_PI(path)->pi_phci_link;
3358 		} else {
3359 			ph->ph_path_head =
3360 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_phci_link;
3361 		}
3362 		if (ph->ph_path_tail == path) {
3363 			ph->ph_path_tail = prev;
3364 		}
3365 	}
3366 
3367 	/*
3368 	 * Clear the pHCI link
3369 	 */
3370 	MDI_PI(pip)->pi_phci_link = NULL;
3371 	MDI_PI(pip)->pi_phci = NULL;
3372 	MDI_PHCI_UNLOCK(ph);
3373 }
3374 
3375 /*
3376  * i_mdi_client_remove_path():
3377  * 		Remove a mdi_pathinfo node from client path list.
3378  */
3379 static void
i_mdi_client_remove_path(mdi_client_t * ct,mdi_pathinfo_t * pip)3380 i_mdi_client_remove_path(mdi_client_t *ct, mdi_pathinfo_t *pip)
3381 {
3382 	mdi_pathinfo_t	*prev = NULL;
3383 	mdi_pathinfo_t	*path;
3384 
3385 	ASSERT(DEVI_BUSY_OWNED(ct->ct_dip));
3386 
3387 	ASSERT(MDI_CLIENT_LOCKED(ct));
3388 	path = ct->ct_path_head;
3389 	while (path != NULL) {
3390 		if (path == pip) {
3391 			break;
3392 		}
3393 		prev = path;
3394 		path = (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3395 	}
3396 
3397 	if (path) {
3398 		ct->ct_path_count--;
3399 		if (prev) {
3400 			MDI_PI(prev)->pi_client_link =
3401 			    MDI_PI(path)->pi_client_link;
3402 		} else {
3403 			ct->ct_path_head =
3404 			    (mdi_pathinfo_t *)MDI_PI(path)->pi_client_link;
3405 		}
3406 		if (ct->ct_path_tail == path) {
3407 			ct->ct_path_tail = prev;
3408 		}
3409 		if (ct->ct_path_last == path) {
3410 			ct->ct_path_last = ct->ct_path_head;
3411 		}
3412 	}
3413 	MDI_PI(pip)->pi_client_link = NULL;
3414 	MDI_PI(pip)->pi_client = NULL;
3415 }
3416 
3417 /*
3418  * i_mdi_pi_state_change():
3419  *		online a mdi_pathinfo node
3420  *
3421  * Return Values:
3422  *		MDI_SUCCESS
3423  *		MDI_FAILURE
3424  */
3425 /*ARGSUSED*/
3426 static int
i_mdi_pi_state_change(mdi_pathinfo_t * pip,mdi_pathinfo_state_t state,int flag)3427 i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
3428 {
3429 	int		rv = MDI_SUCCESS;
3430 	mdi_vhci_t	*vh;
3431 	mdi_phci_t	*ph;
3432 	mdi_client_t	*ct;
3433 	int		(*f)();
3434 	dev_info_t	*cdip;
3435 
3436 	MDI_PI_LOCK(pip);
3437 
3438 	ph = MDI_PI(pip)->pi_phci;
3439 	ASSERT(ph);
3440 	if (ph == NULL) {
3441 		/*
3442 		 * Invalid pHCI device, fail the request
3443 		 */
3444 		MDI_PI_UNLOCK(pip);
3445 		MDI_DEBUG(1, (MDI_WARN, NULL,
3446 		    "!invalid phci: pip %s %p",
3447 		    mdi_pi_spathname(pip), (void *)pip));
3448 		return (MDI_FAILURE);
3449 	}
3450 
3451 	vh = ph->ph_vhci;
3452 	ASSERT(vh);
3453 	if (vh == NULL) {
3454 		/*
3455 		 * Invalid vHCI device, fail the request
3456 		 */
3457 		MDI_PI_UNLOCK(pip);
3458 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3459 		    "!invalid vhci: pip %s %p",
3460 		    mdi_pi_spathname(pip), (void *)pip));
3461 		return (MDI_FAILURE);
3462 	}
3463 
3464 	ct = MDI_PI(pip)->pi_client;
3465 	ASSERT(ct != NULL);
3466 	if (ct == NULL) {
3467 		/*
3468 		 * Invalid client device, fail the request
3469 		 */
3470 		MDI_PI_UNLOCK(pip);
3471 		MDI_DEBUG(1, (MDI_WARN, ph->ph_dip,
3472 		    "!invalid client: pip %s %p",
3473 		    mdi_pi_spathname(pip), (void *)pip));
3474 		return (MDI_FAILURE);
3475 	}
3476 
3477 	/*
3478 	 * If this path has not been initialized yet, Callback vHCI driver's
3479 	 * pathinfo node initialize entry point
3480 	 */
3481 
3482 	if (MDI_PI_IS_INITING(pip)) {
3483 		MDI_PI_UNLOCK(pip);
3484 		f = vh->vh_ops->vo_pi_init;
3485 		if (f != NULL) {
3486 			rv = (*f)(vh->vh_dip, pip, 0);
3487 			if (rv != MDI_SUCCESS) {
3488 				MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3489 				    "!vo_pi_init failed: vHCI %p, pip %s %p",
3490 				    (void *)vh, mdi_pi_spathname(pip),
3491 				    (void *)pip));
3492 				return (MDI_FAILURE);
3493 			}
3494 		}
3495 		MDI_PI_LOCK(pip);
3496 		MDI_PI_CLEAR_TRANSIENT(pip);
3497 	}
3498 
3499 	/*
3500 	 * Do not allow state transition when pHCI is in offline/suspended
3501 	 * states
3502 	 */
3503 	i_mdi_phci_lock(ph, pip);
3504 	if (MDI_PHCI_IS_READY(ph) == 0) {
3505 		MDI_DEBUG(1, (MDI_WARN, ct->ct_dip,
3506 		    "!pHCI not ready, pHCI=%p", (void *)ph));
3507 		MDI_PI_UNLOCK(pip);
3508 		i_mdi_phci_unlock(ph);
3509 		return (MDI_BUSY);
3510 	}
3511 	MDI_PHCI_UNSTABLE(ph);
3512 	i_mdi_phci_unlock(ph);
3513 
3514 	/*
3515 	 * Check if mdi_pathinfo state is in transient state.
3516 	 * If yes, offlining is in progress and wait till transient state is
3517 	 * cleared.
3518 	 */
3519 	if (MDI_PI_IS_TRANSIENT(pip)) {
3520 		while (MDI_PI_IS_TRANSIENT(pip)) {
3521 			cv_wait(&MDI_PI(pip)->pi_state_cv,
3522 			    &MDI_PI(pip)->pi_mutex);
3523 		}
3524 	}
3525 
3526 	/*
3527 	 * Grab the client lock in reverse order sequence and release the
3528 	 * mdi_pathinfo mutex.
3529 	 */
3530 	i_mdi_client_lock(ct, pip);
3531 	MDI_PI_UNLOCK(pip);
3532 
3533 	/*
3534 	 * Wait till failover state is cleared
3535 	 */
3536 	while (MDI_CLIENT_IS_FAILOVER_IN_PROGRESS(ct))
3537 		cv_wait(&ct->ct_failover_cv, &ct->ct_mutex);
3538 
3539 	/*
3540 	 * Mark the mdi_pathinfo node state as transient
3541 	 */
3542 	MDI_PI_LOCK(pip);
3543 	switch (state) {
3544 	case MDI_PATHINFO_STATE_ONLINE:
3545 		MDI_PI_SET_ONLINING(pip);
3546 		break;
3547 
3548 	case MDI_PATHINFO_STATE_STANDBY:
3549 		MDI_PI_SET_STANDBYING(pip);
3550 		break;
3551 
3552 	case MDI_PATHINFO_STATE_FAULT:
3553 		/*
3554 		 * Mark the pathinfo state as FAULTED
3555 		 */
3556 		MDI_PI_SET_FAULTING(pip);
3557 		MDI_PI_ERRSTAT(pip, MDI_PI_HARDERR);
3558 		break;
3559 
3560 	case MDI_PATHINFO_STATE_OFFLINE:
3561 		/*
3562 		 * ndi_devi_offline() cannot hold pip or ct locks.
3563 		 */
3564 		MDI_PI_UNLOCK(pip);
3565 
3566 		/*
3567 		 * If this is a user initiated path online->offline operation
3568 		 * who's success would transition a client from DEGRADED to
3569 		 * FAILED then only proceed if we can offline the client first.
3570 		 */
3571 		cdip = ct->ct_dip;
3572 		if ((flag & NDI_USER_REQ) &&
3573 		    MDI_PI_IS_ONLINE(pip) &&
3574 		    (MDI_CLIENT_STATE(ct) == MDI_CLIENT_STATE_DEGRADED)) {
3575 			i_mdi_client_unlock(ct);
3576 			rv = ndi_devi_offline(cdip, NDI_DEVFS_CLEAN);
3577 			if (rv != NDI_SUCCESS) {
3578 				/*
3579 				 * Convert to MDI error code
3580 				 */
3581 				switch (rv) {
3582 				case NDI_BUSY:
3583 					rv = MDI_BUSY;
3584 					break;
3585 				default:
3586 					rv = MDI_FAILURE;
3587 					break;
3588 				}
3589 				goto state_change_exit;
3590 			} else {
3591 				i_mdi_client_lock(ct, NULL);
3592 			}
3593 		}
3594 		/*
3595 		 * Mark the mdi_pathinfo node state as transient
3596 		 */
3597 		MDI_PI_LOCK(pip);
3598 		MDI_PI_SET_OFFLINING(pip);
3599 		break;
3600 	}
3601 	MDI_PI_UNLOCK(pip);
3602 	MDI_CLIENT_UNSTABLE(ct);
3603 	i_mdi_client_unlock(ct);
3604 
3605 	f = vh->vh_ops->vo_pi_state_change;
3606 	if (f != NULL)
3607 		rv = (*f)(vh->vh_dip, pip, state, 0, flag);
3608 
3609 	MDI_CLIENT_LOCK(ct);
3610 	MDI_PI_LOCK(pip);
3611 	if (rv == MDI_NOT_SUPPORTED) {
3612 		MDI_CLIENT_SET_DEV_NOT_SUPPORTED(ct);
3613 	}
3614 	if (rv != MDI_SUCCESS) {
3615 		MDI_DEBUG(2, (MDI_WARN, ct->ct_dip,
3616 		    "vo_pi_state_change failed: rv %x", rv));
3617 	}
3618 	if (MDI_PI_IS_TRANSIENT(pip)) {
3619 		if (rv == MDI_SUCCESS) {
3620 			MDI_PI_CLEAR_TRANSIENT(pip);
3621 		} else {
3622 			MDI_PI(pip)->pi_state = MDI_PI_OLD_STATE(pip);
3623 		}
3624 	}
3625 
3626 	/*
3627 	 * Wake anyone waiting for this mdi_pathinfo node
3628 	 */
3629 	cv_broadcast(&MDI_PI(pip)->pi_state_cv);
3630 	MDI_PI_UNLOCK(pip);
3631 
3632 	/*
3633 	 * Mark the client device as stable
3634 	 */
3635 	MDI_CLIENT_STABLE(ct);
3636 	if (rv == MDI_SUCCESS) {
3637 		if (ct->ct_unstable == 0) {
3638 			cdip = ct->ct_dip;
3639 
3640 			/*
3641 			 * Onlining the mdi_pathinfo node will impact the
3642 			 * client state Update the client and dev_info node
3643 			 * state accordingly
3644 			 */
3645 			rv = NDI_SUCCESS;
3646 			i_mdi_client_update_state(ct);
3647 			switch (MDI_CLIENT_STATE(ct)) {
3648 			case MDI_CLIENT_STATE_OPTIMAL:
3649 			case MDI_CLIENT_STATE_DEGRADED:
3650 				if (cdip && !i_ddi_devi_attached(cdip) &&
3651 				    ((state == MDI_PATHINFO_STATE_ONLINE) ||
3652 				    (state == MDI_PATHINFO_STATE_STANDBY))) {
3653 
3654 					/*
3655 					 * Must do ndi_devi_online() through
3656 					 * hotplug thread for deferred
3657 					 * attach mechanism to work
3658 					 */
3659 					MDI_CLIENT_UNLOCK(ct);
3660 					rv = ndi_devi_online(cdip, 0);
3661 					MDI_CLIENT_LOCK(ct);
3662 					if ((rv != NDI_SUCCESS) &&
3663 					    (MDI_CLIENT_STATE(ct) ==
3664 					    MDI_CLIENT_STATE_DEGRADED)) {
3665 						MDI_DEBUG(1, (MDI_WARN, cdip,
3666 						    "!ndi_devi_online failed "
3667 						    "error %x", rv));
3668 					}
3669 					rv = NDI_SUCCESS;
3670 				}
3671 				break;
3672 
3673 			case MDI_CLIENT_STATE_FAILED:
3674 				/*
3675 				 * This is the last path case for
3676 				 * non-user initiated events.
3677 				 */
3678 				if (((flag & NDI_USER_REQ) == 0) &&
3679 				    cdip && (i_ddi_node_state(cdip) >=
3680 				    DS_INITIALIZED)) {
3681 					MDI_CLIENT_UNLOCK(ct);
3682 					rv = ndi_devi_offline(cdip,
3683 					    NDI_DEVFS_CLEAN);
3684 					MDI_CLIENT_LOCK(ct);
3685 
3686 					if (rv != NDI_SUCCESS) {
3687 						/*
3688 						 * ndi_devi_offline failed.
3689 						 * Reset client flags to
3690 						 * online as the path could not
3691 						 * be offlined.
3692 						 */
3693 						MDI_DEBUG(1, (MDI_WARN, cdip,
3694 						    "!ndi_devi_offline failed: "
3695 						    "error %x", rv));
3696 						MDI_CLIENT_SET_ONLINE(ct);
3697 					}
3698 				}
3699 				break;
3700 			}
3701 			/*
3702 			 * Convert to MDI error code
3703 			 */
3704 			switch (rv) {
3705 			case NDI_SUCCESS:
3706 				MDI_CLIENT_SET_REPORT_DEV_NEEDED(ct);
3707 				i_mdi_report_path_state(ct, pip);
3708 				rv = MDI_SUCCESS;
3709 				break;
3710 			case NDI_BUSY:
3711 				rv = MDI_BUSY;
3712 				break;
3713 			default:
3714 				rv = MDI_FAILURE;
3715 				break;
3716 			}
3717 		}
3718 	}
3719 	MDI_CLIENT_UNLOCK(ct);
3720 
3721 state_change_exit:
3722 	/*
3723 	 * Mark the pHCI as stable again.
3724 	 */
3725 	MDI_PHCI_LOCK(ph);
3726 	MDI_PHCI_STABLE(ph);
3727 	MDI_PHCI_UNLOCK(ph);
3728 	return (rv);
3729 }
3730 
3731 /*
3732  * mdi_pi_online():
3733  *		Place the path_info node in the online state.  The path is
3734  *		now available to be selected by mdi_select_path() for
3735  *		transporting I/O requests to client devices.
3736  * Return Values:
3737  *		MDI_SUCCESS
3738  *		MDI_FAILURE
3739  */
3740 int
mdi_pi_online(mdi_pathinfo_t * pip,int flags)3741 mdi_pi_online(mdi_pathinfo_t *pip, int flags)
3742 {
3743 	mdi_client_t	*ct = MDI_PI(pip)->pi_client;
3744 	int		client_held = 0;
3745 	int		rv;
3746 
3747 	ASSERT(ct != NULL);
3748 	rv = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_ONLINE, flags);
3749 	if (rv != MDI_SUCCESS)
3750 		return (rv);
3751 
3752 	MDI_PI_LOCK(pip);
3753 	if (MDI_PI(pip)->pi_pm_held == 0) {
3754 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3755 		    "i_mdi_pm_hold_pip %p", (void *)pip));
3756 		i_mdi_pm_hold_pip(pip);
3757 		client_held = 1;
3758 	}
3759 	MDI_PI_UNLOCK(pip);
3760 
3761 	if (client_held) {
3762 		MDI_CLIENT_LOCK(ct);
3763 		if (ct->ct_power_cnt == 0) {
3764 			rv = i_mdi_power_all_phci(ct);
3765 		}
3766 
3767 		MDI_DEBUG(4, (MDI_NOTE, ct->ct_dip,
3768 		    "i_mdi_pm_hold_client %p", (void *)ct));
3769 		i_mdi_pm_hold_client(ct, 1);
3770 		MDI_CLIENT_UNLOCK(ct);
3771 	}
3772 
3773 	return (rv);
3774 }
3775 
3776 /*
3777  * mdi_pi_standby():
3778  *		Place the mdi_pathinfo node in standby state
3779  *
3780  * Return Values:
3781  *		MDI_SUCCESS
3782  *		MDI_FAILURE
3783  */
3784 int
mdi_pi_standby(mdi_pathinfo_t * pip,int flags)3785 mdi_pi_standby(mdi_pathinfo_t *pip, int flags)
3786 {
3787 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_STANDBY, flags));
3788 }
3789 
3790 /*
3791  * mdi_pi_fault():
3792  *		Place the mdi_pathinfo node in fault'ed state
3793  * Return Values:
3794  *		MDI_SUCCESS
3795  *		MDI_FAILURE
3796  */
3797 int
mdi_pi_fault(mdi_pathinfo_t * pip,int flags)3798 mdi_pi_fault(mdi_pathinfo_t *pip, int flags)
3799 {
3800 	return (i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_FAULT, flags));
3801 }
3802 
3803 /*
3804  * mdi_pi_offline():
3805  *		Offline a mdi_pathinfo node.
3806  * Return Values:
3807  *		MDI_SUCCESS
3808  *		MDI_FAILURE
3809  */
3810 int
mdi_pi_offline(mdi_pathinfo_t * pip,int flags)3811 mdi_pi_offline(mdi_pathinfo_t *pip, int flags)
3812 {
3813 	int	ret, client_held = 0;
3814 	mdi_client_t	*ct;
3815 
3816 	/*
3817 	 * Original code overloaded NDI_DEVI_REMOVE to this interface, and
3818 	 * used it to mean "user initiated operation" (i.e. devctl). Callers
3819 	 * should now just use NDI_USER_REQ.
3820 	 */
3821 	if (flags & NDI_DEVI_REMOVE) {
3822 		flags &= ~NDI_DEVI_REMOVE;
3823 		flags |= NDI_USER_REQ;
3824 	}
3825 
3826 	ret = i_mdi_pi_state_change(pip, MDI_PATHINFO_STATE_OFFLINE, flags);
3827 
3828 	if (ret == MDI_SUCCESS) {
3829 		MDI_PI_LOCK(pip);
3830 		if (MDI_PI(pip)->pi_pm_held) {
3831 			client_held = 1;
3832 		}
3833 		MDI_PI_UNLOCK(pip);
3834 
3835 		if (client_held) {
3836 			ct =