xref: /illumos-gate/usr/src/uts/common/inet/ip/ipmp.c (revision ab82c29b)
1e11c3f44Smeem /*
2e11c3f44Smeem  * CDDL HEADER START
3e11c3f44Smeem  *
4e11c3f44Smeem  * The contents of this file are subject to the terms of the
5e11c3f44Smeem  * Common Development and Distribution License (the "License").
6e11c3f44Smeem  * You may not use this file except in compliance with the License.
7e11c3f44Smeem  *
8e11c3f44Smeem  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9e11c3f44Smeem  * or http://www.opensolaris.org/os/licensing.
10e11c3f44Smeem  * See the License for the specific language governing permissions
11e11c3f44Smeem  * and limitations under the License.
12e11c3f44Smeem  *
13e11c3f44Smeem  * When distributing Covered Code, include this CDDL HEADER in each
14e11c3f44Smeem  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15e11c3f44Smeem  * If applicable, add the following below this CDDL HEADER, with the
16e11c3f44Smeem  * fields enclosed by brackets "[]" replaced with your own identifying
17e11c3f44Smeem  * information: Portions Copyright [yyyy] [name of copyright owner]
18e11c3f44Smeem  *
19e11c3f44Smeem  * CDDL HEADER END
20e11c3f44Smeem  *
211f19738eSmeem  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
22e11c3f44Smeem  */
23e11c3f44Smeem 
24e11c3f44Smeem #include <inet/ip.h>
25e11c3f44Smeem #include <inet/ip6.h>
26e11c3f44Smeem #include <inet/ip_if.h>
27e11c3f44Smeem #include <inet/ip_ire.h>
28e11c3f44Smeem #include <inet/ip_multi.h>
29bd670b35SErik Nordmark #include <inet/ip_ndp.h>
30e11c3f44Smeem #include <inet/ip_rts.h>
31e11c3f44Smeem #include <inet/mi.h>
32e11c3f44Smeem #include <net/if_types.h>
33e11c3f44Smeem #include <sys/dlpi.h>
34e11c3f44Smeem #include <sys/kmem.h>
35e11c3f44Smeem #include <sys/modhash.h>
36e11c3f44Smeem #include <sys/sdt.h>
37e11c3f44Smeem #include <sys/strsun.h>
38e11c3f44Smeem #include <sys/sunddi.h>
39e11c3f44Smeem #include <sys/types.h>
40e11c3f44Smeem 
41e11c3f44Smeem /*
42e11c3f44Smeem  * Convenience macros for getting the ip_stack_t associated with an
43e11c3f44Smeem  * ipmp_illgrp_t or ipmp_grp_t.
44e11c3f44Smeem  */
45e11c3f44Smeem #define	IPMP_GRP_TO_IPST(grp)		PHYINT_TO_IPST((grp)->gr_phyint)
46e11c3f44Smeem #define	IPMP_ILLGRP_TO_IPST(illg)	((illg)->ig_ipmp_ill->ill_ipst)
47e11c3f44Smeem 
48e11c3f44Smeem /*
49e11c3f44Smeem  * Assorted constants that aren't important enough to be tunable.
50e11c3f44Smeem  */
51e11c3f44Smeem #define	IPMP_GRP_HASH_SIZE		64
52e11c3f44Smeem #define	IPMP_ILL_REFRESH_TIMEOUT	120	/* seconds */
53e11c3f44Smeem 
54e11c3f44Smeem /*
55e11c3f44Smeem  * IPMP meta-interface kstats (based on those in PSARC/1997/198).
56e11c3f44Smeem  */
57e11c3f44Smeem static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
58e11c3f44Smeem 	{ "obytes",	KSTAT_DATA_UINT32 },
59e11c3f44Smeem 	{ "obytes64",	KSTAT_DATA_UINT64 },
60e11c3f44Smeem 	{ "rbytes",	KSTAT_DATA_UINT32 },
61e11c3f44Smeem 	{ "rbytes64",	KSTAT_DATA_UINT64 },
62e11c3f44Smeem 	{ "opackets",	KSTAT_DATA_UINT32 },
63e11c3f44Smeem 	{ "opackets64",	KSTAT_DATA_UINT64 },
64e11c3f44Smeem 	{ "oerrors",	KSTAT_DATA_UINT32 },
65e11c3f44Smeem 	{ "ipackets",	KSTAT_DATA_UINT32 },
66e11c3f44Smeem 	{ "ipackets64",	KSTAT_DATA_UINT64 },
67e11c3f44Smeem 	{ "ierrors",	KSTAT_DATA_UINT32 },
68e11c3f44Smeem 	{ "multircv",	KSTAT_DATA_UINT32 },
69e11c3f44Smeem 	{ "multixmt",	KSTAT_DATA_UINT32 },
70e11c3f44Smeem 	{ "brdcstrcv",	KSTAT_DATA_UINT32 },
71e11c3f44Smeem 	{ "brdcstxmt",	KSTAT_DATA_UINT32 },
72e11c3f44Smeem 	{ "link_up",	KSTAT_DATA_UINT32 }
73e11c3f44Smeem };
74e11c3f44Smeem 
75e11c3f44Smeem static void	ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
76e11c3f44Smeem static int	ipmp_grp_create_kstats(ipmp_grp_t *);
77e11c3f44Smeem static int	ipmp_grp_update_kstats(kstat_t *, int);
78e11c3f44Smeem static void	ipmp_grp_destroy_kstats(ipmp_grp_t *);
79e11c3f44Smeem static ill_t	*ipmp_illgrp_min_ill(ipmp_illgrp_t *);
80e11c3f44Smeem static ill_t	*ipmp_illgrp_max_ill(ipmp_illgrp_t *);
81e11c3f44Smeem static void	ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
821eee170aSErik Nordmark static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t, uint_t);
83e11c3f44Smeem static boolean_t ipmp_ill_activate(ill_t *);
84e11c3f44Smeem static void	ipmp_ill_deactivate(ill_t *);
85e11c3f44Smeem static void	ipmp_ill_ire_mark_testhidden(ire_t *, char *);
86e11c3f44Smeem static void	ipmp_ill_ire_clear_testhidden(ire_t *, char *);
87e11c3f44Smeem static void	ipmp_ill_refresh_active_timer_start(ill_t *);
88e11c3f44Smeem static void	ipmp_ill_rtsaddrmsg(ill_t *, int);
89e11c3f44Smeem static void	ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
90e11c3f44Smeem static ipif_t	*ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
91e11c3f44Smeem static void	ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
92e11c3f44Smeem static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
938a06b3d6SToomas Soome static void	ipmp_ncec_delete_nonlocal(ncec_t *, void *);
94e11c3f44Smeem 
95e11c3f44Smeem /*
96e11c3f44Smeem  * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
97e11c3f44Smeem  */
98e11c3f44Smeem void
ipmp_init(ip_stack_t * ipst)99e11c3f44Smeem ipmp_init(ip_stack_t *ipst)
100e11c3f44Smeem {
101e11c3f44Smeem 	ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
102e11c3f44Smeem 	    IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
103e11c3f44Smeem 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
104e11c3f44Smeem 	rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
105e11c3f44Smeem }
106e11c3f44Smeem 
107e11c3f44Smeem /*
108e11c3f44Smeem  * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
109e11c3f44Smeem  */
110e11c3f44Smeem void
ipmp_destroy(ip_stack_t * ipst)111e11c3f44Smeem ipmp_destroy(ip_stack_t *ipst)
112e11c3f44Smeem {
113e11c3f44Smeem 	mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
114e11c3f44Smeem 	rw_destroy(&ipst->ips_ipmp_lock);
115e11c3f44Smeem }
116e11c3f44Smeem 
117e11c3f44Smeem /*
118e11c3f44Smeem  * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
119e11c3f44Smeem  * and add it to the hash.  On success, return a pointer to the created group.
120e11c3f44Smeem  * Caller must ensure `grname' is not yet in the hash.  Assumes that the IPMP
121e11c3f44Smeem  * meta-interface associated with the group also has the same name (but they
122e11c3f44Smeem  * may differ later via ipmp_grp_rename()).
123e11c3f44Smeem  */
124e11c3f44Smeem ipmp_grp_t *
ipmp_grp_create(const char * grname,phyint_t * phyi)125e11c3f44Smeem ipmp_grp_create(const char *grname, phyint_t *phyi)
126e11c3f44Smeem {
127e11c3f44Smeem 	ipmp_grp_t *grp;
128e11c3f44Smeem 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
129e11c3f44Smeem 	mod_hash_hndl_t mh;
130e11c3f44Smeem 
131e11c3f44Smeem 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
132e11c3f44Smeem 
133e11c3f44Smeem 	if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
134e11c3f44Smeem 		return (NULL);
135e11c3f44Smeem 
136e11c3f44Smeem 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
137e11c3f44Smeem 	(void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
138e11c3f44Smeem 
139e11c3f44Smeem 	/*
140e11c3f44Smeem 	 * Cache the group's phyint.  This is safe since a phyint_t will
141e11c3f44Smeem 	 * outlive its ipmp_grp_t.
142e11c3f44Smeem 	 */
143e11c3f44Smeem 	grp->gr_phyint = phyi;
144e11c3f44Smeem 
145e11c3f44Smeem 	/*
146e11c3f44Smeem 	 * Create IPMP group kstats.
147e11c3f44Smeem 	 */
148e11c3f44Smeem 	if (ipmp_grp_create_kstats(grp) != 0) {
149e11c3f44Smeem 		kmem_free(grp, sizeof (ipmp_grp_t));
150e11c3f44Smeem 		return (NULL);
151e11c3f44Smeem 	}
152e11c3f44Smeem 
153e11c3f44Smeem 	/*
154e11c3f44Smeem 	 * Insert the group into the hash.
155e11c3f44Smeem 	 */
156e11c3f44Smeem 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
157e11c3f44Smeem 		ipmp_grp_destroy_kstats(grp);
158e11c3f44Smeem 		kmem_free(grp, sizeof (ipmp_grp_t));
159e11c3f44Smeem 		return (NULL);
160e11c3f44Smeem 	}
161e11c3f44Smeem 	ipmp_grp_insert(grp, mh);
162e11c3f44Smeem 
163e11c3f44Smeem 	return (grp);
164e11c3f44Smeem }
165e11c3f44Smeem 
166e11c3f44Smeem /*
167e11c3f44Smeem  * Create IPMP kstat structures for `grp'.  Return an errno upon failure.
168e11c3f44Smeem  */
169e11c3f44Smeem static int
ipmp_grp_create_kstats(ipmp_grp_t * grp)170e11c3f44Smeem ipmp_grp_create_kstats(ipmp_grp_t *grp)
171e11c3f44Smeem {
172e11c3f44Smeem 	kstat_t *ksp;
173e11c3f44Smeem 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
174e11c3f44Smeem 
175e11c3f44Smeem 	ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
176e11c3f44Smeem 	    KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
177e11c3f44Smeem 	if (ksp == NULL)
178e11c3f44Smeem 		return (ENOMEM);
179e11c3f44Smeem 
180e11c3f44Smeem 	ksp->ks_update = ipmp_grp_update_kstats;
181e11c3f44Smeem 	ksp->ks_private = grp;
182e11c3f44Smeem 	bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
183e11c3f44Smeem 
184e11c3f44Smeem 	kstat_install(ksp);
185e11c3f44Smeem 	grp->gr_ksp = ksp;
186e11c3f44Smeem 	return (0);
187e11c3f44Smeem }
188e11c3f44Smeem 
189e11c3f44Smeem /*
190e11c3f44Smeem  * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
191e11c3f44Smeem  */
192e11c3f44Smeem static int
ipmp_grp_update_kstats(kstat_t * ksp,int rw)193e11c3f44Smeem ipmp_grp_update_kstats(kstat_t *ksp, int rw)
194e11c3f44Smeem {
195e11c3f44Smeem 	uint_t		i;
196e11c3f44Smeem 	kstat_named_t	*kn = KSTAT_NAMED_PTR(ksp);
197e11c3f44Smeem 	ipmp_grp_t	*grp = ksp->ks_private;
198e11c3f44Smeem 	ip_stack_t	*ipst = IPMP_GRP_TO_IPST(grp);
199e11c3f44Smeem 	ipsq_t		*ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
200e11c3f44Smeem 	phyint_t	*phyi;
201e11c3f44Smeem 	uint64_t	phyi_kstats[IPMP_KSTAT_MAX];
202e11c3f44Smeem 
203e11c3f44Smeem 	if (rw == KSTAT_WRITE)
204e11c3f44Smeem 		return (EACCES);
205e11c3f44Smeem 
206e11c3f44Smeem 	/*
207e11c3f44Smeem 	 * Start with the group's baseline values.
208e11c3f44Smeem 	 */
209e11c3f44Smeem 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
210e11c3f44Smeem 		if (kn[i].data_type == KSTAT_DATA_UINT32) {
211e11c3f44Smeem 			kn[i].value.ui32 = grp->gr_kstats0[i];
212e11c3f44Smeem 		} else {
213e11c3f44Smeem 			ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
214e11c3f44Smeem 			kn[i].value.ui64 = grp->gr_kstats0[i];
215e11c3f44Smeem 		}
216e11c3f44Smeem 	}
217e11c3f44Smeem 
218e11c3f44Smeem 	/*
219e11c3f44Smeem 	 * Add in the stats of each phyint currently in the group.  Since we
220e11c3f44Smeem 	 * don't directly track the phyints in a group, we cheat by walking
221e11c3f44Smeem 	 * the IPSQ set under ill_g_lock.  (The IPSQ list cannot change while
222e11c3f44Smeem 	 * ill_g_lock is held.)
223e11c3f44Smeem 	 */
224e11c3f44Smeem 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
225e11c3f44Smeem 	ipsq = grp_ipsq->ipsq_next;
226e11c3f44Smeem 	for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
227e11c3f44Smeem 		phyi = ipsq->ipsq_phyint;
228e11c3f44Smeem 
229e11c3f44Smeem 		/*
230e11c3f44Smeem 		 * If a phyint in a group is being unplumbed, it's possible
231e11c3f44Smeem 		 * that ill_glist_delete() -> phyint_free() already freed the
232e11c3f44Smeem 		 * phyint (and set ipsq_phyint to NULL), but the unplumb
233e11c3f44Smeem 		 * operation has yet to complete (and thus ipsq_dq() has yet
234e11c3f44Smeem 		 * to remove the phyint's IPSQ from the group IPSQ's phyint
235e11c3f44Smeem 		 * list).  We skip those phyints here (note that their kstats
236e11c3f44Smeem 		 * have already been added to gr_kstats0[]).
237e11c3f44Smeem 		 */
238e11c3f44Smeem 		if (phyi == NULL)
239e11c3f44Smeem 			continue;
240e11c3f44Smeem 
241e11c3f44Smeem 		ipmp_phyint_get_kstats(phyi, phyi_kstats);
242e11c3f44Smeem 
243e11c3f44Smeem 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
244e11c3f44Smeem 			phyi_kstats[i] -= phyi->phyint_kstats0[i];
245e11c3f44Smeem 			if (kn[i].data_type == KSTAT_DATA_UINT32)
246e11c3f44Smeem 				kn[i].value.ui32 += phyi_kstats[i];
247e11c3f44Smeem 			else
248e11c3f44Smeem 				kn[i].value.ui64 += phyi_kstats[i];
249e11c3f44Smeem 		}
250e11c3f44Smeem 	}
251e11c3f44Smeem 
252e11c3f44Smeem 	kn[IPMP_KSTAT_LINK_UP].value.ui32 =
253e11c3f44Smeem 	    (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
254e11c3f44Smeem 
255e11c3f44Smeem 	rw_exit(&ipst->ips_ill_g_lock);
256e11c3f44Smeem 	return (0);
257e11c3f44Smeem }
258e11c3f44Smeem 
259e11c3f44Smeem /*
260e11c3f44Smeem  * Destroy IPMP kstat structures for `grp'.
261e11c3f44Smeem  */
262e11c3f44Smeem static void
ipmp_grp_destroy_kstats(ipmp_grp_t * grp)263e11c3f44Smeem ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
264e11c3f44Smeem {
265e11c3f44Smeem 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
266e11c3f44Smeem 
267e11c3f44Smeem 	kstat_delete_netstack(grp->gr_ksp, id);
268e11c3f44Smeem 	bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
269e11c3f44Smeem 	grp->gr_ksp = NULL;
270e11c3f44Smeem }
271e11c3f44Smeem 
272e11c3f44Smeem /*
273e11c3f44Smeem  * Look up an IPMP group named `grname' on IP stack `ipst'.  Return NULL if it
274e11c3f44Smeem  * does not exist.
275e11c3f44Smeem  */
276e11c3f44Smeem ipmp_grp_t *
ipmp_grp_lookup(const char * grname,ip_stack_t * ipst)277e11c3f44Smeem ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
278e11c3f44Smeem {
279e11c3f44Smeem 	ipmp_grp_t *grp;
280e11c3f44Smeem 
281e11c3f44Smeem 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
282e11c3f44Smeem 
283e11c3f44Smeem 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
284e11c3f44Smeem 	    (mod_hash_val_t *)&grp) == 0)
285e11c3f44Smeem 		return (grp);
286e11c3f44Smeem 
287e11c3f44Smeem 	return (NULL);
288e11c3f44Smeem }
289e11c3f44Smeem 
290e11c3f44Smeem /*
291e11c3f44Smeem  * Place information about group `grp' into `lifgr'.
292e11c3f44Smeem  */
293e11c3f44Smeem void
ipmp_grp_info(const ipmp_grp_t * grp,lifgroupinfo_t * lifgr)294e11c3f44Smeem ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
295e11c3f44Smeem {
296e11c3f44Smeem 	ill_t *ill;
297e11c3f44Smeem 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
298e11c3f44Smeem 
299e11c3f44Smeem 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
300e11c3f44Smeem 
301e11c3f44Smeem 	lifgr->gi_v4 = (grp->gr_v4 != NULL);
302e11c3f44Smeem 	lifgr->gi_v6 = (grp->gr_v6 != NULL);
303e11c3f44Smeem 	lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
304e11c3f44Smeem 	lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
305e11c3f44Smeem 	lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
306e11c3f44Smeem 	(void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
307e11c3f44Smeem 	lifgr->gi_m4ifname[0] = '\0';
308e11c3f44Smeem 	lifgr->gi_m6ifname[0] = '\0';
309e11c3f44Smeem 	lifgr->gi_bcifname[0] = '\0';
310e11c3f44Smeem 
311e11c3f44Smeem 	if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
312e11c3f44Smeem 		(void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
313e11c3f44Smeem 		(void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
314e11c3f44Smeem 	}
315e11c3f44Smeem 
316e11c3f44Smeem 	if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
317e11c3f44Smeem 		(void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
318e11c3f44Smeem }
319e11c3f44Smeem 
320e11c3f44Smeem /*
321e11c3f44Smeem  * Insert `grp' into the hash using the reserved hash entry `mh'.
322e11c3f44Smeem  * Caller must ensure `grp' is not yet in the hash.
323e11c3f44Smeem  */
324e11c3f44Smeem static void
ipmp_grp_insert(ipmp_grp_t * grp,mod_hash_hndl_t mh)325e11c3f44Smeem ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
326e11c3f44Smeem {
327e11c3f44Smeem 	int err;
328e11c3f44Smeem 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
329e11c3f44Smeem 
330e11c3f44Smeem 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
331e11c3f44Smeem 
332e11c3f44Smeem 	/*
333e11c3f44Smeem 	 * Since grp->gr_name will exist at least as long as `grp' is in the
334e11c3f44Smeem 	 * hash, we use it directly as the key.
335e11c3f44Smeem 	 */
336e11c3f44Smeem 	err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
337e11c3f44Smeem 	    (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
338e11c3f44Smeem 	if (err != 0) {
339e11c3f44Smeem 		/*
340e11c3f44Smeem 		 * This should never happen since `mh' was preallocated.
341e11c3f44Smeem 		 */
342e11c3f44Smeem 		panic("cannot insert IPMP group \"%s\" (err %d)",
343e11c3f44Smeem 		    grp->gr_name, err);
344e11c3f44Smeem 	}
345e11c3f44Smeem }
346e11c3f44Smeem 
347e11c3f44Smeem /*
348e11c3f44Smeem  * Remove `grp' from the hash.  Caller must ensure `grp' is in it.
349e11c3f44Smeem  */
350e11c3f44Smeem static void
ipmp_grp_remove(ipmp_grp_t * grp)351e11c3f44Smeem ipmp_grp_remove(ipmp_grp_t *grp)
352e11c3f44Smeem {
353e11c3f44Smeem 	int err;
354e11c3f44Smeem 	mod_hash_val_t val;
355e11c3f44Smeem 	mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
356e11c3f44Smeem 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
357e11c3f44Smeem 
358e11c3f44Smeem 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
359e11c3f44Smeem 
360e11c3f44Smeem 	err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
361e11c3f44Smeem 	if (err != 0 || val != grp) {
362e11c3f44Smeem 		panic("cannot remove IPMP group \"%s\" (err %d)",
363e11c3f44Smeem 		    grp->gr_name, err);
364e11c3f44Smeem 	}
365e11c3f44Smeem }
366e11c3f44Smeem 
367e11c3f44Smeem /*
368e11c3f44Smeem  * Attempt to rename `grp' to new name `grname'.  Return an errno if the new
369e11c3f44Smeem  * group name already exists or is invalid, or if there isn't enough memory.
370e11c3f44Smeem  */
371e11c3f44Smeem int
ipmp_grp_rename(ipmp_grp_t * grp,const char * grname)372e11c3f44Smeem ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
373e11c3f44Smeem {
374e11c3f44Smeem 	mod_hash_hndl_t mh;
375e11c3f44Smeem 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
376e11c3f44Smeem 
377e11c3f44Smeem 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
378e11c3f44Smeem 
379e11c3f44Smeem 	if (grname[0] == '\0')
380e11c3f44Smeem 		return (EINVAL);
381e11c3f44Smeem 
382e11c3f44Smeem 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
383e11c3f44Smeem 	    (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
384e11c3f44Smeem 		return (EEXIST);
385e11c3f44Smeem 
386e11c3f44Smeem 	/*
387e11c3f44Smeem 	 * Before we remove the group from the hash, ensure we'll be able to
388e11c3f44Smeem 	 * re-insert it by reserving space.
389e11c3f44Smeem 	 */
390e11c3f44Smeem 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
391e11c3f44Smeem 		return (ENOMEM);
392e11c3f44Smeem 
393e11c3f44Smeem 	ipmp_grp_remove(grp);
394e11c3f44Smeem 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
395e11c3f44Smeem 	ipmp_grp_insert(grp, mh);
396e11c3f44Smeem 
397e11c3f44Smeem 	return (0);
398e11c3f44Smeem }
399e11c3f44Smeem 
400e11c3f44Smeem /*
401e11c3f44Smeem  * Destroy `grp' and remove it from the hash.  Caller must ensure `grp' is in
402e11c3f44Smeem  * the hash, and that there are no interfaces on it.
403e11c3f44Smeem  */
404e11c3f44Smeem void
ipmp_grp_destroy(ipmp_grp_t * grp)405e11c3f44Smeem ipmp_grp_destroy(ipmp_grp_t *grp)
406e11c3f44Smeem {
407e11c3f44Smeem 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
408e11c3f44Smeem 
409e11c3f44Smeem 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
410e11c3f44Smeem 
411e11c3f44Smeem 	/*
412e11c3f44Smeem 	 * If there are still interfaces using this group, panic before things
413e11c3f44Smeem 	 * go really off the rails.
414e11c3f44Smeem 	 */
415e11c3f44Smeem 	if (grp->gr_nif != 0)
416e11c3f44Smeem 		panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
417e11c3f44Smeem 
418e11c3f44Smeem 	ipmp_grp_remove(grp);
419e11c3f44Smeem 	ipmp_grp_destroy_kstats(grp);
420e11c3f44Smeem 
421e11c3f44Smeem 	ASSERT(grp->gr_v4 == NULL);
422e11c3f44Smeem 	ASSERT(grp->gr_v6 == NULL);
423e11c3f44Smeem 	ASSERT(grp->gr_nv4 == 0);
424e11c3f44Smeem 	ASSERT(grp->gr_nv6 == 0);
425e11c3f44Smeem 	ASSERT(grp->gr_nactif == 0);
426e11c3f44Smeem 	ASSERT(grp->gr_linkdownmp == NULL);
427e11c3f44Smeem 	grp->gr_phyint = NULL;
428e11c3f44Smeem 
429e11c3f44Smeem 	kmem_free(grp, sizeof (ipmp_grp_t));
430e11c3f44Smeem }
431e11c3f44Smeem 
432e11c3f44Smeem /*
433e11c3f44Smeem  * Check whether `ill' is suitable for inclusion into `grp', and return an
434e11c3f44Smeem  * errno describing the problem (if any).  NOTE: many of these errno values
435e11c3f44Smeem  * are interpreted by ifconfig, which will take corrective action and retry
436e11c3f44Smeem  * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
437e11c3f44Smeem  */
438e11c3f44Smeem static int
ipmp_grp_vet_ill(ipmp_grp_t * grp,ill_t * ill)439e11c3f44Smeem ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
440e11c3f44Smeem {
441e11c3f44Smeem 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
442e11c3f44Smeem 
443e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
444e11c3f44Smeem 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
445e11c3f44Smeem 
446e11c3f44Smeem 	/*
447e11c3f44Smeem 	 * To sidestep complicated address migration logic in the kernel and
448e11c3f44Smeem 	 * to force the kernel's all-hosts multicast memberships to be blown
449e11c3f44Smeem 	 * away, all addresses that had been brought up must be brought back
450e11c3f44Smeem 	 * down prior to adding an interface to a group.  (This includes
451e11c3f44Smeem 	 * addresses currently down due to DAD.)  Once the interface has been
452e11c3f44Smeem 	 * added to the group, its addresses can then be brought back up, at
453e11c3f44Smeem 	 * which point they will be moved to the IPMP meta-interface.
454e11c3f44Smeem 	 * NOTE: we do this before ill_appaddr_cnt() since bringing down the
455e11c3f44Smeem 	 * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
456e11c3f44Smeem 	 */
457e11c3f44Smeem 	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
458e11c3f44Smeem 		return (EADDRINUSE);
459e11c3f44Smeem 
460e11c3f44Smeem 	/*
461e11c3f44Smeem 	 * To avoid confusing applications by changing addresses that are
462e11c3f44Smeem 	 * under their control, all such control must be removed prior to
463e11c3f44Smeem 	 * adding an interface into a group.
464e11c3f44Smeem 	 */
465e11c3f44Smeem 	if (ill_appaddr_cnt(ill) != 0)
466e11c3f44Smeem 		return (EADDRNOTAVAIL);
467e11c3f44Smeem 
468e11c3f44Smeem 	/*
469e11c3f44Smeem 	 * Since PTP addresses do not share the same broadcast domain, they
470e11c3f44Smeem 	 * are not allowed to be in an IPMP group.
471e11c3f44Smeem 	 */
472e11c3f44Smeem 	if (ill_ptpaddr_cnt(ill) != 0)
473e11c3f44Smeem 		return (EINVAL);
474e11c3f44Smeem 
475e11c3f44Smeem 	/*
476e11c3f44Smeem 	 * An ill must support multicast to be allowed into a group.
477e11c3f44Smeem 	 */
478e11c3f44Smeem 	if (!(ill->ill_flags & ILLF_MULTICAST))
479e11c3f44Smeem 		return (ENOTSUP);
480e11c3f44Smeem 
481e11c3f44Smeem 	/*
482e11c3f44Smeem 	 * An ill must strictly be using ARP and/or ND for address
483e11c3f44Smeem 	 * resolution for it to be allowed into a group.
484e11c3f44Smeem 	 */
485bd670b35SErik Nordmark 	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP))
486e11c3f44Smeem 		return (ENOTSUP);
487e11c3f44Smeem 
488e11c3f44Smeem 	/*
489e11c3f44Smeem 	 * An ill cannot also be using usesrc groups.  (Although usesrc uses
490e11c3f44Smeem 	 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
491e11c3f44Smeem 	 * all its modifications as writer.)
492e11c3f44Smeem 	 */
493e11c3f44Smeem 	if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
494e11c3f44Smeem 		return (ENOTSUP);
495e11c3f44Smeem 
496e11c3f44Smeem 	/*
497e11c3f44Smeem 	 * All ills in a group must be the same mactype.
498e11c3f44Smeem 	 */
499e11c3f44Smeem 	if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
500e11c3f44Smeem 		return (EINVAL);
501e11c3f44Smeem 
502e11c3f44Smeem 	return (0);
503e11c3f44Smeem }
504e11c3f44Smeem 
505e11c3f44Smeem /*
506e11c3f44Smeem  * Check whether `phyi' is suitable for inclusion into `grp', and return an
507e11c3f44Smeem  * errno describing the problem (if any).  See comment above ipmp_grp_vet_ill()
508e11c3f44Smeem  * regarding errno values.
509e11c3f44Smeem  */
510e11c3f44Smeem int
ipmp_grp_vet_phyint(ipmp_grp_t * grp,phyint_t * phyi)511e11c3f44Smeem ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
512e11c3f44Smeem {
513e11c3f44Smeem 	int err = 0;
514e11c3f44Smeem 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
515e11c3f44Smeem 
516e11c3f44Smeem 	ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
517e11c3f44Smeem 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
518e11c3f44Smeem 
519e11c3f44Smeem 	/*
520e11c3f44Smeem 	 * An interface cannot have address families plumbed that are not
521e11c3f44Smeem 	 * configured in the group.
522e11c3f44Smeem 	 */
523e11c3f44Smeem 	if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
524e11c3f44Smeem 	    phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
525e11c3f44Smeem 		return (EAFNOSUPPORT);
526e11c3f44Smeem 
527e11c3f44Smeem 	if (phyi->phyint_illv4 != NULL)
528e11c3f44Smeem 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
529e11c3f44Smeem 	if (err == 0 && phyi->phyint_illv6 != NULL)
530e11c3f44Smeem 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
531e11c3f44Smeem 
532e11c3f44Smeem 	return (err);
533e11c3f44Smeem }
534e11c3f44Smeem 
535e11c3f44Smeem /*
536e11c3f44Smeem  * Create a new illgrp on IPMP meta-interface `ill'.
537e11c3f44Smeem  */
538e11c3f44Smeem ipmp_illgrp_t *
ipmp_illgrp_create(ill_t * ill)539e11c3f44Smeem ipmp_illgrp_create(ill_t *ill)
540e11c3f44Smeem {
541e11c3f44Smeem 	uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
542e11c3f44Smeem 	ipmp_illgrp_t *illg;
543e11c3f44Smeem 
544e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
545e11c3f44Smeem 	ASSERT(IS_IPMP(ill));
546e11c3f44Smeem 	ASSERT(ill->ill_grp == NULL);
547e11c3f44Smeem 
548e11c3f44Smeem 	if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
549e11c3f44Smeem 		return (NULL);
550e11c3f44Smeem 
551e11c3f44Smeem 	list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
552e11c3f44Smeem 	list_create(&illg->ig_actif, sizeof (ill_t),
553e11c3f44Smeem 	    offsetof(ill_t, ill_actnode));
554e11c3f44Smeem 	list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
555e11c3f44Smeem 	    offsetof(ipmp_arpent_t, ia_node));
556e11c3f44Smeem 
557e11c3f44Smeem 	illg->ig_ipmp_ill = ill;
558e11c3f44Smeem 	ill->ill_grp = illg;
5591eee170aSErik Nordmark 	ipmp_illgrp_set_mtu(illg, mtu, mtu);
560e11c3f44Smeem 
561e11c3f44Smeem 	return (illg);
562e11c3f44Smeem }
563e11c3f44Smeem 
564e11c3f44Smeem /*
565e11c3f44Smeem  * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
566e11c3f44Smeem  */
567e11c3f44Smeem void
ipmp_illgrp_destroy(ipmp_illgrp_t * illg)568e11c3f44Smeem ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
569e11c3f44Smeem {
570e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
571e11c3f44Smeem 	ASSERT(IS_IPMP(illg->ig_ipmp_ill));
572e11c3f44Smeem 
573e11c3f44Smeem 	/*
574e11c3f44Smeem 	 * Verify `illg' is empty.
575e11c3f44Smeem 	 */
576e11c3f44Smeem 	ASSERT(illg->ig_next_ill == NULL);
577e11c3f44Smeem 	ASSERT(illg->ig_cast_ill == NULL);
578e11c3f44Smeem 	ASSERT(list_is_empty(&illg->ig_arpent));
579e11c3f44Smeem 	ASSERT(list_is_empty(&illg->ig_if));
580e11c3f44Smeem 	ASSERT(list_is_empty(&illg->ig_actif));
581e11c3f44Smeem 	ASSERT(illg->ig_nactif == 0);
582e11c3f44Smeem 
583e11c3f44Smeem 	/*
584e11c3f44Smeem 	 * Destroy `illg'.
585e11c3f44Smeem 	 */
586e11c3f44Smeem 	illg->ig_ipmp_ill->ill_grp = NULL;
587e11c3f44Smeem 	illg->ig_ipmp_ill = NULL;
588e11c3f44Smeem 	list_destroy(&illg->ig_if);
589e11c3f44Smeem 	list_destroy(&illg->ig_actif);
590e11c3f44Smeem 	list_destroy(&illg->ig_arpent);
591e11c3f44Smeem 	kmem_free(illg, sizeof (ipmp_illgrp_t));
592e11c3f44Smeem }
593e11c3f44Smeem 
594e11c3f44Smeem /*
595e11c3f44Smeem  * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
596e11c3f44Smeem  * bind it to an underlying ill, while keeping an even address distribution.
597e11c3f44Smeem  * If the bind is successful, return a pointer to the bound ill.
598e11c3f44Smeem  */
599e11c3f44Smeem ill_t *
ipmp_illgrp_add_ipif(ipmp_illgrp_t * illg,ipif_t * ipif)600e11c3f44Smeem ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
601e11c3f44Smeem {
602e11c3f44Smeem 	ill_t *minill;
603e11c3f44Smeem 	ipmp_arpent_t *entp;
604e11c3f44Smeem 
605e11c3f44Smeem 	ASSERT(IAM_WRITER_IPIF(ipif));
606e11c3f44Smeem 	ASSERT(ipmp_ipif_is_dataaddr(ipif));
607e11c3f44Smeem 
608e11c3f44Smeem 	/*
609e11c3f44Smeem 	 * IPMP data address mappings are internally managed by IP itself, so
610e11c3f44Smeem 	 * delete any existing ARP entries associated with the address.
611e11c3f44Smeem 	 */
612e11c3f44Smeem 	if (!ipif->ipif_isv6) {
613e11c3f44Smeem 		entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
614e11c3f44Smeem 		if (entp != NULL)
615e11c3f44Smeem 			ipmp_illgrp_destroy_arpent(illg, entp);
616e11c3f44Smeem 	}
617e11c3f44Smeem 
618e11c3f44Smeem 	if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
619e11c3f44Smeem 		ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
620e11c3f44Smeem 
621e11c3f44Smeem 	return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
622e11c3f44Smeem }
623e11c3f44Smeem 
624e11c3f44Smeem /*
625e11c3f44Smeem  * Delete `ipif' from the pool of usable data addresses on `illg'.  If it's
626e11c3f44Smeem  * bound, unbind it from the underlying ill while keeping an even address
627e11c3f44Smeem  * distribution.
628e11c3f44Smeem  */
629e11c3f44Smeem void
ipmp_illgrp_del_ipif(ipmp_illgrp_t * illg,ipif_t * ipif)630e11c3f44Smeem ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
631e11c3f44Smeem {
632e11c3f44Smeem 	ill_t *maxill, *boundill = ipif->ipif_bound_ill;
633e11c3f44Smeem 
634e11c3f44Smeem 	ASSERT(IAM_WRITER_IPIF(ipif));
635e11c3f44Smeem 
636e11c3f44Smeem 	if (boundill != NULL) {
637e11c3f44Smeem 		(void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
638e11c3f44Smeem 
639e11c3f44Smeem 		maxill = ipmp_illgrp_max_ill(illg);
640e11c3f44Smeem 		if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
641e11c3f44Smeem 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
642e11c3f44Smeem 			ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
643e11c3f44Smeem 		}
644e11c3f44Smeem 	}
645e11c3f44Smeem }
646e11c3f44Smeem 
647e11c3f44Smeem /*
648e11c3f44Smeem  * Return the active ill with the greatest number of data addresses in `illg'.
649e11c3f44Smeem  */
650e11c3f44Smeem static ill_t *
ipmp_illgrp_max_ill(ipmp_illgrp_t * illg)651e11c3f44Smeem ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
652e11c3f44Smeem {
653e11c3f44Smeem 	ill_t *ill, *bestill = NULL;
654e11c3f44Smeem 
655e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
656e11c3f44Smeem 
657e11c3f44Smeem 	ill = list_head(&illg->ig_actif);
658e11c3f44Smeem 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
659e11c3f44Smeem 		if (bestill == NULL ||
660e11c3f44Smeem 		    ill->ill_bound_cnt > bestill->ill_bound_cnt) {
661e11c3f44Smeem 			bestill = ill;
662e11c3f44Smeem 		}
663e11c3f44Smeem 	}
664e11c3f44Smeem 	return (bestill);
665e11c3f44Smeem }
666e11c3f44Smeem 
667e11c3f44Smeem /*
668e11c3f44Smeem  * Return the active ill with the fewest number of data addresses in `illg'.
669e11c3f44Smeem  */
670e11c3f44Smeem static ill_t *
ipmp_illgrp_min_ill(ipmp_illgrp_t * illg)671e11c3f44Smeem ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
672e11c3f44Smeem {
673e11c3f44Smeem 	ill_t *ill, *bestill = NULL;
674e11c3f44Smeem 
675e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
676e11c3f44Smeem 
677e11c3f44Smeem 	ill = list_head(&illg->ig_actif);
678e11c3f44Smeem 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
679e11c3f44Smeem 		if (bestill == NULL ||
680e11c3f44Smeem 		    ill->ill_bound_cnt < bestill->ill_bound_cnt) {
681e11c3f44Smeem 			if (ill->ill_bound_cnt == 0)
682e11c3f44Smeem 				return (ill);	 /* can't get better */
683e11c3f44Smeem 			bestill = ill;
684e11c3f44Smeem 		}
685e11c3f44Smeem 	}
686e11c3f44Smeem 	return (bestill);
687e11c3f44Smeem }
688e11c3f44Smeem 
689e11c3f44Smeem /*
690e11c3f44Smeem  * Return a pointer to IPMP meta-interface for `illg' (which must exist).
691e11c3f44Smeem  * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
692e11c3f44Smeem  */
693e11c3f44Smeem ill_t *
ipmp_illgrp_ipmp_ill(ipmp_illgrp_t * illg)694e11c3f44Smeem ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
695e11c3f44Smeem {
696e11c3f44Smeem 	return (illg->ig_ipmp_ill);
697e11c3f44Smeem }
698e11c3f44Smeem 
699e11c3f44Smeem /*
700e11c3f44Smeem  * Return a pointer to the next available underlying ill in `illg', or NULL if
701e11c3f44Smeem  * one doesn't exist.  Caller must be inside the IPSQ.
702e11c3f44Smeem  */
703e11c3f44Smeem ill_t *
ipmp_illgrp_next_ill(ipmp_illgrp_t * illg)704e11c3f44Smeem ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
705e11c3f44Smeem {
706e11c3f44Smeem 	ill_t *ill;
707e11c3f44Smeem 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
708e11c3f44Smeem 
709e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
710e11c3f44Smeem 
711e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
712e11c3f44Smeem 	if ((ill = illg->ig_next_ill) != NULL) {
713e11c3f44Smeem 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
714e11c3f44Smeem 		if (illg->ig_next_ill == NULL)
715e11c3f44Smeem 			illg->ig_next_ill = list_head(&illg->ig_actif);
716e11c3f44Smeem 	}
717e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
718e11c3f44Smeem 
719e11c3f44Smeem 	return (ill);
720e11c3f44Smeem }
721e11c3f44Smeem 
722e11c3f44Smeem /*
723e11c3f44Smeem  * Return a held pointer to the next available underlying ill in `illg', or
724e11c3f44Smeem  * NULL if one doesn't exist.  Caller need not be inside the IPSQ.
725e11c3f44Smeem  */
726e11c3f44Smeem ill_t *
ipmp_illgrp_hold_next_ill(ipmp_illgrp_t * illg)727e11c3f44Smeem ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
728e11c3f44Smeem {
729e11c3f44Smeem 	ill_t *ill;
730e11c3f44Smeem 	uint_t i;
731e11c3f44Smeem 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
732e11c3f44Smeem 
733e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
734e11c3f44Smeem 	for (i = 0; i < illg->ig_nactif; i++) {
735e11c3f44Smeem 		ill = illg->ig_next_ill;
736e11c3f44Smeem 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
737e11c3f44Smeem 		if (illg->ig_next_ill == NULL)
738e11c3f44Smeem 			illg->ig_next_ill = list_head(&illg->ig_actif);
739e11c3f44Smeem 
740bd670b35SErik Nordmark 		if (ill_check_and_refhold(ill)) {
741e11c3f44Smeem 			rw_exit(&ipst->ips_ipmp_lock);
742e11c3f44Smeem 			return (ill);
743e11c3f44Smeem 		}
744e11c3f44Smeem 	}
745e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
746e11c3f44Smeem 
747e11c3f44Smeem 	return (NULL);
748e11c3f44Smeem }
749e11c3f44Smeem 
750e11c3f44Smeem /*
751e11c3f44Smeem  * Return a held pointer to the nominated multicast ill in `illg', or NULL if
752e11c3f44Smeem  * one doesn't exist.  Caller need not be inside the IPSQ.
753e11c3f44Smeem  */
754e11c3f44Smeem ill_t *
ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t * illg)755e11c3f44Smeem ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
756e11c3f44Smeem {
757e11c3f44Smeem 	ill_t *castill;
758e11c3f44Smeem 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
759e11c3f44Smeem 
760e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
761e11c3f44Smeem 	castill = illg->ig_cast_ill;
762bd670b35SErik Nordmark 	if (castill != NULL && ill_check_and_refhold(castill)) {
763e11c3f44Smeem 		rw_exit(&ipst->ips_ipmp_lock);
764e11c3f44Smeem 		return (castill);
765e11c3f44Smeem 	}
766e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
767e11c3f44Smeem 	return (NULL);
768e11c3f44Smeem }
769e11c3f44Smeem 
770e11c3f44Smeem /*
771e11c3f44Smeem  * Set the nominated cast ill on `illg' to `castill'.  If `castill' is NULL,
772e11c3f44Smeem  * any existing nomination is removed.  Caller must be inside the IPSQ.
773e11c3f44Smeem  */
774e11c3f44Smeem static void
ipmp_illgrp_set_cast(ipmp_illgrp_t * illg,ill_t * castill)775e11c3f44Smeem ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
776e11c3f44Smeem {
777e11c3f44Smeem 	ill_t *ocastill = illg->ig_cast_ill;
778e11c3f44Smeem 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
779e11c3f44Smeem 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
780e11c3f44Smeem 
781e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
782e11c3f44Smeem 
783e11c3f44Smeem 	/*
784e11c3f44Smeem 	 * Disable old nominated ill (if any).
785e11c3f44Smeem 	 */
786e11c3f44Smeem 	if (ocastill != NULL) {
787e11c3f44Smeem 		DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
788e11c3f44Smeem 		    illg, ill_t *, ocastill);
789e11c3f44Smeem 		ASSERT(ocastill->ill_nom_cast);
790e11c3f44Smeem 		ocastill->ill_nom_cast = B_FALSE;
791e11c3f44Smeem 		/*
792e11c3f44Smeem 		 * If the IPMP meta-interface is down, we never did the join,
793e11c3f44Smeem 		 * so we must not try to leave.
794e11c3f44Smeem 		 */
795e11c3f44Smeem 		if (ipmp_ill->ill_dl_up)
796e11c3f44Smeem 			ill_leave_multicast(ipmp_ill);
797bd670b35SErik Nordmark 
798bd670b35SErik Nordmark 		/*
799bd670b35SErik Nordmark 		 * Delete any NCEs tied to the old nomination.  We must do this
800bd670b35SErik Nordmark 		 * last since ill_leave_multicast() may trigger IREs to be
801bd670b35SErik Nordmark 		 * built using ig_cast_ill.
802bd670b35SErik Nordmark 		 */
8038a06b3d6SToomas Soome 		ncec_walk(ocastill, ipmp_ncec_delete_nonlocal, ocastill,
804bd670b35SErik Nordmark 		    ocastill->ill_ipst);
805e11c3f44Smeem 	}
806e11c3f44Smeem 
807e11c3f44Smeem 	/*
808e11c3f44Smeem 	 * Set new nomination.
809e11c3f44Smeem 	 */
810e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
811e11c3f44Smeem 	illg->ig_cast_ill = castill;
812e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
813e11c3f44Smeem 
814e11c3f44Smeem 	/*
815e11c3f44Smeem 	 * Enable new nominated ill (if any).
816e11c3f44Smeem 	 */
817e11c3f44Smeem 	if (castill != NULL) {
818e11c3f44Smeem 		DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
819e11c3f44Smeem 		    illg, ill_t *, castill);
820e11c3f44Smeem 		ASSERT(!castill->ill_nom_cast);
821e11c3f44Smeem 		castill->ill_nom_cast = B_TRUE;
822e11c3f44Smeem 		/*
823e11c3f44Smeem 		 * If the IPMP meta-interface is down, the attempt to recover
824e11c3f44Smeem 		 * will silently fail but ill_need_recover_multicast will be
825e11c3f44Smeem 		 * erroneously cleared -- so check first.
826e11c3f44Smeem 		 */
827e11c3f44Smeem 		if (ipmp_ill->ill_dl_up)
828e11c3f44Smeem 			ill_recover_multicast(ipmp_ill);
829e11c3f44Smeem 	}
830e11c3f44Smeem }
831e11c3f44Smeem 
832e11c3f44Smeem /*
833e11c3f44Smeem  * Create an IPMP ARP entry and add it to the set tracked on `illg'.  If an
834e11c3f44Smeem  * entry for the same IP address already exists, destroy it first.  Return the
835e11c3f44Smeem  * created IPMP ARP entry, or NULL on failure.
836e11c3f44Smeem  */
837e11c3f44Smeem ipmp_arpent_t *
ipmp_illgrp_create_arpent(ipmp_illgrp_t * illg,boolean_t proxyarp,ipaddr_t ipaddr,uchar_t * lladdr,size_t lladdr_len,uint16_t flags)838bd670b35SErik Nordmark ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp,
839bd670b35SErik Nordmark     ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags)
840e11c3f44Smeem {
841e11c3f44Smeem 	ipmp_arpent_t *entp, *oentp;
842e11c3f44Smeem 
843e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
844e11c3f44Smeem 
845bd670b35SErik Nordmark 	if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len,
846bd670b35SErik Nordmark 	    KM_NOSLEEP)) == NULL)
847e11c3f44Smeem 		return (NULL);
848e11c3f44Smeem 
849bd670b35SErik Nordmark 	/*
850bd670b35SErik Nordmark 	 * Delete any existing ARP entry for this address.
851bd670b35SErik Nordmark 	 */
852e11c3f44Smeem 	if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
853e11c3f44Smeem 		ipmp_illgrp_destroy_arpent(illg, oentp);
854e11c3f44Smeem 
855bd670b35SErik Nordmark 	/*
856bd670b35SErik Nordmark 	 * Prepend the new entry.
857bd670b35SErik Nordmark 	 */
858bd670b35SErik Nordmark 	entp->ia_ipaddr = ipaddr;
859bd670b35SErik Nordmark 	entp->ia_flags = flags;
860bd670b35SErik Nordmark 	entp->ia_lladdr_len = lladdr_len;
861bd670b35SErik Nordmark 	entp->ia_lladdr = (uchar_t *)&entp[1];
862bd670b35SErik Nordmark 	bcopy(lladdr, entp->ia_lladdr, lladdr_len);
863bd670b35SErik Nordmark 	entp->ia_proxyarp = proxyarp;
864bd670b35SErik Nordmark 	entp->ia_notified = B_TRUE;
865e11c3f44Smeem 	list_insert_head(&illg->ig_arpent, entp);
866e11c3f44Smeem 	return (entp);
867e11c3f44Smeem }
868e11c3f44Smeem 
869e11c3f44Smeem /*
870e11c3f44Smeem  * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
871e11c3f44Smeem  */
872e11c3f44Smeem void
ipmp_illgrp_destroy_arpent(ipmp_illgrp_t * illg,ipmp_arpent_t * entp)873e11c3f44Smeem ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
874e11c3f44Smeem {
875e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
876e11c3f44Smeem 
877e11c3f44Smeem 	list_remove(&illg->ig_arpent, entp);
878bd670b35SErik Nordmark 	kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len);
879e11c3f44Smeem }
880e11c3f44Smeem 
881e11c3f44Smeem /*
882e11c3f44Smeem  * Mark that ARP has been notified about the IP address on `entp'; `illg' is
883e11c3f44Smeem  * taken as a debugging aid for DTrace FBT probes.
884e11c3f44Smeem  */
885e11c3f44Smeem /* ARGSUSED */
886e11c3f44Smeem void
ipmp_illgrp_mark_arpent(ipmp_illgrp_t * illg,ipmp_arpent_t * entp)887e11c3f44Smeem ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
888e11c3f44Smeem {
889e11c3f44Smeem 	entp->ia_notified = B_TRUE;
890e11c3f44Smeem }
891e11c3f44Smeem 
892e11c3f44Smeem /*
893e11c3f44Smeem  * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
894e11c3f44Smeem  * NULL, any IPMP ARP entry is requested.  Return NULL if it does not exist.
895e11c3f44Smeem  */
896e11c3f44Smeem ipmp_arpent_t *
ipmp_illgrp_lookup_arpent(ipmp_illgrp_t * illg,ipaddr_t * addrp)897e11c3f44Smeem ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
898e11c3f44Smeem {
899e11c3f44Smeem 	ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
900e11c3f44Smeem 
901e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
902e11c3f44Smeem 
903e11c3f44Smeem 	if (addrp == NULL)
904e11c3f44Smeem 		return (entp);
905e11c3f44Smeem 
906e11c3f44Smeem 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
907e11c3f44Smeem 		if (entp->ia_ipaddr == *addrp)
908e11c3f44Smeem 			break;
909e11c3f44Smeem 	return (entp);
910e11c3f44Smeem }
911e11c3f44Smeem 
912e11c3f44Smeem /*
913e11c3f44Smeem  * Refresh ARP entries on `illg' to be distributed across its active
914e11c3f44Smeem  * interfaces.  Entries that cannot be refreshed (e.g., because there are no
915e11c3f44Smeem  * active interfaces) are marked so that subsequent calls can try again.
916e11c3f44Smeem  */
917e11c3f44Smeem void
ipmp_illgrp_refresh_arpent(ipmp_illgrp_t * illg)918e11c3f44Smeem ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
919e11c3f44Smeem {
920e11c3f44Smeem 	ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
921e11c3f44Smeem 	uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
922e11c3f44Smeem 	ipmp_arpent_t *entp;
923bd670b35SErik Nordmark 	ncec_t *ncec;
924bd670b35SErik Nordmark 	nce_t  *nce;
925e11c3f44Smeem 
926e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
927e11c3f44Smeem 	ASSERT(!ipmp_ill->ill_isv6);
928e11c3f44Smeem 
929e11c3f44Smeem 	ill = list_head(&illg->ig_actif);
930e11c3f44Smeem 	entp = list_head(&illg->ig_arpent);
931e11c3f44Smeem 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
932e11c3f44Smeem 		if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
933e11c3f44Smeem 			entp->ia_notified = B_FALSE;
934e11c3f44Smeem 			continue;
935e11c3f44Smeem 		}
936e11c3f44Smeem 
937e11c3f44Smeem 		ASSERT(paddrlen == ill->ill_phys_addr_length);
938e11c3f44Smeem 
939e11c3f44Smeem 		/*
940e11c3f44Smeem 		 * If this is a proxy ARP entry, we can skip notifying ARP if
941e11c3f44Smeem 		 * the entry is already up-to-date.  If it has changed, we
942e11c3f44Smeem 		 * update the entry's hardware address before notifying ARP.
943e11c3f44Smeem 		 */
944e11c3f44Smeem 		if (entp->ia_proxyarp) {
945bd670b35SErik Nordmark 			if (bcmp(ill->ill_phys_addr, entp->ia_lladdr,
946bd670b35SErik Nordmark 			    paddrlen) == 0 && entp->ia_notified)
947e11c3f44Smeem 				continue;
948bd670b35SErik Nordmark 			bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen);
949e11c3f44Smeem 		}
950e11c3f44Smeem 
951bd670b35SErik Nordmark 		(void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr,
952bd670b35SErik Nordmark 		    paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED,
953bd670b35SErik Nordmark 		    &nce);
954bd670b35SErik Nordmark 		if (nce == NULL || !entp->ia_proxyarp) {
955bd670b35SErik Nordmark 			if (nce != NULL)
956bd670b35SErik Nordmark 				nce_refrele(nce);
957e11c3f44Smeem 			continue;
958e11c3f44Smeem 		}
959bd670b35SErik Nordmark 		ncec = nce->nce_common;
960bd670b35SErik Nordmark 		mutex_enter(&ncec->ncec_lock);
961bd670b35SErik Nordmark 		nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr);
962bd670b35SErik Nordmark 		mutex_exit(&ncec->ncec_lock);
963bd670b35SErik Nordmark 		nce_refrele(nce);
964e11c3f44Smeem 		ipmp_illgrp_mark_arpent(illg, entp);
965e11c3f44Smeem 
966e11c3f44Smeem 		if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
967e11c3f44Smeem 			ill = list_head(&illg->ig_actif);
968e11c3f44Smeem 	}
969e11c3f44Smeem }
970e11c3f44Smeem 
971e11c3f44Smeem /*
972e11c3f44Smeem  * Return an interface in `illg' with the specified `physaddr', or NULL if one
973e11c3f44Smeem  * doesn't exist.  Caller must hold ill_g_lock if it's not inside the IPSQ.
974e11c3f44Smeem  */
975e11c3f44Smeem ill_t *
ipmp_illgrp_find_ill(ipmp_illgrp_t * illg,uchar_t * physaddr,uint_t paddrlen)976e11c3f44Smeem ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
977e11c3f44Smeem {
978e11c3f44Smeem 	ill_t *ill;
979e11c3f44Smeem 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
980e11c3f44Smeem 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
981e11c3f44Smeem 
982e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
983e11c3f44Smeem 
984e11c3f44Smeem 	ill = list_head(&illg->ig_if);
985e11c3f44Smeem 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
986e11c3f44Smeem 		if (ill->ill_phys_addr_length == paddrlen &&
987e11c3f44Smeem 		    bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
988e11c3f44Smeem 			return (ill);
989e11c3f44Smeem 	}
990e11c3f44Smeem 	return (NULL);
991e11c3f44Smeem }
992e11c3f44Smeem 
993e11c3f44Smeem /*
994e11c3f44Smeem  * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
995e11c3f44Smeem  * Caller must be inside the IPSQ unless this is initialization.
996e11c3f44Smeem  */
997e11c3f44Smeem static void
ipmp_illgrp_set_mtu(ipmp_illgrp_t * illg,uint_t mtu,uint_t mc_mtu)9981eee170aSErik Nordmark ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu, uint_t mc_mtu)
999e11c3f44Smeem {
1000e11c3f44Smeem 	ill_t *ill = illg->ig_ipmp_ill;
1001e11c3f44Smeem 	mblk_t *mp;
1002e11c3f44Smeem 
1003e11c3f44Smeem 	ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
1004e11c3f44Smeem 
1005e11c3f44Smeem 	/*
1006e11c3f44Smeem 	 * If allocation fails, we have bigger problems than MTU.
1007e11c3f44Smeem 	 */
10081eee170aSErik Nordmark 	if ((mp = ip_dlnotify_alloc2(DL_NOTE_SDU_SIZE2, mtu, mc_mtu)) != NULL) {
1009e11c3f44Smeem 		illg->ig_mtu = mtu;
10101eee170aSErik Nordmark 		illg->ig_mc_mtu = mc_mtu;
1011e11c3f44Smeem 		put(ill->ill_rq, mp);
1012e11c3f44Smeem 	}
1013e11c3f44Smeem }
1014e11c3f44Smeem 
1015e11c3f44Smeem /*
1016e11c3f44Smeem  * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
1017e11c3f44Smeem  * ill MTU if necessary.
1018e11c3f44Smeem  */
1019e11c3f44Smeem void
ipmp_illgrp_refresh_mtu(ipmp_illgrp_t * illg)1020e11c3f44Smeem ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
1021e11c3f44Smeem {
1022e11c3f44Smeem 	ill_t *ill;
1023e11c3f44Smeem 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
1024e11c3f44Smeem 	uint_t mtu = 0;
10251eee170aSErik Nordmark 	uint_t mc_mtu = 0;
1026e11c3f44Smeem 
1027e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
1028e11c3f44Smeem 
1029e11c3f44Smeem 	/*
1030bd670b35SErik Nordmark 	 * Since ill_mtu can only change under ill_lock, we hold ill_lock
1031e11c3f44Smeem 	 * for each ill as we iterate through the list.  Any changes to the
1032bd670b35SErik Nordmark 	 * ill_mtu will also trigger an update, so even if we missed it
1033e11c3f44Smeem 	 * this time around, the update will catch it.
1034e11c3f44Smeem 	 */
1035e11c3f44Smeem 	ill = list_head(&illg->ig_if);
1036e11c3f44Smeem 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1037e11c3f44Smeem 		mutex_enter(&ill->ill_lock);
1038bd670b35SErik Nordmark 		if (mtu == 0 || ill->ill_mtu < mtu)
1039bd670b35SErik Nordmark 			mtu = ill->ill_mtu;
10401eee170aSErik Nordmark 		if (mc_mtu == 0 || ill->ill_mc_mtu < mc_mtu)
10411eee170aSErik Nordmark 			mc_mtu = ill->ill_mc_mtu;
1042e11c3f44Smeem 		mutex_exit(&ill->ill_lock);
1043e11c3f44Smeem 	}
1044e11c3f44Smeem 
1045e11c3f44Smeem 	/*
1046e11c3f44Smeem 	 * MTU must be at least the minimum MTU.
1047e11c3f44Smeem 	 */
1048e11c3f44Smeem 	mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
10491eee170aSErik Nordmark 	mc_mtu = MAX(mc_mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
10501eee170aSErik Nordmark 	if (illg->ig_mtu != mtu || illg->ig_mc_mtu != mc_mtu)
10511eee170aSErik Nordmark 		ipmp_illgrp_set_mtu(illg, mtu, mc_mtu);
1052e11c3f44Smeem }
1053e11c3f44Smeem 
1054e11c3f44Smeem /*
1055e11c3f44Smeem  * Link illgrp `illg' to IPMP group `grp'.  To simplify the caller, silently
1056e11c3f44Smeem  * allow the same link to be established more than once.
1057e11c3f44Smeem  */
1058e11c3f44Smeem void
ipmp_illgrp_link_grp(ipmp_illgrp_t * illg,ipmp_grp_t * grp)1059e11c3f44Smeem ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
1060e11c3f44Smeem {
1061e11c3f44Smeem 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1062e11c3f44Smeem 
1063e11c3f44Smeem 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1064e11c3f44Smeem 
1065e11c3f44Smeem 	if (illg->ig_ipmp_ill->ill_isv6) {
1066e11c3f44Smeem 		ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
1067e11c3f44Smeem 		grp->gr_v6 = illg;
1068e11c3f44Smeem 	} else {
1069e11c3f44Smeem 		ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
1070e11c3f44Smeem 		grp->gr_v4 = illg;
1071e11c3f44Smeem 	}
1072e11c3f44Smeem }
1073e11c3f44Smeem 
1074e11c3f44Smeem /*
1075e11c3f44Smeem  * Unlink illgrp `illg' from its IPMP group.  Return an errno if the illgrp
1076e11c3f44Smeem  * cannot be unlinked (e.g., because there are still interfaces using it).
1077e11c3f44Smeem  */
1078e11c3f44Smeem int
ipmp_illgrp_unlink_grp(ipmp_illgrp_t * illg)1079e11c3f44Smeem ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
1080e11c3f44Smeem {
1081e11c3f44Smeem 	ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
1082e11c3f44Smeem 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1083e11c3f44Smeem 
1084e11c3f44Smeem 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1085e11c3f44Smeem 
1086e11c3f44Smeem 	if (illg->ig_ipmp_ill->ill_isv6) {
1087e11c3f44Smeem 		if (grp->gr_nv6 + grp->gr_pendv6 != 0)
1088e11c3f44Smeem 			return (EBUSY);
1089e11c3f44Smeem 		grp->gr_v6 = NULL;
1090e11c3f44Smeem 	} else {
1091e11c3f44Smeem 		if (grp->gr_nv4 + grp->gr_pendv4 != 0)
1092e11c3f44Smeem 			return (EBUSY);
1093e11c3f44Smeem 		grp->gr_v4 = NULL;
1094e11c3f44Smeem 	}
1095e11c3f44Smeem 	return (0);
1096e11c3f44Smeem }
1097e11c3f44Smeem 
1098e11c3f44Smeem /*
1099e11c3f44Smeem  * Place `ill' into `illg', and rebalance the data addresses on `illg'
1100e11c3f44Smeem  * to be spread evenly across the ills now in it.  Also, adjust the IPMP
1101e11c3f44Smeem  * ill as necessary to account for `ill' (e.g., MTU).
1102e11c3f44Smeem  */
1103e11c3f44Smeem void
ipmp_ill_join_illgrp(ill_t * ill,ipmp_illgrp_t * illg)1104e11c3f44Smeem ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
1105e11c3f44Smeem {
1106e11c3f44Smeem 	ill_t *ipmp_ill;
1107e11c3f44Smeem 	ipif_t *ipif;
1108e11c3f44Smeem 	ip_stack_t *ipst = ill->ill_ipst;
1109e11c3f44Smeem 
1110e11c3f44Smeem 	/* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
1111e11c3f44Smeem 	ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
1112e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1113e11c3f44Smeem 	ASSERT(ill->ill_grp == NULL);
1114e11c3f44Smeem 
1115e11c3f44Smeem 	ipmp_ill = illg->ig_ipmp_ill;
1116e11c3f44Smeem 
1117e11c3f44Smeem 	/*
1118e11c3f44Smeem 	 * Account for `ill' joining the illgrp.
1119e11c3f44Smeem 	 */
1120e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1121e11c3f44Smeem 	if (ill->ill_isv6)
1122e11c3f44Smeem 		ill->ill_phyint->phyint_grp->gr_nv6++;
1123e11c3f44Smeem 	else
1124e11c3f44Smeem 		ill->ill_phyint->phyint_grp->gr_nv4++;
1125e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
1126e11c3f44Smeem 
1127e11c3f44Smeem 	/*
1128e11c3f44Smeem 	 * Ensure the ILLF_ROUTER flag remains consistent across the group.
1129e11c3f44Smeem 	 */
1130e11c3f44Smeem 	mutex_enter(&ill->ill_lock);
1131e11c3f44Smeem 	if (ipmp_ill->ill_flags & ILLF_ROUTER)
1132e11c3f44Smeem 		ill->ill_flags |= ILLF_ROUTER;
1133e11c3f44Smeem 	else
1134e11c3f44Smeem 		ill->ill_flags &= ~ILLF_ROUTER;
1135e11c3f44Smeem 	mutex_exit(&ill->ill_lock);
1136e11c3f44Smeem 
1137e11c3f44Smeem 	/*
1138e11c3f44Smeem 	 * Blow away all multicast memberships that currently exist on `ill'.
1139e11c3f44Smeem 	 * This may seem odd, but it's consistent with the application view
1140e11c3f44Smeem 	 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
1141f1c454b4SSowmini Varadhan 	 * The ill_grp_pending bit prevents multicast group joins after
1142f1c454b4SSowmini Varadhan 	 * update_conn_ill() and before ill_grp assignment.
1143e11c3f44Smeem 	 */
1144f1c454b4SSowmini Varadhan 	mutex_enter(&ill->ill_mcast_serializer);
1145f1c454b4SSowmini Varadhan 	ill->ill_grp_pending = 1;
1146f1c454b4SSowmini Varadhan 	mutex_exit(&ill->ill_mcast_serializer);
1147bd670b35SErik Nordmark 	update_conn_ill(ill, ill->ill_ipst);
1148e11c3f44Smeem 	if (ill->ill_isv6) {
1149e11c3f44Smeem 		reset_mrt_ill(ill);
1150e11c3f44Smeem 	} else {
1151e11c3f44Smeem 		ipif = ill->ill_ipif;
1152e11c3f44Smeem 		for (; ipif != NULL; ipif = ipif->ipif_next) {
1153e11c3f44Smeem 			reset_mrt_vif_ipif(ipif);
1154e11c3f44Smeem 		}
1155e11c3f44Smeem 	}
1156e11c3f44Smeem 	ip_purge_allmulti(ill);
1157e11c3f44Smeem 
1158e11c3f44Smeem 	/*
1159e11c3f44Smeem 	 * Borrow the first ill's ill_phys_addr_length value for the illgrp's
1160e11c3f44Smeem 	 * physical address length.  All other ills must have the same value,
1161e11c3f44Smeem 	 * since they are required to all be the same mactype.  Also update
1162e11c3f44Smeem 	 * the IPMP ill's MTU and CoS marking, if necessary.
1163e11c3f44Smeem 	 */
1164e11c3f44Smeem 	if (list_is_empty(&illg->ig_if)) {
1165e11c3f44Smeem 		ASSERT(ipmp_ill->ill_phys_addr_length == 0);
1166e11c3f44Smeem 		/*
1167e11c3f44Smeem 		 * NOTE: we leave ill_phys_addr NULL since the IPMP group
1168e11c3f44Smeem 		 * doesn't have a physical address.  This means that code must
1169e11c3f44Smeem 		 * not assume that ill_phys_addr is non-NULL just because
1170e11c3f44Smeem 		 * ill_phys_addr_length is non-zero.  Likewise for ill_nd_lla.
1171e11c3f44Smeem 		 */
1172e11c3f44Smeem 		ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
1173e11c3f44Smeem 		ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
1174e11c3f44Smeem 		ipmp_ill->ill_type = ill->ill_type;
1175e11c3f44Smeem 
1176e11c3f44Smeem 		if (ill->ill_flags & ILLF_COS_ENABLED) {
1177e11c3f44Smeem 			mutex_enter(&ipmp_ill->ill_lock);
1178e11c3f44Smeem 			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1179e11c3f44Smeem 			mutex_exit(&ipmp_ill->ill_lock);
1180e11c3f44Smeem 		}
11811eee170aSErik Nordmark 		ipmp_illgrp_set_mtu(illg, ill->ill_mtu, ill->ill_mc_mtu);
1182e11c3f44Smeem 	} else {
1183e11c3f44Smeem 		ASSERT(ipmp_ill->ill_phys_addr_length ==
1184e11c3f44Smeem 		    ill->ill_phys_addr_length);
1185e11c3f44Smeem 		ASSERT(ipmp_ill->ill_type == ill->ill_type);
1186e11c3f44Smeem 
1187e11c3f44Smeem 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1188e11c3f44Smeem 			mutex_enter(&ipmp_ill->ill_lock);
1189e11c3f44Smeem 			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1190e11c3f44Smeem 			mutex_exit(&ipmp_ill->ill_lock);
1191e11c3f44Smeem 		}
11921eee170aSErik Nordmark 		if (illg->ig_mtu > ill->ill_mtu ||
11931eee170aSErik Nordmark 		    illg->ig_mc_mtu > ill->ill_mc_mtu) {
11941eee170aSErik Nordmark 			ipmp_illgrp_set_mtu(illg, ill->ill_mtu,
11951eee170aSErik Nordmark 			    ill->ill_mc_mtu);
11961eee170aSErik Nordmark 		}
1197e11c3f44Smeem 	}
1198e11c3f44Smeem 
1199e11c3f44Smeem 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1200e11c3f44Smeem 	list_insert_tail(&illg->ig_if, ill);
1201e11c3f44Smeem 	ill->ill_grp = illg;
1202e11c3f44Smeem 	rw_exit(&ipst->ips_ill_g_lock);
1203e11c3f44Smeem 
1204f1c454b4SSowmini Varadhan 	mutex_enter(&ill->ill_mcast_serializer);
1205f1c454b4SSowmini Varadhan 	ill->ill_grp_pending = 0;
1206f1c454b4SSowmini Varadhan 	mutex_exit(&ill->ill_mcast_serializer);
1207f1c454b4SSowmini Varadhan 
1208e11c3f44Smeem 	/*
1209e11c3f44Smeem 	 * Hide the IREs on `ill' so that we don't accidentally find them when
1210e11c3f44Smeem 	 * sending data traffic.
1211e11c3f44Smeem 	 */
1212e11c3f44Smeem 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
1213e11c3f44Smeem 
1214e11c3f44Smeem 	ipmp_ill_refresh_active(ill);
1215e11c3f44Smeem }
1216e11c3f44Smeem 
1217e11c3f44Smeem /*
1218e11c3f44Smeem  * Remove `ill' from its illgrp, and rebalance the data addresses in that
1219e11c3f44Smeem  * illgrp to be spread evenly across the remaining ills.  Also, adjust the
1220e11c3f44Smeem  * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
1221e11c3f44Smeem  */
1222e11c3f44Smeem void
ipmp_ill_leave_illgrp(ill_t * ill)1223e11c3f44Smeem ipmp_ill_leave_illgrp(ill_t *ill)
1224e11c3f44Smeem {
1225e11c3f44Smeem 	ill_t *ipmp_ill;
1226e11c3f44Smeem 	ipif_t *ipif;
1227e11c3f44Smeem 	ipmp_arpent_t *entp;
1228e11c3f44Smeem 	ipmp_illgrp_t *illg = ill->ill_grp;
1229e11c3f44Smeem 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1230e11c3f44Smeem 
1231e11c3f44Smeem 	ASSERT(IS_UNDER_IPMP(ill));
1232e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1233e11c3f44Smeem 	ASSERT(illg != NULL);
1234e11c3f44Smeem 
1235e11c3f44Smeem 	ipmp_ill = illg->ig_ipmp_ill;
1236e11c3f44Smeem 
1237e11c3f44Smeem 	/*
1238e11c3f44Smeem 	 * Cancel IPMP-specific ill timeouts.
1239e11c3f44Smeem 	 */
1240e11c3f44Smeem 	(void) untimeout(ill->ill_refresh_tid);
1241e11c3f44Smeem 
1242e11c3f44Smeem 	/*
1243e11c3f44Smeem 	 * Expose any previously-hidden IREs on `ill'.
1244e11c3f44Smeem 	 */
1245e11c3f44Smeem 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
1246e11c3f44Smeem 
1247e11c3f44Smeem 	/*
1248e11c3f44Smeem 	 * Ensure the multicast state for each ipif on `ill' is down so that
1249e11c3f44Smeem 	 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
1250e11c3f44Smeem 	 * all eligible groups.
1251e11c3f44Smeem 	 */
1252e11c3f44Smeem 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1253e11c3f44Smeem 		if (ipif->ipif_flags & IPIF_UP)
1254e11c3f44Smeem 			ipif_multicast_down(ipif);
1255e11c3f44Smeem 
1256e11c3f44Smeem 	/*
1257e11c3f44Smeem 	 * Account for `ill' leaving the illgrp.
1258e11c3f44Smeem 	 */
1259e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1260e11c3f44Smeem 	if (ill->ill_isv6)
1261e11c3f44Smeem 		ill->ill_phyint->phyint_grp->gr_nv6--;
1262e11c3f44Smeem 	else
1263e11c3f44Smeem 		ill->ill_phyint->phyint_grp->gr_nv4--;
1264e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
1265e11c3f44Smeem 
1266e11c3f44Smeem 	/*
1267e11c3f44Smeem 	 * Pull `ill' out of the interface lists.
1268e11c3f44Smeem 	 */
1269e11c3f44Smeem 	if (list_link_active(&ill->ill_actnode))
1270e11c3f44Smeem 		ipmp_ill_deactivate(ill);
1271e11c3f44Smeem 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1272e11c3f44Smeem 	list_remove(&illg->ig_if, ill);
1273e11c3f44Smeem 	ill->ill_grp = NULL;
1274e11c3f44Smeem 	rw_exit(&ipst->ips_ill_g_lock);
1275e11c3f44Smeem 
1276e11c3f44Smeem 	/*
1277e11c3f44Smeem 	 * Re-establish multicast memberships that were previously being
1278e11c3f44Smeem 	 * handled by the IPMP meta-interface.
1279e11c3f44Smeem 	 */
1280e11c3f44Smeem 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1281e11c3f44Smeem 		if (ipif->ipif_flags & IPIF_UP)
1282e11c3f44Smeem 			ipif_multicast_up(ipif);
1283e11c3f44Smeem 
1284e11c3f44Smeem 	/*
1285e11c3f44Smeem 	 * Refresh the group MTU based on the new interface list.
1286e11c3f44Smeem 	 */
1287e11c3f44Smeem 	ipmp_illgrp_refresh_mtu(illg);
1288e11c3f44Smeem 
1289e11c3f44Smeem 	if (list_is_empty(&illg->ig_if)) {
1290e11c3f44Smeem 		/*
1291e11c3f44Smeem 		 * No ills left in the illgrp; we no longer have a physical
1292e11c3f44Smeem 		 * address length, nor can we support ARP, CoS, or anything
1293e11c3f44Smeem 		 * else that depends on knowing the link layer type.
1294e11c3f44Smeem 		 */
1295e11c3f44Smeem 		while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
1296e11c3f44Smeem 			ipmp_illgrp_destroy_arpent(illg, entp);
1297e11c3f44Smeem 
1298e11c3f44Smeem 		ipmp_ill->ill_phys_addr_length = 0;
1299e11c3f44Smeem 		ipmp_ill->ill_nd_lla_len = 0;
1300e11c3f44Smeem 		ipmp_ill->ill_type = IFT_OTHER;
1301e11c3f44Smeem 		mutex_enter(&ipmp_ill->ill_lock);
1302e11c3f44Smeem 		ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1303e11c3f44Smeem 		mutex_exit(&ipmp_ill->ill_lock);
1304e11c3f44Smeem 	} else {
1305e11c3f44Smeem 		/*
1306e11c3f44Smeem 		 * If `ill' didn't support CoS, see if it can now be enabled.
1307e11c3f44Smeem 		 */
1308e11c3f44Smeem 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1309e11c3f44Smeem 			ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
1310e11c3f44Smeem 
1311e11c3f44Smeem 			ill = list_head(&illg->ig_if);
1312e11c3f44Smeem 			do {
1313e11c3f44Smeem 				if (!(ill->ill_flags & ILLF_COS_ENABLED))
1314e11c3f44Smeem 					break;
1315e11c3f44Smeem 			} while ((ill = list_next(&illg->ig_if, ill)) != NULL);
1316e11c3f44Smeem 
1317e11c3f44Smeem 			if (ill == NULL) {
1318e11c3f44Smeem 				mutex_enter(&ipmp_ill->ill_lock);
1319e11c3f44Smeem 				ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1320e11c3f44Smeem 				mutex_exit(&ipmp_ill->ill_lock);
1321e11c3f44Smeem 			}
1322e11c3f44Smeem 		}
1323e11c3f44Smeem 	}
1324e11c3f44Smeem }
1325e11c3f44Smeem 
1326e11c3f44Smeem /*
1327e11c3f44Smeem  * Check if `ill' should be active, and activate or deactivate if need be.
1328e11c3f44Smeem  * Return B_FALSE if a refresh was necessary but could not be performed.
1329e11c3f44Smeem  */
1330e11c3f44Smeem static boolean_t
ipmp_ill_try_refresh_active(ill_t * ill)1331e11c3f44Smeem ipmp_ill_try_refresh_active(ill_t *ill)
1332e11c3f44Smeem {
1333e11c3f44Smeem 	boolean_t refreshed = B_TRUE;
1334e11c3f44Smeem 
1335e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1336e11c3f44Smeem 	ASSERT(IS_UNDER_IPMP(ill));
1337e11c3f44Smeem 
1338e11c3f44Smeem 	if (ipmp_ill_is_active(ill)) {
1339e11c3f44Smeem 		if (!list_link_active(&ill->ill_actnode))
1340e11c3f44Smeem 			refreshed = ipmp_ill_activate(ill);
1341e11c3f44Smeem 	} else {
1342e11c3f44Smeem 		if (list_link_active(&ill->ill_actnode))
1343e11c3f44Smeem 			ipmp_ill_deactivate(ill);
1344e11c3f44Smeem 	}
1345e11c3f44Smeem 
1346e11c3f44Smeem 	return (refreshed);
1347e11c3f44Smeem }
1348e11c3f44Smeem 
1349e11c3f44Smeem /*
1350e11c3f44Smeem  * Check if `ill' should be active, and activate or deactivate if need be.
1351e11c3f44Smeem  * If the refresh fails, schedule a timer to try again later.
1352e11c3f44Smeem  */
1353e11c3f44Smeem void
ipmp_ill_refresh_active(ill_t * ill)1354e11c3f44Smeem ipmp_ill_refresh_active(ill_t *ill)
1355e11c3f44Smeem {
1356e11c3f44Smeem 	if (!ipmp_ill_try_refresh_active(ill))
1357e11c3f44Smeem 		ipmp_ill_refresh_active_timer_start(ill);
1358e11c3f44Smeem }
1359e11c3f44Smeem 
1360e11c3f44Smeem /*
1361e11c3f44Smeem  * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
1362e11c3f44Smeem  */
1363e11c3f44Smeem static void
ipmp_ill_refresh_active_timer(void * ill_arg)1364e11c3f44Smeem ipmp_ill_refresh_active_timer(void *ill_arg)
1365e11c3f44Smeem {
1366e11c3f44Smeem 	ill_t *ill = ill_arg;
1367e11c3f44Smeem 	boolean_t refreshed = B_FALSE;
1368e11c3f44Smeem 
1369e11c3f44Smeem 	/*
1370e11c3f44Smeem 	 * Clear ill_refresh_tid to indicate that no timeout is pending
1371e11c3f44Smeem 	 * (another thread could schedule a new timeout while we're still
1372e11c3f44Smeem 	 * running, but that's harmless).  If the ill is going away, bail.
1373e11c3f44Smeem 	 */
1374e11c3f44Smeem 	mutex_enter(&ill->ill_lock);
1375e11c3f44Smeem 	ill->ill_refresh_tid = 0;
1376e11c3f44Smeem 	if (ill->ill_state_flags & ILL_CONDEMNED) {
1377e11c3f44Smeem 		mutex_exit(&ill->ill_lock);
1378e11c3f44Smeem 		return;
1379e11c3f44Smeem 	}
1380e11c3f44Smeem 	mutex_exit(&ill->ill_lock);
1381e11c3f44Smeem 
1382e11c3f44Smeem 	if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
1383e11c3f44Smeem 		refreshed = ipmp_ill_try_refresh_active(ill);
1384e11c3f44Smeem 		ipsq_exit(ill->ill_phyint->phyint_ipsq);
1385e11c3f44Smeem 	}
1386e11c3f44Smeem 
1387e11c3f44Smeem 	/*
1388e11c3f44Smeem 	 * If the refresh failed, schedule another attempt.
1389e11c3f44Smeem 	 */
1390e11c3f44Smeem 	if (!refreshed)
1391e11c3f44Smeem 		ipmp_ill_refresh_active_timer_start(ill);
1392e11c3f44Smeem }
1393e11c3f44Smeem 
1394e11c3f44Smeem /*
1395e11c3f44Smeem  * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
1396e11c3f44Smeem  */
1397e11c3f44Smeem static void
ipmp_ill_refresh_active_timer_start(ill_t * ill)1398e11c3f44Smeem ipmp_ill_refresh_active_timer_start(ill_t *ill)
1399e11c3f44Smeem {
1400e11c3f44Smeem 	mutex_enter(&ill->ill_lock);
1401e11c3f44Smeem 
1402e11c3f44Smeem 	/*
1403e11c3f44Smeem 	 * If the ill is going away or a refresh is already scheduled, bail.
1404e11c3f44Smeem 	 */
1405e11c3f44Smeem 	if (ill->ill_refresh_tid != 0 ||
1406e11c3f44Smeem 	    (ill->ill_state_flags & ILL_CONDEMNED)) {
1407e11c3f44Smeem 		mutex_exit(&ill->ill_lock);
1408e11c3f44Smeem 		return;
1409e11c3f44Smeem 	}
1410e11c3f44Smeem 
1411e11c3f44Smeem 	ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
1412e11c3f44Smeem 	    SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
1413e11c3f44Smeem 
1414e11c3f44Smeem 	mutex_exit(&ill->ill_lock);
1415e11c3f44Smeem }
1416e11c3f44Smeem 
1417e11c3f44Smeem /*
1418e11c3f44Smeem  * Activate `ill' so it will be used to send and receive data traffic.  Return
1419e11c3f44Smeem  * B_FALSE if `ill' cannot be activated.  Note that we allocate any messages
1420e11c3f44Smeem  * needed to deactivate `ill' here as well so that deactivation cannot fail.
1421e11c3f44Smeem  */
1422e11c3f44Smeem static boolean_t
ipmp_ill_activate(ill_t * ill)1423e11c3f44Smeem ipmp_ill_activate(ill_t *ill)
1424e11c3f44Smeem {
1425e11c3f44Smeem 	ipif_t		*ipif;
1426e11c3f44Smeem 	mblk_t		*linkupmp = NULL, *linkdownmp = NULL;
1427e11c3f44Smeem 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1428e11c3f44Smeem 	ipmp_illgrp_t	*illg = ill->ill_grp;
1429e11c3f44Smeem 	ill_t		*maxill;
1430e11c3f44Smeem 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1431e11c3f44Smeem 
1432e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1433e11c3f44Smeem 	ASSERT(IS_UNDER_IPMP(ill));
1434e11c3f44Smeem 
1435e11c3f44Smeem 	/*
1436e11c3f44Smeem 	 * If this will be the first active interface in the group, allocate
1437e11c3f44Smeem 	 * the link-up and link-down messages.
1438e11c3f44Smeem 	 */
1439e11c3f44Smeem 	if (grp->gr_nactif == 0) {
1440e11c3f44Smeem 		linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
1441e11c3f44Smeem 		linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
1442e11c3f44Smeem 		if (linkupmp == NULL || linkdownmp == NULL)
1443e11c3f44Smeem 			goto fail;
1444e11c3f44Smeem 	}
1445e11c3f44Smeem 
1446e11c3f44Smeem 	if (list_is_empty(&illg->ig_actif)) {
1447e11c3f44Smeem 		/*
1448e11c3f44Smeem 		 * Now that we have an active ill, nominate it for multicast
1449e11c3f44Smeem 		 * and broadcast duties.  Do this before ipmp_ill_bind_ipif()
1450e11c3f44Smeem 		 * since that may need to send multicast packets (e.g., IPv6
1451e11c3f44Smeem 		 * neighbor discovery probes).
1452e11c3f44Smeem 		 */
1453e11c3f44Smeem 		ipmp_illgrp_set_cast(illg, ill);
1454e11c3f44Smeem 
1455e11c3f44Smeem 		/*
1456e11c3f44Smeem 		 * This is the first active ill in the illgrp -- add 'em all.
1457e11c3f44Smeem 		 * We can access/walk ig_ipmp_ill's ipif list since we're
1458e11c3f44Smeem 		 * writer on its IPSQ as well.
1459e11c3f44Smeem 		 */
1460e11c3f44Smeem 		ipif = illg->ig_ipmp_ill->ill_ipif;
1461e11c3f44Smeem 		for (; ipif != NULL; ipif = ipif->ipif_next)
1462e11c3f44Smeem 			if (ipmp_ipif_is_up_dataaddr(ipif))
1463e11c3f44Smeem 				ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
1464e11c3f44Smeem 	} else {
1465e11c3f44Smeem 		/*
1466e11c3f44Smeem 		 * Redistribute the addresses by moving them from the ill with
1467e11c3f44Smeem 		 * the most addresses until the ill being activated is at the
1468e11c3f44Smeem 		 * same level as the rest of the ills.
1469e11c3f44Smeem 		 */
1470e11c3f44Smeem 		for (;;) {
1471e11c3f44Smeem 			maxill = ipmp_illgrp_max_ill(illg);
1472e11c3f44Smeem 			ASSERT(maxill != NULL);
1473e11c3f44Smeem 			if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
1474e11c3f44Smeem 				break;
1475e11c3f44Smeem 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
1476e11c3f44Smeem 			ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
1477e11c3f44Smeem 		}
1478e11c3f44Smeem 	}
1479e11c3f44Smeem 
1480e11c3f44Smeem 	/*
1481e11c3f44Smeem 	 * Put the interface in the active list.
1482e11c3f44Smeem 	 */
1483e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1484e11c3f44Smeem 	list_insert_tail(&illg->ig_actif, ill);
1485e11c3f44Smeem 	illg->ig_nactif++;
1486e11c3f44Smeem 	illg->ig_next_ill = ill;
1487e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
1488e11c3f44Smeem 
1489e11c3f44Smeem 	/*
1490bd670b35SErik Nordmark 	 * Refresh static/proxy ARP entries to use `ill', if need be.
1491e11c3f44Smeem 	 */
1492e11c3f44Smeem 	if (!ill->ill_isv6)
1493e11c3f44Smeem 		ipmp_illgrp_refresh_arpent(illg);
1494e11c3f44Smeem 
1495e11c3f44Smeem 	/*
1496e11c3f44Smeem 	 * Finally, mark the group link up, if necessary.
1497e11c3f44Smeem 	 */
1498e11c3f44Smeem 	if (grp->gr_nactif++ == 0) {
1499e11c3f44Smeem 		ASSERT(grp->gr_linkdownmp == NULL);
1500e11c3f44Smeem 		grp->gr_linkdownmp = linkdownmp;
1501e11c3f44Smeem 		put(illg->ig_ipmp_ill->ill_rq, linkupmp);
1502e11c3f44Smeem 	}
1503e11c3f44Smeem 	return (B_TRUE);
1504e11c3f44Smeem fail:
1505e11c3f44Smeem 	freemsg(linkupmp);
1506e11c3f44Smeem 	freemsg(linkdownmp);
1507e11c3f44Smeem 	return (B_FALSE);
1508e11c3f44Smeem }
1509e11c3f44Smeem 
1510e11c3f44Smeem /*
1511e11c3f44Smeem  * Deactivate `ill' so it will not be used to send or receive data traffic.
1512e11c3f44Smeem  */
1513e11c3f44Smeem static void
ipmp_ill_deactivate(ill_t * ill)1514e11c3f44Smeem ipmp_ill_deactivate(ill_t *ill)
1515e11c3f44Smeem {
15161f19738eSmeem 	ill_t		*minill, *ipmp_ill;
1517e11c3f44Smeem 	ipif_t		*ipif, *ubnextipif, *ubheadipif = NULL;
1518e11c3f44Smeem 	mblk_t		*mp;
1519e11c3f44Smeem 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1520e11c3f44Smeem 	ipmp_illgrp_t	*illg = ill->ill_grp;
1521e11c3f44Smeem 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1522e11c3f44Smeem 
1523e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1524e11c3f44Smeem 	ASSERT(IS_UNDER_IPMP(ill));
1525e11c3f44Smeem 
15261f19738eSmeem 	ipmp_ill = illg->ig_ipmp_ill;
15271f19738eSmeem 
1528e11c3f44Smeem 	/*
1529e11c3f44Smeem 	 * Pull the interface out of the active list.
1530e11c3f44Smeem 	 */
1531e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1532e11c3f44Smeem 	list_remove(&illg->ig_actif, ill);
1533e11c3f44Smeem 	illg->ig_nactif--;
1534e11c3f44Smeem 	illg->ig_next_ill = list_head(&illg->ig_actif);
1535e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
1536e11c3f44Smeem 
1537e11c3f44Smeem 	/*
1538e11c3f44Smeem 	 * If the ill that's being deactivated had been nominated for
1539e11c3f44Smeem 	 * multicast/broadcast, nominate a new one.
1540e11c3f44Smeem 	 */
1541e11c3f44Smeem 	if (ill == illg->ig_cast_ill)
1542e11c3f44Smeem 		ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
1543e11c3f44Smeem 
1544bd670b35SErik Nordmark 	/*
1545bd670b35SErik Nordmark 	 * Delete all nce_t entries using this ill, so that the next attempt
1546bd670b35SErik Nordmark 	 * to send data traffic will revalidate cached nce's.
1547bd670b35SErik Nordmark 	 */
1548bd670b35SErik Nordmark 	nce_flush(ill, B_TRUE);
1549bd670b35SErik Nordmark 
1550e11c3f44Smeem 	/*
1551e11c3f44Smeem 	 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
1552e11c3f44Smeem 	 * we'll rebind them after we tell the resolver the ill is no longer
1553e11c3f44Smeem 	 * active.  We must do things in this order or the resolver could
1554e11c3f44Smeem 	 * accidentally rebind to the ill we're trying to remove if multiple
1555e11c3f44Smeem 	 * ills in the group have the same hardware address (which is
1556e11c3f44Smeem 	 * unsupported, but shouldn't lead to a wedged machine).
1557e11c3f44Smeem 	 */
1558e11c3f44Smeem 	while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
1559e11c3f44Smeem 		ipif->ipif_bound_next = ubheadipif;
1560e11c3f44Smeem 		ubheadipif = ipif;
1561e11c3f44Smeem 	}
1562e11c3f44Smeem 
15631f19738eSmeem 	if (!ill->ill_isv6) {
1564e11c3f44Smeem 		/*
1565bd670b35SErik Nordmark 		 * Refresh static/proxy ARP entries that had been using `ill'.
1566e11c3f44Smeem 		 */
1567e11c3f44Smeem 		ipmp_illgrp_refresh_arpent(illg);
1568e11c3f44Smeem 	}
1569e11c3f44Smeem 
1570e11c3f44Smeem 	/*
1571e11c3f44Smeem 	 * Rebind each ipif from the deactivated ill to the active ill with
1572e11c3f44Smeem 	 * the fewest ipifs.  If there are no active ills, the ipifs will
1573e11c3f44Smeem 	 * remain unbound.
1574e11c3f44Smeem 	 */
1575e11c3f44Smeem 	for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
1576e11c3f44Smeem 		ubnextipif = ipif->ipif_bound_next;
1577e11c3f44Smeem 		ipif->ipif_bound_next = NULL;
1578e11c3f44Smeem 
1579e11c3f44Smeem 		if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
1580e11c3f44Smeem 			ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
1581e11c3f44Smeem 	}
1582e11c3f44Smeem 
1583bd670b35SErik Nordmark 	/*
15841f19738eSmeem 	 * Remove any IRE_IF_CLONEs for this ill since they might have an
15851f19738eSmeem 	 * ire_nce_cache/nce_common which refers to another ill in the group.
1586bd670b35SErik Nordmark 	 */
15871f19738eSmeem 	ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone, ill,
15881f19738eSmeem 	    ill);
1589bd670b35SErik Nordmark 
1590e11c3f44Smeem 	/*
15911f19738eSmeem 	 * Finally, if there are no longer any active interfaces, then delete
15921f19738eSmeem 	 * any NCECs associated with the group and mark the group link down.
1593e11c3f44Smeem 	 */
1594e11c3f44Smeem 	if (--grp->gr_nactif == 0) {
15958a06b3d6SToomas Soome 		ncec_walk(ipmp_ill, ncec_delete_per_ill, ipmp_ill, ipst);
1596e11c3f44Smeem 		mp = grp->gr_linkdownmp;
1597e11c3f44Smeem 		grp->gr_linkdownmp = NULL;
1598e11c3f44Smeem 		ASSERT(mp != NULL);
15991f19738eSmeem 		put(ipmp_ill->ill_rq, mp);
1600e11c3f44Smeem 	}
1601e11c3f44Smeem }
1602e11c3f44Smeem 
1603e11c3f44Smeem /*
1604e11c3f44Smeem  * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
1605e11c3f44Smeem  * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
1606e11c3f44Smeem  */
1607e11c3f44Smeem static void
ipmp_ill_rtsaddrmsg(ill_t * ill,int cmd)1608e11c3f44Smeem ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
1609e11c3f44Smeem {
1610e11c3f44Smeem 	ipif_t *ipif;
1611e11c3f44Smeem 
1612e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1613e11c3f44Smeem 	ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
1614e11c3f44Smeem 
1615e11c3f44Smeem 	/*
1616e11c3f44Smeem 	 * If `ill' is truly down, there are no messages to generate since:
1617e11c3f44Smeem 	 *
1618e11c3f44Smeem 	 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
1619e11c3f44Smeem 	 *    and its addresses by bringing them down.  But that's already
1620e11c3f44Smeem 	 *    true, so there's nothing to hide.
1621e11c3f44Smeem 	 *
1622e11c3f44Smeem 	 * 2. If cmd == RTM_ADD, then we're supposed to generate messages
1623e11c3f44Smeem 	 *    indicating that any previously-hidden up addresses are again
1624e11c3f44Smeem 	 *    back up (along with the interface).  But they aren't, so
1625e11c3f44Smeem 	 *    there's nothing to expose.
1626e11c3f44Smeem 	 */
1627e11c3f44Smeem 	if (ill->ill_ipif_up_count == 0)
1628e11c3f44Smeem 		return;
1629e11c3f44Smeem 
1630e11c3f44Smeem 	if (cmd == RTM_ADD)
1631e11c3f44Smeem 		ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
1632e11c3f44Smeem 
1633e11c3f44Smeem 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1634e11c3f44Smeem 		if (ipif->ipif_flags & IPIF_UP)
1635e11c3f44Smeem 			ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
1636e11c3f44Smeem 
1637e11c3f44Smeem 	if (cmd == RTM_DELETE)
1638e11c3f44Smeem 		ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
1639e11c3f44Smeem }
1640e11c3f44Smeem 
1641e11c3f44Smeem /*
1642e11c3f44Smeem  * Bind the address named by `ipif' to the underlying ill named by `ill'.
1643e11c3f44Smeem  * If `act' is Res_act_none, don't notify the resolver.  Otherwise, `act'
1644e11c3f44Smeem  * will indicate to the resolver whether this is an initial bringup of
1645e11c3f44Smeem  * `ipif', or just a rebind to another ill.
1646e11c3f44Smeem  */
1647e11c3f44Smeem static void
ipmp_ill_bind_ipif(ill_t * ill,ipif_t * ipif,enum ip_resolver_action act)1648e11c3f44Smeem ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
1649e11c3f44Smeem {
1650e11c3f44Smeem 	int err = 0;
1651e11c3f44Smeem 	ip_stack_t *ipst = ill->ill_ipst;
1652e11c3f44Smeem 
1653e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
1654e11c3f44Smeem 	ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
1655e11c3f44Smeem 	ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
1656e11c3f44Smeem 	ASSERT(ipif->ipif_bound_ill == NULL);
1657e11c3f44Smeem 	ASSERT(ipif->ipif_bound_next == NULL);
1658e11c3f44Smeem 
1659e11c3f44Smeem 	ipif->ipif_bound_next = ill->ill_bound_ipif;
1660e11c3f44Smeem 	ill->ill_bound_ipif = ipif;
1661e11c3f44Smeem 	ill->ill_bound_cnt++;
1662e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1663e11c3f44Smeem 	ipif->ipif_bound_ill = ill;
1664e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
1665e11c3f44Smeem 
1666e11c3f44Smeem 	/*
1667e11c3f44Smeem 	 * If necessary, tell ARP/NDP about the new mapping.  Note that
1668bd670b35SErik Nordmark 	 * ipif_resolver_up() cannot fail for IPv6 ills.
1669e11c3f44Smeem 	 */
1670e11c3f44Smeem 	if (act != Res_act_none) {
1671e11c3f44Smeem 		if (ill->ill_isv6) {
1672e11c3f44Smeem 			VERIFY(ipif_resolver_up(ipif, act) == 0);
1673e11c3f44Smeem 			err = ipif_ndp_up(ipif, act == Res_act_initial);
1674e11c3f44Smeem 		} else {
1675e11c3f44Smeem 			err = ipif_resolver_up(ipif, act);
1676e11c3f44Smeem 		}
1677e11c3f44Smeem 
1678e11c3f44Smeem 		/*
1679e11c3f44Smeem 		 * Since ipif_ndp_up() never returns EINPROGRESS and
1680e11c3f44Smeem 		 * ipif_resolver_up() only returns EINPROGRESS when the
1681e11c3f44Smeem 		 * associated ill is not up, we should never be here with
1682e11c3f44Smeem 		 * EINPROGRESS.  We rely on this to simplify the design.
1683e11c3f44Smeem 		 */
1684e11c3f44Smeem 		ASSERT(err != EINPROGRESS);
1685e11c3f44Smeem 	}
1686e11c3f44Smeem 	/* TODO: retry binding on failure? when? */
1687e11c3f44Smeem 	ipif->ipif_bound = (err == 0);
1688e11c3f44Smeem }
1689e11c3f44Smeem 
1690e11c3f44Smeem /*
1691e11c3f44Smeem  * Unbind the address named by `ipif' from the underlying ill named by `ill'.
1692e11c3f44Smeem  * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
1693e11c3f44Smeem  * If no ipifs are bound to `ill', NULL is returned.  If `notifyres' is
1694e11c3f44Smeem  * B_TRUE, notify the resolver about the change.
1695e11c3f44Smeem  */
1696e11c3f44Smeem static ipif_t *
ipmp_ill_unbind_ipif(ill_t * ill,ipif_t * ipif,boolean_t notifyres)1697e11c3f44Smeem ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
1698e11c3f44Smeem {
1699e11c3f44Smeem 	ipif_t *previpif;
1700e11c3f44Smeem 	ip_stack_t *ipst = ill->ill_ipst;
1701e11c3f44Smeem 
1702e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1703e11c3f44Smeem 	ASSERT(IS_UNDER_IPMP(ill));
1704e11c3f44Smeem 
1705e11c3f44Smeem 	/*
1706e11c3f44Smeem 	 * If necessary, find an ipif to unbind.
1707e11c3f44Smeem 	 */
1708e11c3f44Smeem 	if (ipif == NULL) {
1709e11c3f44Smeem 		if ((ipif = ill->ill_bound_ipif) == NULL) {
1710e11c3f44Smeem 			ASSERT(ill->ill_bound_cnt == 0);
1711e11c3f44Smeem 			return (NULL);
1712e11c3f44Smeem 		}
1713e11c3f44Smeem 	}
1714e11c3f44Smeem 
1715e11c3f44Smeem 	ASSERT(IAM_WRITER_IPIF(ipif));
1716e11c3f44Smeem 	ASSERT(IS_IPMP(ipif->ipif_ill));
1717e11c3f44Smeem 	ASSERT(ipif->ipif_bound_ill == ill);
1718e11c3f44Smeem 	ASSERT(ill->ill_bound_cnt > 0);
1719e11c3f44Smeem 
1720e11c3f44Smeem 	/*
1721e11c3f44Smeem 	 * Unbind it.
1722e11c3f44Smeem 	 */
1723e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1724e11c3f44Smeem 	ipif->ipif_bound_ill = NULL;
1725e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
1726e11c3f44Smeem 	ill->ill_bound_cnt--;
1727e11c3f44Smeem 
1728e11c3f44Smeem 	if (ill->ill_bound_ipif == ipif) {
1729e11c3f44Smeem 		ill->ill_bound_ipif = ipif->ipif_bound_next;
1730e11c3f44Smeem 	} else {
1731e11c3f44Smeem 		previpif = ill->ill_bound_ipif;
1732e11c3f44Smeem 		while (previpif->ipif_bound_next != ipif)
1733e11c3f44Smeem 			previpif = previpif->ipif_bound_next;
1734e11c3f44Smeem 
1735e11c3f44Smeem 		previpif->ipif_bound_next = ipif->ipif_bound_next;
1736e11c3f44Smeem 	}
1737e11c3f44Smeem 	ipif->ipif_bound_next = NULL;
1738e11c3f44Smeem 
1739e11c3f44Smeem 	/*
1740e11c3f44Smeem 	 * If requested, notify the resolvers (provided we're bound).
1741e11c3f44Smeem 	 */
1742e11c3f44Smeem 	if (notifyres && ipif->ipif_bound) {
1743bd670b35SErik Nordmark 		if (ill->ill_isv6)
1744e11c3f44Smeem 			ipif_ndp_down(ipif);
1745bd670b35SErik Nordmark 		else
1746bd670b35SErik Nordmark 			(void) ipif_arp_down(ipif);
1747e11c3f44Smeem 	}
1748e11c3f44Smeem 	ipif->ipif_bound = B_FALSE;
1749e11c3f44Smeem 
1750e11c3f44Smeem 	return (ipif);
1751e11c3f44Smeem }
1752e11c3f44Smeem 
1753e11c3f44Smeem /*
1754e11c3f44Smeem  * Check if `ill' is active.  Caller must hold ill_lock and phyint_lock if
1755e11c3f44Smeem  * it's not inside the IPSQ.  Since ipmp_ill_try_refresh_active() calls this
1756e11c3f44Smeem  * to determine whether an ill should be considered active, other consumers
1757e11c3f44Smeem  * may race and learn about an ill that should be deactivated/activated before
1758e11c3f44Smeem  * IPMP has performed the activation/deactivation.  This should be safe though
1759e11c3f44Smeem  * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
1760e11c3f44Smeem  * would've been cleaned up by ipmp_ill_deactivate().
1761e11c3f44Smeem  */
1762e11c3f44Smeem boolean_t
ipmp_ill_is_active(ill_t * ill)1763e11c3f44Smeem ipmp_ill_is_active(ill_t *ill)
1764e11c3f44Smeem {
1765e11c3f44Smeem 	phyint_t *phyi = ill->ill_phyint;
1766e11c3f44Smeem 
1767e11c3f44Smeem 	ASSERT(IS_UNDER_IPMP(ill));
1768e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill) ||
1769e11c3f44Smeem 	    (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
1770e11c3f44Smeem 
1771e11c3f44Smeem 	/*
1772e11c3f44Smeem 	 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
1773e11c3f44Smeem 	 * set PHYI_FAILED whenever PHYI_RUNNING is cleared.  This allows the
1774e11c3f44Smeem 	 * link flapping logic to be just in in.mpathd and allows us to ignore
1775e11c3f44Smeem 	 * changes to PHYI_RUNNING.
1776e11c3f44Smeem 	 */
1777e11c3f44Smeem 	return (!(ill->ill_ipif_up_count == 0 ||
1778e11c3f44Smeem 	    (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
1779e11c3f44Smeem }
1780e11c3f44Smeem 
1781e11c3f44Smeem /*
1782bd670b35SErik Nordmark  * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated
1783bd670b35SErik Nordmark  * with `ill_arg'.
1784e11c3f44Smeem  */
1785e11c3f44Smeem static void
ipmp_ill_ire_mark_testhidden(ire_t * ire,char * ill_arg)1786e11c3f44Smeem ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
1787e11c3f44Smeem {
1788e11c3f44Smeem 	ill_t *ill = (ill_t *)ill_arg;
1789e11c3f44Smeem 
1790e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1791e11c3f44Smeem 	ASSERT(!IS_IPMP(ill));
1792e11c3f44Smeem 
1793bd670b35SErik Nordmark 	if (ire->ire_ill != ill)
1794e11c3f44Smeem 		return;
1795e11c3f44Smeem 
1796bd670b35SErik Nordmark 	if (IRE_HIDDEN_TYPE(ire->ire_type)) {
1797e11c3f44Smeem 		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
1798bd670b35SErik Nordmark 		ire->ire_testhidden = B_TRUE;
1799e11c3f44Smeem 	}
1800e11c3f44Smeem }
1801e11c3f44Smeem 
1802e11c3f44Smeem /*
1803bd670b35SErik Nordmark  * IRE walker callback: clear ire_testhidden if the IRE has a source address
1804bd670b35SErik Nordmark  * on `ill_arg'.
1805e11c3f44Smeem  */
1806e11c3f44Smeem static void
ipmp_ill_ire_clear_testhidden(ire_t * ire,char * ill_arg)1807e11c3f44Smeem ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
1808e11c3f44Smeem {
1809e11c3f44Smeem 	ill_t *ill = (ill_t *)ill_arg;
1810e11c3f44Smeem 
1811e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ill));
1812e11c3f44Smeem 	ASSERT(!IS_IPMP(ill));
1813e11c3f44Smeem 
1814bd670b35SErik Nordmark 	if (ire->ire_ill == ill) {
1815e11c3f44Smeem 		DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
1816bd670b35SErik Nordmark 		ire->ire_testhidden = B_FALSE;
1817e11c3f44Smeem 	}
1818e11c3f44Smeem }
1819e11c3f44Smeem 
1820e11c3f44Smeem /*
1821e11c3f44Smeem  * Return a held pointer to the IPMP ill for underlying interface `ill', or
1822e11c3f44Smeem  * NULL if one doesn't exist.  (Unfortunately, this function needs to take an
1823e11c3f44Smeem  * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
18240bd79941Smeem  * ill_grp pointer may become stale when not inside an IPSQ and not holding
1825e11c3f44Smeem  * ipmp_lock.)  Caller need not be inside the IPSQ.
1826e11c3f44Smeem  */
1827e11c3f44Smeem ill_t *
ipmp_ill_hold_ipmp_ill(ill_t * ill)1828e11c3f44Smeem ipmp_ill_hold_ipmp_ill(ill_t *ill)
1829e11c3f44Smeem {
1830e11c3f44Smeem 	ip_stack_t *ipst = ill->ill_ipst;
1831e11c3f44Smeem 	ipmp_illgrp_t *illg;
1832e11c3f44Smeem 
1833e11c3f44Smeem 	ASSERT(!IS_IPMP(ill));
1834e11c3f44Smeem 
1835e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1836e11c3f44Smeem 	illg = ill->ill_grp;
1837bd670b35SErik Nordmark 	if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) {
1838e11c3f44Smeem 		rw_exit(&ipst->ips_ipmp_lock);
1839e11c3f44Smeem 		return (illg->ig_ipmp_ill);
1840e11c3f44Smeem 	}
1841e11c3f44Smeem 	/*
1842e11c3f44Smeem 	 * Assume `ill' was removed from the illgrp in the meantime.
1843e11c3f44Smeem 	 */
1844e11c3f44Smeem 	rw_exit(&ill->ill_ipst->ips_ipmp_lock);
1845e11c3f44Smeem 	return (NULL);
1846e11c3f44Smeem }
1847e11c3f44Smeem 
18481f19738eSmeem /*
18491f19738eSmeem  * Return a held pointer to the appropriate underlying ill for sending the
18501f19738eSmeem  * specified type of packet.  (Unfortunately, this function needs to take an
18511f19738eSmeem  * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
18521f19738eSmeem  * ill_grp pointer may become stale when not inside an IPSQ and not holding
18531f19738eSmeem  * ipmp_lock.)  Caller need not be inside the IPSQ.
18541f19738eSmeem  */
18551f19738eSmeem ill_t *
ipmp_ill_hold_xmit_ill(ill_t * ill,boolean_t is_unicast)18561f19738eSmeem ipmp_ill_hold_xmit_ill(ill_t *ill, boolean_t is_unicast)
18571f19738eSmeem {
18581f19738eSmeem 	ill_t *xmit_ill;
18591f19738eSmeem 	ip_stack_t *ipst = ill->ill_ipst;
18601f19738eSmeem 
18611f19738eSmeem 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
18621f19738eSmeem 	if (ill->ill_grp == NULL) {
18631f19738eSmeem 		/*
18641f19738eSmeem 		 * The ill was taken out of the group, so just send on it.
18651f19738eSmeem 		 */
18661f19738eSmeem 		rw_exit(&ipst->ips_ill_g_lock);
18671f19738eSmeem 		ill_refhold(ill);
18681f19738eSmeem 		return (ill);
18691f19738eSmeem 	}
18701f19738eSmeem 	if (is_unicast)
18711f19738eSmeem 		xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp);
18721f19738eSmeem 	else
18731f19738eSmeem 		xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
18741f19738eSmeem 	rw_exit(&ipst->ips_ill_g_lock);
18751f19738eSmeem 
18761f19738eSmeem 	return (xmit_ill);
18771f19738eSmeem }
18781f19738eSmeem 
1879e11c3f44Smeem /*
1880e11c3f44Smeem  * Return the interface index for the IPMP ill tied to underlying interface
1881e11c3f44Smeem  * `ill', or zero if one doesn't exist.  Caller need not be inside the IPSQ.
1882e11c3f44Smeem  */
1883e11c3f44Smeem uint_t
ipmp_ill_get_ipmp_ifindex(const ill_t * ill)1884e11c3f44Smeem ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
1885e11c3f44Smeem {
1886e11c3f44Smeem 	uint_t ifindex = 0;
1887e11c3f44Smeem 	ip_stack_t *ipst = ill->ill_ipst;
1888e11c3f44Smeem 	ipmp_grp_t *grp;
1889e11c3f44Smeem 
1890e11c3f44Smeem 	ASSERT(!IS_IPMP(ill));
1891e11c3f44Smeem 
1892e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1893e11c3f44Smeem 	if ((grp = ill->ill_phyint->phyint_grp) != NULL)
1894e11c3f44Smeem 		ifindex = grp->gr_phyint->phyint_ifindex;
1895e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
1896e11c3f44Smeem 	return (ifindex);
1897e11c3f44Smeem }
1898e11c3f44Smeem 
1899e11c3f44Smeem /*
1900e11c3f44Smeem  * Place phyint `phyi' into IPMP group `grp'.
1901e11c3f44Smeem  */
1902e11c3f44Smeem void
ipmp_phyint_join_grp(phyint_t * phyi,ipmp_grp_t * grp)1903e11c3f44Smeem ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
1904e11c3f44Smeem {
1905e11c3f44Smeem 	ill_t *ill;
1906e11c3f44Smeem 	ipsq_t *ipsq = phyi->phyint_ipsq;
1907e11c3f44Smeem 	ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
1908e11c3f44Smeem 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1909e11c3f44Smeem 
1910e11c3f44Smeem 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1911e11c3f44Smeem 	ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
1912*ab82c29bSToomas Soome 	ill = NULL;
1913e11c3f44Smeem 
1914e11c3f44Smeem 	/*
1915e11c3f44Smeem 	 * Send routing socket messages indicating that the phyint's ills
1916e11c3f44Smeem 	 * and ipifs vanished.
1917e11c3f44Smeem 	 */
1918e11c3f44Smeem 	if (phyi->phyint_illv4 != NULL) {
1919e11c3f44Smeem 		ill = phyi->phyint_illv4;
1920e11c3f44Smeem 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1921e11c3f44Smeem 	}
1922e11c3f44Smeem 
1923e11c3f44Smeem 	if (phyi->phyint_illv6 != NULL) {
1924e11c3f44Smeem 		ill = phyi->phyint_illv6;
1925e11c3f44Smeem 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1926e11c3f44Smeem 	}
1927e11c3f44Smeem 
1928e11c3f44Smeem 	/*
1929e11c3f44Smeem 	 * Snapshot the phyint's initial kstats as a baseline.
1930e11c3f44Smeem 	 */
1931e11c3f44Smeem 	ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
1932e11c3f44Smeem 
1933e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1934e11c3f44Smeem 
1935e11c3f44Smeem 	phyi->phyint_grp = grp;
1936e11c3f44Smeem 	if (++grp->gr_nif == 1)
1937e11c3f44Smeem 		grp->gr_mactype = ill->ill_mactype;
1938e11c3f44Smeem 	else
1939e11c3f44Smeem 		ASSERT(grp->gr_mactype == ill->ill_mactype);
1940e11c3f44Smeem 
1941e11c3f44Smeem 	/*
1942e11c3f44Smeem 	 * Now that we're in the group, request a switch to the group's xop
1943e11c3f44Smeem 	 * when we ipsq_exit().  All future operations will be exclusive on
1944e11c3f44Smeem 	 * the group xop until ipmp_phyint_leave_grp() is called.
1945e11c3f44Smeem 	 */
1946e11c3f44Smeem 	ASSERT(ipsq->ipsq_swxop == NULL);
1947e11c3f44Smeem 	ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
1948e11c3f44Smeem 	ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
1949e11c3f44Smeem 
1950e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
1951e11c3f44Smeem }
1952e11c3f44Smeem 
1953e11c3f44Smeem /*
1954e11c3f44Smeem  * Remove phyint `phyi' from its current IPMP group.
1955e11c3f44Smeem  */
1956e11c3f44Smeem void
ipmp_phyint_leave_grp(phyint_t * phyi)1957e11c3f44Smeem ipmp_phyint_leave_grp(phyint_t *phyi)
1958e11c3f44Smeem {
1959e11c3f44Smeem 	uint_t i;
1960e11c3f44Smeem 	ipsq_t *ipsq = phyi->phyint_ipsq;
1961e11c3f44Smeem 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1962e11c3f44Smeem 	uint64_t phyi_kstats[IPMP_KSTAT_MAX];
1963e11c3f44Smeem 
1964e11c3f44Smeem 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1965e11c3f44Smeem 
1966e11c3f44Smeem 	/*
1967e11c3f44Smeem 	 * If any of the phyint's ills are still in an illgrp, kick 'em out.
1968e11c3f44Smeem 	 */
1969e11c3f44Smeem 	if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
1970e11c3f44Smeem 		ipmp_ill_leave_illgrp(phyi->phyint_illv4);
1971e11c3f44Smeem 	if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
1972e11c3f44Smeem 		ipmp_ill_leave_illgrp(phyi->phyint_illv6);
1973e11c3f44Smeem 
1974e11c3f44Smeem 	/*
1975e11c3f44Smeem 	 * Send routing socket messages indicating that the phyint's ills
1976e11c3f44Smeem 	 * and ipifs have reappeared.
1977e11c3f44Smeem 	 */
1978e11c3f44Smeem 	if (phyi->phyint_illv4 != NULL)
1979e11c3f44Smeem 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
1980e11c3f44Smeem 	if (phyi->phyint_illv6 != NULL)
1981e11c3f44Smeem 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
1982e11c3f44Smeem 
1983e11c3f44Smeem 	/*
1984e11c3f44Smeem 	 * Calculate the phyint's cumulative kstats while it was in the group,
1985e11c3f44Smeem 	 * and add that to the group's baseline.
1986e11c3f44Smeem 	 */
1987e11c3f44Smeem 	ipmp_phyint_get_kstats(phyi, phyi_kstats);
1988e11c3f44Smeem 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
1989e11c3f44Smeem 		phyi_kstats[i] -= phyi->phyint_kstats0[i];
1990e11c3f44Smeem 		atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
1991e11c3f44Smeem 	}
1992e11c3f44Smeem 
1993e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1994e11c3f44Smeem 
1995e11c3f44Smeem 	phyi->phyint_grp->gr_nif--;
1996e11c3f44Smeem 	phyi->phyint_grp = NULL;
1997e11c3f44Smeem 
1998e11c3f44Smeem 	/*
1999e11c3f44Smeem 	 * As our final act in leaving the group, request a switch back to our
2000e11c3f44Smeem 	 * IPSQ's own xop when we ipsq_exit().
2001e11c3f44Smeem 	 */
2002e11c3f44Smeem 	ASSERT(ipsq->ipsq_swxop == NULL);
2003e11c3f44Smeem 	ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
2004e11c3f44Smeem 
2005e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
2006e11c3f44Smeem }
2007e11c3f44Smeem 
2008e11c3f44Smeem /*
2009e11c3f44Smeem  * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
2010e11c3f44Smeem  * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
2011e11c3f44Smeem  */
2012e11c3f44Smeem static void
ipmp_phyint_get_kstats(phyint_t * phyi,uint64_t kstats[])2013e11c3f44Smeem ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
2014e11c3f44Smeem {
2015e11c3f44Smeem 	uint_t		i, j;
2016e11c3f44Smeem 	const char	*name;
2017e11c3f44Smeem 	kstat_t		*ksp;
2018e11c3f44Smeem 	kstat_named_t	*kn;
20192b24ab6bSSebastien Roy 	ip_stack_t	*ipst = PHYINT_TO_IPST(phyi);
20202b24ab6bSSebastien Roy 	zoneid_t	zoneid;
2021e11c3f44Smeem 
2022e11c3f44Smeem 	bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
20232b24ab6bSSebastien Roy 	zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
20242b24ab6bSSebastien Roy 	ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid);
2025e11c3f44Smeem 	if (ksp == NULL)
2026e11c3f44Smeem 		return;
2027e11c3f44Smeem 
2028e11c3f44Smeem 	KSTAT_ENTER(ksp);
2029e11c3f44Smeem 
2030e11c3f44Smeem 	if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
2031e11c3f44Smeem 		/*
2032e11c3f44Smeem 		 * Bring kstats up-to-date before recording.
2033e11c3f44Smeem 		 */
2034e11c3f44Smeem 		(void) KSTAT_UPDATE(ksp, KSTAT_READ);
2035e11c3f44Smeem 
2036e11c3f44Smeem 		kn = KSTAT_NAMED_PTR(ksp);
2037e11c3f44Smeem 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
2038e11c3f44Smeem 			name = ipmp_kstats[i].name;
2039e11c3f44Smeem 			kstats[i] = 0;
2040e11c3f44Smeem 			for (j = 0; j < ksp->ks_ndata; j++) {
2041e11c3f44Smeem 				if (strcmp(kn[j].name, name) != 0)
2042e11c3f44Smeem 					continue;
2043e11c3f44Smeem 
2044e11c3f44Smeem 				switch (kn[j].data_type) {
2045e11c3f44Smeem 				case KSTAT_DATA_INT32:
2046e11c3f44Smeem 				case KSTAT_DATA_UINT32:
2047e11c3f44Smeem 					kstats[i] = kn[j].value.ui32;
2048e11c3f44Smeem 					break;
2049e11c3f44Smeem #ifdef	_LP64
2050e11c3f44Smeem 				case KSTAT_DATA_LONG:
2051e11c3f44Smeem 				case KSTAT_DATA_ULONG:
2052e11c3f44Smeem 					kstats[i] = kn[j].value.ul;
2053e11c3f44Smeem 					break;
2054e11c3f44Smeem #endif
2055e11c3f44Smeem 				case KSTAT_DATA_INT64:
2056e11c3f44Smeem 				case KSTAT_DATA_UINT64:
2057e11c3f44Smeem 					kstats[i] = kn[j].value.ui64;
2058e11c3f44Smeem 					break;
2059e11c3f44Smeem 				}
2060e11c3f44Smeem 				break;
2061e11c3f44Smeem 			}
2062e11c3f44Smeem 		}
2063e11c3f44Smeem 	}
2064e11c3f44Smeem 
2065e11c3f44Smeem 	KSTAT_EXIT(ksp);
2066e11c3f44Smeem 	kstat_rele(ksp);
2067e11c3f44Smeem }
2068e11c3f44Smeem 
2069e11c3f44Smeem /*
2070e11c3f44Smeem  * Refresh the active state of all ills on `phyi'.
2071e11c3f44Smeem  */
2072e11c3f44Smeem void
ipmp_phyint_refresh_active(phyint_t * phyi)2073e11c3f44Smeem ipmp_phyint_refresh_active(phyint_t *phyi)
2074e11c3f44Smeem {
2075e11c3f44Smeem 	if (phyi->phyint_illv4 != NULL)
2076e11c3f44Smeem 		ipmp_ill_refresh_active(phyi->phyint_illv4);
2077e11c3f44Smeem 	if (phyi->phyint_illv6 != NULL)
2078e11c3f44Smeem 		ipmp_ill_refresh_active(phyi->phyint_illv6);
2079e11c3f44Smeem }
2080e11c3f44Smeem 
2081e11c3f44Smeem /*
2082e11c3f44Smeem  * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
2083e11c3f44Smeem  * doesn't exist.  Caller need not be inside the IPSQ.
2084e11c3f44Smeem  */
2085e11c3f44Smeem ill_t *
ipmp_ipif_hold_bound_ill(const ipif_t * ipif)2086e11c3f44Smeem ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
2087e11c3f44Smeem {
2088e11c3f44Smeem 	ill_t *boundill;
2089e11c3f44Smeem 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2090e11c3f44Smeem 
2091e11c3f44Smeem 	ASSERT(IS_IPMP(ipif->ipif_ill));
2092e11c3f44Smeem 
2093e11c3f44Smeem 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2094e11c3f44Smeem 	boundill = ipif->ipif_bound_ill;
2095bd670b35SErik Nordmark 	if (boundill != NULL && ill_check_and_refhold(boundill)) {
2096e11c3f44Smeem 		rw_exit(&ipst->ips_ipmp_lock);
2097e11c3f44Smeem 		return (boundill);
2098e11c3f44Smeem 	}
2099e11c3f44Smeem 	rw_exit(&ipst->ips_ipmp_lock);
2100e11c3f44Smeem 	return (NULL);
2101e11c3f44Smeem }
2102e11c3f44Smeem 
2103e11c3f44Smeem /*
2104e11c3f44Smeem  * Return a pointer to the underlying ill bound to `ipif', or NULL if one
2105e11c3f44Smeem  * doesn't exist.  Caller must be inside the IPSQ.
2106e11c3f44Smeem  */
2107e11c3f44Smeem ill_t *
ipmp_ipif_bound_ill(const ipif_t * ipif)2108e11c3f44Smeem ipmp_ipif_bound_ill(const ipif_t *ipif)
2109e11c3f44Smeem {
2110e11c3f44Smeem 	ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
2111e11c3f44Smeem 	ASSERT(IS_IPMP(ipif->ipif_ill));
2112e11c3f44Smeem 
2113e11c3f44Smeem 	return (ipif->ipif_bound_ill);
2114e11c3f44Smeem }
2115e11c3f44Smeem 
2116e11c3f44Smeem /*
2117e11c3f44Smeem  * Check if `ipif' is a "stub" (placeholder address not being used).
2118e11c3f44Smeem  */
2119e11c3f44Smeem boolean_t
ipmp_ipif_is_stubaddr(const ipif_t * ipif)2120e11c3f44Smeem ipmp_ipif_is_stubaddr(const ipif_t *ipif)
2121e11c3f44Smeem {
2122e11c3f44Smeem 	if (ipif->ipif_flags & IPIF_UP)
2123e11c3f44Smeem 		return (B_FALSE);
2124e11c3f44Smeem 	if (ipif->ipif_ill->ill_isv6)
2125e11c3f44Smeem 		return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2126e11c3f44Smeem 	else
2127e11c3f44Smeem 		return (ipif->ipif_lcl_addr == INADDR_ANY);
2128e11c3f44Smeem }
2129e11c3f44Smeem 
2130e11c3f44Smeem /*
2131e11c3f44Smeem  * Check if `ipif' is an IPMP data address.
2132e11c3f44Smeem  */
2133e11c3f44Smeem boolean_t
ipmp_ipif_is_dataaddr(const ipif_t * ipif)2134e11c3f44Smeem ipmp_ipif_is_dataaddr(const ipif_t *ipif)
2135e11c3f44Smeem {
2136e11c3f44Smeem 	if (ipif->ipif_flags & IPIF_NOFAILOVER)
2137e11c3f44Smeem 		return (B_FALSE);
2138e11c3f44Smeem 	if (ipif->ipif_ill->ill_isv6)
2139e11c3f44Smeem 		return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2140e11c3f44Smeem 	else
2141e11c3f44Smeem 		return (ipif->ipif_lcl_addr != INADDR_ANY);
2142e11c3f44Smeem }
2143e11c3f44Smeem 
2144e11c3f44Smeem /*
2145e11c3f44Smeem  * Check if `ipif' is an IPIF_UP IPMP data address.
2146e11c3f44Smeem  */
2147e11c3f44Smeem static boolean_t
ipmp_ipif_is_up_dataaddr(const ipif_t * ipif)2148e11c3f44Smeem ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
2149e11c3f44Smeem {
2150e11c3f44Smeem 	return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
2151e11c3f44Smeem }
2152bd670b35SErik Nordmark 
2153bd670b35SErik Nordmark /*
21541f19738eSmeem  * Check if `mp' contains a probe packet by checking if the IP source address
21551f19738eSmeem  * is a test address on underlying interface `ill'.  Caller need not be inside
2156bd670b35SErik Nordmark  * the IPSQ.
2157bd670b35SErik Nordmark  */
2158bd670b35SErik Nordmark boolean_t
ipmp_packet_is_probe(mblk_t * mp,ill_t * ill)2159bd670b35SErik Nordmark ipmp_packet_is_probe(mblk_t *mp, ill_t *ill)
2160bd670b35SErik Nordmark {
2161bd670b35SErik Nordmark 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2162bd670b35SErik Nordmark 	ipha_t *ipha = (ipha_t *)mp->b_rptr;
2163bd670b35SErik Nordmark 
2164bd670b35SErik Nordmark 	ASSERT(DB_TYPE(mp) != M_CTL);
2165bd670b35SErik Nordmark 
2166bd670b35SErik Nordmark 	if (!IS_UNDER_IPMP(ill))
2167bd670b35SErik Nordmark 		return (B_FALSE);
2168bd670b35SErik Nordmark 
2169bd670b35SErik Nordmark 	if (ill->ill_isv6) {
2170bd670b35SErik Nordmark 		if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
2171bd670b35SErik Nordmark 		    ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL))
2172bd670b35SErik Nordmark 			return (B_TRUE);
2173bd670b35SErik Nordmark 	} else {
21741f19738eSmeem 		if (ipha->ipha_src != INADDR_ANY &&
2175bd670b35SErik Nordmark 		    ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL))
2176bd670b35SErik Nordmark 			return (B_TRUE);
2177bd670b35SErik Nordmark 	}
2178bd670b35SErik Nordmark 	return (B_FALSE);
2179bd670b35SErik Nordmark }
2180bd670b35SErik Nordmark 
2181bd670b35SErik Nordmark /*
21821f19738eSmeem  * NCEC walker callback: delete `ncec' if it is associated with `ill_arg' and
21831f19738eSmeem  * is not one of our local addresses.  Caller must be inside the IPSQ.
2184bd670b35SErik Nordmark  */
21851f19738eSmeem static void
ipmp_ncec_delete_nonlocal(ncec_t * ncec,void * ill_arg)21868a06b3d6SToomas Soome ipmp_ncec_delete_nonlocal(ncec_t *ncec, void *ill_arg)
2187bd670b35SErik Nordmark {
21881f19738eSmeem 	if (!NCE_MYADDR(ncec) && ncec->ncec_ill == (ill_t *)ill_arg)
21891f19738eSmeem 		ncec_delete(ncec);
2190bd670b35SErik Nordmark }
2191bd670b35SErik Nordmark 
2192bd670b35SErik Nordmark /*
21931f19738eSmeem  * Delete any NCEs tied to the illgrp associated with `ncec'.  Caller need not
21941f19738eSmeem  * be inside the IPSQ.
2195bd670b35SErik Nordmark  */
2196bd670b35SErik Nordmark void
ipmp_ncec_delete_nce(ncec_t * ncec)21971f19738eSmeem ipmp_ncec_delete_nce(ncec_t *ncec)
2198bd670b35SErik Nordmark {
21991f19738eSmeem 	ipmp_illgrp_t	*illg = ncec->ncec_ill->ill_grp;
22001f19738eSmeem 	ip_stack_t	*ipst = ncec->ncec_ipst;
2201bd670b35SErik Nordmark 	ill_t		*ill;
2202bd670b35SErik Nordmark 	nce_t		*nce;
22031f19738eSmeem 	list_t		dead;
2204bd670b35SErik Nordmark 
22051f19738eSmeem 	ASSERT(IS_IPMP(ncec->ncec_ill));
2206bd670b35SErik Nordmark 
22071f19738eSmeem 	/*
22081f19738eSmeem 	 * For each underlying interface, delete `ncec' from its ill_nce list
22091f19738eSmeem 	 * via nce_fastpath_list_delete().  Defer the actual nce_refrele()
22101f19738eSmeem 	 * until we've dropped ill_g_lock.
22111f19738eSmeem 	 */
2212bd670b35SErik Nordmark 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
2213bd670b35SErik Nordmark 
2214bd670b35SErik Nordmark 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2215bd670b35SErik Nordmark 	ill = list_head(&illg->ig_if);
22161f19738eSmeem 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2217bd670b35SErik Nordmark 		nce_fastpath_list_delete(ill, ncec, &dead);
2218bd670b35SErik Nordmark 	rw_exit(&ipst->ips_ill_g_lock);
2219bd670b35SErik Nordmark 
22201f19738eSmeem 	while ((nce = list_remove_head(&dead)) != NULL)
2221bd670b35SErik Nordmark 		nce_refrele(nce);
22221f19738eSmeem 
2223bd670b35SErik Nordmark 	list_destroy(&dead);
2224bd670b35SErik Nordmark }
2225bd670b35SErik Nordmark 
2226bd670b35SErik Nordmark /*
22271f19738eSmeem  * Refresh any NCE entries tied to the illgrp associated with `ncec' to
22281f19738eSmeem  * use the information in `ncec'.  Caller need not be inside the IPSQ.
2229bd670b35SErik Nordmark  */
2230bd670b35SErik Nordmark void
ipmp_ncec_refresh_nce(ncec_t * ncec)22311f19738eSmeem ipmp_ncec_refresh_nce(ncec_t *ncec)
2232bd670b35SErik Nordmark {
22331f19738eSmeem 	ipmp_illgrp_t	*illg = ncec->ncec_ill->ill_grp;
22341f19738eSmeem 	ip_stack_t	*ipst = ncec->ncec_ipst;
2235bd670b35SErik Nordmark 	ill_t		*ill;
2236bd670b35SErik Nordmark 	nce_t		*nce, *nce_next;
2237bd670b35SErik Nordmark 	list_t		replace;
2238bd670b35SErik Nordmark 
22391f19738eSmeem 	ASSERT(IS_IPMP(ncec->ncec_ill));
2240bd670b35SErik Nordmark 
2241bd670b35SErik Nordmark 	/*
22421f19738eSmeem 	 * If `ncec' is not reachable, there is no use in refreshing NCEs.
2243bd670b35SErik Nordmark 	 */
2244bd670b35SErik Nordmark 	if (!NCE_ISREACHABLE(ncec))
2245bd670b35SErik Nordmark 		return;
2246bd670b35SErik Nordmark 
22471f19738eSmeem 	/*
22481f19738eSmeem 	 * Find all the NCEs matching ncec->ncec_addr.  We cannot update them
22491f19738eSmeem 	 * in-situ because we're holding ipmp_lock to prevent changes to IPMP
22501f19738eSmeem 	 * group membership and updating indirectly calls nce_fastpath_probe()
22511f19738eSmeem 	 * -> putnext() which cannot hold locks.  Thus, move the NCEs to a
22521f19738eSmeem 	 * separate list and process that list after dropping ipmp_lock.
22531f19738eSmeem 	 */
2254bd670b35SErik Nordmark 	list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node));
2255bd670b35SErik Nordmark 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2256bd670b35SErik Nordmark 	ill = list_head(&illg->ig_actif);
2257bd670b35SErik Nordmark 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
2258bd670b35SErik Nordmark 		mutex_enter(&ill->ill_lock);
22591f19738eSmeem 		nce = list_head(&ill->ill_nce);
22601f19738eSmeem 		for (; nce != NULL; nce = nce_next) {
2261bd670b35SErik Nordmark 			nce_next = list_next(&ill->ill_nce, nce);
22621f19738eSmeem 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr,
2263bd670b35SErik Nordmark 			    &ncec->ncec_addr)) {
22641f19738eSmeem 				nce_refhold(nce);
22651f19738eSmeem 				nce_delete(nce);
22661f19738eSmeem 				list_insert_tail(&replace, nce);
2267bd670b35SErik Nordmark 			}
2268bd670b35SErik Nordmark 		}
2269bd670b35SErik Nordmark 		mutex_exit(&ill->ill_lock);
2270bd670b35SErik Nordmark 	}
2271bd670b35SErik Nordmark 	rw_exit(&ipst->ips_ipmp_lock);
22721f19738eSmeem 
2273bd670b35SErik Nordmark 	/*
22741f19738eSmeem 	 * Process the list; nce_lookup_then_add_v* ensures that nce->nce_ill
22751f19738eSmeem 	 * is still in the group for ncec->ncec_ill.
2276bd670b35SErik Nordmark 	 */
22771f19738eSmeem 	while ((nce = list_remove_head(&replace)) != NULL) {
2278bd670b35SErik Nordmark 		if (ncec->ncec_ill->ill_isv6) {
2279bd670b35SErik Nordmark 			(void) nce_lookup_then_add_v6(nce->nce_ill,
22801f19738eSmeem 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
2281bd670b35SErik Nordmark 			    &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED,
2282bd670b35SErik Nordmark 			    NULL);
2283bd670b35SErik Nordmark 		} else {
2284bd670b35SErik Nordmark 			ipaddr_t ipaddr;
2285bd670b35SErik Nordmark 
2286bd670b35SErik Nordmark 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr);
2287bd670b35SErik Nordmark 			(void) nce_lookup_then_add_v4(nce->nce_ill,
2288bd670b35SErik Nordmark 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
2289bd670b35SErik Nordmark 			    &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL);
2290bd670b35SErik Nordmark 		}
2291bd670b35SErik Nordmark 		nce_refrele(nce);
2292bd670b35SErik Nordmark 	}
22931f19738eSmeem 
2294bd670b35SErik Nordmark 	list_destroy(&replace);
2295bd670b35SErik Nordmark }
2296