1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28 
29 /*
30  * Global list of phyints, phyint instances, phyint groups and the anonymous
31  * group; the latter is initialized in phyint_init().
32  */
33 struct phyint *phyints = NULL;
34 struct phyint_instance	*phyint_instances = NULL;
35 struct phyint_group *phyint_groups = NULL;
36 struct phyint_group *phyint_anongroup;
37 
38 /*
39  * Grouplist signature; initialized in phyint_init().
40  */
41 static uint64_t phyint_grouplistsig;
42 
43 static void phyint_inst_insert(struct phyint_instance *pii);
44 static void phyint_inst_print(struct phyint_instance *pii);
45 
46 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
47 static void phyint_delete(struct phyint *pi);
48 static boolean_t phyint_is_usable(struct phyint *pi);
49 
50 static void logint_print(struct logint *li);
51 static void logint_insert(struct phyint_instance *pii, struct logint *li);
52 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
53 
54 static void target_print(struct target *tg);
55 static void target_insert(struct phyint_instance *pii, struct target *tg);
56 static struct target *target_first(struct phyint_instance *pii);
57 static struct target *target_select_best(struct phyint_instance *pii);
58 static void target_flush_hosts(struct phyint_group *pg);
59 
60 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
61 
62 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
63 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
64 
65 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
66 static int phyint_group_state_event(struct phyint_group *pg);
67 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
68 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
69     ipmp_if_op_t op);
70 
71 static int logint_upcount(struct phyint *pi);
72 static uint64_t gensig(void);
73 
74 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
75 int
phyint_init(void)76 phyint_init(void)
77 {
78 	phyint_grouplistsig = gensig();
79 	if (track_all_phyints) {
80 		phyint_anongroup = phyint_group_create("");
81 		if (phyint_anongroup == NULL)
82 			return (-1);
83 		phyint_group_insert(phyint_anongroup);
84 	}
85 	return (0);
86 }
87 
88 /* Return the phyint with the given name */
89 struct phyint *
phyint_lookup(const char * name)90 phyint_lookup(const char *name)
91 {
92 	struct phyint *pi;
93 
94 	if (debug & D_PHYINT)
95 		logdebug("phyint_lookup(%s)\n", name);
96 
97 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
98 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
99 			break;
100 	}
101 	return (pi);
102 }
103 
104 /*
105  * Lookup a phyint in the group that has the same hardware address as `pi', or
106  * NULL if there's none.  If `online_only' is set, then only online phyints
107  * are considered when matching.  Otherwise, phyints that had been offlined
108  * due to a duplicate hardware address will also be considered.
109  */
110 static struct phyint *
phyint_lookup_hwaddr(struct phyint * pi,boolean_t online_only)111 phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only)
112 {
113 	struct phyint *pi2;
114 
115 	if (pi->pi_group == phyint_anongroup)
116 		return (NULL);
117 
118 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
119 		if (pi2 == pi)
120 			continue;
121 
122 		/*
123 		 * NOTE: even when online_only is B_FALSE, we ignore phyints
124 		 * that are administratively offline (rather than offline
125 		 * because they're dups); when they're brought back online,
126 		 * they'll be flagged as dups if need be.
127 		 */
128 		if (pi2->pi_state == PI_OFFLINE &&
129 		    (online_only || !pi2->pi_hwaddrdup))
130 			continue;
131 
132 		if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen &&
133 		    bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0)
134 			return (pi2);
135 	}
136 	return (NULL);
137 }
138 
139 /*
140  * Respond to DLPI notifications.  Currently, this only processes physical
141  * address changes for the phyint passed via `arg' by onlining or offlining
142  * phyints in the group.
143  */
144 /* ARGSUSED */
145 static void
phyint_link_notify(dlpi_handle_t dh,dlpi_notifyinfo_t * dnip,void * arg)146 phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
147 {
148 	struct phyint *pi = arg;
149 	struct phyint *oduppi = NULL, *duppi = NULL;
150 
151 	assert((dnip->dni_note & pi->pi_notes) != 0);
152 
153 	if (dnip->dni_note != DL_NOTE_PHYS_ADDR)
154 		return;
155 
156 	assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX);
157 
158 	/*
159 	 * If our hardware address hasn't changed, there's nothing to do.
160 	 */
161 	if (pi->pi_hwaddrlen == dnip->dni_physaddrlen &&
162 	    bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0)
163 		return;
164 
165 	oduppi = phyint_lookup_hwaddr(pi, _B_FALSE);
166 	pi->pi_hwaddrlen = dnip->dni_physaddrlen;
167 	(void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen);
168 	duppi = phyint_lookup_hwaddr(pi, _B_FALSE);
169 
170 	if (oduppi != NULL || pi->pi_hwaddrdup) {
171 		/*
172 		 * Our old hardware address was a duplicate.  If we'd been
173 		 * offlined because of it, and our new hardware address is not
174 		 * a duplicate, then bring us online.  Otherwise, `oduppi'
175 		 * must've been the one brought offline; bring it online.
176 		 */
177 		if (pi->pi_hwaddrdup) {
178 			if (duppi == NULL)
179 				(void) phyint_undo_offline(pi);
180 		} else {
181 			assert(oduppi->pi_hwaddrdup);
182 			(void) phyint_undo_offline(oduppi);
183 		}
184 	}
185 
186 	if (duppi != NULL && !pi->pi_hwaddrdup) {
187 		/*
188 		 * Our new hardware address was a duplicate and we're not
189 		 * yet flagged as a duplicate; bring us offline.
190 		 */
191 		pi->pi_hwaddrdup = _B_TRUE;
192 		(void) phyint_offline(pi, 0);
193 	}
194 }
195 
196 /*
197  * Initialize information about the underlying link for `pi', and set us
198  * up to be notified about future changes.  Returns _B_TRUE on success.
199  */
200 boolean_t
phyint_link_init(struct phyint * pi)201 phyint_link_init(struct phyint *pi)
202 {
203 	int retval;
204 	uint_t notes;
205 	const char *errmsg;
206 	dlpi_notifyid_t id;
207 
208 	pi->pi_notes = 0;
209 	retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0);
210 	if (retval != DLPI_SUCCESS) {
211 		pi->pi_dh = NULL;
212 		errmsg = "cannot open";
213 		goto failed;
214 	}
215 
216 	pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX;
217 	retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr,
218 	    &pi->pi_hwaddrlen);
219 	if (retval != DLPI_SUCCESS) {
220 		errmsg = "cannot get hardware address";
221 		goto failed;
222 	}
223 
224 	/*
225 	 * Check if the link supports DLPI link state notifications.  For
226 	 * historical reasons, the actual changes are tracked through routing
227 	 * sockets, so we immediately disable the notification upon success.
228 	 */
229 	notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
230 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
231 	if (retval == DLPI_SUCCESS) {
232 		(void) dlpi_disabnotify(pi->pi_dh, id, NULL);
233 		pi->pi_notes |= notes;
234 	}
235 
236 	/*
237 	 * Enable notification of hardware address changes to keep pi_hwaddr
238 	 * up-to-date and track if we need to offline/undo-offline phyints.
239 	 */
240 	notes = DL_NOTE_PHYS_ADDR;
241 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
242 	if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0)
243 		pi->pi_notes |= notes;
244 
245 	return (_B_TRUE);
246 failed:
247 	logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval));
248 	if (pi->pi_dh != NULL) {
249 		dlpi_close(pi->pi_dh);
250 		pi->pi_dh = NULL;
251 	}
252 	return (_B_FALSE);
253 }
254 
255 /*
256  * Close use of link on `pi'.
257  */
258 void
phyint_link_close(struct phyint * pi)259 phyint_link_close(struct phyint *pi)
260 {
261 	if (pi->pi_notes & DL_NOTE_PHYS_ADDR) {
262 		(void) poll_remove(dlpi_fd(pi->pi_dh));
263 		pi->pi_notes &= ~DL_NOTE_PHYS_ADDR;
264 	}
265 
266 	/*
267 	 * NOTE: we don't clear pi_notes here so that iflinkstate() can still
268 	 * properly report the link state even when offline (which is possible
269 	 * since we use IFF_RUNNING to track link state).
270 	 */
271 	dlpi_close(pi->pi_dh);
272 	pi->pi_dh = NULL;
273 }
274 
275 /* Return the phyint instance with the given name and the given family */
276 struct phyint_instance *
phyint_inst_lookup(int af,char * name)277 phyint_inst_lookup(int af, char *name)
278 {
279 	struct phyint *pi;
280 
281 	if (debug & D_PHYINT)
282 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
283 
284 	assert(af == AF_INET || af == AF_INET6);
285 
286 	pi = phyint_lookup(name);
287 	if (pi == NULL)
288 		return (NULL);
289 
290 	return (PHYINT_INSTANCE(pi, af));
291 }
292 
293 struct phyint_group *
phyint_group_lookup(const char * pg_name)294 phyint_group_lookup(const char *pg_name)
295 {
296 	struct phyint_group *pg;
297 
298 	if (debug & D_PHYINT)
299 		logdebug("phyint_group_lookup(%s)\n", pg_name);
300 
301 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
302 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
303 			break;
304 	}
305 	return (pg);
306 }
307 
308 /*
309  * Insert the phyint in the linked list of all phyints. If the phyint belongs
310  * to some group, insert it in the phyint group list.
311  */
312 static void
phyint_insert(struct phyint * pi,struct phyint_group * pg)313 phyint_insert(struct phyint *pi, struct phyint_group *pg)
314 {
315 	if (debug & D_PHYINT)
316 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
317 
318 	/* Insert the phyint at the head of the 'all phyints' list */
319 	pi->pi_next = phyints;
320 	pi->pi_prev = NULL;
321 	if (phyints != NULL)
322 		phyints->pi_prev = pi;
323 	phyints = pi;
324 
325 	/*
326 	 * Insert the phyint at the head of the 'phyint_group members' list
327 	 * of the phyint group to which it belongs.
328 	 */
329 	pi->pi_pgnext = NULL;
330 	pi->pi_pgprev = NULL;
331 	pi->pi_group = pg;
332 
333 	pi->pi_pgnext = pg->pg_phyint;
334 	if (pi->pi_pgnext != NULL)
335 		pi->pi_pgnext->pi_pgprev = pi;
336 	pg->pg_phyint = pi;
337 
338 	/* Refresh the group state now that this phyint has been added */
339 	phyint_group_refresh_state(pg);
340 
341 	pg->pg_sig++;
342 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
343 }
344 
345 /* Insert the phyint instance in the linked list of all phyint instances. */
346 static void
phyint_inst_insert(struct phyint_instance * pii)347 phyint_inst_insert(struct phyint_instance *pii)
348 {
349 	if (debug & D_PHYINT) {
350 		logdebug("phyint_inst_insert(%s %s)\n",
351 		    AF_STR(pii->pii_af), pii->pii_name);
352 	}
353 
354 	/*
355 	 * Insert the phyint at the head of the 'all phyint instances' list.
356 	 */
357 	pii->pii_next = phyint_instances;
358 	pii->pii_prev = NULL;
359 	if (phyint_instances != NULL)
360 		phyint_instances->pii_prev = pii;
361 	phyint_instances = pii;
362 }
363 
364 /*
365  * Create a new phyint with the given parameters. Also insert it into
366  * the list of all phyints and the list of phyint group members by calling
367  * phyint_insert().
368  */
369 static struct phyint *
phyint_create(char * pi_name,struct phyint_group * pg,uint_t ifindex,uint64_t flags)370 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
371     uint64_t flags)
372 {
373 	struct phyint *pi;
374 
375 	pi = calloc(1, sizeof (struct phyint));
376 	if (pi == NULL) {
377 		logperror("phyint_create: calloc");
378 		return (NULL);
379 	}
380 
381 	/*
382 	 * Record the phyint values.
383 	 */
384 	(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
385 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
386 	pi->pi_ifindex = ifindex;
387 	pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF));
388 
389 	pi->pi_state = PI_INIT;
390 	pi->pi_flags = PHYINT_FLAGS(flags);
391 
392 	/*
393 	 * Initialize the link state.  The link state is initialized to
394 	 * up, so that if the link is down when IPMP starts monitoring
395 	 * the interface, it will appear as though there has been a
396 	 * transition from the link up to link down.  This avoids
397 	 * having to treat this situation as a special case.
398 	 */
399 	INIT_LINK_STATE(pi);
400 
401 	if (!phyint_link_init(pi)) {
402 		free(pi);
403 		return (NULL);
404 	}
405 
406 	/*
407 	 * Insert the phyint in the list of all phyints, and the
408 	 * list of phyint group members
409 	 */
410 	phyint_insert(pi, pg);
411 
412 	return (pi);
413 }
414 
415 /*
416  * Create a new phyint instance belonging to the phyint 'pi' and address
417  * family 'af'. Also insert it into the list of all phyint instances by
418  * calling phyint_inst_insert().
419  */
420 static struct phyint_instance *
phyint_inst_create(struct phyint * pi,int af)421 phyint_inst_create(struct phyint *pi, int af)
422 {
423 	struct phyint_instance *pii;
424 
425 	pii = calloc(1, sizeof (struct phyint_instance));
426 	if (pii == NULL) {
427 		logperror("phyint_inst_create: calloc");
428 		return (NULL);
429 	}
430 
431 	/*
432 	 * Attach the phyint instance to the phyint.
433 	 * Set the back pointers as well
434 	 */
435 	pii->pii_phyint = pi;
436 	if (af == AF_INET)
437 		pi->pi_v4 = pii;
438 	else
439 		pi->pi_v6 = pii;
440 
441 	pii->pii_in_use = 1;
442 	pii->pii_probe_sock = -1;
443 	pii->pii_snxt = 1;
444 	pii->pii_af = af;
445 	pii->pii_fd_hrtime = gethrtime() +
446 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
447 	pii->pii_flags = pi->pi_flags;
448 
449 	/* Insert the phyint instance in the list of all phyint instances. */
450 	phyint_inst_insert(pii);
451 	return (pii);
452 }
453 
454 /*
455  * Change the state of phyint `pi' to state `state'.
456  */
457 void
phyint_chstate(struct phyint * pi,enum pi_state state)458 phyint_chstate(struct phyint *pi, enum pi_state state)
459 {
460 	/*
461 	 * To simplify things, some callers always set a given state
462 	 * regardless of the previous state of the phyint (e.g., setting
463 	 * PI_RUNNING when it's already set).  We shouldn't bother
464 	 * generating an event or consuming a signature for these, since
465 	 * the actual state of the interface is unchanged.
466 	 */
467 	if (pi->pi_state == state)
468 		return;
469 
470 	pi->pi_state = state;
471 	phyint_changed(pi);
472 }
473 
474 /*
475  * Note that `pi' has changed state.
476  */
477 void
phyint_changed(struct phyint * pi)478 phyint_changed(struct phyint *pi)
479 {
480 	pi->pi_group->pg_sig++;
481 	(void) phyint_state_event(pi->pi_group, pi);
482 }
483 
484 /*
485  * Insert the phyint group in the linked list of all phyint groups
486  * at the head of the list
487  */
488 void
phyint_group_insert(struct phyint_group * pg)489 phyint_group_insert(struct phyint_group *pg)
490 {
491 	pg->pg_next = phyint_groups;
492 	pg->pg_prev = NULL;
493 	if (phyint_groups != NULL)
494 		phyint_groups->pg_prev = pg;
495 	phyint_groups = pg;
496 
497 	phyint_grouplistsig++;
498 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
499 }
500 
501 /*
502  * Create a new phyint group called 'name'.
503  */
504 struct phyint_group *
phyint_group_create(const char * name)505 phyint_group_create(const char *name)
506 {
507 	struct	phyint_group *pg;
508 
509 	if (debug & D_PHYINT)
510 		logdebug("phyint_group_create(%s)\n", name);
511 
512 	pg = calloc(1, sizeof (struct phyint_group));
513 	if (pg == NULL) {
514 		logperror("phyint_group_create: calloc");
515 		return (NULL);
516 	}
517 
518 	(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
519 	pg->pg_sig = gensig();
520 	pg->pg_fdt = user_failure_detection_time;
521 	pg->pg_probeint = user_probe_interval;
522 	pg->pg_in_use = _B_TRUE;
523 
524 	/*
525 	 * Normal groups always start in the PG_FAILED state since they
526 	 * have no active interfaces.  In contrast, anonymous groups are
527 	 * heterogeneous and thus always PG_OK.
528 	 */
529 	pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED);
530 
531 	return (pg);
532 }
533 
534 /*
535  * Change the state of the phyint group `pg' to state `state'.
536  */
537 void
phyint_group_chstate(struct phyint_group * pg,enum pg_state state)538 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
539 {
540 	assert(pg != phyint_anongroup);
541 
542 	/*
543 	 * To simplify things, some callers always set a given state
544 	 * regardless of the previous state of the group (e.g., setting
545 	 * PG_DEGRADED when it's already set).  We shouldn't bother
546 	 * generating an event or consuming a signature for these, since
547 	 * the actual state of the group is unchanged.
548 	 */
549 	if (pg->pg_state == state)
550 		return;
551 
552 	pg->pg_state = state;
553 
554 	switch (state) {
555 	case PG_FAILED:
556 		/*
557 		 * We can never know with certainty that a group has
558 		 * failed.  It is possible that all known targets have
559 		 * failed simultaneously, and new targets have come up
560 		 * instead. If the targets are routers then router
561 		 * discovery will kick in, and we will see the new routers
562 		 * thru routing socket messages. But if the targets are
563 		 * hosts, we have to discover it by multicast.	So flush
564 		 * all the host targets. The next probe will send out a
565 		 * multicast echo request. If this is a group failure, we
566 		 * will still not see any response, otherwise the group
567 		 * will be repaired after we get NUM_PROBE_REPAIRS
568 		 * consecutive unicast replies on any phyint.
569 		 */
570 		target_flush_hosts(pg);
571 		break;
572 
573 	case PG_OK:
574 	case PG_DEGRADED:
575 		break;
576 
577 	default:
578 		logerr("phyint_group_chstate: invalid group state %d; "
579 		    "aborting\n", state);
580 		abort();
581 	}
582 
583 	pg->pg_sig++;
584 	(void) phyint_group_state_event(pg);
585 }
586 
587 /*
588  * Create a new phyint instance and initialize it from the values supplied by
589  * the kernel. Always check for ENXIO before logging any error, because the
590  * interface could have vanished after completion of SIOCGLIFCONF.
591  * Return values:
592  *	pointer to the phyint instance on success
593  *	NULL on failure Eg. if the phyint instance is not found in the kernel
594  */
595 struct phyint_instance *
phyint_inst_init_from_k(int af,char * pi_name)596 phyint_inst_init_from_k(int af, char *pi_name)
597 {
598 	char	pg_name[LIFNAMSIZ + 1];
599 	int	ifsock;
600 	uint_t	ifindex;
601 	uint64_t	flags;
602 	struct lifreq	lifr;
603 	struct phyint	*pi;
604 	struct phyint_instance	*pii;
605 	boolean_t	pi_created;
606 	struct phyint_group	*pg;
607 
608 retry:
609 	pii = NULL;
610 	pi = NULL;
611 	pg = NULL;
612 	pi_created = _B_FALSE;
613 
614 	if (debug & D_PHYINT) {
615 		logdebug("phyint_inst_init_from_k(%s %s)\n",
616 		    AF_STR(af), pi_name);
617 	}
618 
619 	assert(af == AF_INET || af == AF_INET6);
620 
621 	/* Get the socket for doing ioctls */
622 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
623 
624 	/*
625 	 * Get the interface flags.  Ignore virtual interfaces, IPMP
626 	 * meta-interfaces, point-to-point interfaces, and interfaces
627 	 * that can't support multicast.
628 	 */
629 	(void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
630 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
631 		if (errno != ENXIO) {
632 			logperror("phyint_inst_init_from_k:"
633 			    " ioctl (get flags)");
634 		}
635 		return (NULL);
636 	}
637 	flags = lifr.lifr_flags;
638 	if (!(flags & IFF_MULTICAST) ||
639 	    (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT)))
640 		return (NULL);
641 
642 	/*
643 	 * Get the ifindex for recording later in our tables, in case we need
644 	 * to create a new phyint.
645 	 */
646 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
647 		if (errno != ENXIO) {
648 			logperror("phyint_inst_init_from_k: "
649 			    " ioctl (get lifindex)");
650 		}
651 		return (NULL);
652 	}
653 	ifindex = lifr.lifr_index;
654 
655 	/*
656 	 * Get the phyint group name of this phyint, from the kernel.
657 	 */
658 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
659 		if (errno != ENXIO) {
660 			logperror("phyint_inst_init_from_k: "
661 			    "ioctl (get group name)");
662 		}
663 		return (NULL);
664 	}
665 	(void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
666 
667 	/*
668 	 * If the phyint is not part of any group, pg_name is the
669 	 * null string. If 'track_all_phyints' is false, there is no
670 	 * need to create a phyint.
671 	 */
672 	if (pg_name[0] == '\0' && !track_all_phyints) {
673 		/*
674 		 * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are
675 		 * set, reset them. These flags shouldn't be set if in.mpathd
676 		 * isn't tracking the interface.
677 		 */
678 		if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) {
679 			lifr.lifr_flags = flags &
680 			    ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE);
681 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
682 				if (errno != ENXIO) {
683 					logperror("phyint_inst_init_from_k:"
684 					    " ioctl (set flags)");
685 				}
686 			}
687 		}
688 		return (NULL);
689 	}
690 
691 	/*
692 	 * We need to create a new phyint instance.  We may also need to
693 	 * create the group if e.g. the SIOCGLIFCONF loop in initifs() found
694 	 * an underlying interface before it found its IPMP meta-interface.
695 	 * Note that we keep any created groups even if phyint_inst_from_k()
696 	 * fails since a group's existence is not dependent on the ability of
697 	 * in.mpathd to the track the group's interfaces.
698 	 */
699 	if ((pg = phyint_group_lookup(pg_name)) == NULL) {
700 		if ((pg = phyint_group_create(pg_name)) == NULL) {
701 			logerr("phyint_inst_init_from_k: cannot create group "
702 			    "%s\n", pg_name);
703 			return (NULL);
704 		}
705 		phyint_group_insert(pg);
706 	}
707 
708 	/*
709 	 * Lookup the phyint. If the phyint does not exist create it.
710 	 */
711 	pi = phyint_lookup(pi_name);
712 	if (pi == NULL) {
713 		pi = phyint_create(pi_name, pg, ifindex, flags);
714 		if (pi == NULL) {
715 			logerr("phyint_inst_init_from_k:"
716 			    " unable to create phyint %s\n", pi_name);
717 			return (NULL);
718 		}
719 		pi_created = _B_TRUE;
720 	} else {
721 		/* The phyint exists already. */
722 		assert(pi_created == _B_FALSE);
723 		/*
724 		 * Normally we should see consistent values for the IPv4 and
725 		 * IPv6 instances, for phyint properties. If we don't, it
726 		 * means things have changed underneath us, and we should
727 		 * resync our tables with the kernel. Check whether the
728 		 * interface index has changed. If so, it is most likely
729 		 * the interface has been unplumbed and replumbed,
730 		 * while we are yet to update our tables. Do it now.
731 		 */
732 		if (pi->pi_ifindex != ifindex) {
733 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
734 			goto retry;
735 		}
736 		assert(PHYINT_INSTANCE(pi, af) == NULL);
737 
738 		/*
739 		 * If the group name seen by the IPv4 and IPv6 instances
740 		 * are different, it is most likely the groupname has
741 		 * changed, while we are yet to update our tables. Do it now.
742 		 */
743 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
744 			phyint_inst_delete(PHYINT_INSTANCE(pi,
745 			    AF_OTHER(af)));
746 			goto retry;
747 		}
748 	}
749 
750 	/*
751 	 * Create a new phyint instance, corresponding to the 'af'
752 	 * passed in.
753 	 */
754 	pii = phyint_inst_create(pi, af);
755 	if (pii == NULL) {
756 		logerr("phyint_inst_init_from_k: unable to create"
757 		    "phyint inst %s\n", pi->pi_name);
758 		if (pi_created)
759 			phyint_delete(pi);
760 
761 		return (NULL);
762 	}
763 
764 	/*
765 	 * NOTE: the change_pif_flags() implementation requires a phyint
766 	 * instance before it can function, so a number of tasks that would
767 	 * otherwise be done in phyint_create() are deferred to here.
768 	 */
769 	if (pi_created) {
770 		/*
771 		 * If the interface is offline, set the state to PI_OFFLINE.
772 		 * Otherwise, optimistically consider this interface running.
773 		 * Later (in process_link_state_changes()), we will adjust
774 		 * this to match the current state of the link.  Further, if
775 		 * test addresses are subsequently assigned, we will
776 		 * transition to PI_NOTARGETS and then to either PI_RUNNING or
777 		 * PI_FAILED depending on the probe results.
778 		 */
779 		if (pi->pi_flags & IFF_OFFLINE) {
780 			phyint_chstate(pi, PI_OFFLINE);
781 		} else {
782 			/* calls phyint_chstate() */
783 			phyint_transition_to_running(pi);
784 		}
785 
786 		/*
787 		 * If this a standby phyint, determine whether it should be
788 		 * IFF_INACTIVE.
789 		 */
790 		if (pi->pi_flags & IFF_STANDBY)
791 			phyint_standby_refresh_inactive(pi);
792 
793 		/*
794 		 * If this phyint does not have a unique hardware address in its
795 		 * group, offline it.
796 		 */
797 		if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
798 			pi->pi_hwaddrdup = _B_TRUE;
799 			(void) phyint_offline(pi, 0);
800 		}
801 	}
802 
803 	return (pii);
804 }
805 
806 /*
807  * Bind pii_probe_sock to the address associated with pii_probe_logint.
808  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
809  * targets. Do the common part in this function, and complete the
810  * initializations by calling the protocol specific functions
811  * phyint_inst_v{4,6}_sockinit() respectively.
812  *
813  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
814  */
815 boolean_t
phyint_inst_sockinit(struct phyint_instance * pii)816 phyint_inst_sockinit(struct phyint_instance *pii)
817 {
818 	boolean_t success;
819 	struct phyint_group *pg;
820 
821 	if (debug & D_PHYINT) {
822 		logdebug("phyint_inst_sockinit(%s %s)\n",
823 		    AF_STR(pii->pii_af), pii->pii_name);
824 	}
825 
826 	assert(pii->pii_probe_logint != NULL);
827 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
828 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
829 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
830 
831 	/*
832 	 * If the socket is already bound, close pii_probe_sock
833 	 */
834 	if (pii->pii_probe_sock != -1)
835 		close_probe_socket(pii, _B_TRUE);
836 
837 	/*
838 	 * If the phyint is not part of a named group and track_all_phyints is
839 	 * false, simply return.
840 	 */
841 	pg = pii->pii_phyint->pi_group;
842 	if (pg == phyint_anongroup && !track_all_phyints) {
843 		if (debug & D_PHYINT)
844 			logdebug("phyint_inst_sockinit: no group\n");
845 		return (_B_FALSE);
846 	}
847 
848 	/*
849 	 * Initialize the socket by calling the protocol specific function.
850 	 * If it succeeds, add the socket to the poll list.
851 	 */
852 	if (pii->pii_af == AF_INET6)
853 		success = phyint_inst_v6_sockinit(pii);
854 	else
855 		success = phyint_inst_v4_sockinit(pii);
856 
857 	if (success && (poll_add(pii->pii_probe_sock) == 0))
858 		return (_B_TRUE);
859 
860 	/* Something failed, cleanup and return false */
861 	if (pii->pii_probe_sock != -1)
862 		close_probe_socket(pii, _B_FALSE);
863 
864 	return (_B_FALSE);
865 }
866 
867 /*
868  * IPv6 specific part in initializing the pii_probe_sock. This socket is
869  * used to send/receive ICMPv6 probe packets.
870  */
871 static boolean_t
phyint_inst_v6_sockinit(struct phyint_instance * pii)872 phyint_inst_v6_sockinit(struct phyint_instance *pii)
873 {
874 	icmp6_filter_t filter;
875 	int hopcount = 1;
876 	int off = 0;
877 	int on = 1;
878 	struct	sockaddr_in6	testaddr;
879 	int flags;
880 
881 	/*
882 	 * Open a raw socket with ICMPv6 protocol.
883 	 *
884 	 * Use IPV6_BOUND_IF to make sure that probes are sent and received on
885 	 * the specified phyint only.  Bind to the test address to ensure that
886 	 * the responses are sent to the specified phyint.
887 	 *
888 	 * Set the hopcount to 1 so that probe packets are not routed.
889 	 * Disable multicast loopback. Set the receive filter to
890 	 * receive only ICMPv6 echo replies.
891 	 */
892 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
893 	if (pii->pii_probe_sock < 0) {
894 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
895 		return (_B_FALSE);
896 	}
897 
898 	/*
899 	 * Probes must not block in case of lower layer issues.
900 	 */
901 	if ((flags = fcntl(pii->pii_probe_sock, F_GETFL, 0)) == -1) {
902 		logperror_pii(pii, "phyint_inst_v6_sockinit: fcntl"
903 		    " F_GETFL");
904 		return (_B_FALSE);
905 	}
906 	if (fcntl(pii->pii_probe_sock, F_SETFL,
907 	    flags | O_NONBLOCK) == -1) {
908 		logperror_pii(pii, "phyint_inst_v6_sockinit: fcntl"
909 		    " F_SETFL O_NONBLOCK");
910 		return (_B_FALSE);
911 	}
912 
913 	bzero(&testaddr, sizeof (testaddr));
914 	testaddr.sin6_family = AF_INET6;
915 	testaddr.sin6_port = 0;
916 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
917 
918 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
919 	    sizeof (testaddr)) < 0) {
920 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
921 		return (_B_FALSE);
922 	}
923 
924 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF,
925 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
926 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
927 		    " IPV6_MULTICAST_IF");
928 		return (_B_FALSE);
929 	}
930 
931 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF,
932 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
933 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
934 		    " IPV6_BOUND_IF");
935 		return (_B_FALSE);
936 	}
937 
938 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
939 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
940 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
941 		    " IPV6_UNICAST_HOPS");
942 		return (_B_FALSE);
943 	}
944 
945 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
946 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
947 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
948 		    " IPV6_MULTICAST_HOPS");
949 		return (_B_FALSE);
950 	}
951 
952 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
953 	    (char *)&off, sizeof (off)) < 0) {
954 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
955 		    " IPV6_MULTICAST_LOOP");
956 		return (_B_FALSE);
957 	}
958 
959 	/*
960 	 * Filter out so that we only receive ICMP echo replies
961 	 */
962 	ICMP6_FILTER_SETBLOCKALL(&filter);
963 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
964 
965 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
966 	    (char *)&filter, sizeof (filter)) < 0) {
967 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
968 		    " ICMP6_FILTER");
969 		return (_B_FALSE);
970 	}
971 
972 	/* Enable receipt of hoplimit */
973 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
974 	    &on, sizeof (on)) < 0) {
975 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
976 		    " IPV6_RECVHOPLIMIT");
977 		return (_B_FALSE);
978 	}
979 
980 	/* Enable receipt of timestamp */
981 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP,
982 	    &on, sizeof (on)) < 0) {
983 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
984 		    " SO_TIMESTAMP");
985 		return (_B_FALSE);
986 	}
987 
988 	return (_B_TRUE);
989 }
990 
991 /*
992  * IPv4 specific part in initializing the pii_probe_sock. This socket is
993  * used to send/receive ICMPv4 probe packets.
994  */
995 static boolean_t
phyint_inst_v4_sockinit(struct phyint_instance * pii)996 phyint_inst_v4_sockinit(struct phyint_instance *pii)
997 {
998 	struct sockaddr_in  testaddr;
999 	char	char_off = 0;
1000 	int	ttl = 1;
1001 	char	char_ttl = 1;
1002 	int	on = 1;
1003 	int	flags;
1004 
1005 	/*
1006 	 * Open a raw socket with ICMPv4 protocol.
1007 	 *
1008 	 * Use IP_BOUND_IF to make sure that probes are sent and received on
1009 	 * the specified phyint only.  Bind to the test address to ensure that
1010 	 * the responses are sent to the specified phyint.
1011 	 *
1012 	 * Set the ttl to 1 so that probe packets are not routed.
1013 	 * Disable multicast loopback.  Enable receipt of timestamp.
1014 	 */
1015 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
1016 	if (pii->pii_probe_sock < 0) {
1017 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
1018 		return (_B_FALSE);
1019 	}
1020 
1021 	/*
1022 	 * Probes must not block in case of lower layer issues.
1023 	 */
1024 	if ((flags = fcntl(pii->pii_probe_sock, F_GETFL, 0)) == -1) {
1025 		logperror_pii(pii, "phyint_inst_v4_sockinit: fcntl"
1026 		    " F_GETFL");
1027 		return (_B_FALSE);
1028 	}
1029 	if (fcntl(pii->pii_probe_sock, F_SETFL,
1030 	    flags | O_NONBLOCK) == -1) {
1031 		logperror_pii(pii, "phyint_inst_v4_sockinit: fcntl"
1032 		    " F_SETFL O_NONBLOCK");
1033 		return (_B_FALSE);
1034 	}
1035 
1036 	bzero(&testaddr, sizeof (testaddr));
1037 	testaddr.sin_family = AF_INET;
1038 	testaddr.sin_port = 0;
1039 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
1040 	    &testaddr.sin_addr);
1041 
1042 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
1043 	    sizeof (testaddr)) < 0) {
1044 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
1045 		return (_B_FALSE);
1046 	}
1047 
1048 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF,
1049 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
1050 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1051 		    " IP_BOUND_IF");
1052 		return (_B_FALSE);
1053 	}
1054 
1055 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF,
1056 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
1057 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1058 		    " IP_MULTICAST_IF");
1059 		return (_B_FALSE);
1060 	}
1061 
1062 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
1063 	    (char *)&ttl, sizeof (ttl)) < 0) {
1064 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1065 		    " IP_TTL");
1066 		return (_B_FALSE);
1067 	}
1068 
1069 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
1070 	    (char *)&char_off, sizeof (char_off)) == -1) {
1071 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1072 		    " IP_MULTICAST_LOOP");
1073 		return (_B_FALSE);
1074 	}
1075 
1076 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
1077 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
1078 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1079 		    " IP_MULTICAST_TTL");
1080 		return (_B_FALSE);
1081 	}
1082 
1083 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on,
1084 	    sizeof (on)) < 0) {
1085 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1086 		    " SO_TIMESTAMP");
1087 		return (_B_FALSE);
1088 	}
1089 
1090 	return (_B_TRUE);
1091 }
1092 
1093 /*
1094  * Remove the phyint group from the list of 'all phyint groups'
1095  * and free it.
1096  */
1097 void
phyint_group_delete(struct phyint_group * pg)1098 phyint_group_delete(struct phyint_group *pg)
1099 {
1100 	/*
1101 	 * The anonymous group always exists, even when empty.
1102 	 */
1103 	if (pg == phyint_anongroup)
1104 		return;
1105 
1106 	if (debug & D_PHYINT)
1107 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
1108 
1109 	/*
1110 	 * The phyint group must be empty, and must not have any phyints.
1111 	 * The phyint group must be in the list of all phyint groups
1112 	 */
1113 	assert(pg->pg_phyint == NULL);
1114 	assert(phyint_groups == pg || pg->pg_prev != NULL);
1115 
1116 	if (pg->pg_prev != NULL)
1117 		pg->pg_prev->pg_next = pg->pg_next;
1118 	else
1119 		phyint_groups = pg->pg_next;
1120 
1121 	if (pg->pg_next != NULL)
1122 		pg->pg_next->pg_prev = pg->pg_prev;
1123 
1124 	pg->pg_next = NULL;
1125 	pg->pg_prev = NULL;
1126 
1127 	phyint_grouplistsig++;
1128 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
1129 
1130 	addrlist_free(&pg->pg_addrs);
1131 	free(pg);
1132 }
1133 
1134 /*
1135  * Refresh the state of `pg' based on its current members.
1136  */
1137 void
phyint_group_refresh_state(struct phyint_group * pg)1138 phyint_group_refresh_state(struct phyint_group *pg)
1139 {
1140 	enum pg_state state;
1141 	enum pg_state origstate = pg->pg_state;
1142 	struct phyint *pi, *usablepi;
1143 	uint_t nif = 0, nusable = 0;
1144 
1145 	/*
1146 	 * Anonymous groups never change state.
1147 	 */
1148 	if (pg == phyint_anongroup)
1149 		return;
1150 
1151 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1152 		nif++;
1153 		if (phyint_is_usable(pi)) {
1154 			nusable++;
1155 			usablepi = pi;
1156 		}
1157 	}
1158 
1159 	if (nusable == 0)
1160 		state = PG_FAILED;
1161 	else if (nif == nusable)
1162 		state = PG_OK;
1163 	else
1164 		state = PG_DEGRADED;
1165 
1166 	phyint_group_chstate(pg, state);
1167 
1168 	/*
1169 	 * If we're shutting down, skip logging messages since otherwise our
1170 	 * shutdown housecleaning will make us report that groups are unusable.
1171 	 */
1172 	if (cleanup_started)
1173 		return;
1174 
1175 	/*
1176 	 * NOTE: We use pg_failmsg_printed rather than origstate since
1177 	 * otherwise at startup we'll log a "now usable" message when the
1178 	 * first usable phyint is added to an empty group.
1179 	 */
1180 	if (state != PG_FAILED && pg->pg_failmsg_printed) {
1181 		assert(origstate == PG_FAILED);
1182 		logerr("At least 1 IP interface (%s) in group %s is now "
1183 		    "usable\n", usablepi->pi_name, pg->pg_name);
1184 		pg->pg_failmsg_printed = _B_FALSE;
1185 	} else if (origstate != PG_FAILED && state == PG_FAILED) {
1186 		logerr("All IP interfaces in group %s are now unusable\n",
1187 		    pg->pg_name);
1188 		pg->pg_failmsg_printed = _B_TRUE;
1189 	}
1190 }
1191 
1192 /*
1193  * Extract information from the kernel about the desired phyint.
1194  * Look only for properties of the phyint and not properties of logints.
1195  * Take appropriate action on the changes.
1196  * Return codes:
1197  *	PI_OK
1198  *		The phyint exists in the kernel and matches our knowledge
1199  *		of the phyint.
1200  *	PI_DELETED
1201  *		The phyint has vanished in the kernel.
1202  *	PI_IFINDEX_CHANGED
1203  *		The phyint's interface index has changed.
1204  *		Ask the caller to delete and recreate the phyint.
1205  *	PI_IOCTL_ERROR
1206  *		Some ioctl error. Don't change anything.
1207  *	PI_GROUP_CHANGED
1208  *		The phyint has changed group.
1209  */
1210 int
phyint_inst_update_from_k(struct phyint_instance * pii)1211 phyint_inst_update_from_k(struct phyint_instance *pii)
1212 {
1213 	struct lifreq lifr;
1214 	int	ifsock;
1215 	struct phyint *pi;
1216 
1217 	pi = pii->pii_phyint;
1218 
1219 	if (debug & D_PHYINT) {
1220 		logdebug("phyint_inst_update_from_k(%s %s)\n",
1221 		    AF_STR(pii->pii_af), pi->pi_name);
1222 	}
1223 
1224 	/*
1225 	 * Get the ifindex from the kernel, for comparison with the
1226 	 * value in our tables.
1227 	 */
1228 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
1229 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1230 
1231 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1232 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
1233 		if (errno == ENXIO) {
1234 			return (PI_DELETED);
1235 		} else {
1236 			logperror_pii(pii, "phyint_inst_update_from_k:"
1237 			    " ioctl (get lifindex)");
1238 			return (PI_IOCTL_ERROR);
1239 		}
1240 	}
1241 
1242 	if (lifr.lifr_index != pi->pi_ifindex) {
1243 		/*
1244 		 * The index has changed. Most likely the interface has
1245 		 * been unplumbed and replumbed. Ask the caller to take
1246 		 * appropriate action.
1247 		 */
1248 		if (debug & D_PHYINT) {
1249 			logdebug("phyint_inst_update_from_k:"
1250 			    " old index %d new index %d\n",
1251 			    pi->pi_ifindex, lifr.lifr_index);
1252 		}
1253 		return (PI_IFINDEX_CHANGED);
1254 	}
1255 
1256 	/*
1257 	 * Get the group name from the kernel, for comparison with
1258 	 * the value in our tables.
1259 	 */
1260 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
1261 		if (errno == ENXIO) {
1262 			return (PI_DELETED);
1263 		} else {
1264 			logperror_pii(pii, "phyint_inst_update_from_k:"
1265 			    " ioctl (get groupname)");
1266 			return (PI_IOCTL_ERROR);
1267 		}
1268 	}
1269 
1270 	/*
1271 	 * If the phyint has changed group i.e. if the phyint group name
1272 	 * returned by the kernel is different, ask the caller to delete
1273 	 * and recreate the phyint in the right group
1274 	 */
1275 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
1276 		/* Groupname has changed */
1277 		if (debug & D_PHYINT) {
1278 			logdebug("phyint_inst_update_from_k:"
1279 			    " groupname change\n");
1280 		}
1281 		return (PI_GROUP_CHANGED);
1282 	}
1283 
1284 	/*
1285 	 * Get the current phyint flags from the kernel, and determine what
1286 	 * flags have changed by comparing against our tables.	Note that the
1287 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
1288 	 * that IFF_INACTIVE is really still set on the interface.
1289 	 */
1290 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
1291 		if (errno == ENXIO) {
1292 			return (PI_DELETED);
1293 		} else {
1294 			logperror_pii(pii, "phyint_inst_update_from_k: "
1295 			    " ioctl (get flags)");
1296 			return (PI_IOCTL_ERROR);
1297 		}
1298 	}
1299 
1300 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
1301 	if (pi->pi_v4 != NULL)
1302 		pi->pi_v4->pii_flags = pi->pi_flags;
1303 	if (pi->pi_v6 != NULL)
1304 		pi->pi_v6->pii_flags = pi->pi_flags;
1305 
1306 	/*
1307 	 * Make sure the IFF_FAILED flag is set if and only if we think
1308 	 * the interface should be failed.
1309 	 */
1310 	if (pi->pi_flags & IFF_FAILED) {
1311 		if (pi->pi_state == PI_RUNNING)
1312 			(void) change_pif_flags(pi, 0, IFF_FAILED);
1313 	} else {
1314 		if (pi->pi_state == PI_FAILED)
1315 			(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1316 	}
1317 
1318 	/* No change in phyint status */
1319 	return (PI_OK);
1320 }
1321 
1322 /*
1323  * Delete the phyint. Remove it from the list of all phyints, and the
1324  * list of phyint group members.
1325  */
1326 static void
phyint_delete(struct phyint * pi)1327 phyint_delete(struct phyint *pi)
1328 {
1329 	boolean_t active;
1330 	struct phyint *pi2;
1331 	struct phyint_group *pg = pi->pi_group;
1332 
1333 	if (debug & D_PHYINT)
1334 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1335 
1336 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1337 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1338 
1339 	/*
1340 	 * The phyint must belong to a group.
1341 	 */
1342 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1343 
1344 	/* The phyint must be in the list of all phyints */
1345 	assert(phyints == pi || pi->pi_prev != NULL);
1346 
1347 	/* Remove the phyint from the phyint group list */
1348 	pg->pg_sig++;
1349 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1350 
1351 	if (pi->pi_pgprev == NULL) {
1352 		/* Phyint is the 1st in the phyint group list */
1353 		pg->pg_phyint = pi->pi_pgnext;
1354 	} else {
1355 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1356 	}
1357 	if (pi->pi_pgnext != NULL)
1358 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1359 	pi->pi_pgnext = NULL;
1360 	pi->pi_pgprev = NULL;
1361 
1362 	/* Refresh the group state now that this phyint has been removed */
1363 	phyint_group_refresh_state(pg);
1364 
1365 	/* Remove the phyint from the global list of phyints */
1366 	if (pi->pi_prev == NULL) {
1367 		/* Phyint is the 1st in the list */
1368 		phyints = pi->pi_next;
1369 	} else {
1370 		pi->pi_prev->pi_next = pi->pi_next;
1371 	}
1372 	if (pi->pi_next != NULL)
1373 		pi->pi_next->pi_prev = pi->pi_prev;
1374 	pi->pi_next = NULL;
1375 	pi->pi_prev = NULL;
1376 
1377 	/*
1378 	 * See if another phyint in the group had been offlined because
1379 	 * it was a dup of `pi' -- and if so, online it.
1380 	 */
1381 	if (!pi->pi_hwaddrdup &&
1382 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1383 		assert(pi2->pi_hwaddrdup);
1384 		(void) phyint_undo_offline(pi2);
1385 	}
1386 
1387 	/*
1388 	 * If the interface was in a named group and was either an active
1389 	 * standby or the last active interface, try to activate another
1390 	 * interface to compensate.
1391 	 */
1392 	if (pg != phyint_anongroup) {
1393 		active = _B_FALSE;
1394 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1395 			if (phyint_is_functioning(pi2) &&
1396 			    !(pi2->pi_flags & IFF_INACTIVE)) {
1397 				active = _B_TRUE;
1398 				break;
1399 			}
1400 		}
1401 
1402 		if (!active ||
1403 		    (pi->pi_flags & (IFF_STANDBY|IFF_INACTIVE)) == IFF_STANDBY)
1404 			phyint_activate_another(pi);
1405 	}
1406 
1407 	phyint_link_close(pi);
1408 	free(pi);
1409 }
1410 
1411 /*
1412  * Offline phyint `pi' if at least `minred' usable interfaces remain in the
1413  * group.  Returns an IPMP error code.
1414  */
1415 int
phyint_offline(struct phyint * pi,uint_t minred)1416 phyint_offline(struct phyint *pi, uint_t minred)
1417 {
1418 	boolean_t was_active;
1419 	unsigned int nusable = 0;
1420 	struct phyint *pi2;
1421 	struct phyint_group *pg = pi->pi_group;
1422 
1423 	/*
1424 	 * Verify that enough usable interfaces in the group would remain.
1425 	 * As a special case, if the group has failed, allow any non-offline
1426 	 * phyints to be offlined.
1427 	 */
1428 	if (pg != phyint_anongroup) {
1429 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1430 			if (pi2 == pi)
1431 				continue;
1432 			if (phyint_is_usable(pi2) ||
1433 			    (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE))
1434 				nusable++;
1435 		}
1436 	}
1437 	if (nusable < minred)
1438 		return (IPMP_EMINRED);
1439 
1440 	was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1441 
1442 	if (!change_pif_flags(pi, IFF_OFFLINE, IFF_INACTIVE))
1443 		return (IPMP_FAILURE);
1444 
1445 	/*
1446 	 * The interface is now offline, so stop probing it.  Note that
1447 	 * if_mpadm(8) will down the test addresses, after receiving a
1448 	 * success reply from us. The routing socket message will then make us
1449 	 * close the socket used for sending probes. But it is more logical
1450 	 * that an offlined interface must not be probed, even if it has test
1451 	 * addresses.
1452 	 *
1453 	 * NOTE: stop_probing() also sets PI_OFFLINE.
1454 	 */
1455 	stop_probing(pi);
1456 
1457 	/*
1458 	 * If we're offlining the phyint because it has a duplicate hardware
1459 	 * address, print a warning -- and leave the link open so that we can
1460 	 * be notified of hardware address changes that make it usable again.
1461 	 * Otherwise, close the link so that we won't prevent a detach.
1462 	 */
1463 	if (pi->pi_hwaddrdup) {
1464 		logerr("IP interface %s has a hardware address which is not "
1465 		    "unique in group %s; offlining\n", pi->pi_name,
1466 		    pg->pg_name);
1467 	} else {
1468 		phyint_link_close(pi);
1469 	}
1470 
1471 	/*
1472 	 * If this phyint was preventing another phyint with a duplicate
1473 	 * hardware address from being online, bring that one online now.
1474 	 */
1475 	if (!pi->pi_hwaddrdup &&
1476 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1477 		assert(pi2->pi_hwaddrdup);
1478 		(void) phyint_undo_offline(pi2);
1479 	}
1480 
1481 	/*
1482 	 * If this interface was active, try to activate another INACTIVE
1483 	 * interface in the group.
1484 	 */
1485 	if (was_active)
1486 		phyint_activate_another(pi);
1487 
1488 	return (IPMP_SUCCESS);
1489 }
1490 
1491 /*
1492  * Undo a previous offline of `pi'.  Returns an IPMP error code.
1493  */
1494 int
phyint_undo_offline(struct phyint * pi)1495 phyint_undo_offline(struct phyint *pi)
1496 {
1497 	if (pi->pi_state != PI_OFFLINE) {
1498 		errno = EINVAL;
1499 		return (IPMP_FAILURE);
1500 	}
1501 
1502 	/*
1503 	 * If necessary, reinitialize our link information and verify that its
1504 	 * hardware address is still unique across the group.
1505 	 */
1506 	if (pi->pi_dh == NULL && !phyint_link_init(pi)) {
1507 		errno = EIO;
1508 		return (IPMP_FAILURE);
1509 	}
1510 
1511 	if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
1512 		pi->pi_hwaddrdup = _B_TRUE;
1513 		return (IPMP_EHWADDRDUP);
1514 	}
1515 
1516 	if (pi->pi_hwaddrdup) {
1517 		logerr("IP interface %s now has a unique hardware address in "
1518 		    "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name);
1519 		pi->pi_hwaddrdup = _B_FALSE;
1520 	}
1521 
1522 	if (!change_pif_flags(pi, 0, IFF_OFFLINE))
1523 		return (IPMP_FAILURE);
1524 
1525 	/*
1526 	 * While the interface was offline, it may have failed (e.g. the link
1527 	 * may have gone down).  phyint_inst_check_for_failure() will have
1528 	 * already set pi_flags with IFF_FAILED, so we can use that to decide
1529 	 * whether the phyint should transition to running.  Note that after
1530 	 * we transition to running, we will start sending probes again (if
1531 	 * test addresses are configured), which may also reveal that the
1532 	 * interface is in fact failed.
1533 	 */
1534 	if (pi->pi_flags & IFF_FAILED) {
1535 		phyint_chstate(pi, PI_FAILED);
1536 	} else {
1537 		/* calls phyint_chstate() */
1538 		phyint_transition_to_running(pi);
1539 	}
1540 
1541 	/*
1542 	 * Give the requestor time to configure test addresses before
1543 	 * complaining that they're missing.
1544 	 */
1545 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
1546 
1547 	return (IPMP_SUCCESS);
1548 }
1549 
1550 /*
1551  * Delete (unlink and free), the phyint instance.
1552  */
1553 void
phyint_inst_delete(struct phyint_instance * pii)1554 phyint_inst_delete(struct phyint_instance *pii)
1555 {
1556 	struct phyint *pi = pii->pii_phyint;
1557 
1558 	assert(pi != NULL);
1559 
1560 	if (debug & D_PHYINT) {
1561 		logdebug("phyint_inst_delete(%s %s)\n",
1562 		    AF_STR(pii->pii_af), pi->pi_name);
1563 	}
1564 
1565 	/*
1566 	 * If the phyint instance has associated probe targets
1567 	 * delete all the targets
1568 	 */
1569 	while (pii->pii_targets != NULL)
1570 		target_delete(pii->pii_targets);
1571 
1572 	/*
1573 	 * Delete all the logints associated with this phyint
1574 	 * instance.
1575 	 */
1576 	while (pii->pii_logint != NULL)
1577 		logint_delete(pii->pii_logint);
1578 
1579 	/*
1580 	 * Close the socket used to send probes to targets from this phyint.
1581 	 */
1582 	if (pii->pii_probe_sock != -1)
1583 		close_probe_socket(pii, _B_TRUE);
1584 
1585 	/*
1586 	 * Phyint instance must be in the list of all phyint instances.
1587 	 * Remove phyint instance from the global list of phyint instances.
1588 	 */
1589 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1590 	if (pii->pii_prev == NULL) {
1591 		/* Phyint is the 1st in the list */
1592 		phyint_instances = pii->pii_next;
1593 	} else {
1594 		pii->pii_prev->pii_next = pii->pii_next;
1595 	}
1596 	if (pii->pii_next != NULL)
1597 		pii->pii_next->pii_prev = pii->pii_prev;
1598 	pii->pii_next = NULL;
1599 	pii->pii_prev = NULL;
1600 
1601 	/*
1602 	 * Reset the phyint instance pointer in the phyint.
1603 	 * If this is the last phyint instance (being deleted) on this
1604 	 * phyint, then delete the phyint.
1605 	 */
1606 	if (pii->pii_af == AF_INET)
1607 		pi->pi_v4 = NULL;
1608 	else
1609 		pi->pi_v6 = NULL;
1610 
1611 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1612 		phyint_delete(pi);
1613 
1614 	free(pii);
1615 }
1616 
1617 static void
phyint_inst_print(struct phyint_instance * pii)1618 phyint_inst_print(struct phyint_instance *pii)
1619 {
1620 	struct logint *li;
1621 	struct target *tg;
1622 	char abuf[INET6_ADDRSTRLEN];
1623 	int most_recent;
1624 	int i;
1625 
1626 	if (pii->pii_phyint == NULL) {
1627 		logdebug("pii->pi_phyint NULL can't print\n");
1628 		return;
1629 	}
1630 
1631 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1632 	    "sock %x in_use %d\n",
1633 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1634 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1635 	    pii->pii_in_use);
1636 
1637 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1638 		logint_print(li);
1639 
1640 	logdebug("\n");
1641 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1642 		target_print(tg);
1643 
1644 	if (pii->pii_targets == NULL)
1645 		logdebug("pi_targets NULL\n");
1646 
1647 	if (pii->pii_target_next != NULL) {
1648 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1649 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1650 		    abuf, sizeof (abuf)));
1651 	} else {
1652 		logdebug("pi_target_next NULL\n");
1653 	}
1654 
1655 	if (pii->pii_rtt_target_next != NULL) {
1656 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1657 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1658 		    abuf, sizeof (abuf)));
1659 	} else {
1660 		logdebug("pi_rtt_target_next NULL\n");
1661 	}
1662 
1663 	if (pii->pii_targets != NULL) {
1664 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1665 
1666 		i = most_recent;
1667 		do {
1668 			if (pii->pii_probes[i].pr_target != NULL) {
1669 				logdebug("#%d target %s ", i,
1670 				    pr_addr(pii->pii_af,
1671 				    pii->pii_probes[i].pr_target->tg_address,
1672 				    abuf, sizeof (abuf)));
1673 			} else {
1674 				logdebug("#%d target NULL ", i);
1675 			}
1676 			logdebug("time_start %lld status %d "
1677 			    "time_ackproc %lld time_lost %u",
1678 			    pii->pii_probes[i].pr_hrtime_start,
1679 			    pii->pii_probes[i].pr_status,
1680 			    pii->pii_probes[i].pr_hrtime_ackproc,
1681 			    pii->pii_probes[i].pr_time_lost);
1682 			i = PROBE_INDEX_PREV(i);
1683 		} while (i != most_recent);
1684 	}
1685 }
1686 
1687 /*
1688  * Lookup a logint based on the logical interface name, on the given
1689  * phyint instance.
1690  */
1691 static struct logint *
logint_lookup(struct phyint_instance * pii,char * name)1692 logint_lookup(struct phyint_instance *pii, char *name)
1693 {
1694 	struct logint *li;
1695 
1696 	if (debug & D_LOGINT) {
1697 		logdebug("logint_lookup(%s, %s)\n",
1698 		    AF_STR(pii->pii_af), name);
1699 	}
1700 
1701 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1702 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1703 			break;
1704 	}
1705 	return (li);
1706 }
1707 
1708 /*
1709  * Insert a logint at the head of the list of logints of the given
1710  * phyint instance
1711  */
1712 static void
logint_insert(struct phyint_instance * pii,struct logint * li)1713 logint_insert(struct phyint_instance *pii, struct logint *li)
1714 {
1715 	li->li_next = pii->pii_logint;
1716 	li->li_prev = NULL;
1717 	if (pii->pii_logint != NULL)
1718 		pii->pii_logint->li_prev = li;
1719 	pii->pii_logint = li;
1720 	li->li_phyint_inst = pii;
1721 }
1722 
1723 /*
1724  * Create a new named logint, on the specified phyint instance.
1725  */
1726 static struct logint *
logint_create(struct phyint_instance * pii,char * name)1727 logint_create(struct phyint_instance *pii, char *name)
1728 {
1729 	struct logint *li;
1730 
1731 	if (debug & D_LOGINT) {
1732 		logdebug("logint_create(%s %s %s)\n",
1733 		    AF_STR(pii->pii_af), pii->pii_name, name);
1734 	}
1735 
1736 	li = calloc(1, sizeof (struct logint));
1737 	if (li == NULL) {
1738 		logperror("logint_create: calloc");
1739 		return (NULL);
1740 	}
1741 
1742 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1743 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1744 	logint_insert(pii, li);
1745 	return (li);
1746 }
1747 
1748 /*
1749  * Initialize the logint based on the data returned by the kernel.
1750  */
1751 void
logint_init_from_k(struct phyint_instance * pii,char * li_name)1752 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1753 {
1754 	int	ifsock;
1755 	uint64_t flags;
1756 	uint64_t saved_flags;
1757 	struct	logint	*li;
1758 	struct lifreq	lifr;
1759 	struct in6_addr	test_subnet;
1760 	struct in6_addr	testaddr;
1761 	int	test_subnet_len;
1762 	struct sockaddr_in6	*sin6;
1763 	struct sockaddr_in	*sin;
1764 	char abuf[INET6_ADDRSTRLEN];
1765 	boolean_t  ptp = _B_FALSE;
1766 	struct in6_addr tgaddr;
1767 
1768 	if (debug & D_LOGINT) {
1769 		logdebug("logint_init_from_k(%s %s)\n",
1770 		    AF_STR(pii->pii_af), li_name);
1771 	}
1772 
1773 	/* Get the socket for doing ioctls */
1774 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1775 
1776 	/*
1777 	 * Get the flags from the kernel. Also serves as a check whether
1778 	 * the logical still exists. If it doesn't exist, no need to proceed
1779 	 * any further. li_in_use will make the caller clean up the logint
1780 	 */
1781 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1782 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1783 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1784 		/* Interface may have vanished */
1785 		if (errno != ENXIO) {
1786 			logperror_pii(pii, "logint_init_from_k: "
1787 			    "ioctl (get flags)");
1788 		}
1789 		return;
1790 	}
1791 
1792 	flags = lifr.lifr_flags;
1793 
1794 	/*
1795 	 * Verified the logint exists. Now lookup the logint in our tables.
1796 	 * If it does not exist, create a new logint.
1797 	 */
1798 	li = logint_lookup(pii, li_name);
1799 	if (li == NULL) {
1800 		li = logint_create(pii, li_name);
1801 		if (li == NULL) {
1802 			/*
1803 			 * Pretend the interface does not exist
1804 			 * in the kernel
1805 			 */
1806 			return;
1807 		}
1808 	}
1809 
1810 	/*
1811 	 * Update li->li_flags with the new flags, after saving the old
1812 	 * value. This is used later to check what flags has changed and
1813 	 * take any action
1814 	 */
1815 	saved_flags = li->li_flags;
1816 	li->li_flags = flags;
1817 
1818 	/*
1819 	 * Get the address, prefix, prefixlength and update the logint.
1820 	 * Check if anything has changed. If the logint used for the
1821 	 * test address has changed, take suitable action.
1822 	 */
1823 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1824 		/* Interface may have vanished */
1825 		if (errno != ENXIO) {
1826 			logperror_li(li, "logint_init_from_k: (get addr)");
1827 		}
1828 		goto error;
1829 	}
1830 
1831 	if (pii->pii_af == AF_INET) {
1832 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1833 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1834 	} else {
1835 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1836 		testaddr = sin6->sin6_addr;
1837 	}
1838 
1839 	if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1840 		/* Interface may have vanished */
1841 		if (errno != ENXIO)
1842 			logperror_li(li, "logint_init_from_k: (get subnet)");
1843 		goto error;
1844 	}
1845 	if (lifr.lifr_subnet.ss_family == AF_INET6) {
1846 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1847 		test_subnet = sin6->sin6_addr;
1848 		test_subnet_len = lifr.lifr_addrlen;
1849 	} else {
1850 		sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1851 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1852 		test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS);
1853 	}
1854 
1855 	/*
1856 	 * If this is the logint corresponding to the test address used for
1857 	 * sending probes, then if anything significant has changed we need to
1858 	 * determine the test address again.  We ignore changes to the
1859 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1860 	 * course.
1861 	 */
1862 	if (pii->pii_probe_logint == li) {
1863 		if (((li->li_flags ^ saved_flags) &
1864 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1865 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1866 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1867 		    &li->li_subnet)) ||
1868 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1869 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1870 			/*
1871 			 * Something significant that affects the testaddress
1872 			 * has changed. Redo the testaddress selection later on
1873 			 * in select_test_ifs(). For now do the cleanup and
1874 			 * set pii_probe_logint to NULL.
1875 			 */
1876 			if (pii->pii_probe_sock != -1)
1877 				close_probe_socket(pii, _B_TRUE);
1878 			pii->pii_probe_logint = NULL;
1879 		}
1880 	}
1881 
1882 
1883 	/* Update the logint with the values obtained from the kernel.	*/
1884 	li->li_addr = testaddr;
1885 	li->li_in_use = 1;
1886 	if (ptp) {
1887 		li->li_dstaddr = tgaddr;
1888 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1889 		    IP_ABITS : IPV6_ABITS;
1890 	} else {
1891 		li->li_subnet = test_subnet;
1892 		li->li_subnet_len = test_subnet_len;
1893 	}
1894 
1895 	if (debug & D_LOGINT)
1896 		logint_print(li);
1897 
1898 	return;
1899 
1900 error:
1901 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1902 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1903 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1904 	logint_delete(li);
1905 }
1906 
1907 /*
1908  * Delete (unlink and free) a logint.
1909  */
1910 void
logint_delete(struct logint * li)1911 logint_delete(struct logint *li)
1912 {
1913 	struct phyint_instance *pii;
1914 
1915 	pii = li->li_phyint_inst;
1916 	assert(pii != NULL);
1917 
1918 	if (debug & D_LOGINT) {
1919 		int af;
1920 		char abuf[INET6_ADDRSTRLEN];
1921 
1922 		af = pii->pii_af;
1923 		logdebug("logint_delete(%s %s %s/%u)\n",
1924 		    AF_STR(af), li->li_name,
1925 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1926 		    li->li_subnet_len);
1927 	}
1928 
1929 	/* logint must be in the list of logints */
1930 	assert(pii->pii_logint == li || li->li_prev != NULL);
1931 
1932 	/* Remove the logint from the list of logints  */
1933 	if (li->li_prev == NULL) {
1934 		/* logint is the 1st in the list */
1935 		pii->pii_logint = li->li_next;
1936 	} else {
1937 		li->li_prev->li_next = li->li_next;
1938 	}
1939 	if (li->li_next != NULL)
1940 		li->li_next->li_prev = li->li_prev;
1941 	li->li_next = NULL;
1942 	li->li_prev = NULL;
1943 
1944 	/*
1945 	 * If this logint is also being used for probing, then close the
1946 	 * associated socket, if it exists.
1947 	 */
1948 	if (pii->pii_probe_logint == li) {
1949 		if (pii->pii_probe_sock != -1)
1950 			close_probe_socket(pii, _B_TRUE);
1951 		pii->pii_probe_logint = NULL;
1952 	}
1953 
1954 	free(li);
1955 }
1956 
1957 static void
logint_print(struct logint * li)1958 logint_print(struct logint *li)
1959 {
1960 	char abuf[INET6_ADDRSTRLEN];
1961 	int af = li->li_phyint_inst->pii_af;
1962 
1963 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1964 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1965 
1966 	logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use);
1967 }
1968 
1969 char *
pr_addr(int af,struct in6_addr addr,char * abuf,int len)1970 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1971 {
1972 	struct in_addr	addr_v4;
1973 
1974 	if (af == AF_INET) {
1975 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1976 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1977 	} else {
1978 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1979 	}
1980 	return (abuf);
1981 }
1982 
1983 /*
1984  * Fill in the sockaddr_storage pointed to by `ssp' with the IP address
1985  * represented by the [`af',`addr'] pair.  Needed because in.mpathd internally
1986  * stores all addresses as in6_addrs, but we don't want to expose that.
1987  */
1988 void
addr2storage(int af,const struct in6_addr * addr,struct sockaddr_storage * ssp)1989 addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp)
1990 {
1991 	struct sockaddr_in *sinp = (struct sockaddr_in *)ssp;
1992 	struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp;
1993 
1994 	assert(af == AF_INET || af == AF_INET6);
1995 
1996 	switch (af) {
1997 	case AF_INET:
1998 		(void) memset(sinp, 0, sizeof (*sinp));
1999 		sinp->sin_family = AF_INET;
2000 		IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr);
2001 		break;
2002 	case AF_INET6:
2003 		(void) memset(sin6p, 0, sizeof (*sin6p));
2004 		sin6p->sin6_family = AF_INET6;
2005 		sin6p->sin6_addr = *addr;
2006 		break;
2007 	}
2008 }
2009 
2010 /* Lookup target on its address */
2011 struct target *
target_lookup(struct phyint_instance * pii,struct in6_addr addr)2012 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
2013 {
2014 	struct target *tg;
2015 
2016 	if (debug & D_TARGET) {
2017 		char abuf[INET6_ADDRSTRLEN];
2018 
2019 		logdebug("target_lookup(%s %s): addr %s\n",
2020 		    AF_STR(pii->pii_af), pii->pii_name,
2021 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
2022 	}
2023 
2024 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2025 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
2026 			break;
2027 	}
2028 	return (tg);
2029 }
2030 
2031 /*
2032  * Find and return the next active target, for the next probe.
2033  * If no active targets are available, return NULL.
2034  */
2035 struct target *
target_next(struct target * tg)2036 target_next(struct target *tg)
2037 {
2038 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
2039 	struct	target	*marker = tg;
2040 	hrtime_t now;
2041 
2042 	now = gethrtime();
2043 
2044 	/*
2045 	 * Target must be in the list of targets for this phyint
2046 	 * instance.
2047 	 */
2048 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
2049 	assert(pii->pii_targets != NULL);
2050 
2051 	/* Return the next active target */
2052 	do {
2053 		/*
2054 		 * Go to the next target. If we hit the end,
2055 		 * reset the ptr to the head
2056 		 */
2057 		tg = tg->tg_next;
2058 		if (tg == NULL)
2059 			tg = pii->pii_targets;
2060 
2061 		assert(TG_STATUS_VALID(tg->tg_status));
2062 
2063 		switch (tg->tg_status) {
2064 		case TG_ACTIVE:
2065 			return (tg);
2066 
2067 		case TG_UNUSED:
2068 			assert(pii->pii_targets_are_routers);
2069 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
2070 				/*
2071 				 * Bubble up the unused target to active
2072 				 */
2073 				tg->tg_status = TG_ACTIVE;
2074 				pii->pii_ntargets++;
2075 				return (tg);
2076 			}
2077 			break;
2078 
2079 		case TG_SLOW:
2080 			assert(pii->pii_targets_are_routers);
2081 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2082 				/*
2083 				 * Bubble up the slow target to unused
2084 				 */
2085 				tg->tg_status = TG_UNUSED;
2086 			}
2087 			break;
2088 
2089 		case TG_DEAD:
2090 			assert(pii->pii_targets_are_routers);
2091 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2092 				/*
2093 				 * Bubble up the dead target to slow
2094 				 */
2095 				tg->tg_status = TG_SLOW;
2096 				tg->tg_latime = now;
2097 			}
2098 			break;
2099 		}
2100 
2101 	} while (tg != marker);
2102 
2103 	return (NULL);
2104 }
2105 
2106 /*
2107  * Select the best available target, that is not already TG_ACTIVE,
2108  * for the caller. The caller will determine whether it wants to
2109  * make the returned target TG_ACTIVE.
2110  * The selection order is as follows.
2111  * 1. pick a TG_UNSED target, if it exists.
2112  * 2. else pick a TG_SLOW target that has recovered, if it exists
2113  * 3. else pick any TG_SLOW target, if it exists
2114  * 4. else pick a TG_DEAD target that has recovered, if it exists
2115  * 5. else pick any TG_DEAD target, if it exists
2116  * 6. else return null
2117  */
2118 static struct target *
target_select_best(struct phyint_instance * pii)2119 target_select_best(struct phyint_instance *pii)
2120 {
2121 	struct target *tg;
2122 	struct target *slow = NULL;
2123 	struct target *dead = NULL;
2124 	struct target *slow_recovered = NULL;
2125 	struct target *dead_recovered = NULL;
2126 	hrtime_t now;
2127 
2128 	now = gethrtime();
2129 
2130 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2131 		assert(TG_STATUS_VALID(tg->tg_status));
2132 
2133 		switch (tg->tg_status) {
2134 		case TG_UNUSED:
2135 			return (tg);
2136 
2137 		case TG_SLOW:
2138 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2139 				slow_recovered = tg;
2140 				/*
2141 				 * Promote the slow_recovered to unused
2142 				 */
2143 				tg->tg_status = TG_UNUSED;
2144 			} else {
2145 				slow = tg;
2146 			}
2147 			break;
2148 
2149 		case TG_DEAD:
2150 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2151 				dead_recovered = tg;
2152 				/*
2153 				 * Promote the dead_recovered to slow
2154 				 */
2155 				tg->tg_status = TG_SLOW;
2156 				tg->tg_latime = now;
2157 			} else {
2158 				dead = tg;
2159 			}
2160 			break;
2161 
2162 		default:
2163 			break;
2164 		}
2165 	}
2166 
2167 	if (slow_recovered != NULL)
2168 		return (slow_recovered);
2169 	else if (slow != NULL)
2170 		return (slow);
2171 	else if (dead_recovered != NULL)
2172 		return (dead_recovered);
2173 	else
2174 		return (dead);
2175 }
2176 
2177 /*
2178  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
2179  * that are active, pick the next best below.
2180  */
2181 static void
target_activate_all(struct phyint_instance * pii)2182 target_activate_all(struct phyint_instance *pii)
2183 {
2184 	struct target *tg;
2185 
2186 	assert(pii->pii_ntargets == 0);
2187 	assert(pii->pii_target_next == NULL);
2188 	assert(pii->pii_rtt_target_next == NULL);
2189 	assert(pii->pii_targets_are_routers);
2190 
2191 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
2192 		tg = target_select_best(pii);
2193 		if (tg == NULL) {
2194 			/* We are out of targets */
2195 			return;
2196 		}
2197 
2198 		assert(TG_STATUS_VALID(tg->tg_status));
2199 		assert(tg->tg_status != TG_ACTIVE);
2200 		tg->tg_status = TG_ACTIVE;
2201 		pii->pii_ntargets++;
2202 		if (pii->pii_target_next == NULL) {
2203 			pii->pii_target_next = tg;
2204 			pii->pii_rtt_target_next = tg;
2205 		}
2206 	}
2207 }
2208 
2209 static struct target *
target_first(struct phyint_instance * pii)2210 target_first(struct phyint_instance *pii)
2211 {
2212 	struct target *tg;
2213 
2214 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2215 		assert(TG_STATUS_VALID(tg->tg_status));
2216 		if (tg->tg_status == TG_ACTIVE)
2217 			break;
2218 	}
2219 
2220 	return (tg);
2221 }
2222 
2223 /*
2224  * Create a default target entry.
2225  */
2226 void
target_create(struct phyint_instance * pii,struct in6_addr addr,boolean_t is_router)2227 target_create(struct phyint_instance *pii, struct in6_addr addr,
2228     boolean_t is_router)
2229 {
2230 	struct target *tg;
2231 	struct phyint *pi;
2232 	struct logint *li;
2233 
2234 	if (debug & D_TARGET) {
2235 		char abuf[INET6_ADDRSTRLEN];
2236 
2237 		logdebug("target_create(%s %s, %s)\n",
2238 		    AF_STR(pii->pii_af), pii->pii_name,
2239 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
2240 	}
2241 
2242 	/*
2243 	 * If the test address is not yet initialized, do not add
2244 	 * any target, since we cannot determine whether the target
2245 	 * belongs to the same subnet as the test address.
2246 	 */
2247 	li = pii->pii_probe_logint;
2248 	if (li == NULL)
2249 		return;
2250 
2251 	/*
2252 	 * If there are multiple subnets associated with an interface, then
2253 	 * add the target to this phyint instance only if it belongs to the
2254 	 * same subnet as the test address.  This assures us that we will
2255 	 * be able to reach this target through our routing table.
2256 	 */
2257 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
2258 		return;
2259 
2260 	if (pii->pii_targets != NULL) {
2261 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
2262 		if (is_router) {
2263 			if (!pii->pii_targets_are_routers) {
2264 				/*
2265 				 * Prefer router over hosts. Using hosts is a
2266 				 * fallback mechanism, hence delete all host
2267 				 * targets.
2268 				 */
2269 				while (pii->pii_targets != NULL)
2270 					target_delete(pii->pii_targets);
2271 			}
2272 		} else {
2273 			/*
2274 			 * Routers take precedence over hosts. If this
2275 			 * is a router list and we are trying to add a
2276 			 * host, just return. If this is a host list
2277 			 * and if we have sufficient targets, just return
2278 			 */
2279 			if (pii->pii_targets_are_routers ||
2280 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
2281 				return;
2282 		}
2283 	}
2284 
2285 	tg = calloc(1, sizeof (struct target));
2286 	if (tg == NULL) {
2287 		logperror("target_create: calloc");
2288 		return;
2289 	}
2290 
2291 	tg->tg_phyint_inst = pii;
2292 	tg->tg_address = addr;
2293 	tg->tg_in_use = 1;
2294 	tg->tg_rtt_sa = -1;
2295 	tg->tg_num_deferred = 0;
2296 
2297 	/*
2298 	 * If this is the first target, set 'pii_targets_are_routers'
2299 	 * The list of targets is either a list of hosts or list or
2300 	 * routers, but not a mix.
2301 	 */
2302 	if (pii->pii_targets == NULL) {
2303 		assert(pii->pii_ntargets == 0);
2304 		assert(pii->pii_target_next == NULL);
2305 		assert(pii->pii_rtt_target_next == NULL);
2306 		pii->pii_targets_are_routers = is_router ? 1 : 0;
2307 	}
2308 
2309 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
2310 		assert(pii->pii_targets_are_routers);
2311 		assert(pii->pii_target_next != NULL);
2312 		assert(pii->pii_rtt_target_next != NULL);
2313 		tg->tg_status = TG_UNUSED;
2314 	} else {
2315 		if (pii->pii_ntargets == 0) {
2316 			assert(pii->pii_target_next == NULL);
2317 			pii->pii_target_next = tg;
2318 			pii->pii_rtt_target_next = tg;
2319 		}
2320 		pii->pii_ntargets++;
2321 		tg->tg_status = TG_ACTIVE;
2322 	}
2323 
2324 	target_insert(pii, tg);
2325 
2326 	/*
2327 	 * Change state to PI_RUNNING if this phyint instance is capable of
2328 	 * sending and receiving probes -- that is, if we know of at least 1
2329 	 * target, and this phyint instance is probe-capable.  For more
2330 	 * details, see the phyint state diagram in mpd_probe.c.
2331 	 */
2332 	pi = pii->pii_phyint;
2333 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
2334 		if (pi->pi_flags & IFF_FAILED)
2335 			phyint_chstate(pi, PI_FAILED);
2336 		else
2337 			phyint_chstate(pi, PI_RUNNING);
2338 	}
2339 }
2340 
2341 /*
2342  * Add the target address named by `addr' to phyint instance `pii' if it does
2343  * not already exist.  If the target is a router, `is_router' should be set to
2344  * B_TRUE.
2345  */
2346 void
target_add(struct phyint_instance * pii,struct in6_addr addr,boolean_t is_router)2347 target_add(struct phyint_instance *pii, struct in6_addr addr,
2348     boolean_t is_router)
2349 {
2350 	struct target *tg;
2351 
2352 	if (pii == NULL)
2353 		return;
2354 
2355 	tg = target_lookup(pii, addr);
2356 
2357 	/*
2358 	 * If the target does not exist, create it; target_create() will set
2359 	 * tg_in_use to true.  Even if it exists already, if it's a router
2360 	 * target and we'd previously learned of it through multicast, then we
2361 	 * need to recreate it as a router target.  Otherwise, just set
2362 	 * tg_in_use to to true so that init_router_targets() won't delete it.
2363 	 */
2364 	if (tg == NULL || (is_router && !pii->pii_targets_are_routers))
2365 		target_create(pii, addr, is_router);
2366 	else if (is_router)
2367 		tg->tg_in_use = 1;
2368 }
2369 
2370 /*
2371  * Insert target at head of linked list of targets for the associated
2372  * phyint instance
2373  */
2374 static void
target_insert(struct phyint_instance * pii,struct target * tg)2375 target_insert(struct phyint_instance *pii, struct target *tg)
2376 {
2377 	tg->tg_next = pii->pii_targets;
2378 	tg->tg_prev = NULL;
2379 	if (tg->tg_next != NULL)
2380 		tg->tg_next->tg_prev = tg;
2381 	pii->pii_targets = tg;
2382 }
2383 
2384 /*
2385  * Delete a target (unlink and free).
2386  */
2387 void
target_delete(struct target * tg)2388 target_delete(struct target *tg)
2389 {
2390 	int af;
2391 	struct phyint_instance	*pii;
2392 	struct phyint_instance	*pii_other;
2393 
2394 	pii = tg->tg_phyint_inst;
2395 	af = pii->pii_af;
2396 
2397 	if (debug & D_TARGET) {
2398 		char abuf[INET6_ADDRSTRLEN];
2399 
2400 		logdebug("target_delete(%s %s, %s)\n",
2401 		    AF_STR(af), pii->pii_name,
2402 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
2403 	}
2404 
2405 	/*
2406 	 * Target must be in the list of targets for this phyint
2407 	 * instance.
2408 	 */
2409 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
2410 
2411 	/*
2412 	 * Reset all references to 'tg' in the probe information
2413 	 * for this phyint.
2414 	 */
2415 	reset_pii_probes(pii, tg);
2416 
2417 	/*
2418 	 * Remove this target from the list of targets of this
2419 	 * phyint instance.
2420 	 */
2421 	if (tg->tg_prev == NULL) {
2422 		pii->pii_targets = tg->tg_next;
2423 	} else {
2424 		tg->tg_prev->tg_next = tg->tg_next;
2425 	}
2426 
2427 	if (tg->tg_next != NULL)
2428 		tg->tg_next->tg_prev = tg->tg_prev;
2429 
2430 	tg->tg_next = NULL;
2431 	tg->tg_prev = NULL;
2432 
2433 	if (tg->tg_status == TG_ACTIVE)
2434 		pii->pii_ntargets--;
2435 
2436 	/*
2437 	 * Adjust the next target to probe, if it points to
2438 	 * to the currently deleted target.
2439 	 */
2440 	if (pii->pii_target_next == tg)
2441 		pii->pii_target_next = target_first(pii);
2442 
2443 	if (pii->pii_rtt_target_next == tg)
2444 		pii->pii_rtt_target_next = target_first(pii);
2445 
2446 	free(tg);
2447 
2448 	/*
2449 	 * The number of active targets pii_ntargets == 0 iff
2450 	 * the next active target pii->pii_target_next == NULL
2451 	 */
2452 	if (pii->pii_ntargets != 0) {
2453 		assert(pii->pii_target_next != NULL);
2454 		assert(pii->pii_rtt_target_next != NULL);
2455 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2456 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2457 		return;
2458 	}
2459 
2460 	/* At this point, we don't have any active targets. */
2461 	assert(pii->pii_target_next == NULL);
2462 	assert(pii->pii_rtt_target_next == NULL);
2463 
2464 	if (pii->pii_targets_are_routers) {
2465 		/*
2466 		 * Activate any TG_SLOW or TG_DEAD router targets,
2467 		 * since we don't have any other targets
2468 		 */
2469 		target_activate_all(pii);
2470 
2471 		if (pii->pii_ntargets != 0) {
2472 			assert(pii->pii_target_next != NULL);
2473 			assert(pii->pii_rtt_target_next != NULL);
2474 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2475 			assert(pii->pii_rtt_target_next->tg_status ==
2476 			    TG_ACTIVE);
2477 			return;
2478 		}
2479 	}
2480 
2481 	/*
2482 	 * If we still don't have any active targets, the list must
2483 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2484 	 * targets. Zero out the probe stats since it will not be
2485 	 * relevant any longer.
2486 	 */
2487 	assert(pii->pii_targets == NULL);
2488 	pii->pii_targets_are_routers = _B_FALSE;
2489 	clear_pii_probe_stats(pii);
2490 	pii_other = phyint_inst_other(pii);
2491 
2492 	/*
2493 	 * If there are no targets on both instances and the interface would
2494 	 * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state,
2495 	 * since we cannot probe this phyint any more.  For more details,
2496 	 * please see phyint state diagram in mpd_probe.c.
2497 	 */
2498 	if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) &&
2499 	    pii->pii_phyint->pi_state != PI_OFFLINE)
2500 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2501 }
2502 
2503 /*
2504  * Flush the target list of every phyint in the group, if the list
2505  * is a host target list. This is called if group failure is suspected.
2506  * If all targets have failed, multicast will subsequently discover new
2507  * targets. Else it is a group failure.
2508  * Note: This function is a no-op if the list is a router target list.
2509  */
2510 static void
target_flush_hosts(struct phyint_group * pg)2511 target_flush_hosts(struct phyint_group *pg)
2512 {
2513 	struct phyint *pi;
2514 	struct phyint_instance *pii;
2515 
2516 	if (debug & D_TARGET)
2517 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2518 
2519 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2520 		pii = pi->pi_v4;
2521 		if (pii != NULL && !pii->pii_targets_are_routers) {
2522 			/*
2523 			 * Delete all the targets. When the list becomes
2524 			 * empty, target_delete() will set pii->pii_targets
2525 			 * to NULL.
2526 			 */
2527 			while (pii->pii_targets != NULL)
2528 				target_delete(pii->pii_targets);
2529 		}
2530 		pii = pi->pi_v6;
2531 		if (pii != NULL && !pii->pii_targets_are_routers) {
2532 			/*
2533 			 * Delete all the targets. When the list becomes
2534 			 * empty, target_delete() will set pii->pii_targets
2535 			 * to NULL.
2536 			 */
2537 			while (pii->pii_targets != NULL)
2538 				target_delete(pii->pii_targets);
2539 		}
2540 	}
2541 }
2542 
2543 /*
2544  * Reset all references to 'target' in the probe info, as this target is
2545  * being deleted. The pr_target field is guaranteed to be non-null if
2546  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2547  * pr_target will not be accessed unconditionally.
2548  */
2549 static void
reset_pii_probes(struct phyint_instance * pii,struct target * tg)2550 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2551 {
2552 	int i;
2553 
2554 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2555 		if (pii->pii_probes[i].pr_target == tg) {
2556 			if (pii->pii_probes[i].pr_status == PR_UNACKED) {
2557 				probe_chstate(&pii->pii_probes[i], pii,
2558 				    PR_LOST);
2559 			}
2560 			pii->pii_probes[i].pr_target = NULL;
2561 		}
2562 	}
2563 }
2564 
2565 /*
2566  * Clear the probe statistics array.
2567  */
2568 void
clear_pii_probe_stats(struct phyint_instance * pii)2569 clear_pii_probe_stats(struct phyint_instance *pii)
2570 {
2571 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2572 	/* Reset the next probe index in the probe stats array */
2573 	pii->pii_probe_next = 0;
2574 }
2575 
2576 static void
target_print(struct target * tg)2577 target_print(struct target *tg)
2578 {
2579 	char	abuf[INET6_ADDRSTRLEN];
2580 	char	buf[128];
2581 	char	buf2[128];
2582 	int	af;
2583 	int	i;
2584 
2585 	af = tg->tg_phyint_inst->pii_af;
2586 
2587 	logdebug("Target on %s %s addr %s\n"
2588 	    "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n",
2589 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2590 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2591 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2592 	    tg->tg_crtt, tg->tg_in_use);
2593 
2594 	buf[0] = '\0';
2595 	for (i = 0; i < tg->tg_num_deferred; i++) {
2596 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2597 		    tg->tg_deferred[i]);
2598 		(void) strlcat(buf, buf2, sizeof (buf));
2599 	}
2600 	logdebug("deferred rtts:%s\n", buf);
2601 }
2602 
2603 void
phyint_inst_print_all(void)2604 phyint_inst_print_all(void)
2605 {
2606 	struct phyint_instance *pii;
2607 
2608 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2609 		phyint_inst_print(pii);
2610 	}
2611 }
2612 
2613 /*
2614  * Compare two prefixes that have the same prefix length.
2615  * Fails if the prefix length is unreasonable.
2616  */
2617 boolean_t
prefix_equal(struct in6_addr p1,struct in6_addr p2,uint_t prefix_len)2618 prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len)
2619 {
2620 	uchar_t mask;
2621 	int j;
2622 
2623 	if (prefix_len > IPV6_ABITS)
2624 		return (_B_FALSE);
2625 
2626 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2627 		if (p1.s6_addr[j] != p2.s6_addr[j])
2628 			return (_B_FALSE);
2629 
2630 	/* Make the N leftmost bits one */
2631 	mask = 0xff << (8 - prefix_len);
2632 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2633 		return (_B_FALSE);
2634 
2635 	return (_B_TRUE);
2636 }
2637 
2638 /*
2639  * Get the number of UP logints on phyint `pi'.
2640  */
2641 static int
logint_upcount(struct phyint * pi)2642 logint_upcount(struct phyint *pi)
2643 {
2644 	struct	logint	*li;
2645 	int count = 0;
2646 
2647 	if (pi->pi_v4 != NULL) {
2648 		for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) {
2649 			if (li->li_flags & IFF_UP)
2650 				count++;
2651 		}
2652 	}
2653 
2654 	if (pi->pi_v6 != NULL) {
2655 		for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) {
2656 			if (li->li_flags & IFF_UP)
2657 				count++;
2658 		}
2659 	}
2660 
2661 	return (count);
2662 }
2663 
2664 /*
2665  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2666  */
2667 struct phyint_instance *
phyint_inst_other(struct phyint_instance * pii)2668 phyint_inst_other(struct phyint_instance *pii)
2669 {
2670 	if (pii->pii_af == AF_INET)
2671 		return (pii->pii_phyint->pi_v6);
2672 	else
2673 		return (pii->pii_phyint->pi_v4);
2674 }
2675 
2676 /*
2677  * Check whether a phyint is functioning.
2678  */
2679 boolean_t
phyint_is_functioning(struct phyint * pi)2680 phyint_is_functioning(struct phyint *pi)
2681 {
2682 	if (pi->pi_state == PI_RUNNING)
2683 		return (_B_TRUE);
2684 	return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED));
2685 }
2686 
2687 /*
2688  * Check whether a phyint is usable.
2689  */
2690 boolean_t
phyint_is_usable(struct phyint * pi)2691 phyint_is_usable(struct phyint *pi)
2692 {
2693 	if (logint_upcount(pi) == 0)
2694 		return (_B_FALSE);
2695 	return (phyint_is_functioning(pi));
2696 }
2697 
2698 /*
2699  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2700  * Before sending the event, it prepends the current version of the IPMP
2701  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2702  * `nvl' is freed).
2703  */
2704 static int
post_event(const char * subclass,nvlist_t * nvl)2705 post_event(const char *subclass, nvlist_t *nvl)
2706 {
2707 	static evchan_t *evchp = NULL;
2708 
2709 	/*
2710 	 * Initialize the event channel if we haven't already done so.
2711 	 */
2712 	if (evchp == NULL) {
2713 		errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT);
2714 		if (errno != 0) {
2715 			logerr("cannot create event channel `%s': %s\n",
2716 			    IPMP_EVENT_CHAN, strerror(errno));
2717 			goto failed;
2718 		}
2719 	}
2720 
2721 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2722 	    IPMP_EVENT_CUR_VERSION);
2723 	if (errno != 0) {
2724 		logerr("cannot create `%s' event: %s", subclass,
2725 		    strerror(errno));
2726 		goto failed;
2727 	}
2728 
2729 	errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun",
2730 	    "in.mpathd", nvl, EVCH_NOSLEEP);
2731 	if (errno != 0) {
2732 		logerr("cannot send `%s' event: %s\n", subclass,
2733 		    strerror(errno));
2734 		goto failed;
2735 	}
2736 
2737 	nvlist_free(nvl);
2738 	return (0);
2739 failed:
2740 	nvlist_free(nvl);
2741 	return (-1);
2742 }
2743 
2744 /*
2745  * Return the external IPMP state associated with phyint `pi'.
2746  */
2747 static ipmp_if_state_t
ifstate(struct phyint * pi)2748 ifstate(struct phyint *pi)
2749 {
2750 	switch (pi->pi_state) {
2751 	case PI_INIT:
2752 		return (IPMP_IF_UNKNOWN);
2753 
2754 	case PI_NOTARGETS:
2755 		if (pi->pi_flags & IFF_FAILED)
2756 			return (IPMP_IF_FAILED);
2757 		return (IPMP_IF_UNKNOWN);
2758 
2759 	case PI_OFFLINE:
2760 		return (IPMP_IF_OFFLINE);
2761 
2762 	case PI_FAILED:
2763 		return (IPMP_IF_FAILED);
2764 
2765 	case PI_RUNNING:
2766 		return (IPMP_IF_OK);
2767 	}
2768 
2769 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2770 	abort();
2771 	/* NOTREACHED */
2772 }
2773 
2774 /*
2775  * Return the external IPMP interface type associated with phyint `pi'.
2776  */
2777 static ipmp_if_type_t
iftype(struct phyint * pi)2778 iftype(struct phyint *pi)
2779 {
2780 	if (pi->pi_flags & IFF_STANDBY)
2781 		return (IPMP_IF_STANDBY);
2782 	else
2783 		return (IPMP_IF_NORMAL);
2784 }
2785 
2786 /*
2787  * Return the external IPMP link state associated with phyint `pi'.
2788  */
2789 static ipmp_if_linkstate_t
iflinkstate(struct phyint * pi)2790 iflinkstate(struct phyint *pi)
2791 {
2792 	if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN)))
2793 		return (IPMP_LINK_UNKNOWN);
2794 
2795 	return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP);
2796 }
2797 
2798 /*
2799  * Return the external IPMP probe state associated with phyint `pi'.
2800  */
2801 static ipmp_if_probestate_t
ifprobestate(struct phyint * pi)2802 ifprobestate(struct phyint *pi)
2803 {
2804 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6))
2805 		return (IPMP_PROBE_DISABLED);
2806 
2807 	if (pi->pi_state == PI_FAILED)
2808 		return (IPMP_PROBE_FAILED);
2809 
2810 	if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6))
2811 		return (IPMP_PROBE_UNKNOWN);
2812 
2813 	return (IPMP_PROBE_OK);
2814 }
2815 
2816 /*
2817  * Return the external IPMP target mode associated with phyint instance `pii'.
2818  */
2819 static ipmp_if_targmode_t
iftargmode(struct phyint_instance * pii)2820 iftargmode(struct phyint_instance *pii)
2821 {
2822 	if (!PROBE_ENABLED(pii))
2823 		return (IPMP_TARG_DISABLED);
2824 	else if (pii->pii_targets_are_routers)
2825 		return (IPMP_TARG_ROUTES);
2826 	else
2827 		return (IPMP_TARG_MULTICAST);
2828 }
2829 
2830 /*
2831  * Return the external IPMP flags associated with phyint `pi'.
2832  */
2833 static ipmp_if_flags_t
ifflags(struct phyint * pi)2834 ifflags(struct phyint *pi)
2835 {
2836 	ipmp_if_flags_t flags = 0;
2837 
2838 	if (logint_upcount(pi) == 0)
2839 		flags |= IPMP_IFFLAG_DOWN;
2840 	if (pi->pi_flags & IFF_INACTIVE)
2841 		flags |= IPMP_IFFLAG_INACTIVE;
2842 	if (pi->pi_hwaddrdup)
2843 		flags |= IPMP_IFFLAG_HWADDRDUP;
2844 	if (phyint_is_functioning(pi) && flags == 0)
2845 		flags |= IPMP_IFFLAG_ACTIVE;
2846 
2847 	return (flags);
2848 }
2849 
2850 /*
2851  * Store the test address used on phyint instance `pii' in `ssp'.  If there's
2852  * no test address, 0.0.0.0 is stored.
2853  */
2854 static struct sockaddr_storage *
iftestaddr(struct phyint_instance * pii,struct sockaddr_storage * ssp)2855 iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp)
2856 {
2857 	if (PROBE_ENABLED(pii))
2858 		addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp);
2859 	else
2860 		addr2storage(AF_INET6, &in6addr_any, ssp);
2861 
2862 	return (ssp);
2863 }
2864 
2865 /*
2866  * Return the external IPMP group state associated with phyint group `pg'.
2867  */
2868 static ipmp_group_state_t
groupstate(struct phyint_group * pg)2869 groupstate(struct phyint_group *pg)
2870 {
2871 	switch (pg->pg_state) {
2872 	case PG_FAILED:
2873 		return (IPMP_GROUP_FAILED);
2874 	case PG_DEGRADED:
2875 		return (IPMP_GROUP_DEGRADED);
2876 	case PG_OK:
2877 		return (IPMP_GROUP_OK);
2878 	}
2879 
2880 	logerr("groupstate: unknown state %d; aborting\n", pg->pg_state);
2881 	abort();
2882 	/* NOTREACHED */
2883 }
2884 
2885 /*
2886  * Return the external IPMP probe state associated with probe `ps'.
2887  */
2888 static ipmp_probe_state_t
probestate(struct probe_stats * ps)2889 probestate(struct probe_stats *ps)
2890 {
2891 	switch (ps->pr_status) {
2892 	case PR_UNUSED:
2893 	case PR_LOST:
2894 		return (IPMP_PROBE_LOST);
2895 	case PR_UNACKED:
2896 		return (IPMP_PROBE_SENT);
2897 	case PR_ACKED:
2898 		return (IPMP_PROBE_ACKED);
2899 	}
2900 
2901 	logerr("probestate: unknown state %d; aborting\n", ps->pr_status);
2902 	abort();
2903 	/* NOTREACHED */
2904 }
2905 
2906 /*
2907  * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr'
2908  * on phyint instance `pii'.  Returns 0 on success, -1 on failure.
2909  */
2910 int
probe_state_event(struct probe_stats * pr,struct phyint_instance * pii)2911 probe_state_event(struct probe_stats *pr, struct phyint_instance *pii)
2912 {
2913 	nvlist_t *nvl;
2914 	hrtime_t proc_time = 0, recv_time = 0;
2915 	struct sockaddr_storage ss;
2916 	struct target *tg = pr->pr_target;
2917 	int64_t rttavg, rttdev;
2918 
2919 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2920 	if (errno != 0) {
2921 		logperror("cannot create `interface change' event");
2922 		return (-1);
2923 	}
2924 
2925 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id);
2926 	if (errno != 0)
2927 		goto failed;
2928 
2929 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name);
2930 	if (errno != 0)
2931 		goto failed;
2932 
2933 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr));
2934 	if (errno != 0)
2935 		goto failed;
2936 
2937 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME,
2938 	    pr->pr_hrtime_start);
2939 	if (errno != 0)
2940 		goto failed;
2941 
2942 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME,
2943 	    pr->pr_hrtime_sent);
2944 	if (errno != 0)
2945 		goto failed;
2946 
2947 	if (pr->pr_status == PR_ACKED) {
2948 		recv_time = pr->pr_hrtime_ackrecv;
2949 		proc_time = pr->pr_hrtime_ackproc;
2950 	}
2951 
2952 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time);
2953 	if (errno != 0)
2954 		goto failed;
2955 
2956 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time);
2957 	if (errno != 0)
2958 		goto failed;
2959 
2960 	if (tg != NULL)
2961 		addr2storage(pii->pii_af, &tg->tg_address, &ss);
2962 	else
2963 		addr2storage(pii->pii_af, &in6addr_any, &ss);
2964 
2965 	errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss,
2966 	    sizeof (ss));
2967 	if (errno != 0)
2968 		goto failed;
2969 
2970 	rttavg = (tg != NULL) ? (tg->tg_rtt_sa / 8) : 0;
2971 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, rttavg);
2972 	if (errno != 0)
2973 		goto failed;
2974 
2975 	rttdev = (tg != NULL) ? (tg->tg_rtt_sd / 4) : 0;
2976 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, rttdev);
2977 	if (errno != 0)
2978 		goto failed;
2979 
2980 	return (post_event(ESC_IPMP_PROBE_STATE, nvl));
2981 failed:
2982 	logperror("cannot create `probe state' event");
2983 	nvlist_free(nvl);
2984 	return (-1);
2985 }
2986 
2987 /*
2988  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2989  * Returns 0 on success, -1 on failure.
2990  */
2991 static int
phyint_group_state_event(struct phyint_group * pg)2992 phyint_group_state_event(struct phyint_group *pg)
2993 {
2994 	nvlist_t	*nvl;
2995 
2996 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2997 	if (errno != 0) {
2998 		logperror("cannot create `group state change' event");
2999 		return (-1);
3000 	}
3001 
3002 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3003 	if (errno != 0)
3004 		goto failed;
3005 
3006 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3007 	if (errno != 0)
3008 		goto failed;
3009 
3010 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
3011 	if (errno != 0)
3012 		goto failed;
3013 
3014 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
3015 failed:
3016 	logperror("cannot create `group state change' event");
3017 	nvlist_free(nvl);
3018 	return (-1);
3019 }
3020 
3021 /*
3022  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
3023  * `pg'.  Returns 0 on success, -1 on failure.
3024  */
3025 static int
phyint_group_change_event(struct phyint_group * pg,ipmp_group_op_t op)3026 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
3027 {
3028 	nvlist_t *nvl;
3029 
3030 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3031 	if (errno != 0) {
3032 		logperror("cannot create `group change' event");
3033 		return (-1);
3034 	}
3035 
3036 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3037 	if (errno != 0)
3038 		goto failed;
3039 
3040 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3041 	if (errno != 0)
3042 		goto failed;
3043 
3044 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
3045 	    phyint_grouplistsig);
3046 	if (errno != 0)
3047 		goto failed;
3048 
3049 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
3050 	if (errno != 0)
3051 		goto failed;
3052 
3053 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
3054 failed:
3055 	logperror("cannot create `group change' event");
3056 	nvlist_free(nvl);
3057 	return (-1);
3058 }
3059 
3060 /*
3061  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
3062  * group `pg'.	Returns 0 on success, -1 on failure.
3063  */
3064 static int
phyint_group_member_event(struct phyint_group * pg,struct phyint * pi,ipmp_if_op_t op)3065 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
3066     ipmp_if_op_t op)
3067 {
3068 	nvlist_t *nvl;
3069 
3070 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3071 	if (errno != 0) {
3072 		logperror("cannot create `group member change' event");
3073 		return (-1);
3074 	}
3075 
3076 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3077 	if (errno != 0)
3078 		goto failed;
3079 
3080 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3081 	if (errno != 0)
3082 		goto failed;
3083 
3084 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
3085 	if (errno != 0)
3086 		goto failed;
3087 
3088 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3089 	if (errno != 0)
3090 		goto failed;
3091 
3092 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3093 	if (errno != 0)
3094 		goto failed;
3095 
3096 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3097 	if (errno != 0)
3098 		goto failed;
3099 
3100 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
3101 failed:
3102 	logperror("cannot create `group member change' event");
3103 	nvlist_free(nvl);
3104 	return (-1);
3105 
3106 }
3107 
3108 /*
3109  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
3110  * Returns 0 on success, -1 on failure.
3111  */
3112 static int
phyint_state_event(struct phyint_group * pg,struct phyint * pi)3113 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
3114 {
3115 	nvlist_t *nvl;
3116 
3117 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3118 	if (errno != 0) {
3119 		logperror("cannot create `interface change' event");
3120 		return (-1);
3121 	}
3122 
3123 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3124 	if (errno != 0)
3125 		goto failed;
3126 
3127 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3128 	if (errno != 0)
3129 		goto failed;
3130 
3131 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3132 	if (errno != 0)
3133 		goto failed;
3134 
3135 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3136 	if (errno != 0)
3137 		goto failed;
3138 
3139 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3140 	if (errno != 0)
3141 		goto failed;
3142 
3143 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
3144 failed:
3145 	logperror("cannot create `interface change' event");
3146 	nvlist_free(nvl);
3147 	return (-1);
3148 
3149 }
3150 
3151 /*
3152  * Generate a signature for use.  The signature is conceptually divided
3153  * into two pieces: a random 16-bit "generation number" and a 48-bit
3154  * monotonically increasing integer.  The generation number protects
3155  * against stale updates to entities (e.g., IPMP groups) that have been
3156  * deleted and since recreated.
3157  */
3158 static uint64_t
gensig(void)3159 gensig(void)
3160 {
3161 	static int seeded = 0;
3162 
3163 	if (seeded == 0) {
3164 		srand48((long)gethrtime());
3165 		seeded++;
3166 	}
3167 
3168 	return ((uint64_t)lrand48() << 48 | 1);
3169 }
3170 
3171 /*
3172  * Store the information associated with group `grname' into a dynamically
3173  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
3174  */
3175 unsigned int
getgroupinfo(const char * grname,ipmp_groupinfo_t ** grinfopp)3176 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
3177 {
3178 	struct phyint		*pi;
3179 	struct phyint_group	*pg;
3180 	char			(*ifs)[LIFNAMSIZ];
3181 	unsigned int		i, j;
3182 	unsigned int		nif = 0, naddr = 0;
3183 	lifgroupinfo_t		lifgr;
3184 	addrlist_t		*addrp;
3185 	struct sockaddr_storage	*addrs;
3186 	int			fdt = 0;
3187 
3188 	pg = phyint_group_lookup(grname);
3189 	if (pg == NULL)
3190 		return (IPMP_EUNKGROUP);
3191 
3192 	/*
3193 	 * Tally up the number of interfaces, allocate an array to hold them,
3194 	 * and insert their names into the array.  While we're at it, if any
3195 	 * interface is actually enabled to send probes, save the group fdt.
3196 	 */
3197 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
3198 		nif++;
3199 
3200 	ifs = alloca(nif * sizeof (*ifs));
3201 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
3202 		assert(i < nif);
3203 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
3204 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6))
3205 			fdt = pg->pg_fdt;
3206 	}
3207 	assert(i == nif);
3208 
3209 	/*
3210 	 * If this is the anonymous group, there's no other information to
3211 	 * collect (since there's no IPMP interface).
3212 	 */
3213 	if (pg == phyint_anongroup) {
3214 		*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3215 		    groupstate(pg), nif, ifs, "", "", "", "", 0, NULL);
3216 		return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3217 	}
3218 
3219 	/*
3220 	 * Grab some additional information about the group from the kernel.
3221 	 * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name,
3222 	 * we can use ifsock_v4 even for a V6-only group.)
3223 	 */
3224 	(void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
3225 	if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) {
3226 		if (errno == ENOENT)
3227 			return (IPMP_EUNKGROUP);
3228 
3229 		logperror("getgroupinfo: SIOCGLIFGROUPINFO");
3230 		return (IPMP_FAILURE);
3231 	}
3232 
3233 	/*
3234 	 * Tally up the number of data addresses, allocate an array to hold
3235 	 * them, and insert their values into the array.
3236 	 */
3237 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next)
3238 		naddr++;
3239 
3240 	addrs = alloca(naddr * sizeof (*addrs));
3241 	i = 0;
3242 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3243 		/*
3244 		 * It's possible to have duplicate addresses (if some are
3245 		 * down).  Weed the dups out to avoid confusing consumers.
3246 		 * (If groups start having tons of addresses, we'll need a
3247 		 * better algorithm here.)
3248 		 */
3249 		for (j = 0; j < i; j++) {
3250 			if (sockaddrcmp(&addrs[j], &addrp->al_addr))
3251 				break;
3252 		}
3253 		if (j == i) {
3254 			assert(i < naddr);
3255 			addrs[i++] = addrp->al_addr;
3256 		}
3257 	}
3258 	naddr = i;
3259 
3260 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3261 	    groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname,
3262 	    lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs);
3263 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3264 }
3265 
3266 /*
3267  * Store the target information associated with phyint instance `pii' into a
3268  * dynamically allocated structure pointed to by `*targinfopp'.  Returns an
3269  * IPMP error code.
3270  */
3271 unsigned int
gettarginfo(struct phyint_instance * pii,const char * name,ipmp_targinfo_t ** targinfopp)3272 gettarginfo(struct phyint_instance *pii, const char *name,
3273     ipmp_targinfo_t **targinfopp)
3274 {
3275 	uint_t ntarg = 0;
3276 	struct target *tg;
3277 	struct sockaddr_storage	ss;
3278 	struct sockaddr_storage *targs = NULL;
3279 
3280 	if (PROBE_CAPABLE(pii)) {
3281 		targs = alloca(pii->pii_ntargets * sizeof (*targs));
3282 		tg = pii->pii_target_next;
3283 		do {
3284 			if (tg->tg_status == TG_ACTIVE) {
3285 				assert(ntarg < pii->pii_ntargets);
3286 				addr2storage(pii->pii_af, &tg->tg_address,
3287 				    &targs[ntarg++]);
3288 			}
3289 			if ((tg = tg->tg_next) == NULL)
3290 				tg = pii->pii_targets;
3291 		} while (tg != pii->pii_target_next);
3292 
3293 		assert(ntarg == pii->pii_ntargets);
3294 	}
3295 
3296 	*targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss),
3297 	    iftargmode(pii), ntarg, targs);
3298 	return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3299 }
3300 
3301 /*
3302  * Store the information associated with interface `ifname' into a dynamically
3303  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
3304  */
3305 unsigned int
getifinfo(const char * ifname,ipmp_ifinfo_t ** ifinfopp)3306 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
3307 {
3308 	int		retval;
3309 	struct phyint	*pi;
3310 	ipmp_targinfo_t	*targinfo4;
3311 	ipmp_targinfo_t	*targinfo6;
3312 
3313 	pi = phyint_lookup(ifname);
3314 	if (pi == NULL)
3315 		return (IPMP_EUNKIF);
3316 
3317 	if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 ||
3318 	    (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0)
3319 		goto out;
3320 
3321 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
3322 	    ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi),
3323 	    ifflags(pi), targinfo4, targinfo6);
3324 	retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3325 out:
3326 	if (targinfo4 != NULL)
3327 		ipmp_freetarginfo(targinfo4);
3328 	if (targinfo6 != NULL)
3329 		ipmp_freetarginfo(targinfo6);
3330 	return (retval);
3331 }
3332 
3333 /*
3334  * Store the current list of IPMP groups into a dynamically allocated
3335  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
3336  */
3337 unsigned int
getgrouplist(ipmp_grouplist_t ** grlistpp)3338 getgrouplist(ipmp_grouplist_t **grlistpp)
3339 {
3340 	struct phyint_group	*pg;
3341 	char			(*groups)[LIFGRNAMSIZ];
3342 	unsigned int		i, ngroup;
3343 
3344 	/*
3345 	 * Tally up the number of groups, allocate an array to hold them, and
3346 	 * insert their names into the array.
3347 	 */
3348 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
3349 		ngroup++;
3350 
3351 	groups = alloca(ngroup * sizeof (*groups));
3352 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
3353 		assert(i < ngroup);
3354 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
3355 	}
3356 	assert(i == ngroup);
3357 
3358 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
3359 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3360 }
3361 
3362 /*
3363  * Store the address information for `ssp' (in group `grname') into a
3364  * dynamically allocated structure pointed to by `*adinfopp'.  Returns an IPMP
3365  * error code.  (We'd call this function getaddrinfo(), but it would conflict
3366  * with getaddrinfo(3SOCKET)).
3367  */
3368 unsigned int
getgraddrinfo(const char * grname,struct sockaddr_storage * ssp,ipmp_addrinfo_t ** adinfopp)3369 getgraddrinfo(const char *grname, struct sockaddr_storage *ssp,
3370     ipmp_addrinfo_t **adinfopp)
3371 {
3372 	int ifsock;
3373 	addrlist_t *addrp, *addrmatchp = NULL;
3374 	ipmp_addr_state_t state;
3375 	const char *binding = "";
3376 	struct lifreq lifr;
3377 	struct phyint_group *pg;
3378 
3379 	if ((pg = phyint_group_lookup(grname)) == NULL)
3380 		return (IPMP_EUNKADDR);
3381 
3382 	/*
3383 	 * Walk through the data addresses, and find a match.  Note that since
3384 	 * some of the addresses may be down, more than one may match.  We
3385 	 * prefer an up address (if one exists).
3386 	 */
3387 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3388 		if (sockaddrcmp(ssp, &addrp->al_addr)) {
3389 			addrmatchp = addrp;
3390 			if (addrmatchp->al_flags & IFF_UP)
3391 				break;
3392 		}
3393 	}
3394 
3395 	if (addrmatchp == NULL)
3396 		return (IPMP_EUNKADDR);
3397 
3398 	state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN;
3399 	if (state == IPMP_ADDR_UP) {
3400 		ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6;
3401 		(void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ);
3402 		if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0)
3403 			binding = lifr.lifr_binding;
3404 	}
3405 
3406 	*adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding);
3407 	return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3408 }
3409 
3410 /*
3411  * Store a snapshot of the IPMP subsystem into a dynamically allocated
3412  * structure pointed to by `*snapp'.  Returns an IPMP error code.
3413  */
3414 unsigned int
getsnap(ipmp_snap_t ** snapp)3415 getsnap(ipmp_snap_t **snapp)
3416 {
3417 	ipmp_grouplist_t	*grlistp;
3418 	ipmp_groupinfo_t	*grinfop;
3419 	ipmp_addrinfo_t		*adinfop;
3420 	ipmp_addrlist_t		*adlistp;
3421 	ipmp_ifinfo_t		*ifinfop;
3422 	ipmp_snap_t		*snap;
3423 	struct phyint		*pi;
3424 	unsigned int		i, j;
3425 	int			retval;
3426 
3427 	snap = ipmp_snap_create();
3428 	if (snap == NULL)
3429 		return (IPMP_ENOMEM);
3430 
3431 	/*
3432 	 * Add group list.
3433 	 */
3434 	retval = getgrouplist(&snap->sn_grlistp);
3435 	if (retval != IPMP_SUCCESS)
3436 		goto failed;
3437 
3438 	/*
3439 	 * Add information for each group in the list, along with all of its
3440 	 * data addresses.
3441 	 */
3442 	grlistp = snap->sn_grlistp;
3443 	for (i = 0; i < grlistp->gl_ngroup; i++) {
3444 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
3445 		if (retval != IPMP_SUCCESS)
3446 			goto failed;
3447 
3448 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
3449 		if (retval != IPMP_SUCCESS) {
3450 			ipmp_freegroupinfo(grinfop);
3451 			goto failed;
3452 		}
3453 
3454 		adlistp = grinfop->gr_adlistp;
3455 		for (j = 0; j < adlistp->al_naddr; j++) {
3456 			retval = getgraddrinfo(grinfop->gr_name,
3457 			    &adlistp->al_addrs[j], &adinfop);
3458 			if (retval != IPMP_SUCCESS)
3459 				goto failed;
3460 
3461 			retval = ipmp_snap_addaddrinfo(snap, adinfop);
3462 			if (retval != IPMP_SUCCESS) {
3463 				ipmp_freeaddrinfo(adinfop);
3464 				goto failed;
3465 			}
3466 		}
3467 	}
3468 
3469 	/*
3470 	 * Add information for each configured phyint.
3471 	 */
3472 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
3473 		retval = getifinfo(pi->pi_name, &ifinfop);
3474 		if (retval != IPMP_SUCCESS)
3475 			goto failed;
3476 
3477 		retval = ipmp_snap_addifinfo(snap, ifinfop);
3478 		if (retval != IPMP_SUCCESS) {
3479 			ipmp_freeifinfo(ifinfop);
3480 			goto failed;
3481 		}
3482 	}
3483 
3484 	*snapp = snap;
3485 	return (IPMP_SUCCESS);
3486 failed:
3487 	ipmp_snap_free(snap);
3488 	return (retval);
3489 }
3490