1 /*
2  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Copyright (c) 1987 Regents of the University of California.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms are permitted
11  * provided that the above copyright notice and this paragraph are
12  * duplicated in all such forms and that any documentation,
13  * advertising materials, and other materials related to such
14  * distribution and use acknowledge that the software was developed
15  * by the University of California, Berkeley. The name of the
16  * University may not be used to endorse or promote products derived
17  * from this software without specific prior written permission.
18  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21  */
22 
23 #pragma ident	"%Z%%M%	%I%	%E% SMI"
24 
25 #include "mpd_defs.h"
26 #include "mpd_tables.h"
27 
28 /*
29  * Probe types for probe()
30  */
31 #define	PROBE_UNI	0x1234		/* Unicast probe packet */
32 #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
33 #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
34 
35 #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
36 
37 /*
38  * Format of probe / probe response packets. This is an ICMP Echo request
39  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
40  */
41 struct pr_icmp
42 {
43 	uint8_t  pr_icmp_type;		/* type field */
44 	uint8_t  pr_icmp_code;		/* code field */
45 	uint16_t pr_icmp_cksum;		/* checksum field */
46 	uint16_t pr_icmp_id;		/* Identification */
47 	uint16_t pr_icmp_seq;		/* sequence number */
48 	uint32_t pr_icmp_timestamp;	/* Time stamp	*/
49 	uint32_t pr_icmp_mtype;		/* Message type */
50 };
51 
52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
53 				    0x0, 0x0, 0x0, 0x0,
54 				    0x0, 0x0, 0x0, 0x0,
55 				    0x0, 0x0, 0x0, 0x1 } };
56 
57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
58 
59 static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
60 
61 static void		*find_ancillary(struct msghdr *msg, int cmsg_type);
62 static void		pi_set_crtt(struct target *tg, int m,
63     boolean_t is_probe_uni);
64 static void		incoming_echo_reply(struct phyint_instance *pii,
65     struct pr_icmp *reply, struct in6_addr fromaddr);
66 static void		incoming_rtt_reply(struct phyint_instance *pii,
67     struct pr_icmp *reply, struct in6_addr fromaddr);
68 static void		incoming_mcast_reply(struct phyint_instance *pii,
69     struct pr_icmp *reply, struct in6_addr fromaddr);
70 
71 static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
72 static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
73 static boolean_t	check_exception_target(struct phyint_instance *pii,
74     struct target *target);
75 static void		probe_fail_info(struct phyint_instance *pii,
76     struct target *cur_tg, struct probe_fail_count *pfinfo);
77 static void		probe_success_info(struct phyint_instance *pii,
78     struct target *cur_tg, struct probe_success_count *psinfo);
79 static boolean_t	phyint_repaired(struct phyint *pi);
80 
81 static int		failover(struct phyint *from, struct phyint *to);
82 static int		failback(struct phyint *from, struct phyint *to);
83 static struct phyint	*get_failover_dst(struct phyint *pi, int failover_type);
84 
85 static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
86 static int 		in_cksum(ushort_t *addr, int len);
87 static void		reset_snxt_basetimes(void);
88 
89 /*
90  * CRTT - Conservative Round Trip Time Estimate
91  * Probe success - A matching probe reply received before CRTT ms has elapsed
92  *	after sending the probe.
93  * Probe failure - No probe reply received and more than CRTT ms has elapsed
94  *	after sending the probe.
95  *
96  * TLS - Time last success. Most recent probe ack received at this time.
97  * TFF - Time first fail. The time of the earliest probe failure in
98  *	a consecutive series of probe failures.
99  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
100  * 	before declaring phyint repair.
101  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
102  *	declare a phyint failure.
103  *
104  * 			Phyint state diagram
105  *
106  * The state of a phyint that is capable of being probed, is completely
107  * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
108  *
109  * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
110  * of the link (according to the driver).  If the phyint is also configured
111  * with a test address (the common case) and probe targets, then a phyint must
112  * also successfully be able to send and receive probes in order to remain in
113  * the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
114  *
115  * Further, if a PI_RUNNING phyint is configured with a test address but is
116  * unable to find any probe targets, it will transition to the PI_NOTARGETS
117  * state, which indicates that the link is apparently functional but that
118  * in.mpathd is unable to send probes to verify functionality (in this case,
119  * in.mpathd makes the optimistic assumption that the interface is working
120  * correctly and thus does not perform a failover, but reports the interface
121  * as IPMP_IF_UNKNOWN through the async events and query interfaces).
122  *
123  * At any point, a phyint may be administratively marked offline via if_mpadm.
124  * In this case, the interface always transitions to PI_OFFLINE, regardless
125  * of its previous state.  When the interface is later brought back online,
126  * in.mpathd acts as if the interface is new (and thus it transitions to
127  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
128  * its probes, if probes are sent).
129  *
130  * pi_state -  PI_RUNNING or PI_FAILED
131  *	PI_RUNNING: The failure detection logic says the phyint is good.
132  *	PI_FAILED: The failure detection logic says the phyint has failed.
133  *
134  * pg_groupfailed  - Group failure, all interfaces in the group have failed.
135  *	The pi_state may be either PI_FAILED or PI_NOTARGETS.
136  *	In the case of router targets, we assume that the current list of
137  *	targets obtained from the routing table, is still valid, so the
138  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
139  *	list of targets, and multicast to the all hosts, to reconstruct the
140  *	target list. So the phyints are in the PI_NOTARGETS state.
141  *
142  * I -	value of (pi_flags & IFF_INACTIVE)
143  *	IFF_INACTIVE: No failovers have been done to this phyint, from
144  *		other phyints. This phyint is inactive. Phyint can be a Standby.
145  *		When failback has been disabled (FAILOVER=no configured),
146  *		phyint can also be a non-STANDBY. In this case IFF_INACTIVE
147  *		is set when phyint subsequently recovers after a failure.
148  *
149  * pi_empty
150  *	This phyint has failed over successfully to another phyint, and
151  *	this phyint is currently "empty". It does not host any addresses or
152  *	multicast membership etc. This is the state of a phyint after a
153  *	failover from the phyint has completed successfully and no subsequent
154  *	'failover to' or 'failback to' has occurred on the phyint.
155  *	IP guarantees that no new logicals will be hosted nor any multicast
156  *	joins permitted on the phyint, since the phyint is either failed or
157  *	inactive. pi_empty is set implies the phyint is either failed or
158  *	inactive.
159  *
160  * pi_full
161  *	The phyint hosts all of its own addresses that it "owns". If the
162  *	phyint was previously failed or inactive, failbacks to the phyint
163  *	has completed successfully. i.e. No more failbacks to this phyint
164  *	can produce any change in system state whatsoever.
165  *
166  * Not all 32 possible combinations of the above 5-tuple are possible.
167  * Furthermore some of the above combinations are transient. They may occur
168  * only because the failover or failback did not complete successfully. The
169  * failover/failback will be retried and eventually a stable state will be
170  * reached.
171  *
172  * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
173  * The following are the state machines. 'from' and 'to' are the src and
174  * dst of the failover/failback, below
175  *
176  *			pi_empty state machine
177  * ---------------------------------------------------------------------------
178  *	Event				State	->	New State
179  * ---------------------------------------------------------------------------
180  *	successful completion 		from.pi_empty = 0 -> from.pi_empty = 1
181  *	of failover
182  *
183  *	Initiate failover 		to.pi_empty = X   -> to.pi_empty = 0
184  *
185  * 	Initiate failback 		to.pi_empty = X   -> to.pi_empty = 0
186  *
187  * 	group failure			pi_empty = X	  -> pi_empty = 0
188  * ---------------------------------------------------------------------------
189  *
190  *			pi_full state machine
191  * ---------------------------------------------------------------------------
192  *	Event				State		  -> New State
193  * ---------------------------------------------------------------------------
194  *	successful completion		to.pi_full = 0    -> to.pi_full = 1
195  *	of failback from
196  *	each of the other phyints
197  *
198  *	Initiate failover 		from.pi_full = X  -> from.pi_full = 0
199  *
200  *	group failure			pi_full = X	  -> pi_full = 0
201  * ---------------------------------------------------------------------------
202  *
203  *			pi_state state machine
204  * ---------------------------------------------------------------------------
205  *	Event			State			New State
206  *				Action:
207  * ---------------------------------------------------------------------------
208  *	NIC failure		(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
209  *	detection		: set IFF_FAILED on this phyint
210  *				: failover from this phyint to another
211  *
212  *	NIC failure		(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
213  *	detection		: set IFF_FAILED on this phyint
214  *
215  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=yes)
216  *	detection				     -> (PI_RUNNING, I == 0)
217  *				: to.pi_empty = 0
218  *				: clear IFF_FAILED on this phyint
219  *				: failback to this phyint if enabled
220  *
221  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=no)
222  *	detection				     ->	(PI_RUNNING, I == 1)
223  *				: to.pi_empty = 0
224  *				: clear IFF_FAILED on this phyint
225  *				: if failback is disabled set I == 1
226  *
227  *	Group failure		(perform on all phyints in the group)
228  *	detection 		PI_RUNNING		PI_FAILED
229  *	(Router targets)	: set IFF_FAILED
230  *				: clear pi_empty and pi_full
231  *
232  *	Group failure		(perform on all phyints in the group)
233  *	detection 		PI_RUNNING		PI_NOTARGETS
234  *	(Host targets)		: set IFF_FAILED
235  *				: clear pi_empty and pi_full
236  *				: delete the target list on all phyints
237  * ---------------------------------------------------------------------------
238  *
239  *			I state machine
240  * ---------------------------------------------------------------------------
241  *	Event		State			Action:
242  * ---------------------------------------------------------------------------
243  *	Turn on I 	pi_empty == 0, STANDBY 	: failover from standby
244  *
245  *	Turn off I 	PI_RUNNING, STANDBY	: pi_empty = 0
246  *			pi_full == 0		: failback to this if enabled
247  * ---------------------------------------------------------------------------
248  *
249  * Assertions: (Read '==>' as implies)
250  *
251  * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
252  * (pi_empty == 1) ==> (pi_full == 0)
253  * (pi_full  == 1) ==> (pi_empty == 0)
254  *
255  * Invariants
256  *
257  * pg_groupfailed = 0  &&
258  *   1. (I == 1, pi_empty == 0)		   ==> initiate failover from standby
259  *   2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
260  *   3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
261  *
262  * 1. says that an inactive standby, that is not empty, has to be failed
263  * over. For a standby to be truly inactive, it should not host any
264  * addresses. So we move them to some other phyint. Usually we catch the
265  * turn on of IFF_INACTIVE, and perform this action. However if the failover
266  * did not complete successfully, then subsequently we have lost the edge
267  * trigger, and this invariant kicks in and completes the action.
268  *
269  * 2. says that any failed phyint that is not empty must be failed over.
270  * Usually we do the failover when we detect NIC failure. However if the
271  * failover does not complete successfully, this invariant kicks in and
272  * completes the failover. We exclude inactive standby which is covered by 1.
273  *
274  * 3. says that any running phyint that is not full must be failed back.
275  * Usually we do the failback when we detect NIC repair. However if the
276  * failback does not complete successfully, this invariant kicks in and
277  * completes the failback. Note that we don't want to failback to an inactive
278  * standby.
279  *
280  * The invariants 1 - 3 and the actions are in initifs().
281  */
282 
283 struct probes_missed probes_missed;
284 
285 /*
286  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
287  * will be added on by the kernel.  The id field identifies this phyint.
288  * and the sequence number is an increasing (modulo 2^^16) integer. The data
289  * portion holds the time value when the packet is sent. On echo this is
290  * extracted to compute the round-trip time. Three different types of
291  * probe packets are used.
292  *
293  * PROBE_UNI: This type is used to do failure detection / failure recovery
294  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
295  *	not less than the current CRTT. pii_probes[] stores data
296  *	about these probes. These packets consume sequence number space.
297  *
298  * PROBE_RTT: This type is used to make only rtt measurments. Normally these
299  * 	are not used. Under heavy network load, the rtt may go up very high,
300  *	due to a spike, or may appear to go high, due to extreme scheduling
301  * 	delays. Once the network stress is removed, mpathd takes long time to
302  *	recover, because the probe_interval is already high, and it takes
303  *	a long time to send out sufficient number of probes to bring down the
304  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
305  *	user_probe_interval ms. and will cause only rtt updates. These packets
306  *	do not consume sequence number space nor is information about these
307  *	packets stored in the pii_probes[]
308  *
309  * PROBE_MULTI: This type is only used to construct a list of targets, when
310  *	no targets are known. The packet is multicast to the all hosts addr.
311  */
312 static void
313 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
314 {
315 	struct pr_icmp probe_pkt;	/* Probe packet */
316 	struct sockaddr_in6 whereto6; 	/* target address IPv6 */
317 	struct sockaddr_in whereto; 	/* target address IPv4 */
318 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
319 	boolean_t sent = _B_TRUE;
320 
321 	if (debug & D_TARGET) {
322 		logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
323 		    pii->pii_name, probe_type, cur_time);
324 	}
325 
326 	assert(pii->pii_probe_sock != -1);
327 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
328 	    probe_type == PROBE_RTT);
329 
330 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
331 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
332 	probe_pkt.pr_icmp_code = 0;
333 	probe_pkt.pr_icmp_cksum = 0;
334 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
335 
336 	/*
337 	 * Since there is no need to do arithmetic on the icmpid,
338 	 * (only equality check is done) pii_icmpid is stored in
339 	 * network byte order at initialization itself.
340 	 */
341 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
342 	probe_pkt.pr_icmp_timestamp = htonl(cur_time);
343 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
344 
345 	/*
346 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
347 	 * the all hosts address. Otherwise it is unicast to the next target.
348 	 */
349 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
350 	    pii->pii_rtt_target_next != NULL));
351 
352 	if (pii->pii_af == AF_INET6) {
353 		bzero(&whereto6, sizeof (whereto6));
354 		whereto6.sin6_family = AF_INET6;
355 		if (probe_type == PROBE_MULTI) {
356 			whereto6.sin6_addr = all_nodes_mcast_v6;
357 		} else if (probe_type == PROBE_UNI) {
358 			whereto6.sin6_addr = pii->pii_target_next->tg_address;
359 		} else  {
360 			/* type is PROBE_RTT */
361 			whereto6.sin6_addr =
362 			    pii->pii_rtt_target_next->tg_address;
363 		}
364 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
365 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
366 		    sizeof (whereto6)) != sizeof (probe_pkt)) {
367 			logperror_pii(pii, "probe: probe sendto");
368 			sent = _B_FALSE;
369 		}
370 	} else {
371 		bzero(&whereto, sizeof (whereto));
372 		whereto.sin_family = AF_INET;
373 		if (probe_type == PROBE_MULTI) {
374 			whereto.sin_addr = all_nodes_mcast_v4;
375 		} else if (probe_type == PROBE_UNI) {
376 			IN6_V4MAPPED_TO_INADDR(
377 			    &pii->pii_target_next->tg_address,
378 			    &whereto.sin_addr);
379 		} else {
380 			/* type is PROBE_RTT */
381 			IN6_V4MAPPED_TO_INADDR(
382 			    &pii->pii_rtt_target_next->tg_address,
383 			    &whereto.sin_addr);
384 		}
385 
386 		/*
387 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
388 		 */
389 		probe_pkt.pr_icmp_cksum =
390 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
391 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
392 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
393 		    sizeof (whereto)) != sizeof (probe_pkt)) {
394 			logperror_pii(pii, "probe: probe sendto");
395 			sent = _B_FALSE;
396 		}
397 	}
398 
399 	/*
400 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
401 	 * update our tables. We will need this info in processing the probe
402 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
403 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
404 	 * are only used to construct a list of targets. PROBE_RTT packets are
405 	 * used only for updating the rtt and not for failure detection.
406 	 */
407 	if (probe_type == PROBE_UNI && sent) {
408 		pr_ndx = pii->pii_probe_next;
409 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
410 
411 		/* Collect statistics, before we reuse the last slot. */
412 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
413 			pii->pii_cum_stats.lost++;
414 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
415 			pii->pii_cum_stats.acked++;
416 		pii->pii_cum_stats.sent++;
417 
418 		pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
419 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
420 		pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
421 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
422 		pii->pii_target_next = target_next(pii->pii_target_next);
423 		assert(pii->pii_target_next != NULL);
424 		/*
425 		 * If we have a single variable to denote the next target to
426 		 * probe for both rtt probes and failure detection probes, we
427 		 * could end up with a situation where the failure detection
428 		 * probe targets become disjoint from the rtt probe targets.
429 		 * Eg. if 2 targets and the actual fdt is double the user
430 		 * specified fdt. So we have 2 variables. In this scheme
431 		 * we also reset pii_rtt_target_next for every fdt probe,
432 		 * though that may not be necessary.
433 		 */
434 		pii->pii_rtt_target_next = pii->pii_target_next;
435 		pii->pii_snxt++;
436 	} else if (probe_type == PROBE_RTT) {
437 		pii->pii_rtt_target_next =
438 		    target_next(pii->pii_rtt_target_next);
439 		assert(pii->pii_rtt_target_next != NULL);
440 	}
441 }
442 
443 /*
444  * Incoming IPv4 data from wire, is received here. Called from main.
445  */
446 void
447 in_data(struct phyint_instance *pii)
448 {
449 	struct	sockaddr_in 	from;
450 	struct	in6_addr	fromaddr;
451 	uint_t	fromlen;
452 	static uint_t in_packet[(IP_MAXPACKET + 1)/4];
453 	struct ip *ip;
454 	int 	iphlen;
455 	int 	len;
456 	char 	abuf[INET_ADDRSTRLEN];
457 	struct	pr_icmp	*reply;
458 
459 	if (debug & D_PROBE) {
460 		logdebug("in_data(%s %s)\n",
461 		    AF_STR(pii->pii_af), pii->pii_name);
462 	}
463 
464 	/*
465 	 * Poll has already told us that a message is waiting,
466 	 * on this socket. Read it now. We should not block.
467 	 */
468 	fromlen = sizeof (from);
469 	len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
470 	    sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
471 	if (len < 0) {
472 		logperror_pii(pii, "in_data: recvfrom");
473 		return;
474 	}
475 
476 	/*
477 	 * If the NIC has indicated the link is down, don't go
478 	 * any further.
479 	 */
480 	if (LINK_DOWN(pii->pii_phyint))
481 		return;
482 
483 	/* Get the printable address for error reporting */
484 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
485 
486 	/* Make sure packet contains at least minimum ICMP header */
487 	ip = (struct ip *)in_packet;
488 	iphlen = ip->ip_hl << 2;
489 	if (len < iphlen + ICMP_MINLEN) {
490 		if (debug & D_PKTBAD) {
491 			logdebug("in_data: packet too short (%d bytes)"
492 			    " from %s\n", len, abuf);
493 		}
494 		return;
495 	}
496 
497 	/*
498 	 * Subtract the IP hdr length, 'len' will be length of the probe
499 	 * reply, starting from the icmp hdr.
500 	 */
501 	len -= iphlen;
502 	/* LINTED */
503 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
504 
505 	/* Probe replies are icmp echo replies. Ignore anything else */
506 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
507 		return;
508 
509 	/*
510 	 * The icmp id should match what we sent, which is stored
511 	 * in pi_icmpid. The icmp code for reply must be 0.
512 	 * The reply content must be a struct pr_icmp
513 	 */
514 	if (reply->pr_icmp_id != pii->pii_icmpid) {
515 		/* Not in response to our probe */
516 		return;
517 	}
518 
519 	if (reply->pr_icmp_code != 0) {
520 		logtrace("probe reply code %d from %s on %s\n",
521 		    reply->pr_icmp_code, abuf, pii->pii_name);
522 		return;
523 	}
524 
525 	if (len < sizeof (struct pr_icmp)) {
526 		logtrace("probe reply too short: %d bytes from %s on %s\n",
527 		    len, abuf, pii->pii_name);
528 		return;
529 	}
530 
531 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
532 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
533 		/* Unicast probe reply */
534 		incoming_echo_reply(pii, reply, fromaddr);
535 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
536 		/* Multicast reply */
537 		incoming_mcast_reply(pii, reply, fromaddr);
538 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
539 		incoming_rtt_reply(pii, reply, fromaddr);
540 	} else {
541 		/* Probably not in response to our probe */
542 		logtrace("probe reply type: %d from %s on %s\n",
543 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
544 		return;
545 	}
546 
547 }
548 
549 /*
550  * Incoming IPv6 data from wire is received here. Called from main.
551  */
552 void
553 in6_data(struct phyint_instance *pii)
554 {
555 	struct sockaddr_in6 from;
556 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
557 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
558 	int len;
559 	char abuf[INET6_ADDRSTRLEN];
560 	struct msghdr msg;
561 	struct iovec iov;
562 	uchar_t *opt;
563 	struct	pr_icmp *reply;
564 
565 	if (debug & D_PROBE) {
566 		logdebug("in6_data(%s %s)\n",
567 		    AF_STR(pii->pii_af), pii->pii_name);
568 	}
569 
570 	iov.iov_base = (char *)in_packet;
571 	iov.iov_len = sizeof (in_packet);
572 	msg.msg_iov = &iov;
573 	msg.msg_iovlen = 1;
574 	msg.msg_name = (struct sockaddr *)&from;
575 	msg.msg_namelen = sizeof (from);
576 	msg.msg_control = ancillary_data;
577 	msg.msg_controllen = sizeof (ancillary_data);
578 
579 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
580 		logperror_pii(pii, "in6_data: recvfrom");
581 		return;
582 	}
583 
584 	/*
585 	 * If the NIC has indicated that the link is down, don't go
586 	 * any further.
587 	 */
588 	if (LINK_DOWN(pii->pii_phyint))
589 		return;
590 
591 	/* Get the printable address for error reporting */
592 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
593 	if (len < ICMP_MINLEN) {
594 		if (debug & D_PKTBAD) {
595 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
596 			    msg.msg_flags, abuf);
597 		}
598 		return;
599 	}
600 	/* Ignore packets > 64k or control buffers that don't fit */
601 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
602 		if (debug & D_PKTBAD) {
603 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
604 			    msg.msg_flags, abuf);
605 		}
606 		return;
607 	}
608 
609 	reply = (struct pr_icmp *)in_packet;
610 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
611 		return;
612 
613 	if (reply->pr_icmp_id != pii->pii_icmpid) {
614 		/* Not in response to our probe */
615 		return;
616 	}
617 
618 	/*
619 	 * The kernel has already verified the the ICMP checksum.
620 	 */
621 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
622 		logtrace("ICMPv6 echo reply source address not linklocal from "
623 		    "%s on %s\n", abuf, pii->pii_name);
624 		return;
625 	}
626 	opt = find_ancillary(&msg, IPV6_RTHDR);
627 	if (opt != NULL) {
628 		/* Can't allow routing headers in probe replies  */
629 		logtrace("message with routing header from %s on %s\n",
630 		    abuf, pii->pii_name);
631 		return;
632 	}
633 	if (reply->pr_icmp_code != 0) {
634 		logtrace("probe reply code: %d from %s on %s\n",
635 		    reply->pr_icmp_code, abuf, pii->pii_name);
636 		return;
637 	}
638 	if (len < (sizeof (struct pr_icmp))) {
639 		logtrace("probe reply too short: %d bytes from %s on %s\n",
640 		    len, abuf, pii->pii_name);
641 		return;
642 	}
643 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
644 		incoming_echo_reply(pii, reply, from.sin6_addr);
645 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
646 		incoming_mcast_reply(pii, reply, from.sin6_addr);
647 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
648 		incoming_rtt_reply(pii, reply, from.sin6_addr);
649 	} else  {
650 		/* Probably not in response to our probe */
651 		logtrace("probe reply type: %d from %s on %s\n",
652 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
653 	}
654 }
655 
656 /*
657  * Process the incoming rtt reply, in response to our rtt probe.
658  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
659  * have any stored information about the probe we sent. So we don't log
660  * any errors if we receive bad replies.
661  */
662 static void
663 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
664     struct in6_addr fromaddr)
665 {
666 	int 	m;		/* rtt measurment in ms */
667 	uint32_t cur_time;	/* in ms from some arbitrary point */
668 	char	abuf[INET6_ADDRSTRLEN];
669 	struct	target	*target;
670 	uint32_t pr_icmp_timestamp;
671 	struct 	phyint_group *pg;
672 
673 	/* Get the printable address for error reporting */
674 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
675 
676 	if (debug & D_PROBE) {
677 		logdebug("incoming_rtt_reply: %s %s %s\n",
678 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
679 	}
680 
681 	/* Do we know this target ? */
682 	target = target_lookup(pii, fromaddr);
683 	if (target == NULL)
684 		return;
685 
686 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
687 	cur_time = getcurrenttime();
688 	m = (int)(cur_time - pr_icmp_timestamp);
689 
690 	/* Invalid rtt. It has wrapped around */
691 	if (m < 0)
692 		return;
693 
694 	/*
695 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
696 	 * The initial few responses after the interface is repaired may
697 	 * contain high rtt's because they could have been queued up waiting
698 	 * for ARP/NDP resolution on a failed interface.
699 	 */
700 	pg = pii->pii_phyint->pi_group;
701 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
702 		return;
703 
704 	/*
705 	 * Update rtt only if the new rtt is lower than the current rtt.
706 	 * (specified by the 3rd parameter to pi_set_crtt).
707 	 * If a spike has caused the current probe_interval to be >
708 	 * user_probe_interval, then this mechanism is used to bring down
709 	 * the rtt rapidly once the network stress is removed.
710 	 * If the new rtt is higher than the current rtt, we don't want to
711 	 * update the rtt. We are having more than 1 outstanding probe and
712 	 * the increase in rtt we are seeing is being unnecessarily weighted
713 	 * many times. The regular rtt update will be handled by
714 	 * incoming_echo_reply() and will take care of any rtt increase.
715 	 */
716 	pi_set_crtt(target, m, _B_FALSE);
717 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
718 	    (user_failure_detection_time < pg->pg_fdt) &&
719 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
720 		/*
721 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
722 		 * investigate if we can improve the failure detection time to
723 		 * meet whatever the user specified.
724 		 */
725 		if (check_pg_crtt_improved(pg)) {
726 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
727 			    user_failure_detection_time);
728 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
729 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
730 				logerr("Improved failure detection time %d ms "
731 				    "on (%s %s) for group \"%s\"\n",
732 				    pg->pg_fdt, AF_STR(pii->pii_af),
733 				    pii->pii_name,
734 				    pii->pii_phyint->pi_group->pg_name);
735 			}
736 			if (user_failure_detection_time == pg->pg_fdt) {
737 				/* Avoid any truncation or rounding errors */
738 				pg->pg_probeint = user_probe_interval;
739 				/*
740 				 * No more rtt probes will be sent. The actual
741 				 * fdt has dropped to the user specified value.
742 				 * pii_fd_snxt_basetime and pii_snxt_basetime
743 				 * will be in sync henceforth.
744 				 */
745 				reset_snxt_basetimes();
746 			}
747 		}
748 	}
749 }
750 
751 /*
752  * Process the incoming echo reply, in response to our unicast probe.
753  * Common for both IPv4 and IPv6
754  */
755 static void
756 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
757     struct in6_addr fromaddr)
758 {
759 	int 	m;		/* rtt measurment in ms */
760 	uint32_t cur_time;	/* in ms from some arbitrary point */
761 	char	abuf[INET6_ADDRSTRLEN];
762 	int	pr_ndx;
763 	struct	target	*target;
764 	boolean_t exception;
765 	uint32_t pr_icmp_timestamp;
766 	uint16_t pr_icmp_seq;
767 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
768 
769 	/* Get the printable address for error reporting */
770 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
771 
772 	if (debug & D_PROBE) {
773 		logdebug("incoming_echo_reply: %s %s %s seq %u\n",
774 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
775 		    ntohs(reply->pr_icmp_seq));
776 	}
777 
778 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
779 	pr_icmp_seq  = ntohs(reply->pr_icmp_seq);
780 
781 	/* Reject out of window probe replies */
782 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
783 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
784 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
785 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
786 		pii->pii_cum_stats.unknown++;
787 		return;
788 	}
789 	cur_time = getcurrenttime();
790 	m = (int)(cur_time - pr_icmp_timestamp);
791 	if (m < 0) {
792 		/*
793 		 * This is a ridiculously high value of rtt. rtt has wrapped
794 		 * around. Log a message, and ignore the rtt.
795 		 */
796 		logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
797 		    "timestamp %u\n", cur_time, pr_icmp_timestamp);
798 	}
799 
800 	/*
801 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
802 	 * number in our pii->pii_probes[] array. The icmp sequence number
803 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
804 	 */
805 	pr_ndx = MOD_SUB(pii->pii_probe_next,
806 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
807 
808 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
809 
810 	target = pii->pii_probes[pr_ndx].pr_target;
811 
812 	/*
813 	 * Perform sanity checks, whether this probe reply that we
814 	 * have received is genuine
815 	 */
816 	if (target != NULL) {
817 		/*
818 		 * Compare the src. addr of the received ICMP or ICMPv6
819 		 * probe reply with the target address in our tables.
820 		 */
821 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
822 			/*
823 			 * We don't have any record of having sent a probe to
824 			 * this target. This is a fake probe reply. Log an error
825 			 */
826 			logtrace("probe status %d Fake probe reply seq %u "
827 			    "snxt %u on %s from %s\n",
828 			    pii->pii_probes[pr_ndx].pr_status,
829 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
830 			pii->pii_cum_stats.unknown++;
831 			return;
832 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
833 			/*
834 			 * The address matches, but our tables indicate that
835 			 * this probe reply has been acked already. So this
836 			 * is a duplicate probe reply. Log an error
837 			 */
838 			logtrace("probe status %d Duplicate probe reply seq %u "
839 			    "snxt %u on %s from %s\n",
840 			    pii->pii_probes[pr_ndx].pr_status,
841 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
842 			pii->pii_cum_stats.unknown++;
843 			return;
844 		}
845 	} else {
846 		/*
847 		 * Target must not be NULL in the PR_UNACKED state
848 		 */
849 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
850 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
851 			/*
852 			 * The probe stats slot is unused. So we didn't
853 			 * send out any probe to this target. This is a fake.
854 			 * Log an error.
855 			 */
856 			logtrace("probe status %d Fake probe reply seq %u "
857 			    "snxt %u on %s from %s\n",
858 			    pii->pii_probes[pr_ndx].pr_status,
859 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
860 		}
861 		pii->pii_cum_stats.unknown++;
862 		return;
863 	}
864 
865 	/*
866 	 * If the rtt does not appear to be right, don't update the
867 	 * rtt stats. This can happen if the system dropped into the
868 	 * debugger, or the system was hung or too busy for a
869 	 * substantial time that we didn't get a chance to run.
870 	 */
871 	if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
872 		/*
873 		 * If the probe corresponding to this receieved response
874 		 * was truly sent 'm' ms. ago, then this response must
875 		 * have been rejected by the sequence number checks. The
876 		 * fact that it has passed the sequence number checks
877 		 * means that the measured rtt is wrong. We were probably
878 		 * scheduled long after the packet was received.
879 		 */
880 		goto out;
881 	}
882 
883 	/*
884 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
885 	 * The initial few responses after the interface is repaired may
886 	 * contain high rtt's because they could have been queued up waiting
887 	 * for ARP/NDP resolution on a failed interface.
888 	 */
889 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
890 		goto out;
891 
892 	/*
893 	 * Don't update the Conservative Round Trip Time estimate for this
894 	 * (phint, target) pair if this is the not the highest ack seq seen
895 	 * thus far on this target.
896 	 */
897 	if (!highest_ack_tg(pr_icmp_seq, target))
898 		goto out;
899 
900 	/*
901 	 * Always update the rtt. This is a failure detection probe
902 	 * and we want to measure both increase / decrease in rtt.
903 	 */
904 	pi_set_crtt(target, m, _B_TRUE);
905 
906 	/*
907 	 * If the crtt exceeds the average time between probes,
908 	 * investigate if this slow target is an exception. If so we
909 	 * can avoid this target and still meet the failure detection
910 	 * time. Otherwise we can't meet the failure detection time.
911 	 */
912 	if (target->tg_crtt > pg->pg_probeint) {
913 		exception = check_exception_target(pii, target);
914 		if (exception) {
915 			/*
916 			 * This target is exceptionally slow. Don't use it
917 			 * for future probes. check_exception_target() has
918 			 * made sure that we have at least MIN_PROBE_TARGETS
919 			 * other active targets
920 			 */
921 			if (pii->pii_targets_are_routers) {
922 				/*
923 				 * This is a slow router, mark it as slow
924 				 * and don't use it for further probes. We
925 				 * don't delete it, since it will be populated
926 				 * again when we do a router scan. Hence we
927 				 * need to maintain extra state (unlike the
928 				 * host case below).  Mark it as TG_SLOW.
929 				 */
930 				if (target->tg_status == TG_ACTIVE)
931 					pii->pii_ntargets--;
932 				target->tg_status = TG_SLOW;
933 				target->tg_latime = gethrtime();
934 				target->tg_rtt_sa = -1;
935 				target->tg_crtt = 0;
936 				target->tg_rtt_sd = 0;
937 				if (pii->pii_target_next == target) {
938 					pii->pii_target_next =
939 					    target_next(target);
940 				}
941 			} else {
942 				/*
943 				 * the slow target is not a router, we can
944 				 * just delete it. Send an icmp multicast and
945 				 * pick the fastest responder that is not
946 				 * already an active target. target_delete()
947 				 * adjusts pii->pii_target_next
948 				 */
949 				target_delete(target);
950 				probe(pii, PROBE_MULTI, cur_time);
951 			}
952 		} else {
953 			/*
954 			 * We can't meet the failure detection time.
955 			 * Log a message, and update the detection time to
956 			 * whatever we can achieve.
957 			 */
958 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
959 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
960 			last_fdt_bumpup_time = gethrtime();
961 			if (pg != phyint_anongroup) {
962 				logerr("Cannot meet requested failure detection"
963 				    " time of %d ms on (%s %s) new failure"
964 				    " detection time for group \"%s\" is %d"
965 				    " ms\n", user_failure_detection_time,
966 				    AF_STR(pii->pii_af), pii->pii_name,
967 				    pg->pg_name, pg->pg_fdt);
968 			}
969 		}
970 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
971 	    (user_failure_detection_time < pg->pg_fdt) &&
972 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
973 		/*
974 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
975 		 * investigate if we can improve the failure detection time to
976 		 * meet whatever the user specified.
977 		 */
978 		if (check_pg_crtt_improved(pg)) {
979 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
980 			    user_failure_detection_time);
981 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
982 			if (pg != phyint_anongroup) {
983 				logerr("Improved failure detection time %d ms "
984 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
985 				    AF_STR(pii->pii_af), pii->pii_name,
986 				    pg->pg_name);
987 			}
988 			if (user_failure_detection_time == pg->pg_fdt) {
989 				/* Avoid any truncation or rounding errors */
990 				pg->pg_probeint = user_probe_interval;
991 				/*
992 				 * No more rtt probes will be sent. The actual
993 				 * fdt has dropped to the user specified value.
994 				 * pii_fd_snxt_basetime and pii_snxt_basetime
995 				 * will be in sync henceforth.
996 				 */
997 				reset_snxt_basetimes();
998 			}
999 		}
1000 	}
1001 out:
1002 	pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
1003 	pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
1004 
1005 	/*
1006 	 * Update pii->pii_rack, i.e. the sequence number of the last received
1007 	 * probe response, based on the echo reply we have received now, if
1008 	 * either of the following conditions are satisfied.
1009 	 * a. pii_rack is outside the current receive window of
1010 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
1011 	 *    This means we have not received probe responses for a
1012 	 *    long time, and the sequence number has wrapped around.
1013 	 * b. pii_rack is within the current receive window and this echo
1014 	 *    reply corresponds to the highest sequence number we have seen
1015 	 *    so far.
1016 	 */
1017 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
1018 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
1019 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
1020 		pii->pii_rack = pr_icmp_seq;
1021 	}
1022 }
1023 
1024 /*
1025  * Returns true if seq is the highest unacknowledged seq for target tg
1026  * else returns false
1027  */
1028 static boolean_t
1029 highest_ack_tg(uint16_t seq, struct target *tg)
1030 {
1031 	struct phyint_instance *pii;
1032 	int	 pr_ndx;
1033 	uint16_t pr_seq;
1034 
1035 	pii = tg->tg_phyint_inst;
1036 
1037 	/*
1038 	 * Get the seq number of the most recent probe sent so far,
1039 	 * and also get the corresponding probe index in the probe stats
1040 	 * array.
1041 	 */
1042 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1043 	pr_seq = pii->pii_snxt;
1044 	pr_seq--;
1045 
1046 	/*
1047 	 * Start from the most recent probe and walk back, trying to find
1048 	 * an acked probe corresponding to target tg.
1049 	 */
1050 	for (; pr_ndx != pii->pii_probe_next;
1051 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1052 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
1053 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1054 			if (SEQ_GT(pr_seq, seq))
1055 				return (_B_FALSE);
1056 		}
1057 	}
1058 	return (_B_TRUE);
1059 }
1060 
1061 /*
1062  * Check whether the crtt for the group has improved by a factor of
1063  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1064  * detection time flapping in the face of small crtt changes.
1065  */
1066 static boolean_t
1067 check_pg_crtt_improved(struct phyint_group *pg)
1068 {
1069 	struct	phyint *pi;
1070 
1071 	if (debug & D_PROBE)
1072 		logdebug("check_pg_crtt_improved()\n");
1073 
1074 	/*
1075 	 * The crtt for the group is only improved if each phyint_instance
1076 	 * for both ipv4 and ipv6 is improved.
1077 	 */
1078 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1079 		if (!check_pii_crtt_improved(pi->pi_v4) ||
1080 		    !check_pii_crtt_improved(pi->pi_v6))
1081 			return (_B_FALSE);
1082 	}
1083 
1084 	return (_B_TRUE);
1085 }
1086 
1087 /*
1088  * Check whether the crtt has improved substantially on this phyint_instance.
1089  * Returns _B_TRUE if there's no crtt information available, because pii
1090  * is NULL or the phyint_instance is not capable of probing.
1091  */
1092 boolean_t
1093 check_pii_crtt_improved(struct phyint_instance *pii) {
1094 	struct 	target *tg;
1095 
1096 	if (pii == NULL)
1097 		return (_B_TRUE);
1098 
1099 	if (!PROBE_CAPABLE(pii) ||
1100 	    pii->pii_phyint->pi_state == PI_FAILED)
1101 		return (_B_TRUE);
1102 
1103 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1104 		if (tg->tg_status != TG_ACTIVE)
1105 			continue;
1106 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1107 		    LOWER_FDT_TRIGGER)) {
1108 			return (_B_FALSE);
1109 		}
1110 	}
1111 
1112 	return (_B_TRUE);
1113 }
1114 
1115 /*
1116  * This target responds very slowly to probes. The target's crtt exceeds
1117  * the probe interval of its group. Compare against other targets
1118  * and determine if this target is an exception, if so return true, else false
1119  */
1120 static boolean_t
1121 check_exception_target(struct phyint_instance *pii, struct target *target)
1122 {
1123 	struct	target *tg;
1124 	char abuf[INET6_ADDRSTRLEN];
1125 
1126 	if (debug & D_PROBE) {
1127 		logdebug("check_exception_target(%s %s target %s)\n",
1128 		    AF_STR(pii->pii_af), pii->pii_name,
1129 		    pr_addr(pii->pii_af, target->tg_address,
1130 			abuf, sizeof (abuf)));
1131 	}
1132 
1133 	/*
1134 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1135 	 * to make a good judgement. Otherwise don't drop this target.
1136 	 */
1137 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1138 		return (_B_FALSE);
1139 
1140 	/*
1141 	 * Determine whether only this particular target is slow.
1142 	 * We know that this target's crtt exceeds the group's probe interval.
1143 	 * If all other active targets have a
1144 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1145 	 * then this target is considered slow.
1146 	 */
1147 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1148 		if (tg != target && tg->tg_status == TG_ACTIVE) {
1149 			if (tg->tg_crtt >
1150 			    pii->pii_phyint->pi_group->pg_probeint /
1151 			    EXCEPTION_FACTOR) {
1152 				return (_B_FALSE);
1153 			}
1154 		}
1155 	}
1156 
1157 	return (_B_TRUE);
1158 }
1159 
1160 /*
1161  * Update the target list. The icmp all hosts multicast has given us
1162  * some host to which we can send probes. If we already have sufficient
1163  * targets, discard it.
1164  */
1165 static void
1166 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1167     struct in6_addr fromaddr)
1168 /* ARGSUSED */
1169 {
1170 	int af;
1171 	char abuf[INET6_ADDRSTRLEN];
1172 	struct phyint *pi;
1173 
1174 	if (debug & D_PROBE) {
1175 		logdebug("incoming_mcast_reply(%s %s %s)\n",
1176 		    AF_STR(pii->pii_af), pii->pii_name,
1177 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1178 	}
1179 
1180 	/*
1181 	 * Using host targets is a fallback mechanism. If we have
1182 	 * found a router, don't add this host target. If we already
1183 	 * know MAX_PROBE_TARGETS, don't add another target.
1184 	 */
1185 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1186 	if (pii->pii_targets != NULL) {
1187 		if (pii->pii_targets_are_routers ||
1188 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1189 			return;
1190 		}
1191 	}
1192 
1193 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1194 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1195 		/*
1196 		 * Guard against response from 0.0.0.0
1197 		 * and ::. Log a trace message
1198 		 */
1199 		logtrace("probe response from %s on %s\n",
1200 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1201 		    pii->pii_name);
1202 		return;
1203 	}
1204 
1205 	/*
1206 	 * This address is one of our own, so reject this address as a
1207 	 * valid probe target.
1208 	 */
1209 	af = pii->pii_af;
1210 	if (own_address(fromaddr))
1211 		return;
1212 
1213 	/*
1214 	 * If the phyint is part a named group, then add the address to all
1215 	 * members of the group.  Otherwise, add the address only to the
1216 	 * phyint itself, since other phyints in the anongroup may not be on
1217 	 * the same subnet.
1218 	 */
1219 	pi = pii->pii_phyint;
1220 	if (pi->pi_group == phyint_anongroup) {
1221 		target_add(pii, fromaddr, _B_FALSE);
1222 	} else {
1223 		pi = pi->pi_group->pg_phyint;
1224 		for (; pi != NULL; pi = pi->pi_pgnext)
1225 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1226 	}
1227 }
1228 
1229 /*
1230  * Compute CRTT given an existing scaled average, scaled deviation estimate
1231  * and a new rtt time.  The formula is from Jacobson and Karels'
1232  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1233  * are the same as those in Appendix A.2 of that paper.
1234  *
1235  * m = new measurement
1236  * sa = scaled RTT average (8 * average estimates)
1237  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1238  * crtt = Conservative round trip time. Used to determine whether probe
1239  * has timed out.
1240  *
1241  * New scaled average and deviation are passed back via sap and svp
1242  */
1243 static int
1244 compute_crtt(int *sap, int *svp, int m)
1245 {
1246 	int sa = *sap;
1247 	int sv = *svp;
1248 	int crtt;
1249 	int saved_m = m;
1250 
1251 	assert(*sap >= -1);
1252 	assert(*svp >= 0);
1253 
1254 	if (sa != -1) {
1255 		/*
1256 		 * Update average estimator:
1257 		 *	new rtt = old rtt + 1/8 Error
1258 		 *	    where Error = m - old rtt
1259 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1260 		 *	i.e. new sa =  old sa + Error
1261 		 */
1262 		m -= sa >> 3;		/* m is now Error in estimate. */
1263 		if ((sa += m) < 0) {
1264 			/* Don't allow the smoothed average to be negative. */
1265 			sa = 0;
1266 		}
1267 
1268 		/*
1269 		 * Update deviation estimator:
1270 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1271 		 *	i.e. 4 * new mdev = 4 * old mdev +
1272 		 *		(abs(Error) - old mdev)
1273 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1274 		 */
1275 		if (m < 0)
1276 			m = -m;
1277 		m -= sv >> 2;
1278 		sv += m;
1279 	} else {
1280 		/* Initialization. This is the first response received. */
1281 		sa = (m << 3);
1282 		sv = (m << 1);
1283 	}
1284 
1285 	crtt = (sa >> 3) + sv;
1286 
1287 	if (debug & D_PROBE) {
1288 		logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
1289 		    "%d\n", saved_m, sa, sv, crtt);
1290 	}
1291 
1292 	*sap = sa;
1293 	*svp = sv;
1294 
1295 	/*
1296 	 * CRTT = average estimates  + 4 * deviation estimates
1297 	 *	= sa / 8 + sv
1298 	 */
1299 	return (crtt);
1300 }
1301 
1302 static void
1303 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
1304 {
1305 	struct phyint_instance *pii = tg->tg_phyint_inst;
1306 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1307 	int sa = tg->tg_rtt_sa;
1308 	int sv = tg->tg_rtt_sd;
1309 	int new_crtt;
1310 	int i;
1311 
1312 	if (debug & D_PROBE)
1313 		logdebug("pi_set_crtt: target -  m %d\n", m);
1314 
1315 	/* store the round trip time, in case we need to defer computation */
1316 	tg->tg_deferred[tg->tg_num_deferred] = m;
1317 
1318 	new_crtt = compute_crtt(&sa, &sv, m);
1319 
1320 	/*
1321 	 * If this probe's round trip time would singlehandedly cause an
1322 	 * increase in the group's probe interval consider it suspect.
1323 	 */
1324 	if ((new_crtt > probe_interval) && is_probe_uni) {
1325 		if (debug & D_PROBE) {
1326 			logdebug("Received a suspect probe on %s, new_crtt ="
1327 			    " %d, probe_interval = %d, num_deferred = %d\n",
1328 			    pii->pii_probe_logint->li_name, new_crtt,
1329 			    probe_interval, tg->tg_num_deferred);
1330 		}
1331 
1332 		/*
1333 		 * If we've deferred as many rtts as we plan on deferring, then
1334 		 * assume the link really did slow down and process all queued
1335 		 * rtts
1336 		 */
1337 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1338 			if (debug & D_PROBE) {
1339 				logdebug("Received MAXDEFERREDRTT probes which "
1340 				    "would cause an increased probe_interval.  "
1341 				    "Integrating queued rtt data points.\n");
1342 			}
1343 
1344 			for (i = 0; i <= tg->tg_num_deferred; i++) {
1345 				tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
1346 				    &tg->tg_rtt_sd, tg->tg_deferred[i]);
1347 			}
1348 
1349 			tg->tg_num_deferred = 0;
1350 		} else {
1351 			tg->tg_num_deferred++;
1352 		}
1353 		return;
1354 	}
1355 
1356 	/*
1357 	 * If this is a normal probe, or an RTT probe that would lead to a
1358 	 * reduced CRTT, then update our CRTT data.  Further, if this was
1359 	 * a normal probe, pitch any deferred probes since our probes are
1360 	 * again being answered within our CRTT estimates.
1361 	 */
1362 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1363 		tg->tg_rtt_sa = sa;
1364 		tg->tg_rtt_sd = sv;
1365 		tg->tg_crtt = new_crtt;
1366 		if (is_probe_uni)
1367 			tg->tg_num_deferred = 0;
1368 	}
1369 }
1370 
1371 /*
1372  * Return a pointer to the specified option buffer.
1373  * If not found return NULL.
1374  */
1375 static void *
1376 find_ancillary(struct msghdr *msg, int cmsg_type)
1377 {
1378 	struct cmsghdr *cmsg;
1379 
1380 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1381 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1382 		if (cmsg->cmsg_level == IPPROTO_IPV6 &&
1383 		    cmsg->cmsg_type == cmsg_type) {
1384 			return (CMSG_DATA(cmsg));
1385 		}
1386 	}
1387 	return (NULL);
1388 }
1389 
1390 /*
1391  * See if a previously failed interface has started working again.
1392  */
1393 void
1394 phyint_check_for_repair(struct phyint *pi)
1395 {
1396 	if (phyint_repaired(pi)) {
1397 		if (pi->pi_group == phyint_anongroup) {
1398 			logerr("NIC repair detected on %s\n", pi->pi_name);
1399 		} else {
1400 			logerr("NIC repair detected on %s of group %s\n",
1401 			    pi->pi_name, pi->pi_group->pg_name);
1402 		}
1403 
1404 		/*
1405 		 * If the interface is offline, just clear the FAILED flag,
1406 		 * delaying the state change and failback operation until it
1407 		 * is brought back online.
1408 		 */
1409 		if (pi->pi_state == PI_OFFLINE) {
1410 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1411 			return;
1412 		}
1413 
1414 		if (pi->pi_flags & IFF_STANDBY) {
1415 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1416 		} else {
1417 			if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) {
1418 				(void) change_lif_flags(pi,
1419 				    IFF_FAILED, _B_FALSE);
1420 				/* Per state diagram */
1421 				pi->pi_empty = 0;
1422 			}
1423 		}
1424 
1425 		phyint_chstate(pi, PI_RUNNING);
1426 
1427 		if (GROUP_FAILED(pi->pi_group)) {
1428 			/*
1429 			 * This is the 1st phyint to receive a response
1430 			 * after group failure.
1431 			 */
1432 			logerr("At least 1 interface (%s) of group %s has "
1433 			    "repaired\n", pi->pi_name, pi->pi_group->pg_name);
1434 			phyint_group_chstate(pi->pi_group, PG_RUNNING);
1435 		}
1436 	}
1437 }
1438 
1439 /*
1440  * See if a previously functioning interface has failed, or if the
1441  * whole group of interfaces has failed.
1442  */
1443 static void
1444 phyint_inst_check_for_failure(struct phyint_instance *pii)
1445 {
1446 	struct	phyint	*pi;
1447 	struct	phyint	*pi2;
1448 
1449 	pi = pii->pii_phyint;
1450 
1451 	switch (failure_state(pii)) {
1452 	case PHYINT_FAILURE:
1453 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1454 		if (pi->pi_group == phyint_anongroup) {
1455 			logerr("NIC failure detected on %s\n", pii->pii_name);
1456 		} else {
1457 			logerr("NIC failure detected on %s of group %s\n",
1458 			    pii->pii_name, pi->pi_group->pg_name);
1459 		}
1460 		/*
1461 		 * Do the failover, unless the interface is offline (in
1462 		 * which case we've already failed over).
1463 		 */
1464 		if (pi->pi_state != PI_OFFLINE) {
1465 			phyint_chstate(pi, PI_FAILED);
1466 			reset_crtt_all(pi);
1467 			if (!(pi->pi_flags & IFF_INACTIVE))
1468 				(void) try_failover(pi, FAILOVER_NORMAL);
1469 		}
1470 		break;
1471 
1472 	case GROUP_FAILURE:
1473 		logerr("All Interfaces in group %s have failed\n",
1474 		    pi->pi_group->pg_name);
1475 		for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
1476 		    pi2 = pi2->pi_pgnext) {
1477 			if (pi2->pi_flags & IFF_OFFLINE)
1478 				continue;
1479 			(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
1480 			reset_crtt_all(pi2);
1481 
1482 			/*
1483 			 * In the case of host targets, we
1484 			 * would have flushed the targets,
1485 			 * and gone to PI_NOTARGETS state.
1486 			 */
1487 			if (pi2->pi_state == PI_RUNNING)
1488 				phyint_chstate(pi2, PI_FAILED);
1489 
1490 			pi2->pi_empty = 0;
1491 			pi2->pi_full = 0;
1492 		}
1493 		break;
1494 
1495 	default:
1496 		break;
1497 	}
1498 }
1499 
1500 /*
1501  * Determines if any timeout event has occurred and returns the number of
1502  * milliseconds until the next timeout event for the phyint. Returns
1503  * TIMER_INFINITY for "never".
1504  */
1505 uint_t
1506 phyint_inst_timer(struct phyint_instance *pii)
1507 {
1508 	int 	pr_ndx;
1509 	uint_t	timeout;
1510 	struct	target	*cur_tg;
1511 	struct	probe_stats *pr_statp;
1512 	struct	phyint_instance *pii_other;
1513 	struct	phyint *pi;
1514 	int	valid_unack_count;
1515 	int	i;
1516 	int	interval;
1517 	uint_t	check_time;
1518 	uint_t	cur_time;
1519 	hrtime_t cur_hrtime;
1520 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1521 
1522 	cur_time = getcurrenttime();
1523 
1524 	if (debug & D_TIMER) {
1525 		logdebug("phyint_inst_timer(%s %s)\n",
1526 		    AF_STR(pii->pii_af), pii->pii_name);
1527 	}
1528 
1529 	pii_other = phyint_inst_other(pii);
1530 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1531 		/*
1532 		 * Check to see if we're here due to link up/down flapping; If
1533 		 * enough time has passed, then try to bring the interface
1534 		 * back up; otherwise, schedule a timer to bring it back up
1535 		 * when enough time *has* elapsed.
1536 		 */
1537 		pi = pii->pii_phyint;
1538 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1539 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1540 			if (check_time > cur_time)
1541 				return (check_time - cur_time);
1542 
1543 			phyint_check_for_repair(pi);
1544 		}
1545 	}
1546 
1547 	/*
1548 	 * If this phyint is not yet initialized for probes,
1549 	 * don't proceed further
1550 	 */
1551 	if (pii->pii_probe_sock == -1)
1552 		return (TIMER_INFINITY);
1553 
1554 	/*
1555 	 * If the timer has fired too soon, probably triggered
1556 	 * by some other phyint instance, return the remaining
1557 	 * time
1558 	 */
1559 	if (TIME_LT(cur_time, pii->pii_snxt_time))
1560 		return (pii->pii_snxt_time - cur_time);
1561 
1562 	/*
1563 	 * If the link is down, don't send any probes for now.
1564 	 */
1565 	if (LINK_DOWN(pii->pii_phyint))
1566 		return (TIMER_INFINITY);
1567 
1568 	/*
1569 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1570 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1571 	 * Base probe time is strictly periodic.
1572 	 */
1573 	interval = GET_RANDOM(
1574 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1575 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1576 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1577 
1578 	/*
1579 	 * Check if the current time > next time to probe. If so, we missed
1580 	 * sending 1 or more probes, probably due to heavy system load. At least
1581 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1582 	 * were scheduled. Make adjustments to the times, in multiples of
1583 	 * user_probe_interval.
1584 	 */
1585 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1586 		int n;
1587 
1588 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1589 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1590 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1591 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1592 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1593 		    pii->pii_snxt_basetime);
1594 
1595 		/* Collect statistics about missed probes */
1596 		probes_missed.pm_nprobes += n + 1;
1597 		probes_missed.pm_ntimes++;
1598 	}
1599 	pii->pii_snxt_basetime += user_probe_interval;
1600 	interval = pii->pii_snxt_time - cur_time;
1601 	if (debug & D_TARGET) {
1602 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1603 		    " interval %u\n", cur_time, pii->pii_snxt_time,
1604 		    pii->pii_snxt_basetime, interval);
1605 	}
1606 
1607 	/*
1608 	 * If no targets are known, we need to send an ICMP multicast. The
1609 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1610 	 * to see if we found a target.
1611 	 */
1612 	if (pii->pii_target_next == NULL) {
1613 		assert(pii->pii_ntargets == 0);
1614 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1615 		probe(pii, PROBE_MULTI, cur_time);
1616 		return (interval);
1617 	}
1618 
1619 	if ((user_probe_interval != probe_interval) &&
1620 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1621 		/*
1622 		 * the failure detection (fd) probe timer has not yet fired.
1623 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1624 		 */
1625 		probe(pii, PROBE_RTT, cur_time);
1626 		return (interval);
1627 	}
1628 	/*
1629 	 * the fd probe timer has fired. Need to do all failure
1630 	 * detection / recovery calculations, and then send an fd probe
1631 	 * of type PROBE_UNI.
1632 	 */
1633 	if (user_probe_interval == probe_interval) {
1634 		/*
1635 		 * We could have missed some probes, and then adjusted
1636 		 * pii_snxt_basetime above. Otherwise we could have
1637 		 * blindly added probe_interval to pii_fd_snxt_basetime.
1638 		 */
1639 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1640 	} else {
1641 		pii->pii_fd_snxt_basetime += probe_interval;
1642 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1643 			int n;
1644 
1645 			n = (cur_time - pii->pii_fd_snxt_basetime) /
1646 			    probe_interval;
1647 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1648 		}
1649 	}
1650 
1651 	/*
1652 	 * We can have at most, the latest 2 probes that we sent, in
1653 	 * the PR_UNACKED state. All previous probes sent, are either
1654 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1655 	 * timed out if the probe's time_sent + the CRTT < currenttime.
1656 	 * For each of the last 2 probes, examine whether it has timed
1657 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1658 	 */
1659 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1660 	valid_unack_count = 0;
1661 
1662 	for (i = 0; i < 2; i++) {
1663 		pr_statp = &pii->pii_probes[pr_ndx];
1664 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1665 		switch (pr_statp->pr_status) {
1666 		case PR_ACKED:
1667 			/*
1668 			 * We received back an ACK, so the switch clearly
1669 			 * is not dropping our traffic, and thus we can
1670 			 * enable failure detection immediately.
1671 			 */
1672 			if (pii->pii_fd_hrtime > gethrtime()) {
1673 				if (debug & D_PROBE) {
1674 					logdebug("successful probe on %s; "
1675 					    "ending quiet period\n",
1676 					    pii->pii_phyint->pi_name);
1677 				}
1678 				pii->pii_fd_hrtime = gethrtime();
1679 			}
1680 			break;
1681 
1682 		case PR_UNACKED:
1683 			assert(cur_tg != NULL);
1684 			/*
1685 			 * The crtt could be zero for some reason,
1686 			 * Eg. the phyint could be failed. If the crtt is
1687 			 * not available use group's probe interval,
1688 			 * which is a worst case estimate.
1689 			 */
1690 			if (cur_tg->tg_crtt != 0) {
1691 				timeout = pr_statp->pr_time_sent +
1692 				    cur_tg->tg_crtt;
1693 			} else {
1694 				timeout = pr_statp->pr_time_sent +
1695 				    probe_interval;
1696 			}
1697 			if (TIME_LT(timeout, cur_time)) {
1698 				pr_statp->pr_status = PR_LOST;
1699 				pr_statp->pr_time_lost = timeout;
1700 			} else if (i == 1) {
1701 				/*
1702 				 * We are forced to consider this probe
1703 				 * lost, as we can have at most 2 unack.
1704 				 * probes any time, and we will be sending a
1705 				 * probe at the end of this function.
1706 				 * Normally, we should not be here, but
1707 				 * this can happen if an incoming response
1708 				 * that was considered lost has increased
1709 				 * the crtt for this target, and also bumped
1710 				 * up the FDT. Note that we never cancel or
1711 				 * increase the current pii_time_left, so
1712 				 * when the timer fires, we find 2 valid
1713 				 * unacked probes, and they are yet to timeout
1714 				 */
1715 				pr_statp->pr_status = PR_LOST;
1716 				pr_statp->pr_time_lost = cur_time;
1717 			} else {
1718 				/*
1719 				 * Only the most recent probe can enter
1720 				 * this 'else' arm. The second most recent
1721 				 * probe must take either of the above arms,
1722 				 * if it is unacked.
1723 				 */
1724 				valid_unack_count++;
1725 			}
1726 			break;
1727 		}
1728 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1729 	}
1730 
1731 	/*
1732 	 * We send out 1 probe randomly in the interval between one half
1733 	 * and one probe interval for the group. Given that the CRTT is always
1734 	 * less than the group's probe interval, we can have at most 1
1735 	 * unacknowledged probe now.  All previous probes are either lost or
1736 	 * acked.
1737 	 */
1738 	assert(valid_unack_count == 0 || valid_unack_count == 1);
1739 
1740 	/*
1741 	 * The timer has fired. Take appropriate action depending
1742 	 * on the current state of the phyint.
1743 	 *
1744 	 * PI_RUNNING state 	- Failure detection and failover
1745 	 * PI_FAILED state 	- Repair detection and failback
1746 	 */
1747 	switch (pii->pii_phyint->pi_state) {
1748 	case PI_FAILED:
1749 		/*
1750 		 * If the most recent probe (excluding unacked probes that
1751 		 * are yet to time out) has been acked, check whether the
1752 		 * phyint is now repaired. If the phyint is repaired, then
1753 		 * attempt failback, unless it is an inactive standby.
1754 		 */
1755 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1756 			phyint_check_for_repair(pii->pii_phyint);
1757 		}
1758 		break;
1759 
1760 	case PI_RUNNING:
1761 		/*
1762 		 * It's possible our probes have been lost because of a
1763 		 * spanning-tree mandated quiet period on the switch.  If so,
1764 		 * ignore the lost probes and consider the interface to still
1765 		 * be functioning.
1766 		 */
1767 		cur_hrtime = gethrtime();
1768 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1769 			break;
1770 
1771 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1772 			/*
1773 			 * We have 1 or more failed probes (excluding unacked
1774 			 * probes that are yet to time out). Determine if the
1775 			 * phyint has failed. If so attempt a failover,
1776 			 * unless it is an inactive standby
1777 			 */
1778 			phyint_inst_check_for_failure(pii);
1779 		}
1780 		break;
1781 
1782 	default:
1783 		logerr("phyint_inst_timer: invalid state %d\n",
1784 		    pii->pii_phyint->pi_state);
1785 		abort();
1786 	}
1787 
1788 	/*
1789 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1790 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1791 	 * was called, the target list may be empty.
1792 	 */
1793 	if (pii->pii_target_next != NULL) {
1794 		probe(pii, PROBE_UNI, cur_time);
1795 		/*
1796 		 * If we have just the one probe target, and we're not using
1797 		 * router targets, try to find another as we presently have
1798 		 * no resilience.
1799 		 */
1800 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1801 			probe(pii, PROBE_MULTI, cur_time);
1802 	} else {
1803 		probe(pii, PROBE_MULTI, cur_time);
1804 	}
1805 	return (interval);
1806 }
1807 
1808 /*
1809  * Start the probe timer for an interface instance.
1810  */
1811 void
1812 start_timer(struct phyint_instance *pii)
1813 {
1814 	uint32_t interval;
1815 
1816 	/*
1817 	 * Spread the base probe times (pi_snxt_basetime) across phyints
1818 	 * uniformly over the (curtime..curtime + the group's probe_interval).
1819 	 * pi_snxt_basetime is strictly periodic with a frequency of
1820 	 * the group's probe interval. The actual probe time pi_snxt_time
1821 	 * adds some randomness to pi_snxt_basetime and happens in probe().
1822 	 * For the 1st probe on each phyint after the timer is started,
1823 	 * pi_snxt_time and pi_snxt_basetime are the same.
1824 	 */
1825 	interval = GET_RANDOM(0,
1826 	    (int)pii->pii_phyint->pi_group->pg_probeint);
1827 
1828 	pii->pii_snxt_basetime = getcurrenttime() + interval;
1829 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1830 	pii->pii_snxt_time = pii->pii_snxt_basetime;
1831 	timer_schedule(interval);
1832 }
1833 
1834 /*
1835  * Restart the probe timer on an interface instance.
1836  */
1837 static void
1838 restart_timer(struct phyint_instance *pii)
1839 {
1840 	/*
1841 	 * We don't need to restart the timer if it was never started in
1842 	 * the first place (pii->pii_basetime_inited not set), as the timer
1843 	 * won't have gone off yet.
1844 	 */
1845 	if (pii->pii_basetime_inited != 0) {
1846 
1847 		if (debug & D_LINKNOTE)
1848 			logdebug("restart timer: restarting timer on %s, "
1849 			    "address family %s\n", pii->pii_phyint->pi_name,
1850 			    AF_STR(pii->pii_af));
1851 
1852 		start_timer(pii);
1853 	}
1854 }
1855 
1856 static void
1857 process_link_state_down(struct phyint *pi)
1858 {
1859 	logerr("The link has gone down on %s\n", pi->pi_name);
1860 
1861 	/*
1862 	 * Clear the probe statistics arrays, we don't want the repair
1863 	 * detection logic relying on probes that were succesful prior
1864 	 *  to the link going down.
1865 	 */
1866 	if (PROBE_CAPABLE(pi->pi_v4))
1867 		clear_pii_probe_stats(pi->pi_v4);
1868 	if (PROBE_CAPABLE(pi->pi_v6))
1869 		clear_pii_probe_stats(pi->pi_v6);
1870 	/*
1871 	 * Check for interface failure.  Although we know the interface
1872 	 * has failed, we don't know if all the other interfaces in the
1873 	 * group have failed as well.
1874 	 */
1875 	if ((pi->pi_state == PI_RUNNING) ||
1876 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1877 		if (debug & D_LINKNOTE) {
1878 			logdebug("process_link_state_down:"
1879 			    " checking for failure on %s\n", pi->pi_name);
1880 		}
1881 
1882 		if (pi->pi_v4 != NULL)
1883 			phyint_inst_check_for_failure(pi->pi_v4);
1884 		else if (pi->pi_v6 != NULL)
1885 			phyint_inst_check_for_failure(pi->pi_v6);
1886 	}
1887 }
1888 
1889 static void
1890 process_link_state_up(struct phyint *pi)
1891 {
1892 	logerr("The link has come up on %s\n", pi->pi_name);
1893 
1894 	/*
1895 	 * We stopped any running timers on each instance when the link
1896 	 * went down, so restart them.
1897 	 */
1898 	if (pi->pi_v4)
1899 		restart_timer(pi->pi_v4);
1900 	if (pi->pi_v6)
1901 		restart_timer(pi->pi_v6);
1902 
1903 	phyint_check_for_repair(pi);
1904 
1905 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1906 	if (pi->pi_whendx == LINK_UP_PERMIN)
1907 		pi->pi_whendx = 0;
1908 }
1909 
1910 /*
1911  * Process any changes in link state passed up from the interfaces.
1912  */
1913 void
1914 process_link_state_changes(void)
1915 {
1916 	struct phyint *pi;
1917 
1918 	/* Look for interfaces where the link state has just changed */
1919 
1920 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1921 		boolean_t old_link_state_up = LINK_UP(pi);
1922 
1923 		/*
1924 		 * Except when the "phyint" structure is created, this is
1925 		 * the only place the link state is updated.  This allows
1926 		 * this routine to detect changes in link state, rather
1927 		 * than just the current state.
1928 		 */
1929 		UPDATE_LINK_STATE(pi);
1930 
1931 		if (LINK_DOWN(pi)) {
1932 			/*
1933 			 * Has link just gone down?
1934 			 */
1935 			if (old_link_state_up)
1936 				process_link_state_down(pi);
1937 		} else {
1938 			/*
1939 			 * Has link just gone back up?
1940 			 */
1941 			if (!old_link_state_up)
1942 				process_link_state_up(pi);
1943 		}
1944 	}
1945 }
1946 
1947 void
1948 reset_crtt_all(struct phyint *pi)
1949 {
1950 	struct phyint_instance *pii;
1951 	struct target *tg;
1952 
1953 	pii = pi->pi_v4;
1954 	if (pii != NULL) {
1955 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1956 			tg->tg_crtt = 0;
1957 			tg->tg_rtt_sa = -1;
1958 			tg->tg_rtt_sd = 0;
1959 		}
1960 	}
1961 
1962 	pii = pi->pi_v6;
1963 	if (pii != NULL) {
1964 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1965 			tg->tg_crtt = 0;
1966 			tg->tg_rtt_sa = -1;
1967 			tg->tg_rtt_sd = 0;
1968 		}
1969 	}
1970 }
1971 
1972 /*
1973  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
1974  * probes on both instances IPv4 and IPv6.
1975  * If the interface has failed, return the time of the first probe failure
1976  * in "tff".
1977  */
1978 static int
1979 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
1980 {
1981 	uint_t	pi_tff;
1982 	struct	target *cur_tg;
1983 	struct	probe_fail_count pfinfo;
1984 	struct	phyint_instance *pii_other;
1985 	int	pr_ndx;
1986 
1987 	/*
1988 	 * Get the number of consecutive failed probes on
1989 	 * this phyint across all targets. Also get the number
1990 	 * of consecutive failed probes on this target only
1991 	 */
1992 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1993 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
1994 	probe_fail_info(pii, cur_tg, &pfinfo);
1995 
1996 	/* Get the time of first failure, for later use */
1997 	pi_tff = pfinfo.pf_tff;
1998 
1999 	/*
2000 	 * If the current target has not responded to the
2001 	 * last NUM_PROBE_FAILS probes, and other targets are
2002 	 * responding delete this target. Dead gateway detection
2003 	 * will eventually remove this target (if router) from the
2004 	 * routing tables. If that does not occur, we may end
2005 	 * up adding this to our list again.
2006 	 */
2007 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2008 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2009 		if (pii->pii_targets_are_routers) {
2010 			if (cur_tg->tg_status == TG_ACTIVE)
2011 				pii->pii_ntargets--;
2012 			cur_tg->tg_status = TG_DEAD;
2013 			cur_tg->tg_crtt = 0;
2014 			cur_tg->tg_rtt_sa = -1;
2015 			cur_tg->tg_rtt_sd = 0;
2016 			if (pii->pii_target_next == cur_tg)
2017 				pii->pii_target_next = target_next(cur_tg);
2018 		} else {
2019 			target_delete(cur_tg);
2020 			probe(pii, PROBE_MULTI, getcurrenttime());
2021 		}
2022 		return (PHYINT_OK);
2023 	}
2024 
2025 	/*
2026 	 * If the phyint has lost NUM_PROBE_FAILS or more
2027 	 * consecutive probes, on both IPv4 and IPv6 protocol
2028 	 * instances of the phyint, then trigger failure
2029 	 * detection, else return false
2030 	 */
2031 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2032 		return (PHYINT_OK);
2033 
2034 	pii_other = phyint_inst_other(pii);
2035 	if (PROBE_CAPABLE(pii_other)) {
2036 		probe_fail_info(pii_other, NULL, &pfinfo);
2037 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2038 			/*
2039 			 * We have NUM_PROBE_FAILS or more failures
2040 			 * on both IPv4 and IPv6. Get the earliest
2041 			 * time when failure was detected on this
2042 			 * phyint across IPv4 and IPv6.
2043 			 */
2044 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2045 				pi_tff = pfinfo.pf_tff;
2046 		} else {
2047 			/*
2048 			 * This instance has < NUM_PROBE_FAILS failure.
2049 			 * So return false
2050 			 */
2051 			return (PHYINT_OK);
2052 		}
2053 	}
2054 	*tff = pi_tff;
2055 	return (PHYINT_FAILURE);
2056 }
2057 
2058 /*
2059  * Check if the link has gone down on this phyint, or it has failed the
2060  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2061  * Also look at other phyints of this group, for group failures.
2062  */
2063 int
2064 failure_state(struct phyint_instance *pii)
2065 {
2066 	struct	probe_success_count psinfo;
2067 	uint_t	pi2_tls;		/* time last success */
2068 	uint_t	pi_tff;			/* time first fail */
2069 	struct	phyint	*pi2;
2070 	struct	phyint *pi;
2071 	struct	phyint_instance *pii2;
2072 	struct  phyint_group *pg;
2073 	boolean_t alone;
2074 
2075 	if (debug & D_FAILOVER)
2076 		logdebug("phyint_failed(%s)\n", pii->pii_name);
2077 
2078 	pi = pii->pii_phyint;
2079 	pg = pi->pi_group;
2080 
2081 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2082 		PHYINT_OK)
2083 		return (PHYINT_OK);
2084 
2085 	/*
2086 	 * At this point, the link is down, or the phyint is suspect,
2087 	 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
2088 	 * does not belong to any group, or is the only member of the
2089 	 * group capable of being probed, return PHYINT_FAILURE.
2090 	 */
2091 	alone = _B_TRUE;
2092 	if (pg != phyint_anongroup) {
2093 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2094 			if (pi2 == pi)
2095 				continue;
2096 			if (PROBE_CAPABLE(pi2->pi_v4) ||
2097 			    PROBE_CAPABLE(pi2->pi_v6)) {
2098 				alone = _B_FALSE;
2099 				break;
2100 			}
2101 		}
2102 	}
2103 	if (alone)
2104 		return (PHYINT_FAILURE);
2105 
2106 	/*
2107 	 * Need to compare against other phyints of the same group
2108 	 * to exclude group failures. If the failure was detected via
2109 	 * probing, then if the time of last success (tls) of any
2110 	 * phyint is more recent than the time of first fail (tff) of the
2111 	 * phyint in question, and the link is up on the phyint,
2112 	 * then it is a phyint failure. Otherwise it is a group failure.
2113 	 * If failure was detected via a link down notification sent from
2114 	 * the driver to IP, we see if any phyints in the group are still
2115 	 * running and haven't received a link down notification.  We
2116 	 * will usually be processing the link down notification shortly
2117 	 * after it was received, so there is no point looking at the tls
2118 	 * of other phyints.
2119 	 */
2120 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2121 		/* Exclude ourself from comparison */
2122 		if (pi2 == pi)
2123 			continue;
2124 
2125 		if (LINK_DOWN(pi)) {
2126 			/*
2127 			 * We use FLAGS_TO_LINK_STATE() to test the
2128 			 * flags directly, rather then LINK_UP() or
2129 			 * LINK_DOWN(), as we may not have got round
2130 			 * to processing the link state for the other
2131 			 * phyints in the group yet.
2132 			 *
2133 			 * The check for PI_RUNNING and group
2134 			 * failure handles the case when the
2135 			 * group begins to recover.  The first
2136 			 * phyint to recover should not trigger
2137 			 * a failover from the soon-to-recover
2138 			 * other phyints to the first recovered
2139 			 * phyint. PI_RUNNING will be set, and
2140 			 * pg_groupfailed cleared only after
2141 			 * receipt of NUM_PROBE_REPAIRS, by
2142 			 * which time the other phyints should
2143 			 * have received at least 1 packet,
2144 			 * and so will not have NUM_PROBE_FAILS.
2145 			 */
2146 			if ((pi2->pi_state == PI_RUNNING) &&
2147 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
2148 				return (PHYINT_FAILURE);
2149 		} else {
2150 			/*
2151 			 * Need to compare against both IPv4 and
2152 			 * IPv6 instances.
2153 			 */
2154 			pii2 = pi2->pi_v4;
2155 			if (pii2 != NULL) {
2156 				probe_success_info(pii2, NULL, &psinfo);
2157 				if (psinfo.ps_tls_valid) {
2158 					pi2_tls = psinfo.ps_tls;
2159 					/*
2160 					 * See comment above regarding check
2161 					 * for PI_RUNNING and group failure.
2162 					 */
2163 					if (TIME_GT(pi2_tls, pi_tff) &&
2164 					    (pi2->pi_state == PI_RUNNING) &&
2165 					    !GROUP_FAILED(pg) &&
2166 					    FLAGS_TO_LINK_STATE(pi2))
2167 						return (PHYINT_FAILURE);
2168 				}
2169 			}
2170 
2171 			pii2 = pi2->pi_v6;
2172 			if (pii2 != NULL) {
2173 				probe_success_info(pii2, NULL, &psinfo);
2174 				if (psinfo.ps_tls_valid) {
2175 					pi2_tls = psinfo.ps_tls;
2176 					/*
2177 					 * See comment above regarding check
2178 					 * for PI_RUNNING and group failure.
2179 					 */
2180 					if (TIME_GT(pi2_tls, pi_tff) &&
2181 					    (pi2->pi_state == PI_RUNNING) &&
2182 					    !GROUP_FAILED(pg) &&
2183 					    FLAGS_TO_LINK_STATE(pi2))
2184 						return (PHYINT_FAILURE);
2185 				}
2186 			}
2187 		}
2188 	}
2189 
2190 	/*
2191 	 * Change the group state to PG_FAILED if it's not already.
2192 	 */
2193 	if (!GROUP_FAILED(pg))
2194 		phyint_group_chstate(pg, PG_FAILED);
2195 
2196 	return (GROUP_FAILURE);
2197 }
2198 
2199 /*
2200  * Return the information associated with consecutive probe successes
2201  * starting with the most recent probe. At most the last 2 probes can be
2202  * in the unacknowledged state. All previous probes have either failed
2203  * or succeeded.
2204  */
2205 static void
2206 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2207     struct probe_success_count *psinfo)
2208 {
2209 	uint_t	i;
2210 	struct probe_stats *pr_statp;
2211 	uint_t most_recent;
2212 	uint_t second_most_recent;
2213 	boolean_t pi_found_failure = _B_FALSE;
2214 	boolean_t tg_found_failure = _B_FALSE;
2215 	uint_t now;
2216 	uint_t timeout;
2217 	struct target *tg;
2218 
2219 	if (debug & D_FAILOVER)
2220 		logdebug("probe_success_info(%s)\n", pii->pii_name);
2221 
2222 	bzero(psinfo, sizeof (*psinfo));
2223 	now = getcurrenttime();
2224 
2225 	/*
2226 	 * Start with the most recent probe, and count the number
2227 	 * of consecutive probe successes. Latch the number of successes
2228 	 * on hitting a failure.
2229 	 */
2230 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2231 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2232 
2233 	for (i = most_recent; i != pii->pii_probe_next;
2234 	    i = PROBE_INDEX_PREV(i)) {
2235 		pr_statp = &pii->pii_probes[i];
2236 
2237 		switch (pr_statp->pr_status) {
2238 		case PR_UNACKED:
2239 			/*
2240 			 * Only the most recent 2 probes can be unacknowledged
2241 			 */
2242 			assert(i == most_recent || i == second_most_recent);
2243 
2244 			tg = pr_statp->pr_target;
2245 			assert(tg != NULL);
2246 			/*
2247 			 * The crtt could be zero for some reason,
2248 			 * Eg. the phyint could be failed. If the crtt is
2249 			 * not available use the value of the group's probe
2250 			 * interval which is a worst case estimate.
2251 			 */
2252 			if (tg->tg_crtt != 0) {
2253 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2254 			} else {
2255 				timeout = pr_statp->pr_time_sent +
2256 				    pii->pii_phyint->pi_group->pg_probeint;
2257 			}
2258 
2259 			if (TIME_LT(timeout, now)) {
2260 				/*
2261 				 * We hit a failure. Latch the total number of
2262 				 * recent consecutive successes.
2263 				 */
2264 				pr_statp->pr_time_lost = timeout;
2265 				pr_statp->pr_status = PR_LOST;
2266 				pi_found_failure = _B_TRUE;
2267 				if (cur_tg != NULL && tg == cur_tg) {
2268 					/*
2269 					 * We hit a failure for the desired
2270 					 * target. Latch the number of recent
2271 					 * consecutive successes for this target
2272 					 */
2273 					tg_found_failure = _B_TRUE;
2274 				}
2275 			}
2276 			break;
2277 
2278 		case PR_ACKED:
2279 			/*
2280 			 * Bump up the count of probe successes, if we
2281 			 * have not seen any failure so far.
2282 			 */
2283 			if (!pi_found_failure)
2284 				psinfo->ps_nsucc++;
2285 
2286 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2287 			    !tg_found_failure) {
2288 				psinfo->ps_nsucc_tg++;
2289 			}
2290 
2291 			/*
2292 			 * Record the time of last success, if this is
2293 			 * the most recent probe success.
2294 			 */
2295 			if (!psinfo->ps_tls_valid) {
2296 				psinfo->ps_tls = pr_statp->pr_time_acked;
2297 				psinfo->ps_tls_valid = _B_TRUE;
2298 			}
2299 			break;
2300 
2301 		case PR_LOST:
2302 			/*
2303 			 * We hit a failure. Latch the total number of
2304 			 * recent consecutive successes.
2305 			 */
2306 			pi_found_failure = _B_TRUE;
2307 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2308 				/*
2309 				 * We hit a failure for the desired target.
2310 				 * Latch the number of recent consecutive
2311 				 * successes for this target
2312 				 */
2313 				tg_found_failure = _B_TRUE;
2314 			}
2315 			break;
2316 
2317 		default:
2318 			return;
2319 
2320 		}
2321 	}
2322 }
2323 
2324 /*
2325  * Return the information associated with consecutive probe failures
2326  * starting with the most recent probe. Only the last 2 probes can be in the
2327  * unacknowledged state. All previous probes have either failed or succeeded.
2328  */
2329 static void
2330 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2331     struct probe_fail_count *pfinfo)
2332 {
2333 	int	i;
2334 	struct probe_stats *pr_statp;
2335 	boolean_t	tg_found_success = _B_FALSE;
2336 	boolean_t	pi_found_success = _B_FALSE;
2337 	int	most_recent;
2338 	int	second_most_recent;
2339 	uint_t	now;
2340 	uint_t	timeout;
2341 	struct	target *tg;
2342 
2343 	if (debug & D_FAILOVER)
2344 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2345 
2346 	bzero(pfinfo, sizeof (*pfinfo));
2347 	now = getcurrenttime();
2348 
2349 	/*
2350 	 * Start with the most recent probe, and count the number
2351 	 * of consecutive probe failures. Latch the number of failures
2352 	 * on hitting a probe success.
2353 	 */
2354 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2355 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2356 
2357 	for (i = most_recent; i != pii->pii_probe_next;
2358 	    i = PROBE_INDEX_PREV(i)) {
2359 		pr_statp = &pii->pii_probes[i];
2360 
2361 		assert(PR_STATUS_VALID(pr_statp->pr_status));
2362 
2363 		switch (pr_statp->pr_status) {
2364 		case PR_UNACKED:
2365 			/*
2366 			 * Only the most recent 2 probes can be unacknowledged
2367 			 */
2368 			assert(i == most_recent || i == second_most_recent);
2369 
2370 			tg = pr_statp->pr_target;
2371 			/*
2372 			 * Target is guaranteed to exist in the unack. state
2373 			 */
2374 			assert(tg != NULL);
2375 			/*
2376 			 * The crtt could be zero for some reason,
2377 			 * Eg. the phyint could be failed. If the crtt is
2378 			 * not available use the group's probe interval,
2379 			 * which is a worst case estimate.
2380 			 */
2381 			if (tg->tg_crtt != 0) {
2382 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2383 			} else {
2384 				timeout = pr_statp->pr_time_sent +
2385 				    pii->pii_phyint->pi_group->pg_probeint;
2386 			}
2387 
2388 			if (TIME_GT(timeout, now))
2389 				break;
2390 
2391 			pr_statp->pr_time_lost = timeout;
2392 			pr_statp->pr_status = PR_LOST;
2393 			/* FALLTHRU */
2394 
2395 		case PR_LOST:
2396 			if (!pi_found_success) {
2397 				pfinfo->pf_nfail++;
2398 				pfinfo->pf_tff = pr_statp->pr_time_lost;
2399 			}
2400 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2401 			    !tg_found_success)  {
2402 				pfinfo->pf_nfail_tg++;
2403 			}
2404 			break;
2405 
2406 		default:
2407 			/*
2408 			 * We hit a success or unused slot. Latch the
2409 			 * total number of recent consecutive failures.
2410 			 */
2411 			pi_found_success = _B_TRUE;
2412 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2413 				/*
2414 				 * We hit a success for the desired target.
2415 				 * Latch the number of recent consecutive
2416 				 * failures for this target
2417 				 */
2418 				tg_found_success = _B_TRUE;
2419 			}
2420 		}
2421 	}
2422 }
2423 
2424 /*
2425  * Check if the phyint has been repaired.  If no test address has been
2426  * configured, then consider the interface repaired if the link is up (unless
2427  * the link is flapping; see below).  Otherwise, look for proof of probes
2428  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2429  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2430  */
2431 static boolean_t
2432 phyint_repaired(struct phyint *pi)
2433 {
2434 	struct	probe_success_count psinfo;
2435 	struct	phyint_instance *pii;
2436 	struct	target *cur_tg;
2437 	int	pr_ndx;
2438 	uint_t	cur_time;
2439 
2440 	if (debug & D_FAILOVER)
2441 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2442 
2443 	if (LINK_DOWN(pi))
2444 		return (_B_FALSE);
2445 
2446 	/*
2447 	 * If we don't have any test addresses and the link is up, then
2448 	 * consider the interface repaired, unless we've received more than
2449 	 * LINK_UP_PERMIN link up notifications in the last minute, in
2450 	 * which case we keep the link down until we drop back below
2451 	 * the threshold.
2452 	 */
2453 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2454 		cur_time = getcurrenttime();
2455 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2456 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2457 			pi->pi_lfmsg_printed = 0;
2458 			return (_B_TRUE);
2459 		}
2460 		if (!pi->pi_lfmsg_printed) {
2461 			logerr("The link has come up on %s more than %d times "
2462 			    "in the last minute; disabling failback until it "
2463 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2464 			pi->pi_lfmsg_printed = 1;
2465 		}
2466 
2467 		return (_B_FALSE);
2468 	}
2469 
2470 	pii = pi->pi_v4;
2471 	if (PROBE_CAPABLE(pii)) {
2472 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2473 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2474 		probe_success_info(pii, cur_tg, &psinfo);
2475 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2476 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2477 			return (_B_TRUE);
2478 	}
2479 
2480 	pii = pi->pi_v6;
2481 	if (PROBE_CAPABLE(pii)) {
2482 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2483 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2484 		probe_success_info(pii, cur_tg, &psinfo);
2485 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2486 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2487 			return (_B_TRUE);
2488 	}
2489 
2490 	return (_B_FALSE);
2491 }
2492 
2493 /*
2494  * Try failover from phyint 'pi' to a suitable destination.
2495  */
2496 int
2497 try_failover(struct phyint *pi, int failover_type)
2498 {
2499 	struct phyint *dst;
2500 	int err;
2501 
2502 	if (debug & D_FAILOVER)
2503 		logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
2504 
2505 	/*
2506 	 * Attempt to find a failover destination 'dst'.
2507 	 * dst will be null if any of the following is true
2508 	 * Phyint is not part of a group  OR
2509 	 * Phyint is the only member of a group OR
2510 	 * No suitable failover dst was available
2511 	 */
2512 	dst = get_failover_dst(pi, failover_type);
2513 	if (dst == NULL)
2514 		return (IPMP_EMINRED);
2515 
2516 	dst->pi_empty = 0;			/* Per state diagram */
2517 	pi->pi_full = 0;			/* Per state diagram */
2518 
2519 	err = failover(pi, dst);
2520 
2521 	if (debug & D_FAILOVER) {
2522 		logdebug("failed over from %s to %s ret %d\n",
2523 		    pi->pi_name, dst->pi_name, err);
2524 	}
2525 	if (err == 0) {
2526 		pi->pi_empty = 1;		/* Per state diagram */
2527 		/*
2528 		 * we don't want to print out this message if a
2529 		 * phyint is leaving the group, nor for failover from
2530 		 * standby
2531 		 */
2532 		if (failover_type == FAILOVER_NORMAL) {
2533 			logerr("Successfully failed over from NIC %s to NIC "
2534 			    "%s\n", pi->pi_name, dst->pi_name);
2535 		}
2536 		return (0);
2537 	} else {
2538 		/*
2539 		 * The failover did not succeed. We must retry the failover
2540 		 * only after resyncing our state based on the kernel's.
2541 		 * For eg. either the src or the dst might have been unplumbed
2542 		 * causing this failure. initifs() will be called again,
2543 		 * from main, since full_scan_required has been set to true
2544 		 * by failover();
2545 		 */
2546 		return (IPMP_FAILURE);
2547 	}
2548 }
2549 
2550 /*
2551  * global_errno captures the errno value, if failover() or failback()
2552  * fails. This is sent to if_mpadm(1M).
2553  */
2554 int global_errno;
2555 
2556 /*
2557  * Attempt failover from phyint 'from' to phyint 'to'.
2558  * IP moves everything from phyint 'from' to phyint 'to'.
2559  */
2560 static int
2561 failover(struct phyint *from, struct phyint *to)
2562 {
2563 	struct	lifreq	lifr;
2564 	int 	ret;
2565 
2566 	if (debug & D_FAILOVER) {
2567 		logdebug("failing over from %s to %s\n",
2568 		    from->pi_name, to->pi_name);
2569 	}
2570 
2571 	/*
2572 	 * Perform the failover. Both IPv4 and IPv6 are failed over
2573 	 * using a single ioctl by passing in AF_UNSPEC family.
2574 	 */
2575 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2576 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2577 	lifr.lifr_movetoindex = to->pi_ifindex;
2578 
2579 	ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
2580 	if (ret < 0) {
2581 		global_errno = errno;
2582 		logperror("failover: ioctl (failover)");
2583 	}
2584 
2585 	/*
2586 	 * Set full_scan_required to true. This will make us read
2587 	 * the state from the kernel in initifs() and update our tables,
2588 	 * to reflect the current state after the failover. If the
2589 	 * failover has failed it will then reissue the failover.
2590 	 */
2591 	full_scan_required = _B_TRUE;
2592 	return (ret);
2593 }
2594 
2595 /*
2596  * phyint 'pi' has recovered. Attempt failback from every phyint in the same
2597  * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
2598  * Return values:
2599  * IPMP_SUCCESS:		Failback successful from each of the other
2600  *				phyints in the group.
2601  * IPMP_EFBPARTIAL: 		Failback successful from some of the other
2602  *				phyints in the group.
2603  * IPMP_FAILURE:		Failback syscall failed with some error.
2604  *
2605  * Note that failback is attempted regardless of the setting of the
2606  * failback_enabled flag.
2607  */
2608 int
2609 do_failback(struct phyint *pi, boolean_t check_only)
2610 {
2611 	struct  phyint *from;
2612 	boolean_t done;
2613 	boolean_t partial;
2614 	boolean_t attempted_failback = _B_FALSE;
2615 
2616 	if (debug & D_FAILOVER)
2617 		logdebug("do_failback(%s)\n", pi->pi_name);
2618 
2619 	/* If this phyint is not part of a named group, return. */
2620 	if (pi->pi_group == phyint_anongroup) {
2621 		pi->pi_full = 1;
2622 		return (IPMP_SUCCESS);
2623 	}
2624 
2625 	/*
2626 	 * Attempt failback from every phyint in the group to 'pi'.
2627 	 * The reason for doing this, instead of only from the
2628 	 * phyint to which we did the failover is given below.
2629 	 *
2630 	 * After 'pi' failed, if any app. tries to join on a multicast
2631 	 * address (IPv6), on the failed phyint, IP picks any arbitrary
2632 	 * non-failed phyint in the group, instead of the failed phyint,
2633 	 * in.mpathd is not aware of this. Thus failing back only from the
2634 	 * interface to which 'pi' failed over, will failback the ipif's
2635 	 * but not the ilm's. So we need to failback from all members of
2636 	 * the phyint group
2637 	 */
2638 	done = _B_TRUE;
2639 	partial = _B_FALSE;
2640 	for (from = pi->pi_group->pg_phyint; from != NULL;
2641 	    from = from->pi_pgnext) {
2642 		/* Exclude ourself as a failback src */
2643 		if (from == pi)
2644 			continue;
2645 
2646 		/*
2647 		 * If the 'from' phyint has IPv4 plumbed, the 'to'
2648 		 * phyint must also have IPv4 plumbed. Similar check
2649 		 * for IPv6. IP makes the same check. Otherwise the
2650 		 * failback will fail.
2651 		 */
2652 		if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
2653 		    (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
2654 			partial = _B_TRUE;
2655 			continue;
2656 		}
2657 
2658 		if (!check_only) {
2659 			pi->pi_empty = 0;	/* Per state diagram */
2660 			attempted_failback = _B_TRUE;
2661 			if (failback(from, pi) != 0) {
2662 				done = _B_FALSE;
2663 				break;
2664 			}
2665 		}
2666 	}
2667 
2668 	if (check_only) {
2669 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2670 	}
2671 
2672 	/*
2673 	 * We are done. No more phyint from which we can src the failback
2674 	 */
2675 	if (done) {
2676 		if (!partial)
2677 			pi->pi_full = 1;	/* Per state diagram */
2678 		/*
2679 		 * Don't print out a message unless there is a
2680 		 * transition from FAILED to RUNNING. For eg.
2681 		 * we don't want to print out this message if a
2682 		 * phyint is leaving the group, or at startup
2683 		 */
2684 		if (attempted_failback && (pi->pi_flags &
2685 		    (IFF_FAILED | IFF_OFFLINE))) {
2686 			logerr("Successfully failed back to NIC %s\n",
2687 			    pi->pi_name);
2688 		}
2689 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2690 	}
2691 
2692 	return (IPMP_FAILURE);
2693 }
2694 
2695 /*
2696  * This function is similar to do_failback() above, but respects the
2697  * failback_enabled flag for phyints in named groups.
2698  */
2699 int
2700 try_failback(struct phyint *pi, boolean_t check_only)
2701 {
2702 	if (debug & D_FAILOVER)
2703 		logdebug("try_failback(%s)\n", pi->pi_name);
2704 
2705 	if (pi->pi_group != phyint_anongroup && !failback_enabled)
2706 		return (IPMP_EFBDISABLED);
2707 
2708 	return (do_failback(pi, check_only));
2709 }
2710 
2711 /*
2712  * Failback everything from phyint 'from' that has the same ifindex
2713  * as phyint to's ifindex.
2714  */
2715 static int
2716 failback(struct phyint *from, struct phyint *to)
2717 {
2718 	struct lifreq lifr;
2719 	int ret;
2720 
2721 	if (debug & D_FAILOVER)
2722 		logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
2723 
2724 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2725 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2726 	lifr.lifr_movetoindex = to->pi_ifindex;
2727 
2728 	ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
2729 	if (ret < 0) {
2730 		global_errno = errno;
2731 		logperror("failback: ioctl (failback)");
2732 	}
2733 
2734 	/*
2735 	 * Set full_scan_required to true. This will make us read
2736 	 * the state from the kernel in initifs() and update our tables,
2737 	 * to reflect the current state after the failback. If the
2738 	 * failback has failed it will then reissue the failback.
2739 	 */
2740 	full_scan_required = _B_TRUE;
2741 
2742 	return (ret);
2743 }
2744 
2745 /*
2746  * Select a target phyint for failing over from 'pi'.
2747  * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
2748  * target phyint is chosen as follows,
2749  *	1. Pick any inactive standby interface.
2750  *	2. If no inactive standby is available, select any phyint in the
2751  *	   same group that has the least number of logints, (excluding
2752  *	   IFF_NOFAILOVER and !IFF_UP logints)
2753  * If we are failing over from a standby, failover_type is
2754  * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
2755  * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
2756  * and we won't return NULL, as long as there is at least 1 other phyint
2757  * in the group.
2758  */
2759 static struct phyint *
2760 get_failover_dst(struct phyint *pi, int failover_type)
2761 {
2762 	struct phyint	*maybe = NULL;
2763 	struct phyint	*pi2;
2764 	struct phyint 	*last_choice = NULL;
2765 
2766 	if (pi->pi_group == phyint_anongroup)
2767 		return (NULL);
2768 
2769 	/*
2770 	 * Loop thru the phyints in the group, and pick the preferred
2771 	 * phyint for the target.
2772 	 */
2773 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2774 		/* Exclude ourself and offlined interfaces */
2775 		if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
2776 			continue;
2777 
2778 		/*
2779 		 * The chosen target phyint must have IPv4 instance
2780 		 * plumbed, if the src phyint has IPv4 plumbed. Similarly
2781 		 * for IPv6.
2782 		 */
2783 		if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
2784 		    (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
2785 			continue;
2786 
2787 		/* The chosen target must be PI_RUNNING. */
2788 		if (pi2->pi_state != PI_RUNNING) {
2789 			last_choice = pi2;
2790 			continue;
2791 		}
2792 
2793 		if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) &&
2794 		    (failover_type != FAILOVER_TO_NONSTANDBY)) {
2795 			return (pi2);
2796 		} else {
2797 			if (maybe == NULL)
2798 				maybe = pi2;
2799 			else if (logint_upcount(pi2) < logint_upcount(maybe))
2800 				maybe = pi2;
2801 		}
2802 	}
2803 	if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
2804 		return (last_choice);
2805 	else
2806 		return (maybe);
2807 }
2808 
2809 /*
2810  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2811  */
2812 boolean_t
2813 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
2814 {
2815 	int ifsock;
2816 	struct lifreq lifr;
2817 
2818 	if (debug & D_FAILOVER) {
2819 		logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
2820 		    pi->pi_name, flags, (int)setfl);
2821 	}
2822 
2823 	if (pi->pi_v4 != NULL) {
2824 		ifsock = ifsock_v4;
2825 	} else  {
2826 		ifsock = ifsock_v6;
2827 	}
2828 
2829 	/*
2830 	 * Get the current flags from the kernel, and set/clear the
2831 	 * desired phyint flags. Since we set only phyint flags, we can
2832 	 * do it on either IPv4 or IPv6 instance.
2833 	 */
2834 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2835 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
2836 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2837 		if (errno != ENXIO)
2838 			logperror("change_lif_flags: ioctl (get flags)");
2839 		return (_B_FALSE);
2840 	}
2841 	if (setfl)
2842 		lifr.lifr_flags |= flags;
2843 	else
2844 		lifr.lifr_flags &= ~flags;
2845 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2846 		if (errno != ENXIO)
2847 			logperror("change_lif_flags: ioctl (set flags)");
2848 		return (_B_FALSE);
2849 	}
2850 
2851 	/*
2852 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2853 	 * phyint flags.
2854 	 */
2855 	if (setfl)
2856 		pi->pi_flags |= flags;
2857 	else
2858 		pi->pi_flags &= ~flags;
2859 
2860 	if (pi->pi_v4)
2861 		pi->pi_v4->pii_flags = pi->pi_flags;
2862 
2863 	if (pi->pi_v6)
2864 		pi->pi_v6->pii_flags = pi->pi_flags;
2865 
2866 	return (_B_TRUE);
2867 }
2868 
2869 /*
2870  * icmp cksum computation for IPv4.
2871  */
2872 static int
2873 in_cksum(ushort_t *addr, int len)
2874 {
2875 	register int nleft = len;
2876 	register ushort_t *w = addr;
2877 	register ushort_t answer;
2878 	ushort_t odd_byte = 0;
2879 	register int sum = 0;
2880 
2881 	/*
2882 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2883 	 *  we add sequential 16 bit words to it, and at the end, fold
2884 	 *  back all the carry bits from the top 16 bits into the lower
2885 	 *  16 bits.
2886 	 */
2887 	while (nleft > 1)  {
2888 		sum += *w++;
2889 		nleft -= 2;
2890 	}
2891 
2892 	/* mop up an odd byte, if necessary */
2893 	if (nleft == 1) {
2894 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2895 		sum += odd_byte;
2896 	}
2897 
2898 	/*
2899 	 * add back carry outs from top 16 bits to low 16 bits
2900 	 */
2901 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2902 	sum += (sum >> 16);			/* add carry */
2903 	answer = ~sum;				/* truncate to 16 bits */
2904 	return (answer);
2905 }
2906 
2907 static void
2908 reset_snxt_basetimes(void)
2909 {
2910 	struct phyint_instance *pii;
2911 
2912 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2913 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2914 	}
2915 }
2916 
2917 /*
2918  * Is the address one of our own addresses? Unfortunately,
2919  * we cannot check our phyint tables to determine if the address
2920  * is our own. This is because, we don't track interfaces that
2921  * are not part of any group. We have to either use a 'bind' or
2922  * get the complete list of all interfaces using SIOCGLIFCONF,
2923  * to do this check. We could also use SIOCTMYADDR.
2924  * Bind fails for the local zone address, so we might include local zone
2925  * address as target address. If local zone address is a target address
2926  * and it is up, it is not possible to detect the interface failure.
2927  * SIOCTMYADDR also doesn't consider local zone address as own address.
2928  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2929  * are stored in laddr_list.
2930  */
2931 
2932 boolean_t
2933 own_address(struct in6_addr addr)
2934 {
2935 	struct local_addr *taddr = laddr_list;
2936 
2937 	for (; taddr != NULL; taddr = taddr->next) {
2938 		if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) {
2939 			return (_B_TRUE);
2940 		}
2941 	}
2942 	return (_B_FALSE);
2943 }
2944