1 /*
2  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Copyright (c) 1987 Regents of the University of California.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms are permitted
11  * provided that the above copyright notice and this paragraph are
12  * duplicated in all such forms and that any documentation,
13  * advertising materials, and other materials related to such
14  * distribution and use acknowledge that the software was developed
15  * by the University of California, Berkeley. The name of the
16  * University may not be used to endorse or promote products derived
17  * from this software without specific prior written permission.
18  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21  */
22 
23 #pragma ident	"%Z%%M%	%I%	%E% SMI"
24 
25 #include "mpd_defs.h"
26 #include "mpd_tables.h"
27 
28 /*
29  * Probe types for probe()
30  */
31 #define	PROBE_UNI	0x1234		/* Unicast probe packet */
32 #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
33 #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
34 
35 #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
36 
37 /*
38  * Format of probe / probe response packets. This is an ICMP Echo request
39  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
40  */
41 struct pr_icmp
42 {
43 	uint8_t  pr_icmp_type;		/* type field */
44 	uint8_t  pr_icmp_code;		/* code field */
45 	uint16_t pr_icmp_cksum;		/* checksum field */
46 	uint16_t pr_icmp_id;		/* Identification */
47 	uint16_t pr_icmp_seq;		/* sequence number */
48 	uint32_t pr_icmp_timestamp;	/* Time stamp	*/
49 	uint32_t pr_icmp_mtype;		/* Message type */
50 };
51 
52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
53 				    0x0, 0x0, 0x0, 0x0,
54 				    0x0, 0x0, 0x0, 0x0,
55 				    0x0, 0x0, 0x0, 0x1 } };
56 
57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
58 
59 static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
60 
61 static void		*find_ancillary(struct msghdr *msg, int cmsg_type);
62 static void		pi_set_crtt(struct target *tg, int m,
63     boolean_t is_probe_uni);
64 static void		incoming_echo_reply(struct phyint_instance *pii,
65     struct pr_icmp *reply, struct in6_addr fromaddr);
66 static void		incoming_rtt_reply(struct phyint_instance *pii,
67     struct pr_icmp *reply, struct in6_addr fromaddr);
68 static void		incoming_mcast_reply(struct phyint_instance *pii,
69     struct pr_icmp *reply, struct in6_addr fromaddr);
70 
71 static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
72 static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
73 static boolean_t	check_exception_target(struct phyint_instance *pii,
74     struct target *target);
75 static void		probe_fail_info(struct phyint_instance *pii,
76     struct target *cur_tg, struct probe_fail_count *pfinfo);
77 static void		probe_success_info(struct phyint_instance *pii,
78     struct target *cur_tg, struct probe_success_count *psinfo);
79 static boolean_t	phyint_repaired(struct phyint *pi);
80 
81 static int		failover(struct phyint *from, struct phyint *to);
82 static int		failback(struct phyint *from, struct phyint *to);
83 static struct phyint	*get_failover_dst(struct phyint *pi, int failover_type);
84 
85 static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
86 static int 		in_cksum(ushort_t *addr, int len);
87 static void		reset_snxt_basetimes(void);
88 
89 /*
90  * CRTT - Conservative Round Trip Time Estimate
91  * Probe success - A matching probe reply received before CRTT ms has elapsed
92  *	after sending the probe.
93  * Probe failure - No probe reply received and more than CRTT ms has elapsed
94  *	after sending the probe.
95  *
96  * TLS - Time last success. Most recent probe ack received at this time.
97  * TFF - Time first fail. The time of the earliest probe failure in
98  *	a consecutive series of probe failures.
99  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
100  * 	before declaring phyint repair.
101  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
102  *	declare a phyint failure.
103  *
104  * 			Phyint state diagram
105  *
106  * The state of a phyint that is capable of being probed, is completely
107  * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
108  *
109  * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
110  * of the link (according to the driver).  If the phyint is also configured
111  * with a test address (the common case) and probe targets, then a phyint must
112  * also successfully be able to send and receive probes in order to remain in
113  * the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
114  *
115  * Further, if a PI_RUNNING phyint is configured with a test address but is
116  * unable to find any probe targets, it will transition to the PI_NOTARGETS
117  * state, which indicates that the link is apparently functional but that
118  * in.mpathd is unable to send probes to verify functionality (in this case,
119  * in.mpathd makes the optimistic assumption that the interface is working
120  * correctly and thus does not perform a failover, but reports the interface
121  * as IPMP_IF_UNKNOWN through the async events and query interfaces).
122  *
123  * At any point, a phyint may be administratively marked offline via if_mpadm.
124  * In this case, the interface always transitions to PI_OFFLINE, regardless
125  * of its previous state.  When the interface is later brought back online,
126  * in.mpathd acts as if the interface is new (and thus it transitions to
127  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
128  * its probes, if probes are sent).
129  *
130  * pi_state -  PI_RUNNING or PI_FAILED
131  *	PI_RUNNING: The failure detection logic says the phyint is good.
132  *	PI_FAILED: The failure detection logic says the phyint has failed.
133  *
134  * pg_groupfailed  - Group failure, all interfaces in the group have failed.
135  *	The pi_state may be either PI_FAILED or PI_NOTARGETS.
136  *	In the case of router targets, we assume that the current list of
137  *	targets obtained from the routing table, is still valid, so the
138  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
139  *	list of targets, and multicast to the all hosts, to reconstruct the
140  *	target list. So the phyints are in the PI_NOTARGETS state.
141  *
142  * I -	value of (pi_flags & IFF_INACTIVE)
143  *	IFF_INACTIVE: No failovers have been done to the standby, from
144  *		other phyints. This phyint is an inactive standby.
145  *
146  * pi_empty
147  *	This phyint has failed over successfully to another phyint, and
148  *	this phyint is currently "empty". It does not host any addresses or
149  *	multicast membership etc. This is the state of a phyint after a
150  *	failover from the phyint has completed successfully and no subsequent
151  *	'failover to' or 'failback to' has occurred on the phyint.
152  *	IP guarantees that no new logicals will be hosted nor any multicast
153  *	joins permitted on the phyint, since the phyint is either failed or
154  *	inactive. pi_empty is set implies the phyint is either failed or
155  *	inactive.
156  *
157  * pi_full
158  *	The phyint hosts all of its own addresses that it "owns". If the
159  *	phyint was previously failed or inactive, failbacks to the phyint
160  *	has completed successfully. i.e. No more failbacks to this phyint
161  *	can produce any change in system state whatsoever.
162  *
163  * Not all 32 possible combinations of the above 5-tuple are possible.
164  * Furthermore some of the above combinations are transient. They may occur
165  * only because the failover or failback did not complete successfully. The
166  * failover/failback will be retried and eventually a stable state will be
167  * reached.
168  *
169  * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
170  * The following are the state machines. 'from' and 'to' are the src and
171  * dst of the failover/failback, below
172  *
173  *			pi_empty state machine
174  * ---------------------------------------------------------------------------
175  *	Event				State	->	New State
176  * ---------------------------------------------------------------------------
177  *	successful completion 		from.pi_empty = 0 -> from.pi_empty = 1
178  *	of failover
179  *
180  *	Initiate failover 		to.pi_empty = X   -> to.pi_empty = 0
181  *
182  * 	Initiate failback 		to.pi_empty = X   -> to.pi_empty = 0
183  *
184  * 	group failure			pi_empty = X	  -> pi_empty = 0
185  * ---------------------------------------------------------------------------
186  *
187  *			pi_full state machine
188  * ---------------------------------------------------------------------------
189  *	Event				State		  -> New State
190  * ---------------------------------------------------------------------------
191  *	successful completion		to.pi_full = 0    -> to.pi_full = 1
192  *	of failback from
193  *	each of the other phyints
194  *
195  *	Initiate failover 		from.pi_full = X  -> from.pi_full = 0
196  *
197  *	group failure			pi_full = X	  -> pi_full = 0
198  * ---------------------------------------------------------------------------
199  *
200  *			pi_state state machine
201  * ---------------------------------------------------------------------------
202  *	Event			State			New State
203  *				Action:
204  * ---------------------------------------------------------------------------
205  *	NIC failure		(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
206  *	detection		: set IFF_FAILED on this phyint
207  *				: failover from this phyint to another
208  *
209  *	NIC failure		(PI_RUNNING, I == 1) -> (PI_FAILED, I == 1)
210  *	detection		: set IFF_FAILED on this phyint
211  *
212  *	NIC repair 		(PI_FAILED, I == 0)  ->	(PI_RUNNING, I == 0)
213  *	detection		: to.pi_empty = 0
214  *				: failback to this phyint if enabled
215  *				: clear IFF_FAILED on this phyint
216  *
217  *	NIC repair 		(PI_FAILED, I == 1)  ->	(PI_RUNNING, I == 1)
218  *	detection		: clear IFF_FAILED on this phyint
219  *
220  *	Group failure		(perform on all phyints in the group)
221  *	detection 		PI_RUNNING		PI_FAILED
222  *	(Router targets)	: set IFF_FAILED
223  *				: clear pi_empty and pi_full
224  *
225  *	Group failure		(perform on all phyints in the group)
226  *	detection 		PI_RUNNING		PI_NOTARGETS
227  *	(Host targets)		: set IFF_FAILED
228  *				: clear pi_empty and pi_full
229  *				: delete the target list on all phyints
230  * ---------------------------------------------------------------------------
231  *
232  *			I state machine
233  * ---------------------------------------------------------------------------
234  *	Event		State			Action:
235  * ---------------------------------------------------------------------------
236  *	Turn on I 	pi_empty == 0 		: failover from standby
237  *
238  *	Turn off I 	PI_RUNNING,		: pi_empty = 0
239  *			pi_full == 0		: failback to this if enabled
240  * ---------------------------------------------------------------------------
241  *
242  * Assertions: (Read '==>' as implies)
243  *
244  * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
245  * (pi_empty == 1) ==> (pi_full == 0)
246  * (pi_full  == 1) ==> (pi_empty == 0)
247  *
248  * Invariants
249  *
250  * pg_groupfailed = 0  &&
251  *   1. (I == 1, pi_empty == 0)		 ==> initiate failover from standby
252  *   2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
253  *   3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
254  *
255  * 1. says that an inactive standby, that is not empty, has to be failed
256  * over. For a standby to be truly inactive, it should not host any
257  * addresses. So we move them to some other phyint. Usually we catch the
258  * turn on of IFF_INACTIVE, and perform this action. However if the failover
259  * did not complete successfully, then subsequently we have lost the edge
260  * trigger, and this invariant kicks in and completes the action.
261  *
262  * 2. says that any failed phyint that is not empty must be failed over.
263  * Usually we do the failover when we detect NIC failure. However if the
264  * failover does not complete successfully, this invariant kicks in and
265  * completes the failover. We exclude inactive standby which is covered by 1.
266  *
267  * 3. says that any running phyint that is not full must be failed back.
268  * Usually we do the failback when we detect NIC repair. However if the
269  * failback does not complete successfully, this invariant kicks in and
270  * completes the failback. Note that we don't want to failback to an inactive
271  * standby.
272  *
273  * The invariants 1 - 3 and the actions are in initifs().
274  */
275 
276 struct probes_missed probes_missed;
277 
278 /*
279  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
280  * will be added on by the kernel.  The id field identifies this phyint.
281  * and the sequence number is an increasing (modulo 2^^16) integer. The data
282  * portion holds the time value when the packet is sent. On echo this is
283  * extracted to compute the round-trip time. Three different types of
284  * probe packets are used.
285  *
286  * PROBE_UNI: This type is used to do failure detection / failure recovery
287  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
288  *	not less than the current CRTT. pii_probes[] stores data
289  *	about these probes. These packets consume sequence number space.
290  *
291  * PROBE_RTT: This type is used to make only rtt measurments. Normally these
292  * 	are not used. Under heavy network load, the rtt may go up very high,
293  *	due to a spike, or may appear to go high, due to extreme scheduling
294  * 	delays. Once the network stress is removed, mpathd takes long time to
295  *	recover, because the probe_interval is already high, and it takes
296  *	a long time to send out sufficient number of probes to bring down the
297  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
298  *	user_probe_interval ms. and will cause only rtt updates. These packets
299  *	do not consume sequence number space nor is information about these
300  *	packets stored in the pii_probes[]
301  *
302  * PROBE_MULTI: This type is only used to construct a list of targets, when
303  *	no targets are known. The packet is multicast to the all hosts addr.
304  */
305 static void
306 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
307 {
308 	struct pr_icmp probe_pkt;	/* Probe packet */
309 	struct sockaddr_in6 whereto6; 	/* target address IPv6 */
310 	struct sockaddr_in whereto; 	/* target address IPv4 */
311 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
312 	boolean_t sent = _B_TRUE;
313 
314 	if (debug & D_TARGET) {
315 		logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
316 		    pii->pii_name, probe_type, cur_time);
317 	}
318 
319 	assert(pii->pii_probe_sock != -1);
320 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
321 	    probe_type == PROBE_RTT);
322 
323 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
324 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
325 	probe_pkt.pr_icmp_code = 0;
326 	probe_pkt.pr_icmp_cksum = 0;
327 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
328 
329 	/*
330 	 * Since there is no need to do arithmetic on the icmpid,
331 	 * (only equality check is done) pii_icmpid is stored in
332 	 * network byte order at initialization itself.
333 	 */
334 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
335 	probe_pkt.pr_icmp_timestamp = htonl(cur_time);
336 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
337 
338 	/*
339 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
340 	 * the all hosts address. Otherwise it is unicast to the next target.
341 	 */
342 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
343 	    pii->pii_rtt_target_next != NULL));
344 
345 	if (pii->pii_af == AF_INET6) {
346 		bzero(&whereto6, sizeof (whereto6));
347 		whereto6.sin6_family = AF_INET6;
348 		if (probe_type == PROBE_MULTI) {
349 			whereto6.sin6_addr = all_nodes_mcast_v6;
350 		} else if (probe_type == PROBE_UNI) {
351 			whereto6.sin6_addr = pii->pii_target_next->tg_address;
352 		} else  {
353 			/* type is PROBE_RTT */
354 			whereto6.sin6_addr =
355 			    pii->pii_rtt_target_next->tg_address;
356 		}
357 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
358 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
359 		    sizeof (whereto6)) != sizeof (probe_pkt)) {
360 			logperror_pii(pii, "probe: probe sendto");
361 			sent = _B_FALSE;
362 		}
363 	} else {
364 		bzero(&whereto, sizeof (whereto));
365 		whereto.sin_family = AF_INET;
366 		if (probe_type == PROBE_MULTI) {
367 			whereto.sin_addr = all_nodes_mcast_v4;
368 		} else if (probe_type == PROBE_UNI) {
369 			IN6_V4MAPPED_TO_INADDR(
370 			    &pii->pii_target_next->tg_address,
371 			    &whereto.sin_addr);
372 		} else {
373 			/* type is PROBE_RTT */
374 			IN6_V4MAPPED_TO_INADDR(
375 			    &pii->pii_rtt_target_next->tg_address,
376 			    &whereto.sin_addr);
377 		}
378 
379 		/*
380 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
381 		 */
382 		probe_pkt.pr_icmp_cksum =
383 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
384 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
385 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
386 		    sizeof (whereto)) != sizeof (probe_pkt)) {
387 			logperror_pii(pii, "probe: probe sendto");
388 			sent = _B_FALSE;
389 		}
390 	}
391 
392 	/*
393 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
394 	 * update our tables. We will need this info in processing the probe
395 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
396 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
397 	 * are only used to construct a list of targets. PROBE_RTT packets are
398 	 * used only for updating the rtt and not for failure detection.
399 	 */
400 	if (probe_type == PROBE_UNI && sent) {
401 		pr_ndx = pii->pii_probe_next;
402 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
403 
404 		/* Collect statistics, before we reuse the last slot. */
405 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
406 			pii->pii_cum_stats.lost++;
407 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
408 			pii->pii_cum_stats.acked++;
409 		pii->pii_cum_stats.sent++;
410 
411 		pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
412 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
413 		pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
414 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
415 		pii->pii_target_next = target_next(pii->pii_target_next);
416 		assert(pii->pii_target_next != NULL);
417 		/*
418 		 * If we have a single variable to denote the next target to
419 		 * probe for both rtt probes and failure detection probes, we
420 		 * could end up with a situation where the failure detection
421 		 * probe targets become disjoint from the rtt probe targets.
422 		 * Eg. if 2 targets and the actual fdt is double the user
423 		 * specified fdt. So we have 2 variables. In this scheme
424 		 * we also reset pii_rtt_target_next for every fdt probe,
425 		 * though that may not be necessary.
426 		 */
427 		pii->pii_rtt_target_next = pii->pii_target_next;
428 		pii->pii_snxt++;
429 	} else if (probe_type == PROBE_RTT) {
430 		pii->pii_rtt_target_next =
431 		    target_next(pii->pii_rtt_target_next);
432 		assert(pii->pii_rtt_target_next != NULL);
433 	}
434 }
435 
436 /*
437  * Incoming IPv4 data from wire, is received here. Called from main.
438  */
439 void
440 in_data(struct phyint_instance *pii)
441 {
442 	struct	sockaddr_in 	from;
443 	struct	in6_addr	fromaddr;
444 	uint_t	fromlen;
445 	static uint_t in_packet[(IP_MAXPACKET + 1)/4];
446 	struct ip *ip;
447 	int 	iphlen;
448 	int 	len;
449 	char 	abuf[INET_ADDRSTRLEN];
450 	struct	pr_icmp	*reply;
451 
452 	if (debug & D_PROBE) {
453 		logdebug("in_data(%s %s)\n",
454 		    AF_STR(pii->pii_af), pii->pii_name);
455 	}
456 
457 	/*
458 	 * Poll has already told us that a message is waiting,
459 	 * on this socket. Read it now. We should not block.
460 	 */
461 	fromlen = sizeof (from);
462 	len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
463 	    sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
464 	if (len < 0) {
465 		logperror_pii(pii, "in_data: recvfrom");
466 		return;
467 	}
468 
469 	/*
470 	 * If the NIC has indicated the link is down, don't go
471 	 * any further.
472 	 */
473 	if (LINK_DOWN(pii->pii_phyint))
474 		return;
475 
476 	/* Get the printable address for error reporting */
477 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
478 
479 	/* Make sure packet contains at least minimum ICMP header */
480 	ip = (struct ip *)in_packet;
481 	iphlen = ip->ip_hl << 2;
482 	if (len < iphlen + ICMP_MINLEN) {
483 		if (debug & D_PKTBAD) {
484 			logdebug("in_data: packet too short (%d bytes)"
485 			    " from %s\n", len, abuf);
486 		}
487 		return;
488 	}
489 
490 	/*
491 	 * Subtract the IP hdr length, 'len' will be length of the probe
492 	 * reply, starting from the icmp hdr.
493 	 */
494 	len -= iphlen;
495 	/* LINTED */
496 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
497 
498 	/* Probe replies are icmp echo replies. Ignore anything else */
499 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
500 		return;
501 
502 	/*
503 	 * The icmp id should match what we sent, which is stored
504 	 * in pi_icmpid. The icmp code for reply must be 0.
505 	 * The reply content must be a struct pr_icmp
506 	 */
507 	if (reply->pr_icmp_id != pii->pii_icmpid) {
508 		/* Not in response to our probe */
509 		return;
510 	}
511 
512 	if (reply->pr_icmp_code != 0) {
513 		logtrace("probe reply code %d from %s on %s\n",
514 		    reply->pr_icmp_code, abuf, pii->pii_name);
515 		return;
516 	}
517 
518 	if (len < sizeof (struct pr_icmp)) {
519 		logtrace("probe reply too short: %d bytes from %s on %s\n",
520 		    len, abuf, pii->pii_name);
521 		return;
522 	}
523 
524 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
525 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
526 		/* Unicast probe reply */
527 		incoming_echo_reply(pii, reply, fromaddr);
528 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
529 		/* Multicast reply */
530 		incoming_mcast_reply(pii, reply, fromaddr);
531 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
532 		incoming_rtt_reply(pii, reply, fromaddr);
533 	} else {
534 		/* Probably not in response to our probe */
535 		logtrace("probe reply type: %d from %s on %s\n",
536 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
537 		return;
538 	}
539 
540 }
541 
542 /*
543  * Incoming IPv6 data from wire is received here. Called from main.
544  */
545 void
546 in6_data(struct phyint_instance *pii)
547 {
548 	struct sockaddr_in6 from;
549 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
550 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
551 	int len;
552 	char abuf[INET6_ADDRSTRLEN];
553 	struct msghdr msg;
554 	struct iovec iov;
555 	uchar_t *opt;
556 	struct	pr_icmp *reply;
557 
558 	if (debug & D_PROBE) {
559 		logdebug("in6_data(%s %s)\n",
560 		    AF_STR(pii->pii_af), pii->pii_name);
561 	}
562 
563 	iov.iov_base = (char *)in_packet;
564 	iov.iov_len = sizeof (in_packet);
565 	msg.msg_iov = &iov;
566 	msg.msg_iovlen = 1;
567 	msg.msg_name = (struct sockaddr *)&from;
568 	msg.msg_namelen = sizeof (from);
569 	msg.msg_control = ancillary_data;
570 	msg.msg_controllen = sizeof (ancillary_data);
571 
572 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
573 		logperror_pii(pii, "in6_data: recvfrom");
574 		return;
575 	}
576 
577 	/*
578 	 * If the NIC has indicated that the link is down, don't go
579 	 * any further.
580 	 */
581 	if (LINK_DOWN(pii->pii_phyint))
582 		return;
583 
584 	/* Get the printable address for error reporting */
585 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
586 	if (len < ICMP_MINLEN) {
587 		if (debug & D_PKTBAD) {
588 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
589 			    msg.msg_flags, abuf);
590 		}
591 		return;
592 	}
593 	/* Ignore packets > 64k or control buffers that don't fit */
594 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
595 		if (debug & D_PKTBAD) {
596 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
597 			    msg.msg_flags, abuf);
598 		}
599 		return;
600 	}
601 
602 	reply = (struct pr_icmp *)in_packet;
603 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
604 		return;
605 
606 	if (reply->pr_icmp_id != pii->pii_icmpid) {
607 		/* Not in response to our probe */
608 		return;
609 	}
610 
611 	/*
612 	 * The kernel has already verified the the ICMP checksum.
613 	 */
614 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
615 		logtrace("ICMPv6 echo reply source address not linklocal from "
616 		    "%s on %s\n", abuf, pii->pii_name);
617 		return;
618 	}
619 	opt = find_ancillary(&msg, IPV6_RTHDR);
620 	if (opt != NULL) {
621 		/* Can't allow routing headers in probe replies  */
622 		logtrace("message with routing header from %s on %s\n",
623 		    abuf, pii->pii_name);
624 		return;
625 	}
626 	if (reply->pr_icmp_code != 0) {
627 		logtrace("probe reply code: %d from %s on %s\n",
628 		    reply->pr_icmp_code, abuf, pii->pii_name);
629 		return;
630 	}
631 	if (len < (sizeof (struct pr_icmp))) {
632 		logtrace("probe reply too short: %d bytes from %s on %s\n",
633 		    len, abuf, pii->pii_name);
634 		return;
635 	}
636 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
637 		incoming_echo_reply(pii, reply, from.sin6_addr);
638 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
639 		incoming_mcast_reply(pii, reply, from.sin6_addr);
640 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
641 		incoming_rtt_reply(pii, reply, from.sin6_addr);
642 	} else  {
643 		/* Probably not in response to our probe */
644 		logtrace("probe reply type: %d from %s on %s\n",
645 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
646 	}
647 }
648 
649 /*
650  * Process the incoming rtt reply, in response to our rtt probe.
651  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
652  * have any stored information about the probe we sent. So we don't log
653  * any errors if we receive bad replies.
654  */
655 static void
656 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
657     struct in6_addr fromaddr)
658 {
659 	int 	m;		/* rtt measurment in ms */
660 	uint32_t cur_time;	/* in ms from some arbitrary point */
661 	char	abuf[INET6_ADDRSTRLEN];
662 	struct	target	*target;
663 	uint32_t pr_icmp_timestamp;
664 	struct 	phyint_group *pg;
665 
666 	/* Get the printable address for error reporting */
667 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
668 
669 	if (debug & D_PROBE) {
670 		logdebug("incoming_rtt_reply: %s %s %s\n",
671 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
672 	}
673 
674 	/* Do we know this target ? */
675 	target = target_lookup(pii, fromaddr);
676 	if (target == NULL)
677 		return;
678 
679 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
680 	cur_time = getcurrenttime();
681 	m = (int)(cur_time - pr_icmp_timestamp);
682 
683 	/* Invalid rtt. It has wrapped around */
684 	if (m < 0)
685 		return;
686 
687 	/*
688 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
689 	 * The initial few responses after the interface is repaired may
690 	 * contain high rtt's because they could have been queued up waiting
691 	 * for ARP/NDP resolution on a failed interface.
692 	 */
693 	pg = pii->pii_phyint->pi_group;
694 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
695 		return;
696 
697 	/*
698 	 * Update rtt only if the new rtt is lower than the current rtt.
699 	 * (specified by the 3rd parameter to pi_set_crtt).
700 	 * If a spike has caused the current probe_interval to be >
701 	 * user_probe_interval, then this mechanism is used to bring down
702 	 * the rtt rapidly once the network stress is removed.
703 	 * If the new rtt is higher than the current rtt, we don't want to
704 	 * update the rtt. We are having more than 1 outstanding probe and
705 	 * the increase in rtt we are seeing is being unnecessarily weighted
706 	 * many times. The regular rtt update will be handled by
707 	 * incoming_echo_reply() and will take care of any rtt increase.
708 	 */
709 	pi_set_crtt(target, m, _B_FALSE);
710 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
711 	    (user_failure_detection_time < pg->pg_fdt) &&
712 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
713 		/*
714 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
715 		 * investigate if we can improve the failure detection time to
716 		 * meet whatever the user specified.
717 		 */
718 		if (check_pg_crtt_improved(pg)) {
719 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
720 			    user_failure_detection_time);
721 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
722 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
723 				logerr("Improved failure detection time %d ms "
724 				    "on (%s %s) for group \"%s\"\n",
725 				    pg->pg_fdt, AF_STR(pii->pii_af),
726 				    pii->pii_name,
727 				    pii->pii_phyint->pi_group->pg_name);
728 			}
729 			if (user_failure_detection_time == pg->pg_fdt) {
730 				/* Avoid any truncation or rounding errors */
731 				pg->pg_probeint = user_probe_interval;
732 				/*
733 				 * No more rtt probes will be sent. The actual
734 				 * fdt has dropped to the user specified value.
735 				 * pii_fd_snxt_basetime and pii_snxt_basetime
736 				 * will be in sync henceforth.
737 				 */
738 				reset_snxt_basetimes();
739 			}
740 		}
741 	}
742 }
743 
744 /*
745  * Process the incoming echo reply, in response to our unicast probe.
746  * Common for both IPv4 and IPv6
747  */
748 static void
749 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
750     struct in6_addr fromaddr)
751 {
752 	int 	m;		/* rtt measurment in ms */
753 	uint32_t cur_time;	/* in ms from some arbitrary point */
754 	char	abuf[INET6_ADDRSTRLEN];
755 	int	pr_ndx;
756 	struct	target	*target;
757 	boolean_t exception;
758 	uint32_t pr_icmp_timestamp;
759 	uint16_t pr_icmp_seq;
760 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
761 
762 	/* Get the printable address for error reporting */
763 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
764 
765 	if (debug & D_PROBE) {
766 		logdebug("incoming_echo_reply: %s %s %s seq %u\n",
767 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
768 		    ntohs(reply->pr_icmp_seq));
769 	}
770 
771 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
772 	pr_icmp_seq  = ntohs(reply->pr_icmp_seq);
773 
774 	/* Reject out of window probe replies */
775 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
776 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
777 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
778 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
779 		pii->pii_cum_stats.unknown++;
780 		return;
781 	}
782 	cur_time = getcurrenttime();
783 	m = (int)(cur_time - pr_icmp_timestamp);
784 	if (m < 0) {
785 		/*
786 		 * This is a ridiculously high value of rtt. rtt has wrapped
787 		 * around. Log a message, and ignore the rtt.
788 		 */
789 		logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
790 		    "timestamp %u\n", cur_time, pr_icmp_timestamp);
791 	}
792 
793 	/*
794 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
795 	 * number in our pii->pii_probes[] array. The icmp sequence number
796 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
797 	 */
798 	pr_ndx = MOD_SUB(pii->pii_probe_next,
799 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
800 
801 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
802 
803 	target = pii->pii_probes[pr_ndx].pr_target;
804 
805 	/*
806 	 * Perform sanity checks, whether this probe reply that we
807 	 * have received is genuine
808 	 */
809 	if (target != NULL) {
810 		/*
811 		 * Compare the src. addr of the received ICMP or ICMPv6
812 		 * probe reply with the target address in our tables.
813 		 */
814 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
815 			/*
816 			 * We don't have any record of having sent a probe to
817 			 * this target. This is a fake probe reply. Log an error
818 			 */
819 			logtrace("probe status %d Fake probe reply seq %u "
820 			    "snxt %u on %s from %s\n",
821 			    pii->pii_probes[pr_ndx].pr_status,
822 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
823 			pii->pii_cum_stats.unknown++;
824 			return;
825 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
826 			/*
827 			 * The address matches, but our tables indicate that
828 			 * this probe reply has been acked already. So this
829 			 * is a duplicate probe reply. Log an error
830 			 */
831 			logtrace("probe status %d Duplicate probe reply seq %u "
832 			    "snxt %u on %s from %s\n",
833 			    pii->pii_probes[pr_ndx].pr_status,
834 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
835 			pii->pii_cum_stats.unknown++;
836 			return;
837 		}
838 	} else {
839 		/*
840 		 * Target must not be NULL in the PR_UNACKED state
841 		 */
842 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
843 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
844 			/*
845 			 * The probe stats slot is unused. So we didn't
846 			 * send out any probe to this target. This is a fake.
847 			 * Log an error.
848 			 */
849 			logtrace("probe status %d Fake probe reply seq %u "
850 			    "snxt %u on %s from %s\n",
851 			    pii->pii_probes[pr_ndx].pr_status,
852 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
853 		}
854 		pii->pii_cum_stats.unknown++;
855 		return;
856 	}
857 
858 	/*
859 	 * If the rtt does not appear to be right, don't update the
860 	 * rtt stats. This can happen if the system dropped into the
861 	 * debugger, or the system was hung or too busy for a
862 	 * substantial time that we didn't get a chance to run.
863 	 */
864 	if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
865 		/*
866 		 * If the probe corresponding to this receieved response
867 		 * was truly sent 'm' ms. ago, then this response must
868 		 * have been rejected by the sequence number checks. The
869 		 * fact that it has passed the sequence number checks
870 		 * means that the measured rtt is wrong. We were probably
871 		 * scheduled long after the packet was received.
872 		 */
873 		goto out;
874 	}
875 
876 	/*
877 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
878 	 * The initial few responses after the interface is repaired may
879 	 * contain high rtt's because they could have been queued up waiting
880 	 * for ARP/NDP resolution on a failed interface.
881 	 */
882 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
883 		goto out;
884 
885 	/*
886 	 * Don't update the Conservative Round Trip Time estimate for this
887 	 * (phint, target) pair if this is the not the highest ack seq seen
888 	 * thus far on this target.
889 	 */
890 	if (!highest_ack_tg(pr_icmp_seq, target))
891 		goto out;
892 
893 	/*
894 	 * Always update the rtt. This is a failure detection probe
895 	 * and we want to measure both increase / decrease in rtt.
896 	 */
897 	pi_set_crtt(target, m, _B_TRUE);
898 
899 	/*
900 	 * If the crtt exceeds the average time between probes,
901 	 * investigate if this slow target is an exception. If so we
902 	 * can avoid this target and still meet the failure detection
903 	 * time. Otherwise we can't meet the failure detection time.
904 	 */
905 	if (target->tg_crtt > pg->pg_probeint) {
906 		exception = check_exception_target(pii, target);
907 		if (exception) {
908 			/*
909 			 * This target is exceptionally slow. Don't use it
910 			 * for future probes. check_exception_target() has
911 			 * made sure that we have at least MIN_PROBE_TARGETS
912 			 * other active targets
913 			 */
914 			if (pii->pii_targets_are_routers) {
915 				/*
916 				 * This is a slow router, mark it as slow
917 				 * and don't use it for further probes. We
918 				 * don't delete it, since it will be populated
919 				 * again when we do a router scan. Hence we
920 				 * need to maintain extra state (unlike the
921 				 * host case below).  Mark it as TG_SLOW.
922 				 */
923 				if (target->tg_status == TG_ACTIVE)
924 					pii->pii_ntargets--;
925 				target->tg_status = TG_SLOW;
926 				target->tg_latime = gethrtime();
927 				target->tg_rtt_sa = -1;
928 				target->tg_crtt = 0;
929 				target->tg_rtt_sd = 0;
930 				if (pii->pii_target_next == target) {
931 					pii->pii_target_next =
932 					    target_next(target);
933 				}
934 			} else {
935 				/*
936 				 * the slow target is not a router, we can
937 				 * just delete it. Send an icmp multicast and
938 				 * pick the fastest responder that is not
939 				 * already an active target. target_delete()
940 				 * adjusts pii->pii_target_next
941 				 */
942 				target_delete(target);
943 				probe(pii, PROBE_MULTI, cur_time);
944 			}
945 		} else {
946 			/*
947 			 * We can't meet the failure detection time.
948 			 * Log a message, and update the detection time to
949 			 * whatever we can achieve.
950 			 */
951 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
952 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
953 			last_fdt_bumpup_time = gethrtime();
954 			if (pg != phyint_anongroup) {
955 				logerr("Cannot meet requested failure detection"
956 				    " time of %d ms on (%s %s) new failure"
957 				    " detection time for group \"%s\" is %d"
958 				    " ms\n", user_failure_detection_time,
959 				    AF_STR(pii->pii_af), pii->pii_name,
960 				    pg->pg_name, pg->pg_fdt);
961 			}
962 		}
963 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
964 	    (user_failure_detection_time < pg->pg_fdt) &&
965 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
966 		/*
967 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
968 		 * investigate if we can improve the failure detection time to
969 		 * meet whatever the user specified.
970 		 */
971 		if (check_pg_crtt_improved(pg)) {
972 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
973 			    user_failure_detection_time);
974 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
975 			if (pg != phyint_anongroup) {
976 				logerr("Improved failure detection time %d ms "
977 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
978 				    AF_STR(pii->pii_af), pii->pii_name,
979 				    pg->pg_name);
980 			}
981 			if (user_failure_detection_time == pg->pg_fdt) {
982 				/* Avoid any truncation or rounding errors */
983 				pg->pg_probeint = user_probe_interval;
984 				/*
985 				 * No more rtt probes will be sent. The actual
986 				 * fdt has dropped to the user specified value.
987 				 * pii_fd_snxt_basetime and pii_snxt_basetime
988 				 * will be in sync henceforth.
989 				 */
990 				reset_snxt_basetimes();
991 			}
992 		}
993 	}
994 out:
995 	pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
996 	pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
997 
998 	/*
999 	 * Update pii->pii_rack, i.e. the sequence number of the last received
1000 	 * probe response, based on the echo reply we have received now, if
1001 	 * either of the following conditions are satisfied.
1002 	 * a. pii_rack is outside the current receive window of
1003 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
1004 	 *    This means we have not received probe responses for a
1005 	 *    long time, and the sequence number has wrapped around.
1006 	 * b. pii_rack is within the current receive window and this echo
1007 	 *    reply corresponds to the highest sequence number we have seen
1008 	 *    so far.
1009 	 */
1010 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
1011 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
1012 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
1013 		pii->pii_rack = pr_icmp_seq;
1014 	}
1015 }
1016 
1017 /*
1018  * Returns true if seq is the highest unacknowledged seq for target tg
1019  * else returns false
1020  */
1021 static boolean_t
1022 highest_ack_tg(uint16_t seq, struct target *tg)
1023 {
1024 	struct phyint_instance *pii;
1025 	int	 pr_ndx;
1026 	uint16_t pr_seq;
1027 
1028 	pii = tg->tg_phyint_inst;
1029 
1030 	/*
1031 	 * Get the seq number of the most recent probe sent so far,
1032 	 * and also get the corresponding probe index in the probe stats
1033 	 * array.
1034 	 */
1035 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1036 	pr_seq = pii->pii_snxt;
1037 	pr_seq--;
1038 
1039 	/*
1040 	 * Start from the most recent probe and walk back, trying to find
1041 	 * an acked probe corresponding to target tg.
1042 	 */
1043 	for (; pr_ndx != pii->pii_probe_next;
1044 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1045 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
1046 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1047 			if (SEQ_GT(pr_seq, seq))
1048 				return (_B_FALSE);
1049 		}
1050 	}
1051 	return (_B_TRUE);
1052 }
1053 
1054 /*
1055  * Check whether the crtt for the group has improved by a factor of
1056  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1057  * detection time flapping in the face of small crtt changes.
1058  */
1059 static boolean_t
1060 check_pg_crtt_improved(struct phyint_group *pg)
1061 {
1062 	struct	phyint *pi;
1063 
1064 	if (debug & D_PROBE)
1065 		logdebug("check_pg_crtt_improved()\n");
1066 
1067 	/*
1068 	 * The crtt for the group is only improved if each phyint_instance
1069 	 * for both ipv4 and ipv6 is improved.
1070 	 */
1071 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1072 		if (!check_pii_crtt_improved(pi->pi_v4) ||
1073 		    !check_pii_crtt_improved(pi->pi_v6))
1074 			return (_B_FALSE);
1075 	}
1076 
1077 	return (_B_TRUE);
1078 }
1079 
1080 /*
1081  * Check whether the crtt has improved substantially on this phyint_instance.
1082  * Returns _B_TRUE if there's no crtt information available, because pii
1083  * is NULL or the phyint_instance is not capable of probing.
1084  */
1085 boolean_t
1086 check_pii_crtt_improved(struct phyint_instance *pii) {
1087 	struct 	target *tg;
1088 
1089 	if (pii == NULL)
1090 		return (_B_TRUE);
1091 
1092 	if (!PROBE_CAPABLE(pii) ||
1093 	    pii->pii_phyint->pi_state == PI_FAILED)
1094 		return (_B_TRUE);
1095 
1096 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1097 		if (tg->tg_status != TG_ACTIVE)
1098 			continue;
1099 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1100 		    LOWER_FDT_TRIGGER)) {
1101 			return (_B_FALSE);
1102 		}
1103 	}
1104 
1105 	return (_B_TRUE);
1106 }
1107 
1108 /*
1109  * This target responds very slowly to probes. The target's crtt exceeds
1110  * the probe interval of its group. Compare against other targets
1111  * and determine if this target is an exception, if so return true, else false
1112  */
1113 static boolean_t
1114 check_exception_target(struct phyint_instance *pii, struct target *target)
1115 {
1116 	struct	target *tg;
1117 	char abuf[INET6_ADDRSTRLEN];
1118 
1119 	if (debug & D_PROBE) {
1120 		logdebug("check_exception_target(%s %s target %s)\n",
1121 		    AF_STR(pii->pii_af), pii->pii_name,
1122 		    pr_addr(pii->pii_af, target->tg_address,
1123 			abuf, sizeof (abuf)));
1124 	}
1125 
1126 	/*
1127 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1128 	 * to make a good judgement. Otherwise don't drop this target.
1129 	 */
1130 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1131 		return (_B_FALSE);
1132 
1133 	/*
1134 	 * Determine whether only this particular target is slow.
1135 	 * We know that this target's crtt exceeds the group's probe interval.
1136 	 * If all other active targets have a
1137 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1138 	 * then this target is considered slow.
1139 	 */
1140 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1141 		if (tg != target && tg->tg_status == TG_ACTIVE) {
1142 			if (tg->tg_crtt >
1143 			    pii->pii_phyint->pi_group->pg_probeint /
1144 			    EXCEPTION_FACTOR) {
1145 				return (_B_FALSE);
1146 			}
1147 		}
1148 	}
1149 
1150 	return (_B_TRUE);
1151 }
1152 
1153 /*
1154  * Update the target list. The icmp all hosts multicast has given us
1155  * some host to which we can send probes. If we already have sufficient
1156  * targets, discard it.
1157  */
1158 static void
1159 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1160     struct in6_addr fromaddr)
1161 /* ARGSUSED */
1162 {
1163 	int af;
1164 	char abuf[INET6_ADDRSTRLEN];
1165 	struct phyint *pi;
1166 
1167 	if (debug & D_PROBE) {
1168 		logdebug("incoming_mcast_reply(%s %s %s)\n",
1169 		    AF_STR(pii->pii_af), pii->pii_name,
1170 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1171 	}
1172 
1173 	/*
1174 	 * Using host targets is a fallback mechanism. If we have
1175 	 * found a router, don't add this host target. If we already
1176 	 * know MAX_PROBE_TARGETS, don't add another target.
1177 	 */
1178 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1179 	if (pii->pii_targets != NULL) {
1180 		if (pii->pii_targets_are_routers ||
1181 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1182 			return;
1183 		}
1184 	}
1185 
1186 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1187 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1188 		/*
1189 		 * Guard against response from 0.0.0.0
1190 		 * and ::. Log a trace message
1191 		 */
1192 		logtrace("probe response from %s on %s\n",
1193 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1194 		    pii->pii_name);
1195 		return;
1196 	}
1197 
1198 	/*
1199 	 * This address is one of our own, so reject this address as a
1200 	 * valid probe target.
1201 	 */
1202 	af = pii->pii_af;
1203 	if (own_address(af, fromaddr))
1204 		return;
1205 
1206 	/*
1207 	 * If the phyint is part a named group, then add the address to all
1208 	 * members of the group.  Otherwise, add the address only to the
1209 	 * phyint itself, since other phyints in the anongroup may not be on
1210 	 * the same subnet.
1211 	 */
1212 	pi = pii->pii_phyint;
1213 	if (pi->pi_group == phyint_anongroup) {
1214 		target_add(pii, fromaddr, _B_FALSE);
1215 	} else {
1216 		pi = pi->pi_group->pg_phyint;
1217 		for (; pi != NULL; pi = pi->pi_pgnext)
1218 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1219 	}
1220 }
1221 
1222 /*
1223  * Compute CRTT given an existing scaled average, scaled deviation estimate
1224  * and a new rtt time.  The formula is from Jacobson and Karels'
1225  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1226  * are the same as those in Appendix A.2 of that paper.
1227  *
1228  * m = new measurement
1229  * sa = scaled RTT average (8 * average estimates)
1230  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1231  * crtt = Conservative round trip time. Used to determine whether probe
1232  * has timed out.
1233  *
1234  * New scaled average and deviation are passed back via sap and svp
1235  */
1236 static int
1237 compute_crtt(int *sap, int *svp, int m)
1238 {
1239 	int sa = *sap;
1240 	int sv = *svp;
1241 	int crtt;
1242 	int saved_m = m;
1243 
1244 	assert(*sap >= -1);
1245 	assert(*svp >= 0);
1246 
1247 	if (sa != -1) {
1248 		/*
1249 		 * Update average estimator:
1250 		 *	new rtt = old rtt + 1/8 Error
1251 		 *	    where Error = m - old rtt
1252 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1253 		 *	i.e. new sa =  old sa + Error
1254 		 */
1255 		m -= sa >> 3;		/* m is now Error in estimate. */
1256 		if ((sa += m) < 0) {
1257 			/* Don't allow the smoothed average to be negative. */
1258 			sa = 0;
1259 		}
1260 
1261 		/*
1262 		 * Update deviation estimator:
1263 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1264 		 *	i.e. 4 * new mdev = 4 * old mdev +
1265 		 *		(abs(Error) - old mdev)
1266 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1267 		 */
1268 		if (m < 0)
1269 			m = -m;
1270 		m -= sv >> 2;
1271 		sv += m;
1272 	} else {
1273 		/* Initialization. This is the first response received. */
1274 		sa = (m << 3);
1275 		sv = (m << 1);
1276 	}
1277 
1278 	crtt = (sa >> 3) + sv;
1279 
1280 	if (debug & D_PROBE) {
1281 		logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
1282 		    "%d\n", saved_m, sa, sv, crtt);
1283 	}
1284 
1285 	*sap = sa;
1286 	*svp = sv;
1287 
1288 	/*
1289 	 * CRTT = average estimates  + 4 * deviation estimates
1290 	 *	= sa / 8 + sv
1291 	 */
1292 	return (crtt);
1293 }
1294 
1295 static void
1296 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
1297 {
1298 	struct phyint_instance *pii = tg->tg_phyint_inst;
1299 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1300 	int sa = tg->tg_rtt_sa;
1301 	int sv = tg->tg_rtt_sd;
1302 	int new_crtt;
1303 	int i;
1304 
1305 	if (debug & D_PROBE)
1306 		logdebug("pi_set_crtt: target -  m %d\n", m);
1307 
1308 	/* store the round trip time, in case we need to defer computation */
1309 	tg->tg_deferred[tg->tg_num_deferred] = m;
1310 
1311 	new_crtt = compute_crtt(&sa, &sv, m);
1312 
1313 	/*
1314 	 * If this probe's round trip time would singlehandedly cause an
1315 	 * increase in the group's probe interval consider it suspect.
1316 	 */
1317 	if ((new_crtt > probe_interval) && is_probe_uni) {
1318 		if (debug & D_PROBE) {
1319 			logdebug("Received a suspect probe on %s, new_crtt ="
1320 			    " %d, probe_interval = %d, num_deferred = %d\n",
1321 			    pii->pii_probe_logint->li_name, new_crtt,
1322 			    probe_interval, tg->tg_num_deferred);
1323 		}
1324 
1325 		/*
1326 		 * If we've deferred as many rtts as we plan on deferring, then
1327 		 * assume the link really did slow down and process all queued
1328 		 * rtts
1329 		 */
1330 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1331 			if (debug & D_PROBE) {
1332 				logdebug("Received MAXDEFERREDRTT probes which "
1333 				    "would cause an increased probe_interval.  "
1334 				    "Integrating queued rtt data points.\n");
1335 			}
1336 
1337 			for (i = 0; i <= tg->tg_num_deferred; i++) {
1338 				tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
1339 				    &tg->tg_rtt_sd, tg->tg_deferred[i]);
1340 			}
1341 
1342 			tg->tg_num_deferred = 0;
1343 		} else {
1344 			tg->tg_num_deferred++;
1345 		}
1346 		return;
1347 	}
1348 
1349 	/*
1350 	 * If this is a normal probe, or an RTT probe that would lead to a
1351 	 * reduced CRTT, then update our CRTT data.  Further, if this was
1352 	 * a normal probe, pitch any deferred probes since our probes are
1353 	 * again being answered within our CRTT estimates.
1354 	 */
1355 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1356 		tg->tg_rtt_sa = sa;
1357 		tg->tg_rtt_sd = sv;
1358 		tg->tg_crtt = new_crtt;
1359 		if (is_probe_uni)
1360 			tg->tg_num_deferred = 0;
1361 	}
1362 }
1363 
1364 /*
1365  * Return a pointer to the specified option buffer.
1366  * If not found return NULL.
1367  */
1368 static void *
1369 find_ancillary(struct msghdr *msg, int cmsg_type)
1370 {
1371 	struct cmsghdr *cmsg;
1372 
1373 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1374 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1375 		if (cmsg->cmsg_level == IPPROTO_IPV6 &&
1376 		    cmsg->cmsg_type == cmsg_type) {
1377 			return (CMSG_DATA(cmsg));
1378 		}
1379 	}
1380 	return (NULL);
1381 }
1382 
1383 /*
1384  * See if a previously failed interface has started working again.
1385  */
1386 void
1387 phyint_check_for_repair(struct phyint *pi)
1388 {
1389 	if (phyint_repaired(pi)) {
1390 		if (pi->pi_group == phyint_anongroup) {
1391 			logerr("NIC repair detected on %s\n", pi->pi_name);
1392 		} else {
1393 			logerr("NIC repair detected on %s of group %s\n",
1394 			    pi->pi_name, pi->pi_group->pg_name);
1395 		}
1396 
1397 		/*
1398 		 * If the interface is offline, just clear the FAILED flag,
1399 		 * delaying the state change and failback operation until it
1400 		 * is brought back online.
1401 		 */
1402 		if (pi->pi_state == PI_OFFLINE) {
1403 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1404 			return;
1405 		}
1406 
1407 		if (pi->pi_flags & IFF_INACTIVE) {
1408 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1409 		} else {
1410 			if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) {
1411 				(void) change_lif_flags(pi,
1412 				    IFF_FAILED, _B_FALSE);
1413 				/* Per state diagram */
1414 				pi->pi_empty = 0;
1415 			}
1416 		}
1417 
1418 		phyint_chstate(pi, PI_RUNNING);
1419 
1420 		if (GROUP_FAILED(pi->pi_group)) {
1421 			/*
1422 			 * This is the 1st phyint to receive a response
1423 			 * after group failure.
1424 			 */
1425 			logerr("At least 1 interface (%s) of group %s has "
1426 			    "repaired\n", pi->pi_name, pi->pi_group->pg_name);
1427 			phyint_group_chstate(pi->pi_group, PG_RUNNING);
1428 		}
1429 	}
1430 }
1431 
1432 /*
1433  * See if a previously functioning interface has failed, or if the
1434  * whole group of interfaces has failed.
1435  */
1436 static void
1437 phyint_inst_check_for_failure(struct phyint_instance *pii)
1438 {
1439 	struct	phyint	*pi;
1440 	struct	phyint	*pi2;
1441 
1442 	pi = pii->pii_phyint;
1443 
1444 	switch (failure_state(pii)) {
1445 	case PHYINT_FAILURE:
1446 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1447 		if (pi->pi_group == phyint_anongroup) {
1448 			logerr("NIC failure detected on %s\n", pii->pii_name);
1449 		} else {
1450 			logerr("NIC failure detected on %s of group %s\n",
1451 			    pii->pii_name, pi->pi_group->pg_name);
1452 		}
1453 		/*
1454 		 * Do the failover, unless the interface is offline (in
1455 		 * which case we've already failed over).
1456 		 */
1457 		if (pi->pi_state != PI_OFFLINE) {
1458 			phyint_chstate(pi, PI_FAILED);
1459 			reset_crtt_all(pi);
1460 			if (!(pi->pi_flags & IFF_INACTIVE))
1461 				(void) try_failover(pi, FAILOVER_NORMAL);
1462 		}
1463 		break;
1464 
1465 	case GROUP_FAILURE:
1466 		logerr("All Interfaces in group %s have failed\n",
1467 		    pi->pi_group->pg_name);
1468 		for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
1469 		    pi2 = pi2->pi_pgnext) {
1470 			if (pi2->pi_flags & IFF_OFFLINE)
1471 				continue;
1472 			(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
1473 			reset_crtt_all(pi2);
1474 
1475 			/*
1476 			 * In the case of host targets, we
1477 			 * would have flushed the targets,
1478 			 * and gone to PI_NOTARGETS state.
1479 			 */
1480 			if (pi2->pi_state == PI_RUNNING)
1481 				phyint_chstate(pi, PI_FAILED);
1482 
1483 			pi2->pi_empty = 0;
1484 			pi2->pi_full = 0;
1485 		}
1486 		break;
1487 
1488 	default:
1489 		break;
1490 	}
1491 }
1492 
1493 /*
1494  * Determines if any timeout event has occurred and returns the number of
1495  * milliseconds until the next timeout event for the phyint. Returns
1496  * TIMER_INFINITY for "never".
1497  */
1498 uint_t
1499 phyint_inst_timer(struct phyint_instance *pii)
1500 {
1501 	int 	pr_ndx;
1502 	uint_t	timeout;
1503 	struct	target	*cur_tg;
1504 	struct	probe_stats *pr_statp;
1505 	struct	phyint_instance *pii_other;
1506 	struct	phyint *pi;
1507 	int	valid_unack_count;
1508 	int	i;
1509 	int	interval;
1510 	uint_t	check_time;
1511 	uint_t	cur_time;
1512 	hrtime_t cur_hrtime;
1513 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1514 
1515 	cur_time = getcurrenttime();
1516 
1517 	if (debug & D_TIMER) {
1518 		logdebug("phyint_inst_timer(%s %s)\n",
1519 		    AF_STR(pii->pii_af), pii->pii_name);
1520 	}
1521 
1522 	pii_other = phyint_inst_other(pii);
1523 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1524 		/*
1525 		 * Check to see if we're here due to link up/down flapping; If
1526 		 * enough time has passed, then try to bring the interface
1527 		 * back up; otherwise, schedule a timer to bring it back up
1528 		 * when enough time *has* elapsed.
1529 		 */
1530 		pi = pii->pii_phyint;
1531 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1532 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1533 			if (check_time > cur_time)
1534 				return (check_time - cur_time);
1535 
1536 			phyint_check_for_repair(pi);
1537 		}
1538 	}
1539 
1540 	/*
1541 	 * If this phyint is not yet initialized for probes,
1542 	 * don't proceed further
1543 	 */
1544 	if (pii->pii_probe_sock == -1)
1545 		return (TIMER_INFINITY);
1546 
1547 	/*
1548 	 * If the timer has fired too soon, probably triggered
1549 	 * by some other phyint instance, return the remaining
1550 	 * time
1551 	 */
1552 	if (TIME_LT(cur_time, pii->pii_snxt_time))
1553 		return (pii->pii_snxt_time - cur_time);
1554 
1555 	/*
1556 	 * If the link is down, don't send any probes for now.
1557 	 */
1558 	if (LINK_DOWN(pii->pii_phyint))
1559 		return (TIMER_INFINITY);
1560 
1561 	/*
1562 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1563 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1564 	 * Base probe time is strictly periodic.
1565 	 */
1566 	interval = GET_RANDOM(
1567 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1568 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1569 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1570 
1571 	/*
1572 	 * Check if the current time > next time to probe. If so, we missed
1573 	 * sending 1 or more probes, probably due to heavy system load. At least
1574 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1575 	 * were scheduled. Make adjustments to the times, in multiples of
1576 	 * user_probe_interval.
1577 	 */
1578 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1579 		int n;
1580 
1581 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1582 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1583 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1584 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1585 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1586 		    pii->pii_snxt_basetime);
1587 
1588 		/* Collect statistics about missed probes */
1589 		probes_missed.pm_nprobes += n + 1;
1590 		probes_missed.pm_ntimes++;
1591 	}
1592 	pii->pii_snxt_basetime += user_probe_interval;
1593 	interval = pii->pii_snxt_time - cur_time;
1594 	if (debug & D_TARGET) {
1595 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1596 		    " interval %u\n", cur_time, pii->pii_snxt_time,
1597 		    pii->pii_snxt_basetime, interval);
1598 	}
1599 
1600 	/*
1601 	 * If no targets are known, we need to send an ICMP multicast. The
1602 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1603 	 * to see if we found a target.
1604 	 */
1605 	if (pii->pii_target_next == NULL) {
1606 		assert(pii->pii_ntargets == 0);
1607 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1608 		probe(pii, PROBE_MULTI, cur_time);
1609 		return (interval);
1610 	}
1611 
1612 	if ((user_probe_interval != probe_interval) &&
1613 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1614 		/*
1615 		 * the failure detection (fd) probe timer has not yet fired.
1616 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1617 		 */
1618 		probe(pii, PROBE_RTT, cur_time);
1619 		return (interval);
1620 	}
1621 	/*
1622 	 * the fd probe timer has fired. Need to do all failure
1623 	 * detection / recovery calculations, and then send an fd probe
1624 	 * of type PROBE_UNI.
1625 	 */
1626 	if (user_probe_interval == probe_interval) {
1627 		/*
1628 		 * We could have missed some probes, and then adjusted
1629 		 * pii_snxt_basetime above. Otherwise we could have
1630 		 * blindly added probe_interval to pii_fd_snxt_basetime.
1631 		 */
1632 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1633 	} else {
1634 		pii->pii_fd_snxt_basetime += probe_interval;
1635 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1636 			int n;
1637 
1638 			n = (cur_time - pii->pii_fd_snxt_basetime) /
1639 			    probe_interval;
1640 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1641 		}
1642 	}
1643 
1644 	/*
1645 	 * We can have at most, the latest 2 probes that we sent, in
1646 	 * the PR_UNACKED state. All previous probes sent, are either
1647 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1648 	 * timed out if the probe's time_sent + the CRTT < currenttime.
1649 	 * For each of the last 2 probes, examine whether it has timed
1650 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1651 	 */
1652 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1653 	valid_unack_count = 0;
1654 
1655 	for (i = 0; i < 2; i++) {
1656 		pr_statp = &pii->pii_probes[pr_ndx];
1657 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1658 		switch (pr_statp->pr_status) {
1659 		case PR_ACKED:
1660 			/*
1661 			 * We received back an ACK, so the switch clearly
1662 			 * is not dropping our traffic, and thus we can
1663 			 * enable failure detection immediately.
1664 			 */
1665 			if (pii->pii_fd_hrtime > gethrtime()) {
1666 				if (debug & D_PROBE) {
1667 					logdebug("successful probe on %s; "
1668 					    "ending quiet period\n",
1669 					    pii->pii_phyint->pi_name);
1670 				}
1671 				pii->pii_fd_hrtime = gethrtime();
1672 			}
1673 			break;
1674 
1675 		case PR_UNACKED:
1676 			assert(cur_tg != NULL);
1677 			/*
1678 			 * The crtt could be zero for some reason,
1679 			 * Eg. the phyint could be failed. If the crtt is
1680 			 * not available use group's probe interval,
1681 			 * which is a worst case estimate.
1682 			 */
1683 			if (cur_tg->tg_crtt != 0) {
1684 				timeout = pr_statp->pr_time_sent +
1685 				    cur_tg->tg_crtt;
1686 			} else {
1687 				timeout = pr_statp->pr_time_sent +
1688 				    probe_interval;
1689 			}
1690 			if (TIME_LT(timeout, cur_time)) {
1691 				pr_statp->pr_status = PR_LOST;
1692 				pr_statp->pr_time_lost = timeout;
1693 			} else if (i == 1) {
1694 				/*
1695 				 * We are forced to consider this probe
1696 				 * lost, as we can have at most 2 unack.
1697 				 * probes any time, and we will be sending a
1698 				 * probe at the end of this function.
1699 				 * Normally, we should not be here, but
1700 				 * this can happen if an incoming response
1701 				 * that was considered lost has increased
1702 				 * the crtt for this target, and also bumped
1703 				 * up the FDT. Note that we never cancel or
1704 				 * increase the current pii_time_left, so
1705 				 * when the timer fires, we find 2 valid
1706 				 * unacked probes, and they are yet to timeout
1707 				 */
1708 				pr_statp->pr_status = PR_LOST;
1709 				pr_statp->pr_time_lost = cur_time;
1710 			} else {
1711 				/*
1712 				 * Only the most recent probe can enter
1713 				 * this 'else' arm. The second most recent
1714 				 * probe must take either of the above arms,
1715 				 * if it is unacked.
1716 				 */
1717 				valid_unack_count++;
1718 			}
1719 			break;
1720 		}
1721 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1722 	}
1723 
1724 	/*
1725 	 * We send out 1 probe randomly in the interval between one half
1726 	 * and one probe interval for the group. Given that the CRTT is always
1727 	 * less than the group's probe interval, we can have at most 1
1728 	 * unacknowledged probe now.  All previous probes are either lost or
1729 	 * acked.
1730 	 */
1731 	assert(valid_unack_count == 0 || valid_unack_count == 1);
1732 
1733 	/*
1734 	 * The timer has fired. Take appropriate action depending
1735 	 * on the current state of the phyint.
1736 	 *
1737 	 * PI_RUNNING state 	- Failure detection and failover
1738 	 * PI_FAILED state 	- Repair detection and failback
1739 	 */
1740 	switch (pii->pii_phyint->pi_state) {
1741 	case PI_FAILED:
1742 		/*
1743 		 * If the most recent probe (excluding unacked probes that
1744 		 * are yet to time out) has been acked, check whether the
1745 		 * phyint is now repaired. If the phyint is repaired, then
1746 		 * attempt failback, unless it is an inactive standby.
1747 		 */
1748 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1749 			phyint_check_for_repair(pii->pii_phyint);
1750 		}
1751 		break;
1752 
1753 	case PI_RUNNING:
1754 		/*
1755 		 * It's possible our probes have been lost because of a
1756 		 * spanning-tree mandated quiet period on the switch.  If so,
1757 		 * ignore the lost probes and consider the interface to still
1758 		 * be functioning.
1759 		 */
1760 		cur_hrtime = gethrtime();
1761 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1762 			break;
1763 
1764 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1765 			/*
1766 			 * We have 1 or more failed probes (excluding unacked
1767 			 * probes that are yet to time out). Determine if the
1768 			 * phyint has failed. If so attempt a failover,
1769 			 * unless it is an inactive standby
1770 			 */
1771 			phyint_inst_check_for_failure(pii);
1772 		}
1773 		break;
1774 
1775 	default:
1776 		logerr("phyint_inst_timer: invalid state %d\n",
1777 		    pii->pii_phyint->pi_state);
1778 		abort();
1779 	}
1780 
1781 	/*
1782 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1783 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1784 	 * was called, the target list may be empty.
1785 	 */
1786 	if (pii->pii_target_next != NULL) {
1787 		probe(pii, PROBE_UNI, cur_time);
1788 		/*
1789 		 * If we have just the one probe target, and we're not using
1790 		 * router targets, try to find another as we presently have
1791 		 * no resilience.
1792 		 */
1793 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1794 			probe(pii, PROBE_MULTI, cur_time);
1795 	} else {
1796 		probe(pii, PROBE_MULTI, cur_time);
1797 	}
1798 	return (interval);
1799 }
1800 
1801 /*
1802  * Start the probe timer for an interface instance.
1803  */
1804 void
1805 start_timer(struct phyint_instance *pii)
1806 {
1807 	uint32_t interval;
1808 
1809 	/*
1810 	 * Spread the base probe times (pi_snxt_basetime) across phyints
1811 	 * uniformly over the (curtime..curtime + the group's probe_interval).
1812 	 * pi_snxt_basetime is strictly periodic with a frequency of
1813 	 * the group's probe interval. The actual probe time pi_snxt_time
1814 	 * adds some randomness to pi_snxt_basetime and happens in probe().
1815 	 * For the 1st probe on each phyint after the timer is started,
1816 	 * pi_snxt_time and pi_snxt_basetime are the same.
1817 	 */
1818 	interval = GET_RANDOM(0,
1819 	    (int)pii->pii_phyint->pi_group->pg_probeint);
1820 
1821 	pii->pii_snxt_basetime = getcurrenttime() + interval;
1822 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1823 	pii->pii_snxt_time = pii->pii_snxt_basetime;
1824 	timer_schedule(interval);
1825 }
1826 
1827 /*
1828  * Restart the probe timer on an interface instance.
1829  */
1830 static void
1831 restart_timer(struct phyint_instance *pii)
1832 {
1833 	/*
1834 	 * We don't need to restart the timer if it was never started in
1835 	 * the first place (pii->pii_basetime_inited not set), as the timer
1836 	 * won't have gone off yet.
1837 	 */
1838 	if (pii->pii_basetime_inited != 0) {
1839 
1840 		if (debug & D_LINKNOTE)
1841 			logdebug("restart timer: restarting timer on %s, "
1842 			    "address family %s\n", pii->pii_phyint->pi_name,
1843 			    AF_STR(pii->pii_af));
1844 
1845 		start_timer(pii);
1846 	}
1847 }
1848 
1849 static void
1850 process_link_state_down(struct phyint *pi)
1851 {
1852 	logerr("The link has gone down on %s\n", pi->pi_name);
1853 
1854 	/*
1855 	 * Clear the probe statistics arrays, we don't want the repair
1856 	 * detection logic relying on probes that were succesful prior
1857 	 *  to the link going down.
1858 	 */
1859 	if (PROBE_CAPABLE(pi->pi_v4))
1860 		clear_pii_probe_stats(pi->pi_v4);
1861 	if (PROBE_CAPABLE(pi->pi_v6))
1862 		clear_pii_probe_stats(pi->pi_v6);
1863 	/*
1864 	 * Check for interface failure.  Although we know the interface
1865 	 * has failed, we don't know if all the other interfaces in the
1866 	 * group have failed as well.
1867 	 */
1868 	if ((pi->pi_state == PI_RUNNING) ||
1869 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1870 		if (debug & D_LINKNOTE) {
1871 			logdebug("process_link_state_down:"
1872 			    " checking for failure on %s\n", pi->pi_name);
1873 		}
1874 
1875 		if (pi->pi_v4 != NULL)
1876 			phyint_inst_check_for_failure(pi->pi_v4);
1877 		else if (pi->pi_v6 != NULL)
1878 			phyint_inst_check_for_failure(pi->pi_v6);
1879 	}
1880 }
1881 
1882 static void
1883 process_link_state_up(struct phyint *pi)
1884 {
1885 	logerr("The link has come up on %s\n", pi->pi_name);
1886 
1887 	/*
1888 	 * We stopped any running timers on each instance when the link
1889 	 * went down, so restart them.
1890 	 */
1891 	if (pi->pi_v4)
1892 		restart_timer(pi->pi_v4);
1893 	if (pi->pi_v6)
1894 		restart_timer(pi->pi_v6);
1895 
1896 	phyint_check_for_repair(pi);
1897 
1898 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1899 	if (pi->pi_whendx == LINK_UP_PERMIN)
1900 		pi->pi_whendx = 0;
1901 }
1902 
1903 /*
1904  * Process any changes in link state passed up from the interfaces.
1905  */
1906 void
1907 process_link_state_changes(void)
1908 {
1909 	struct phyint *pi;
1910 
1911 	/* Look for interfaces where the link state has just changed */
1912 
1913 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1914 		boolean_t old_link_state_up = LINK_UP(pi);
1915 
1916 		/*
1917 		 * Except when the "phyint" structure is created, this is
1918 		 * the only place the link state is updated.  This allows
1919 		 * this routine to detect changes in link state, rather
1920 		 * than just the current state.
1921 		 */
1922 		UPDATE_LINK_STATE(pi);
1923 
1924 		if (LINK_DOWN(pi)) {
1925 			/*
1926 			 * Has link just gone down?
1927 			 */
1928 			if (old_link_state_up)
1929 				process_link_state_down(pi);
1930 		} else {
1931 			/*
1932 			 * Has link just gone back up?
1933 			 */
1934 			if (!old_link_state_up)
1935 				process_link_state_up(pi);
1936 		}
1937 	}
1938 }
1939 
1940 void
1941 reset_crtt_all(struct phyint *pi)
1942 {
1943 	struct phyint_instance *pii;
1944 	struct target *tg;
1945 
1946 	pii = pi->pi_v4;
1947 	if (pii != NULL) {
1948 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1949 			tg->tg_crtt = 0;
1950 			tg->tg_rtt_sa = -1;
1951 			tg->tg_rtt_sd = 0;
1952 		}
1953 	}
1954 
1955 	pii = pi->pi_v6;
1956 	if (pii != NULL) {
1957 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1958 			tg->tg_crtt = 0;
1959 			tg->tg_rtt_sa = -1;
1960 			tg->tg_rtt_sd = 0;
1961 		}
1962 	}
1963 }
1964 
1965 /*
1966  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
1967  * probes on both instances IPv4 and IPv6.
1968  * If the interface has failed, return the time of the first probe failure
1969  * in "tff".
1970  */
1971 static int
1972 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
1973 {
1974 	uint_t	pi_tff;
1975 	struct	target *cur_tg;
1976 	struct	probe_fail_count pfinfo;
1977 	struct	phyint_instance *pii_other;
1978 	int	pr_ndx;
1979 
1980 	/*
1981 	 * Get the number of consecutive failed probes on
1982 	 * this phyint across all targets. Also get the number
1983 	 * of consecutive failed probes on this target only
1984 	 */
1985 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1986 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
1987 	probe_fail_info(pii, cur_tg, &pfinfo);
1988 
1989 	/* Get the time of first failure, for later use */
1990 	pi_tff = pfinfo.pf_tff;
1991 
1992 	/*
1993 	 * If the current target has not responded to the
1994 	 * last NUM_PROBE_FAILS probes, and other targets are
1995 	 * responding delete this target. Dead gateway detection
1996 	 * will eventually remove this target (if router) from the
1997 	 * routing tables. If that does not occur, we may end
1998 	 * up adding this to our list again.
1999 	 */
2000 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2001 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2002 		if (pii->pii_targets_are_routers) {
2003 			if (cur_tg->tg_status == TG_ACTIVE)
2004 				pii->pii_ntargets--;
2005 			cur_tg->tg_status = TG_DEAD;
2006 			cur_tg->tg_crtt = 0;
2007 			cur_tg->tg_rtt_sa = -1;
2008 			cur_tg->tg_rtt_sd = 0;
2009 			if (pii->pii_target_next == cur_tg)
2010 				pii->pii_target_next = target_next(cur_tg);
2011 		} else {
2012 			target_delete(cur_tg);
2013 			probe(pii, PROBE_MULTI, getcurrenttime());
2014 		}
2015 		return (PHYINT_OK);
2016 	}
2017 
2018 	/*
2019 	 * If the phyint has lost NUM_PROBE_FAILS or more
2020 	 * consecutive probes, on both IPv4 and IPv6 protocol
2021 	 * instances of the phyint, then trigger failure
2022 	 * detection, else return false
2023 	 */
2024 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2025 		return (PHYINT_OK);
2026 
2027 	pii_other = phyint_inst_other(pii);
2028 	if (PROBE_CAPABLE(pii_other)) {
2029 		probe_fail_info(pii_other, NULL, &pfinfo);
2030 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2031 			/*
2032 			 * We have NUM_PROBE_FAILS or more failures
2033 			 * on both IPv4 and IPv6. Get the earliest
2034 			 * time when failure was detected on this
2035 			 * phyint across IPv4 and IPv6.
2036 			 */
2037 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2038 				pi_tff = pfinfo.pf_tff;
2039 		} else {
2040 			/*
2041 			 * This instance has < NUM_PROBE_FAILS failure.
2042 			 * So return false
2043 			 */
2044 			return (PHYINT_OK);
2045 		}
2046 	}
2047 	*tff = pi_tff;
2048 	return (PHYINT_FAILURE);
2049 }
2050 
2051 /*
2052  * Check if the link has gone down on this phyint, or it has failed the
2053  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2054  * Also look at other phyints of this group, for group failures.
2055  */
2056 int
2057 failure_state(struct phyint_instance *pii)
2058 {
2059 	struct	probe_success_count psinfo;
2060 	uint_t	pi2_tls;		/* time last success */
2061 	uint_t	pi_tff;			/* time first fail */
2062 	struct	phyint	*pi2;
2063 	struct	phyint *pi;
2064 	struct	phyint_instance *pii2;
2065 	struct  phyint_group *pg;
2066 	boolean_t alone;
2067 
2068 	if (debug & D_FAILOVER)
2069 		logdebug("phyint_failed(%s)\n", pii->pii_name);
2070 
2071 	pi = pii->pii_phyint;
2072 	pg = pi->pi_group;
2073 
2074 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2075 		PHYINT_OK)
2076 		return (PHYINT_OK);
2077 
2078 	/*
2079 	 * At this point, the link is down, or the phyint is suspect,
2080 	 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
2081 	 * does not belong to any group, or is the only member of the
2082 	 * group capable of being probed, return PHYINT_FAILURE.
2083 	 */
2084 	alone = _B_TRUE;
2085 	if (pg != phyint_anongroup) {
2086 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2087 			if (pi2 == pi)
2088 				continue;
2089 			if (PROBE_CAPABLE(pi2->pi_v4) ||
2090 			    PROBE_CAPABLE(pi2->pi_v6)) {
2091 				alone = _B_FALSE;
2092 				break;
2093 			}
2094 		}
2095 	}
2096 	if (alone)
2097 		return (PHYINT_FAILURE);
2098 
2099 	/*
2100 	 * Need to compare against other phyints of the same group
2101 	 * to exclude group failures. If the failure was detected via
2102 	 * probing, then if the time of last success (tls) of any
2103 	 * phyint is more recent than the time of first fail (tff) of the
2104 	 * phyint in question, and the link is up on the phyint,
2105 	 * then it is a phyint failure. Otherwise it is a group failure.
2106 	 * If failure was detected via a link down notification sent from
2107 	 * the driver to IP, we see if any phyints in the group are still
2108 	 * running and haven't received a link down notification.  We
2109 	 * will usually be processing the link down notification shortly
2110 	 * after it was received, so there is no point looking at the tls
2111 	 * of other phyints.
2112 	 */
2113 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2114 		/* Exclude ourself from comparison */
2115 		if (pi2 == pi)
2116 			continue;
2117 
2118 		if (LINK_DOWN(pi)) {
2119 			/*
2120 			 * We use FLAGS_TO_LINK_STATE() to test the
2121 			 * flags directly, rather then LINK_UP() or
2122 			 * LINK_DOWN(), as we may not have got round
2123 			 * to processing the link state for the other
2124 			 * phyints in the group yet.
2125 			 *
2126 			 * The check for PI_RUNNING and group
2127 			 * failure handles the case when the
2128 			 * group begins to recover.  The first
2129 			 * phyint to recover should not trigger
2130 			 * a failover from the soon-to-recover
2131 			 * other phyints to the first recovered
2132 			 * phyint. PI_RUNNING will be set, and
2133 			 * pg_groupfailed cleared only after
2134 			 * receipt of NUM_PROBE_REPAIRS, by
2135 			 * which time the other phyints should
2136 			 * have received at least 1 packet,
2137 			 * and so will not have NUM_PROBE_FAILS.
2138 			 */
2139 			if ((pi2->pi_state == PI_RUNNING) &&
2140 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
2141 				return (PHYINT_FAILURE);
2142 		} else {
2143 			/*
2144 			 * Need to compare against both IPv4 and
2145 			 * IPv6 instances.
2146 			 */
2147 			pii2 = pi2->pi_v4;
2148 			if (pii2 != NULL) {
2149 				probe_success_info(pii2, NULL, &psinfo);
2150 				if (psinfo.ps_tls_valid) {
2151 					pi2_tls = psinfo.ps_tls;
2152 					/*
2153 					 * See comment above regarding check
2154 					 * for PI_RUNNING and group failure.
2155 					 */
2156 					if (TIME_GT(pi2_tls, pi_tff) &&
2157 					    (pi2->pi_state == PI_RUNNING) &&
2158 					    !GROUP_FAILED(pg) &&
2159 					    FLAGS_TO_LINK_STATE(pi2))
2160 						return (PHYINT_FAILURE);
2161 				}
2162 			}
2163 
2164 			pii2 = pi2->pi_v6;
2165 			if (pii2 != NULL) {
2166 				probe_success_info(pii2, NULL, &psinfo);
2167 				if (psinfo.ps_tls_valid) {
2168 					pi2_tls = psinfo.ps_tls;
2169 					/*
2170 					 * See comment above regarding check
2171 					 * for PI_RUNNING and group failure.
2172 					 */
2173 					if (TIME_GT(pi2_tls, pi_tff) &&
2174 					    (pi2->pi_state == PI_RUNNING) &&
2175 					    !GROUP_FAILED(pg) &&
2176 					    FLAGS_TO_LINK_STATE(pi2))
2177 						return (PHYINT_FAILURE);
2178 				}
2179 			}
2180 		}
2181 	}
2182 
2183 	/*
2184 	 * Change the group state to PG_FAILED if it's not already.
2185 	 */
2186 	if (!GROUP_FAILED(pg))
2187 		phyint_group_chstate(pg, PG_FAILED);
2188 
2189 	return (GROUP_FAILURE);
2190 }
2191 
2192 /*
2193  * Return the information associated with consecutive probe successes
2194  * starting with the most recent probe. At most the last 2 probes can be
2195  * in the unacknowledged state. All previous probes have either failed
2196  * or succeeded.
2197  */
2198 static void
2199 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2200     struct probe_success_count *psinfo)
2201 {
2202 	uint_t	i;
2203 	struct probe_stats *pr_statp;
2204 	uint_t most_recent;
2205 	uint_t second_most_recent;
2206 	boolean_t pi_found_failure = _B_FALSE;
2207 	boolean_t tg_found_failure = _B_FALSE;
2208 	uint_t now;
2209 	uint_t timeout;
2210 	struct target *tg;
2211 
2212 	if (debug & D_FAILOVER)
2213 		logdebug("probe_success_info(%s)\n", pii->pii_name);
2214 
2215 	bzero(psinfo, sizeof (*psinfo));
2216 	now = getcurrenttime();
2217 
2218 	/*
2219 	 * Start with the most recent probe, and count the number
2220 	 * of consecutive probe successes. Latch the number of successes
2221 	 * on hitting a failure.
2222 	 */
2223 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2224 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2225 
2226 	for (i = most_recent; i != pii->pii_probe_next;
2227 	    i = PROBE_INDEX_PREV(i)) {
2228 		pr_statp = &pii->pii_probes[i];
2229 
2230 		switch (pr_statp->pr_status) {
2231 		case PR_UNACKED:
2232 			/*
2233 			 * Only the most recent 2 probes can be unacknowledged
2234 			 */
2235 			assert(i == most_recent || i == second_most_recent);
2236 
2237 			tg = pr_statp->pr_target;
2238 			assert(tg != NULL);
2239 			/*
2240 			 * The crtt could be zero for some reason,
2241 			 * Eg. the phyint could be failed. If the crtt is
2242 			 * not available use the value of the group's probe
2243 			 * interval which is a worst case estimate.
2244 			 */
2245 			if (tg->tg_crtt != 0) {
2246 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2247 			} else {
2248 				timeout = pr_statp->pr_time_sent +
2249 				    pii->pii_phyint->pi_group->pg_probeint;
2250 			}
2251 
2252 			if (TIME_LT(timeout, now)) {
2253 				/*
2254 				 * We hit a failure. Latch the total number of
2255 				 * recent consecutive successes.
2256 				 */
2257 				pr_statp->pr_time_lost = timeout;
2258 				pr_statp->pr_status = PR_LOST;
2259 				pi_found_failure = _B_TRUE;
2260 				if (cur_tg != NULL && tg == cur_tg) {
2261 					/*
2262 					 * We hit a failure for the desired
2263 					 * target. Latch the number of recent
2264 					 * consecutive successes for this target
2265 					 */
2266 					tg_found_failure = _B_TRUE;
2267 				}
2268 			}
2269 			break;
2270 
2271 		case PR_ACKED:
2272 			/*
2273 			 * Bump up the count of probe successes, if we
2274 			 * have not seen any failure so far.
2275 			 */
2276 			if (!pi_found_failure)
2277 				psinfo->ps_nsucc++;
2278 
2279 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2280 			    !tg_found_failure) {
2281 				psinfo->ps_nsucc_tg++;
2282 			}
2283 
2284 			/*
2285 			 * Record the time of last success, if this is
2286 			 * the most recent probe success.
2287 			 */
2288 			if (!psinfo->ps_tls_valid) {
2289 				psinfo->ps_tls = pr_statp->pr_time_acked;
2290 				psinfo->ps_tls_valid = _B_TRUE;
2291 			}
2292 			break;
2293 
2294 		case PR_LOST:
2295 			/*
2296 			 * We hit a failure. Latch the total number of
2297 			 * recent consecutive successes.
2298 			 */
2299 			pi_found_failure = _B_TRUE;
2300 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2301 				/*
2302 				 * We hit a failure for the desired target.
2303 				 * Latch the number of recent consecutive
2304 				 * successes for this target
2305 				 */
2306 				tg_found_failure = _B_TRUE;
2307 			}
2308 			break;
2309 
2310 		default:
2311 			return;
2312 
2313 		}
2314 	}
2315 }
2316 
2317 /*
2318  * Return the information associated with consecutive probe failures
2319  * starting with the most recent probe. Only the last 2 probes can be in the
2320  * unacknowledged state. All previous probes have either failed or succeeded.
2321  */
2322 static void
2323 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2324     struct probe_fail_count *pfinfo)
2325 {
2326 	int	i;
2327 	struct probe_stats *pr_statp;
2328 	boolean_t	tg_found_success = _B_FALSE;
2329 	boolean_t	pi_found_success = _B_FALSE;
2330 	int	most_recent;
2331 	int	second_most_recent;
2332 	uint_t	now;
2333 	uint_t	timeout;
2334 	struct	target *tg;
2335 
2336 	if (debug & D_FAILOVER)
2337 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2338 
2339 	bzero(pfinfo, sizeof (*pfinfo));
2340 	now = getcurrenttime();
2341 
2342 	/*
2343 	 * Start with the most recent probe, and count the number
2344 	 * of consecutive probe failures. Latch the number of failures
2345 	 * on hitting a probe success.
2346 	 */
2347 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2348 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2349 
2350 	for (i = most_recent; i != pii->pii_probe_next;
2351 	    i = PROBE_INDEX_PREV(i)) {
2352 		pr_statp = &pii->pii_probes[i];
2353 
2354 		assert(PR_STATUS_VALID(pr_statp->pr_status));
2355 
2356 		switch (pr_statp->pr_status) {
2357 		case PR_UNACKED:
2358 			/*
2359 			 * Only the most recent 2 probes can be unacknowledged
2360 			 */
2361 			assert(i == most_recent || i == second_most_recent);
2362 
2363 			tg = pr_statp->pr_target;
2364 			/*
2365 			 * Target is guaranteed to exist in the unack. state
2366 			 */
2367 			assert(tg != NULL);
2368 			/*
2369 			 * The crtt could be zero for some reason,
2370 			 * Eg. the phyint could be failed. If the crtt is
2371 			 * not available use the group's probe interval,
2372 			 * which is a worst case estimate.
2373 			 */
2374 			if (tg->tg_crtt != 0) {
2375 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2376 			} else {
2377 				timeout = pr_statp->pr_time_sent +
2378 				    pii->pii_phyint->pi_group->pg_probeint;
2379 			}
2380 
2381 			if (TIME_GT(timeout, now))
2382 				break;
2383 
2384 			pr_statp->pr_time_lost = timeout;
2385 			pr_statp->pr_status = PR_LOST;
2386 			/* FALLTHRU */
2387 
2388 		case PR_LOST:
2389 			if (!pi_found_success) {
2390 				pfinfo->pf_nfail++;
2391 				pfinfo->pf_tff = pr_statp->pr_time_lost;
2392 			}
2393 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2394 			    !tg_found_success)  {
2395 				pfinfo->pf_nfail_tg++;
2396 			}
2397 			break;
2398 
2399 		default:
2400 			/*
2401 			 * We hit a success or unused slot. Latch the
2402 			 * total number of recent consecutive failures.
2403 			 */
2404 			pi_found_success = _B_TRUE;
2405 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2406 				/*
2407 				 * We hit a success for the desired target.
2408 				 * Latch the number of recent consecutive
2409 				 * failures for this target
2410 				 */
2411 				tg_found_success = _B_TRUE;
2412 			}
2413 		}
2414 	}
2415 }
2416 
2417 /*
2418  * Check if the phyint has been repaired.  If no test address has been
2419  * configured, then consider the interface repaired if the link is up (unless
2420  * the link is flapping; see below).  Otherwise, look for proof of probes
2421  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2422  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2423  */
2424 static boolean_t
2425 phyint_repaired(struct phyint *pi)
2426 {
2427 	struct	probe_success_count psinfo;
2428 	struct	phyint_instance *pii;
2429 	struct	target *cur_tg;
2430 	int	pr_ndx;
2431 	uint_t	cur_time;
2432 
2433 	if (debug & D_FAILOVER)
2434 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2435 
2436 	if (LINK_DOWN(pi))
2437 		return (_B_FALSE);
2438 
2439 	/*
2440 	 * If we don't have any test addresses and the link is up, then
2441 	 * consider the interface repaired, unless we've received more than
2442 	 * LINK_UP_PERMIN link up notifications in the last minute, in
2443 	 * which case we keep the link down until we drop back below
2444 	 * the threshold.
2445 	 */
2446 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2447 		cur_time = getcurrenttime();
2448 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2449 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2450 			pi->pi_lfmsg_printed = 0;
2451 			return (_B_TRUE);
2452 		}
2453 		if (!pi->pi_lfmsg_printed) {
2454 			logerr("The link has come up on %s more than %d times "
2455 			    "in the last minute; disabling failback until it "
2456 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2457 			pi->pi_lfmsg_printed = 1;
2458 		}
2459 
2460 		return (_B_FALSE);
2461 	}
2462 
2463 	pii = pi->pi_v4;
2464 	if (PROBE_CAPABLE(pii)) {
2465 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2466 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2467 		probe_success_info(pii, cur_tg, &psinfo);
2468 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2469 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2470 			return (_B_TRUE);
2471 	}
2472 
2473 	pii = pi->pi_v6;
2474 	if (PROBE_CAPABLE(pii)) {
2475 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2476 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2477 		probe_success_info(pii, cur_tg, &psinfo);
2478 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2479 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2480 			return (_B_TRUE);
2481 	}
2482 
2483 	return (_B_FALSE);
2484 }
2485 
2486 /*
2487  * Try failover from phyint 'pi' to a suitable destination.
2488  */
2489 int
2490 try_failover(struct phyint *pi, int failover_type)
2491 {
2492 	struct phyint *dst;
2493 	int err;
2494 
2495 	if (debug & D_FAILOVER)
2496 		logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
2497 
2498 	/*
2499 	 * Attempt to find a failover destination 'dst'.
2500 	 * dst will be null if any of the following is true
2501 	 * Phyint is not part of a group  OR
2502 	 * Phyint is the only member of a group OR
2503 	 * No suitable failover dst was available
2504 	 */
2505 	dst = get_failover_dst(pi, failover_type);
2506 	if (dst == NULL)
2507 		return (IPMP_EMINRED);
2508 
2509 	dst->pi_empty = 0;			/* Per state diagram */
2510 	pi->pi_full = 0;			/* Per state diagram */
2511 
2512 	err = failover(pi, dst);
2513 
2514 	if (debug & D_FAILOVER) {
2515 		logdebug("failed over from %s to %s ret %d\n",
2516 		    pi->pi_name, dst->pi_name, err);
2517 	}
2518 	if (err == 0) {
2519 		pi->pi_empty = 1;		/* Per state diagram */
2520 		/*
2521 		 * we don't want to print out this message if a
2522 		 * phyint is leaving the group, nor for failover from
2523 		 * standby
2524 		 */
2525 		if (failover_type == FAILOVER_NORMAL) {
2526 			logerr("Successfully failed over from NIC %s to NIC "
2527 			    "%s\n", pi->pi_name, dst->pi_name);
2528 		}
2529 		return (0);
2530 	} else {
2531 		/*
2532 		 * The failover did not succeed. We must retry the failover
2533 		 * only after resyncing our state based on the kernel's.
2534 		 * For eg. either the src or the dst might have been unplumbed
2535 		 * causing this failure. initifs() will be called again,
2536 		 * from main, since full_scan_required has been set to true
2537 		 * by failover();
2538 		 */
2539 		return (IPMP_FAILURE);
2540 	}
2541 }
2542 
2543 /*
2544  * global_errno captures the errno value, if failover() or failback()
2545  * fails. This is sent to if_mpadm(1M).
2546  */
2547 int global_errno;
2548 
2549 /*
2550  * Attempt failover from phyint 'from' to phyint 'to'.
2551  * IP moves everything from phyint 'from' to phyint 'to'.
2552  */
2553 static int
2554 failover(struct phyint *from, struct phyint *to)
2555 {
2556 	struct	lifreq	lifr;
2557 	int 	ret;
2558 
2559 	if (debug & D_FAILOVER) {
2560 		logdebug("failing over from %s to %s\n",
2561 		    from->pi_name, to->pi_name);
2562 	}
2563 
2564 	/*
2565 	 * Perform the failover. Both IPv4 and IPv6 are failed over
2566 	 * using a single ioctl by passing in AF_UNSPEC family.
2567 	 */
2568 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2569 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2570 	lifr.lifr_movetoindex = to->pi_ifindex;
2571 
2572 	ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
2573 	if (ret < 0) {
2574 		global_errno = errno;
2575 		logperror("failover: ioctl (failover)");
2576 	}
2577 
2578 	/*
2579 	 * Set full_scan_required to true. This will make us read
2580 	 * the state from the kernel in initifs() and update our tables,
2581 	 * to reflect the current state after the failover. If the
2582 	 * failover has failed it will then reissue the failover.
2583 	 */
2584 	full_scan_required = _B_TRUE;
2585 	return (ret);
2586 }
2587 
2588 /*
2589  * phyint 'pi' has recovered. Attempt failback from every phyint in the same
2590  * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
2591  * Return values:
2592  * IPMP_SUCCESS:		Failback successful from each of the other
2593  *				phyints in the group.
2594  * IPMP_EFBPARTIAL: 		Failback successful from some of the other
2595  *				phyints in the group.
2596  * IPMP_FAILURE:		Failback syscall failed with some error.
2597  *
2598  * Note that failback is attempted regardless of the setting of the
2599  * failback_enabled flag.
2600  */
2601 int
2602 do_failback(struct phyint *pi, boolean_t check_only)
2603 {
2604 	struct  phyint *from;
2605 	boolean_t done;
2606 	boolean_t partial;
2607 	boolean_t attempted_failback = _B_FALSE;
2608 
2609 	if (debug & D_FAILOVER)
2610 		logdebug("do_failback(%s)\n", pi->pi_name);
2611 
2612 	/* If this phyint is not part of a named group, return. */
2613 	if (pi->pi_group == phyint_anongroup) {
2614 		pi->pi_full = 1;
2615 		return (IPMP_SUCCESS);
2616 	}
2617 
2618 	/*
2619 	 * Attempt failback from every phyint in the group to 'pi'.
2620 	 * The reason for doing this, instead of only from the
2621 	 * phyint to which we did the failover is given below.
2622 	 *
2623 	 * After 'pi' failed, if any app. tries to join on a multicast
2624 	 * address (IPv6), on the failed phyint, IP picks any arbitrary
2625 	 * non-failed phyint in the group, instead of the failed phyint,
2626 	 * in.mpathd is not aware of this. Thus failing back only from the
2627 	 * interface to which 'pi' failed over, will failback the ipif's
2628 	 * but not the ilm's. So we need to failback from all members of
2629 	 * the phyint group
2630 	 */
2631 	done = _B_TRUE;
2632 	partial = _B_FALSE;
2633 	for (from = pi->pi_group->pg_phyint; from != NULL;
2634 	    from = from->pi_pgnext) {
2635 		/* Exclude ourself as a failback src */
2636 		if (from == pi)
2637 			continue;
2638 
2639 		/*
2640 		 * If the 'from' phyint has IPv4 plumbed, the 'to'
2641 		 * phyint must also have IPv4 plumbed. Similar check
2642 		 * for IPv6. IP makes the same check. Otherwise the
2643 		 * failback will fail.
2644 		 */
2645 		if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
2646 		    (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
2647 			partial = _B_TRUE;
2648 			continue;
2649 		}
2650 
2651 		if (!check_only) {
2652 			pi->pi_empty = 0;	/* Per state diagram */
2653 			attempted_failback = _B_TRUE;
2654 			if (failback(from, pi) != 0) {
2655 				done = _B_FALSE;
2656 				break;
2657 			}
2658 		}
2659 	}
2660 
2661 	if (check_only) {
2662 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2663 	}
2664 
2665 	/*
2666 	 * We are done. No more phyint from which we can src the failback
2667 	 */
2668 	if (done) {
2669 		if (!partial)
2670 			pi->pi_full = 1;	/* Per state diagram */
2671 		/*
2672 		 * Don't print out a message unless there is a
2673 		 * transition from FAILED to RUNNING. For eg.
2674 		 * we don't want to print out this message if a
2675 		 * phyint is leaving the group, or at startup
2676 		 */
2677 		if (attempted_failback && (pi->pi_flags &
2678 		    (IFF_FAILED | IFF_OFFLINE))) {
2679 			logerr("Successfully failed back to NIC %s\n",
2680 			    pi->pi_name);
2681 		}
2682 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2683 	}
2684 
2685 	return (IPMP_FAILURE);
2686 }
2687 
2688 /*
2689  * This function is similar to do_failback() above, but respects the
2690  * failback_enabled flag for phyints in named groups.
2691  */
2692 int
2693 try_failback(struct phyint *pi, boolean_t check_only)
2694 {
2695 	if (debug & D_FAILOVER)
2696 		logdebug("try_failback(%s)\n", pi->pi_name);
2697 
2698 	if (pi->pi_group != phyint_anongroup && !failback_enabled)
2699 		return (IPMP_EFBDISABLED);
2700 
2701 	return (do_failback(pi, check_only));
2702 }
2703 
2704 /*
2705  * Failback everything from phyint 'from' that has the same ifindex
2706  * as phyint to's ifindex.
2707  */
2708 static int
2709 failback(struct phyint *from, struct phyint *to)
2710 {
2711 	struct lifreq lifr;
2712 	int ret;
2713 
2714 	if (debug & D_FAILOVER)
2715 		logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
2716 
2717 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2718 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2719 	lifr.lifr_movetoindex = to->pi_ifindex;
2720 
2721 	ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
2722 	if (ret < 0) {
2723 		global_errno = errno;
2724 		logperror("failback: ioctl (failback)");
2725 	}
2726 
2727 	/*
2728 	 * Set full_scan_required to true. This will make us read
2729 	 * the state from the kernel in initifs() and update our tables,
2730 	 * to reflect the current state after the failback. If the
2731 	 * failback has failed it will then reissue the failback.
2732 	 */
2733 	full_scan_required = _B_TRUE;
2734 
2735 	return (ret);
2736 }
2737 
2738 /*
2739  * Select a target phyint for failing over from 'pi'.
2740  * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
2741  * target phyint is chosen as follows,
2742  *	1. Pick any inactive standby interface.
2743  *	2. If no inactive standby is available, select any phyint in the
2744  *	   same group that has the least number of logints, (excluding
2745  *	   IFF_NOFAILOVER and !IFF_UP logints)
2746  * If we are failing over from a standby, failover_type is
2747  * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
2748  * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
2749  * and we won't return NULL, as long as there is at least 1 other phyint
2750  * in the group.
2751  */
2752 static struct phyint *
2753 get_failover_dst(struct phyint *pi, int failover_type)
2754 {
2755 	struct phyint	*maybe = NULL;
2756 	struct phyint	*pi2;
2757 	struct phyint 	*last_choice = NULL;
2758 
2759 	if (pi->pi_group == phyint_anongroup)
2760 		return (NULL);
2761 
2762 	/*
2763 	 * Loop thru the phyints in the group, and pick the preferred
2764 	 * phyint for the target.
2765 	 */
2766 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2767 		/* Exclude ourself and offlined interfaces */
2768 		if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
2769 			continue;
2770 
2771 		/*
2772 		 * The chosen target phyint must have IPv4 instance
2773 		 * plumbed, if the src phyint has IPv4 plumbed. Similarly
2774 		 * for IPv6.
2775 		 */
2776 		if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
2777 		    (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
2778 			continue;
2779 
2780 		/* The chosen target must be PI_RUNNING. */
2781 		if (pi2->pi_state != PI_RUNNING) {
2782 			last_choice = pi2;
2783 			continue;
2784 		}
2785 
2786 		if ((pi2->pi_flags & IFF_INACTIVE) &&
2787 		    (failover_type != FAILOVER_TO_NONSTANDBY)) {
2788 			return (pi2);
2789 		} else {
2790 			if (maybe == NULL)
2791 				maybe = pi2;
2792 			else if (logint_upcount(pi2) < logint_upcount(maybe))
2793 				maybe = pi2;
2794 		}
2795 	}
2796 	if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
2797 		return (last_choice);
2798 	else
2799 		return (maybe);
2800 }
2801 
2802 /*
2803  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2804  */
2805 boolean_t
2806 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
2807 {
2808 	int ifsock;
2809 	struct lifreq lifr;
2810 
2811 	if (debug & D_FAILOVER) {
2812 		logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
2813 		    pi->pi_name, flags, (int)setfl);
2814 	}
2815 
2816 	if (pi->pi_v4 != NULL) {
2817 		ifsock = ifsock_v4;
2818 	} else  {
2819 		ifsock = ifsock_v6;
2820 	}
2821 
2822 	/*
2823 	 * Get the current flags from the kernel, and set/clear the
2824 	 * desired phyint flags. Since we set only phyint flags, we can
2825 	 * do it on either IPv4 or IPv6 instance.
2826 	 */
2827 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2828 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
2829 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2830 		if (errno != ENXIO)
2831 			logperror("change_lif_flags: ioctl (get flags)");
2832 		return (_B_FALSE);
2833 	}
2834 	if (setfl)
2835 		lifr.lifr_flags |= flags;
2836 	else
2837 		lifr.lifr_flags &= ~flags;
2838 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2839 		if (errno != ENXIO)
2840 			logperror("change_lif_flags: ioctl (set flags)");
2841 		return (_B_FALSE);
2842 	}
2843 
2844 	/*
2845 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2846 	 * phyint flags.
2847 	 */
2848 	if (setfl)
2849 		pi->pi_flags |= flags;
2850 	else
2851 		pi->pi_flags &= ~flags;
2852 
2853 	if (pi->pi_v4)
2854 		pi->pi_v4->pii_flags = pi->pi_flags;
2855 
2856 	if (pi->pi_v6)
2857 		pi->pi_v6->pii_flags = pi->pi_flags;
2858 
2859 	return (_B_TRUE);
2860 }
2861 
2862 /*
2863  * icmp cksum computation for IPv4.
2864  */
2865 static int
2866 in_cksum(ushort_t *addr, int len)
2867 {
2868 	register int nleft = len;
2869 	register ushort_t *w = addr;
2870 	register ushort_t answer;
2871 	ushort_t odd_byte = 0;
2872 	register int sum = 0;
2873 
2874 	/*
2875 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2876 	 *  we add sequential 16 bit words to it, and at the end, fold
2877 	 *  back all the carry bits from the top 16 bits into the lower
2878 	 *  16 bits.
2879 	 */
2880 	while (nleft > 1)  {
2881 		sum += *w++;
2882 		nleft -= 2;
2883 	}
2884 
2885 	/* mop up an odd byte, if necessary */
2886 	if (nleft == 1) {
2887 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2888 		sum += odd_byte;
2889 	}
2890 
2891 	/*
2892 	 * add back carry outs from top 16 bits to low 16 bits
2893 	 */
2894 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2895 	sum += (sum >> 16);			/* add carry */
2896 	answer = ~sum;				/* truncate to 16 bits */
2897 	return (answer);
2898 }
2899 
2900 static void
2901 reset_snxt_basetimes(void)
2902 {
2903 	struct phyint_instance *pii;
2904 
2905 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2906 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2907 	}
2908 }
2909 
2910 /*
2911  * Is the address one of our own addresses? Unfortunately,
2912  * we cannot check our phyint tables to determine if the address
2913  * is our own. This is because, we don't track interfaces that
2914  * are not part of any group. We have to either use a 'bind' or
2915  * get the complete list of all interfaces using SIOCGLIFCONF,
2916  * to do this check. We choose to use 'bind'. We could use
2917  * SIOCTMYADDR, but bind is preferred, since it is stronger.
2918  * SIOCTMYADDR excludes down interfaces, while bind includes even
2919  * down interfaces.
2920  */
2921 boolean_t
2922 own_address(int af, struct in6_addr addr)
2923 {
2924 	int sock;
2925 	boolean_t ours = _B_TRUE;
2926 
2927 	sock = socket(AF_INET6, SOCK_DGRAM, 0);
2928 	if (sock  == -1) {
2929 		logperror("own_address: socket");
2930 		/*
2931 		 * If the socket call fails, err on the side of caution,
2932 		 * and return true.
2933 		 */
2934 	} else {
2935 		struct sockaddr_in6 sin6;
2936 
2937 		(void) memset(&sin6, 0, sizeof (struct sockaddr_in6));
2938 		sin6.sin6_family = AF_INET6;
2939 		sin6.sin6_addr = addr;
2940 		/*
2941 		 * If the bind succeeds, then this address is one of our
2942 		 * addresses.
2943 		 * If bind returns error EADDRNOTAVAIL, the address is
2944 		 * not one of ours.
2945 		 * If bind returns an error other than EADDRNOTAVAIL, err
2946 		 * on the side of caution and report the address as one of
2947 		 * our own.
2948 		 */
2949 		if (bind(sock, (struct sockaddr *)&sin6,
2950 		    sizeof (struct sockaddr_in6)) == -1) {
2951 			if (errno == EADDRNOTAVAIL)
2952 				ours = _B_FALSE;
2953 			else
2954 				logperror("own_address: bind");
2955 		}
2956 		(void) close(sock);
2957 	}
2958 	if (debug & D_TARGET) {
2959 		char abuf[INET6_ADDRSTRLEN];
2960 
2961 		logdebug("own_address: addr %s is %s ours\n",
2962 		    pr_addr(af, addr, abuf, sizeof (abuf)),
2963 		    ours ? "one of" : "not");
2964 	}
2965 	return (ours);
2966 }
2967