1 /*
2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Copyright (c) 1987 Regents of the University of California.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms are permitted
11  * provided that the above copyright notice and this paragraph are
12  * duplicated in all such forms and that any documentation,
13  * advertising materials, and other materials related to such
14  * distribution and use acknowledge that the software was developed
15  * by the University of California, Berkeley. The name of the
16  * University may not be used to endorse or promote products derived
17  * from this software without specific prior written permission.
18  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21  */
22 
23 #include "mpd_defs.h"
24 #include "mpd_tables.h"
25 
26 /*
27  * Probe types for probe()
28  */
29 #define	PROBE_UNI	0x1234		/* Unicast probe packet */
30 #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
31 #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
32 
33 #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
34 
35 /*
36  * Format of probe / probe response packets. This is an ICMP Echo request
37  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
38  */
39 struct pr_icmp
40 {
41 	uint8_t  pr_icmp_type;		/* type field */
42 	uint8_t  pr_icmp_code;		/* code field */
43 	uint16_t pr_icmp_cksum;		/* checksum field */
44 	uint16_t pr_icmp_id;		/* Identification */
45 	uint16_t pr_icmp_seq;		/* sequence number */
46 	uint64_t pr_icmp_timestamp;	/* Time stamp (in ns) */
47 	uint32_t pr_icmp_mtype;		/* Message type */
48 };
49 
50 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
51 				    0x0, 0x0, 0x0, 0x0,
52 				    0x0, 0x0, 0x0, 0x0,
53 				    0x0, 0x0, 0x0, 0x1 } };
54 
55 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
56 
57 static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
58 
59 static void		*find_ancillary(struct msghdr *msg, int cmsg_level,
60     int cmsg_type);
61 static void		pi_set_crtt(struct target *tg, int64_t m,
62     boolean_t is_probe_uni);
63 static void		incoming_echo_reply(struct phyint_instance *pii,
64     struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
65 static void		incoming_rtt_reply(struct phyint_instance *pii,
66     struct pr_icmp *reply, struct in6_addr fromaddr);
67 static void		incoming_mcast_reply(struct phyint_instance *pii,
68     struct pr_icmp *reply, struct in6_addr fromaddr);
69 
70 static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
71 static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
72 static boolean_t	check_exception_target(struct phyint_instance *pii,
73     struct target *target);
74 static void		probe_fail_info(struct phyint_instance *pii,
75     struct target *cur_tg, struct probe_fail_count *pfinfo);
76 static void		probe_success_info(struct phyint_instance *pii,
77     struct target *cur_tg, struct probe_success_count *psinfo);
78 static boolean_t	phyint_repaired(struct phyint *pi);
79 
80 static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
81 static int 		in_cksum(ushort_t *addr, int len);
82 static void		reset_snxt_basetimes(void);
83 static int		ns2ms(int64_t ns);
84 static int64_t		tv2ns(struct timeval *);
85 
86 /*
87  * CRTT - Conservative Round Trip Time Estimate
88  * Probe success - A matching probe reply received before CRTT ms has elapsed
89  *	after sending the probe.
90  * Probe failure - No probe reply received and more than CRTT ms has elapsed
91  *	after sending the probe.
92  *
93  * TLS - Time last success. Most recent probe ack received at this time.
94  * TFF - Time first fail. The time of the earliest probe failure in
95  *	a consecutive series of probe failures.
96  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
97  * 	before declaring phyint repair.
98  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
99  *	declare a phyint failure.
100  *
101  * 			Phyint state diagram
102  *
103  * The state of a phyint that is capable of being probed, is completely
104  * specified by the 3-tuple <pi_state, pg_state, I>.
105  *
106  * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
107  * IFF_OFFLINE is set.  If the phyint is also configured with a test address
108  * (the common case) and probe targets, then a phyint must also successfully
109  * be able to send and receive probes in order to remain in the PI_RUNNING
110  * state (otherwise, it transitions to PI_FAILED).
111  *
112  * Further, if a PI_RUNNING phyint is configured with a test address but is
113  * unable to find any probe targets, it will transition to the PI_NOTARGETS
114  * state, which indicates that the link is apparently functional but that
115  * in.mpathd is unable to send probes to verify functionality (in this case,
116  * in.mpathd makes the optimistic assumption that the interface is working
117  * correctly and thus does not mark the interface FAILED, but reports it as
118  * IPMP_IF_UNKNOWN through the async events and query interfaces).
119  *
120  * At any point, a phyint may be administratively marked offline via if_mpadm.
121  * In this case, the interface always transitions to PI_OFFLINE, regardless
122  * of its previous state.  When the interface is later brought back online,
123  * in.mpathd acts as if the interface is new (and thus it transitions to
124  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
125  * its probes, if probes are sent).
126  *
127  * pi_state -  PI_RUNNING or PI_FAILED
128  *	PI_RUNNING: The failure detection logic says the phyint is good.
129  *	PI_FAILED: The failure detection logic says the phyint has failed.
130  *
131  * pg_state  - PG_OK, PG_DEGRADED, or PG_FAILED.
132  *	PG_OK: All interfaces in the group are OK.
133  *	PG_DEGRADED: Some interfaces in the group are unusable.
134  *	PG_FAILED: All interfaces in the group are unusable.
135  *
136  *	In the case of router targets, we assume that the current list of
137  *	targets obtained from the routing table, is still valid, so the
138  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
139  *	list of targets, and multicast to the all hosts, to reconstruct the
140  *	target list. So the phyints are in the PI_NOTARGETS state.
141  *
142  * I -	value of (pi_flags & IFF_INACTIVE)
143  *	IFF_INACTIVE: This phyint will not send or receive packets.
144  *	Usually, inactive is tied to standby interfaces that are not yet
145  *	needed (e.g., no non-standby interfaces in the group have failed).
146  *	When failback has been disabled (FAILBACK=no configured), phyint can
147  *	also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
148  *	subsequently recovers after a failure.
149  *
150  * Not all 9 possible combinations of the above 3-tuple are possible.
151  *
152  * I is tracked by IP. pi_state is tracked by mpathd.
153  *
154  *			pi_state state machine
155  * ---------------------------------------------------------------------------
156  *	Event			State			New State
157  *				Action:
158  * ---------------------------------------------------------------------------
159  *	IP interface failure	(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
160  *	detection		: set IFF_FAILED on this phyint
161  *
162  *	IP interface failure	(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
163  *	detection		: set IFF_FAILED on this phyint
164  *
165  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=yes)
166  *	detection				     -> (PI_RUNNING, I == 0)
167  *				: clear IFF_FAILED on this phyint
168  *
169  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=no)
170  *	detection				     ->	(PI_RUNNING, I == 1)
171  *				: clear IFF_FAILED on this phyint
172  *				: if failback is disabled set I == 1
173  *
174  *	Group failure		(perform on all phyints in the group)
175  *	detection 		PI_RUNNING		PI_FAILED
176  *	(Router targets)	: set IFF_FAILED
177  *
178  *	Group failure		(perform on all phyints in the group)
179  *	detection 		PI_RUNNING		PI_NOTARGETS
180  *	(Host targets)		: set IFF_FAILED
181  *				: delete the target list on all phyints
182  * ---------------------------------------------------------------------------
183  */
184 
185 struct probes_missed probes_missed;
186 
187 /*
188  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
189  * will be added on by the kernel.  The id field identifies this phyint.
190  * and the sequence number is an increasing (modulo 2^^16) integer. The data
191  * portion holds the time value when the packet is sent. On echo this is
192  * extracted to compute the round-trip time. Three different types of
193  * probe packets are used.
194  *
195  * PROBE_UNI: This type is used to do failure detection / failure recovery
196  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
197  *	not less than the current CRTT. pii_probes[] stores data
198  *	about these probes. These packets consume sequence number space.
199  *
200  * PROBE_RTT: This type is used to make only rtt measurements. Normally these
201  * 	are not used. Under heavy network load, the rtt may go up very high,
202  *	due to a spike, or may appear to go high, due to extreme scheduling
203  * 	delays. Once the network stress is removed, mpathd takes long time to
204  *	recover, because the probe_interval is already high, and it takes
205  *	a long time to send out sufficient number of probes to bring down the
206  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
207  *	user_probe_interval ms. and will cause only rtt updates. These packets
208  *	do not consume sequence number space nor is information about these
209  *	packets stored in the pii_probes[]
210  *
211  * PROBE_MULTI: This type is only used to construct a list of targets, when
212  *	no targets are known. The packet is multicast to the all hosts addr.
213  */
214 static void
215 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
216 {
217 	hrtime_t sent_hrtime;
218 	struct timeval sent_tv;
219 	struct pr_icmp probe_pkt;	/* Probe packet */
220 	struct sockaddr_storage targ;	/* target address */
221 	uint_t	targaddrlen;		/* targed address length */
222 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
223 	boolean_t sent = _B_FALSE;
224 	int	rval;
225 
226 	if (debug & D_TARGET) {
227 		logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
228 		    pii->pii_name, probe_type, start_hrtime);
229 	}
230 
231 	assert(pii->pii_probe_sock != -1);
232 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
233 	    probe_type == PROBE_RTT);
234 
235 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
236 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
237 	probe_pkt.pr_icmp_code = 0;
238 	probe_pkt.pr_icmp_cksum = 0;
239 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
240 
241 	/*
242 	 * Since there is no need to do arithmetic on the icmpid,
243 	 * (only equality check is done) pii_icmpid is stored in
244 	 * network byte order at initialization itself.
245 	 */
246 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
247 	probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
248 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
249 
250 	/*
251 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
252 	 * the all hosts address. Otherwise it is unicast to the next target.
253 	 */
254 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
255 	    pii->pii_rtt_target_next != NULL));
256 
257 	bzero(&targ, sizeof (targ));
258 	targ.ss_family = pii->pii_af;
259 
260 	if (pii->pii_af == AF_INET6) {
261 		struct in6_addr *addr6;
262 
263 		addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
264 		targaddrlen = sizeof (struct sockaddr_in6);
265 		if (probe_type == PROBE_MULTI) {
266 			*addr6 = all_nodes_mcast_v6;
267 		} else if (probe_type == PROBE_UNI) {
268 			*addr6 = pii->pii_target_next->tg_address;
269 		} else { /* type is PROBE_RTT */
270 			*addr6 = pii->pii_rtt_target_next->tg_address;
271 		}
272 	} else {
273 		struct in_addr *addr4;
274 
275 		addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
276 		targaddrlen = sizeof (struct sockaddr_in);
277 		if (probe_type == PROBE_MULTI) {
278 			*addr4 = all_nodes_mcast_v4;
279 		} else if (probe_type == PROBE_UNI) {
280 			IN6_V4MAPPED_TO_INADDR(
281 			    &pii->pii_target_next->tg_address, addr4);
282 		} else { /* type is PROBE_RTT */
283 			IN6_V4MAPPED_TO_INADDR(
284 			    &pii->pii_rtt_target_next->tg_address, addr4);
285 		}
286 
287 		/*
288 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
289 		 */
290 		probe_pkt.pr_icmp_cksum =
291 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
292 	}
293 
294 	/*
295 	 * Use the current time as the time we sent.  Not atomic, but the best
296 	 * we can do from here.
297 	 */
298 	sent_hrtime = gethrtime();
299 	(void) gettimeofday(&sent_tv, NULL);
300 	rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
301 	    (struct sockaddr *)&targ, targaddrlen);
302 	/*
303 	 * If the send would block, this may either be transient or a hang in a
304 	 * lower layer. We pretend the probe was actually sent, the daemon will
305 	 * not see a reply to the probe and will fail the interface if normal
306 	 * failure detection criteria are met.
307 	 */
308 	if (rval == sizeof (probe_pkt) ||
309 	    (rval == -1 && errno == EWOULDBLOCK)) {
310 		sent = _B_TRUE;
311 	} else {
312 		logperror_pii(pii, "probe: probe sendto");
313 	}
314 
315 	/*
316 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
317 	 * update our tables. We will need this info in processing the probe
318 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
319 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
320 	 * are only used to construct a list of targets. PROBE_RTT packets are
321 	 * used only for updating the rtt and not for failure detection.
322 	 */
323 	if (probe_type == PROBE_UNI && sent) {
324 		pr_ndx = pii->pii_probe_next;
325 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
326 
327 		/* Collect statistics, before we reuse the last slot. */
328 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
329 			pii->pii_cum_stats.lost++;
330 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
331 			pii->pii_cum_stats.acked++;
332 		pii->pii_cum_stats.sent++;
333 
334 		pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
335 		pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
336 		pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
337 		pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
338 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
339 		probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
340 
341 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
342 		pii->pii_target_next = target_next(pii->pii_target_next);
343 		assert(pii->pii_target_next != NULL);
344 		/*
345 		 * If we have a single variable to denote the next target to
346 		 * probe for both rtt probes and failure detection probes, we
347 		 * could end up with a situation where the failure detection
348 		 * probe targets become disjoint from the rtt probe targets.
349 		 * Eg. if 2 targets and the actual fdt is double the user
350 		 * specified fdt. So we have 2 variables. In this scheme
351 		 * we also reset pii_rtt_target_next for every fdt probe,
352 		 * though that may not be necessary.
353 		 */
354 		pii->pii_rtt_target_next = pii->pii_target_next;
355 		pii->pii_snxt++;
356 	} else if (probe_type == PROBE_RTT) {
357 		pii->pii_rtt_target_next =
358 		    target_next(pii->pii_rtt_target_next);
359 		assert(pii->pii_rtt_target_next != NULL);
360 	}
361 }
362 
363 /*
364  * Incoming IPv4 data from wire, is received here. Called from main.
365  */
366 void
367 in_data(struct phyint_instance *pii)
368 {
369 	struct	sockaddr_in 	from;
370 	struct	in6_addr	fromaddr;
371 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
372 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
373 	struct ip *ip;
374 	int 	iphlen;
375 	int 	len;
376 	char 	abuf[INET_ADDRSTRLEN];
377 	struct msghdr msg;
378 	struct iovec iov;
379 	struct pr_icmp *reply;
380 	struct timeval *recv_tvp;
381 
382 	if (debug & D_PROBE) {
383 		logdebug("in_data(%s %s)\n",
384 		    AF_STR(pii->pii_af), pii->pii_name);
385 	}
386 
387 	iov.iov_base = (char *)in_packet;
388 	iov.iov_len = sizeof (in_packet);
389 	msg.msg_iov = &iov;
390 	msg.msg_iovlen = 1;
391 	msg.msg_name = (struct sockaddr *)&from;
392 	msg.msg_namelen = sizeof (from);
393 	msg.msg_control = ancillary_data;
394 	msg.msg_controllen = sizeof (ancillary_data);
395 
396 	/*
397 	 * Poll has already told us that a message is waiting,
398 	 * on this socket. Read it now. We should not block.
399 	 */
400 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
401 		logperror_pii(pii, "in_data: recvmsg");
402 		return;
403 	}
404 
405 	/*
406 	 * If the datalink has indicated the link is down, don't go
407 	 * any further.
408 	 */
409 	if (LINK_DOWN(pii->pii_phyint))
410 		return;
411 
412 	/* Get the printable address for error reporting */
413 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
414 
415 	/* Ignore packets > 64k or control buffers that don't fit */
416 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
417 		if (debug & D_PKTBAD) {
418 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
419 			    msg.msg_flags, abuf);
420 		}
421 		return;
422 	}
423 
424 	/* Make sure packet contains at least minimum ICMP header */
425 	ip = (struct ip *)in_packet;
426 	iphlen = ip->ip_hl << 2;
427 	if (len < iphlen + ICMP_MINLEN) {
428 		if (debug & D_PKTBAD) {
429 			logdebug("in_data: packet too short (%d bytes)"
430 			    " from %s\n", len, abuf);
431 		}
432 		return;
433 	}
434 
435 	/*
436 	 * Subtract the IP hdr length, 'len' will be length of the probe
437 	 * reply, starting from the icmp hdr.
438 	 */
439 	len -= iphlen;
440 	/* LINTED */
441 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
442 
443 	/* Probe replies are icmp echo replies. Ignore anything else */
444 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
445 		return;
446 
447 	/*
448 	 * The icmp id should match what we sent, which is stored
449 	 * in pi_icmpid. The icmp code for reply must be 0.
450 	 * The reply content must be a struct pr_icmp
451 	 */
452 	if (reply->pr_icmp_id != pii->pii_icmpid) {
453 		/* Not in response to our probe */
454 		return;
455 	}
456 
457 	if (reply->pr_icmp_code != 0) {
458 		logtrace("probe reply code %d from %s on %s\n",
459 		    reply->pr_icmp_code, abuf, pii->pii_name);
460 		return;
461 	}
462 
463 	if (len < sizeof (struct pr_icmp)) {
464 		logtrace("probe reply too short: %d bytes from %s on %s\n",
465 		    len, abuf, pii->pii_name);
466 		return;
467 	}
468 
469 	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
470 	if (recv_tvp == NULL) {
471 		logtrace("message without timestamp from %s on %s\n",
472 		    abuf, pii->pii_name);
473 		return;
474 	}
475 
476 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
477 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
478 		/* Unicast probe reply */
479 		incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
480 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
481 		/* Multicast reply */
482 		incoming_mcast_reply(pii, reply, fromaddr);
483 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
484 		incoming_rtt_reply(pii, reply, fromaddr);
485 	} else {
486 		/* Probably not in response to our probe */
487 		logtrace("probe reply type: %d from %s on %s\n",
488 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
489 		return;
490 	}
491 }
492 
493 /*
494  * Incoming IPv6 data from wire is received here. Called from main.
495  */
496 void
497 in6_data(struct phyint_instance *pii)
498 {
499 	struct sockaddr_in6 from;
500 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
501 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
502 	int len;
503 	char abuf[INET6_ADDRSTRLEN];
504 	struct msghdr msg;
505 	struct iovec iov;
506 	void	*opt;
507 	struct	pr_icmp *reply;
508 	struct	timeval *recv_tvp;
509 
510 	if (debug & D_PROBE) {
511 		logdebug("in6_data(%s %s)\n",
512 		    AF_STR(pii->pii_af), pii->pii_name);
513 	}
514 
515 	iov.iov_base = (char *)in_packet;
516 	iov.iov_len = sizeof (in_packet);
517 	msg.msg_iov = &iov;
518 	msg.msg_iovlen = 1;
519 	msg.msg_name = (struct sockaddr *)&from;
520 	msg.msg_namelen = sizeof (from);
521 	msg.msg_control = ancillary_data;
522 	msg.msg_controllen = sizeof (ancillary_data);
523 
524 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
525 		logperror_pii(pii, "in6_data: recvmsg");
526 		return;
527 	}
528 
529 	/*
530 	 * If the datalink has indicated that the link is down, don't go
531 	 * any further.
532 	 */
533 	if (LINK_DOWN(pii->pii_phyint))
534 		return;
535 
536 	/* Get the printable address for error reporting */
537 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
538 	if (len < ICMP_MINLEN) {
539 		if (debug & D_PKTBAD) {
540 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
541 			    msg.msg_flags, abuf);
542 		}
543 		return;
544 	}
545 	/* Ignore packets > 64k or control buffers that don't fit */
546 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
547 		if (debug & D_PKTBAD) {
548 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
549 			    msg.msg_flags, abuf);
550 		}
551 		return;
552 	}
553 
554 	reply = (struct pr_icmp *)in_packet;
555 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
556 		return;
557 
558 	if (reply->pr_icmp_id != pii->pii_icmpid) {
559 		/* Not in response to our probe */
560 		return;
561 	}
562 
563 	/*
564 	 * The kernel has already verified the the ICMP checksum.
565 	 */
566 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
567 		logtrace("ICMPv6 echo reply source address not linklocal from "
568 		    "%s on %s\n", abuf, pii->pii_name);
569 		return;
570 	}
571 	opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
572 	if (opt != NULL) {
573 		/* Can't allow routing headers in probe replies  */
574 		logtrace("message with routing header from %s on %s\n",
575 		    abuf, pii->pii_name);
576 		return;
577 	}
578 
579 	if (reply->pr_icmp_code != 0) {
580 		logtrace("probe reply code: %d from %s on %s\n",
581 		    reply->pr_icmp_code, abuf, pii->pii_name);
582 		return;
583 	}
584 	if (len < (sizeof (struct pr_icmp))) {
585 		logtrace("probe reply too short: %d bytes from %s on %s\n",
586 		    len, abuf, pii->pii_name);
587 		return;
588 	}
589 
590 	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
591 	if (recv_tvp == NULL) {
592 		logtrace("message without timestamp from %s on %s\n",
593 		    abuf, pii->pii_name);
594 		return;
595 	}
596 
597 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
598 		incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
599 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
600 		incoming_mcast_reply(pii, reply, from.sin6_addr);
601 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
602 		incoming_rtt_reply(pii, reply, from.sin6_addr);
603 	} else  {
604 		/* Probably not in response to our probe */
605 		logtrace("probe reply type: %d from %s on %s\n",
606 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
607 	}
608 }
609 
610 /*
611  * Process the incoming rtt reply, in response to our rtt probe.
612  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
613  * have any stored information about the probe we sent. So we don't log
614  * any errors if we receive bad replies.
615  */
616 static void
617 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
618     struct in6_addr fromaddr)
619 {
620 	int64_t	m;		/* rtt measurement in ns */
621 	char	abuf[INET6_ADDRSTRLEN];
622 	struct	target	*target;
623 	struct 	phyint_group *pg;
624 
625 	/* Get the printable address for error reporting */
626 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
627 
628 	if (debug & D_PROBE) {
629 		logdebug("incoming_rtt_reply: %s %s %s\n",
630 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
631 	}
632 
633 	/* Do we know this target ? */
634 	target = target_lookup(pii, fromaddr);
635 	if (target == NULL)
636 		return;
637 
638 	m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
639 	/* Invalid rtt. It has wrapped around */
640 	if (m < 0)
641 		return;
642 
643 	/*
644 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
645 	 * The initial few responses after the interface is repaired may
646 	 * contain high rtt's because they could have been queued up waiting
647 	 * for ARP/NDP resolution on a failed interface.
648 	 */
649 	pg = pii->pii_phyint->pi_group;
650 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
651 		return;
652 
653 	/*
654 	 * Update rtt only if the new rtt is lower than the current rtt.
655 	 * (specified by the 3rd parameter to pi_set_crtt).
656 	 * If a spike has caused the current probe_interval to be >
657 	 * user_probe_interval, then this mechanism is used to bring down
658 	 * the rtt rapidly once the network stress is removed.
659 	 * If the new rtt is higher than the current rtt, we don't want to
660 	 * update the rtt. We are having more than 1 outstanding probe and
661 	 * the increase in rtt we are seeing is being unnecessarily weighted
662 	 * many times. The regular rtt update will be handled by
663 	 * incoming_echo_reply() and will take care of any rtt increase.
664 	 */
665 	pi_set_crtt(target, m, _B_FALSE);
666 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
667 	    (user_failure_detection_time < pg->pg_fdt) &&
668 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
669 		/*
670 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
671 		 * investigate if we can improve the failure detection time to
672 		 * meet whatever the user specified.
673 		 */
674 		if (check_pg_crtt_improved(pg)) {
675 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
676 			    user_failure_detection_time);
677 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
678 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
679 				logerr("Improved failure detection time %d ms "
680 				    "on (%s %s) for group \"%s\"\n",
681 				    pg->pg_fdt, AF_STR(pii->pii_af),
682 				    pii->pii_name,
683 				    pii->pii_phyint->pi_group->pg_name);
684 			}
685 			if (user_failure_detection_time == pg->pg_fdt) {
686 				/* Avoid any truncation or rounding errors */
687 				pg->pg_probeint = user_probe_interval;
688 				/*
689 				 * No more rtt probes will be sent. The actual
690 				 * fdt has dropped to the user specified value.
691 				 * pii_fd_snxt_basetime and pii_snxt_basetime
692 				 * will be in sync henceforth.
693 				 */
694 				reset_snxt_basetimes();
695 			}
696 		}
697 	}
698 }
699 
700 /*
701  * Process the incoming echo reply, in response to our unicast probe.
702  * Common for both IPv4 and IPv6
703  */
704 static void
705 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
706     struct in6_addr fromaddr, struct timeval *recv_tvp)
707 {
708 	int64_t	m;		/* rtt measurement in ns */
709 	hrtime_t cur_hrtime;	/* in ns from some arbitrary point */
710 	char	abuf[INET6_ADDRSTRLEN];
711 	int	pr_ndx;
712 	struct	target	*target;
713 	boolean_t exception;
714 	uint64_t pr_icmp_timestamp;
715 	uint16_t pr_icmp_seq;
716 	struct	probe_stats *pr_statp;
717 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
718 
719 	/* Get the printable address for error reporting */
720 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
721 
722 	if (debug & D_PROBE) {
723 		logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
724 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
725 		    ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
726 	}
727 
728 	pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
729 	pr_icmp_seq = ntohs(reply->pr_icmp_seq);
730 
731 	/* Reject out of window probe replies */
732 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
733 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
734 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
735 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
736 		pii->pii_cum_stats.unknown++;
737 		return;
738 	}
739 
740 	cur_hrtime = gethrtime();
741 	m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
742 	if (m < 0) {
743 		/*
744 		 * This is a ridiculously high value of rtt. rtt has wrapped
745 		 * around. Log a message, and ignore the rtt.
746 		 */
747 		logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
748 		    "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
749 	}
750 
751 	/*
752 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
753 	 * number in our pii->pii_probes[] array. The icmp sequence number
754 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
755 	 */
756 	pr_ndx = MOD_SUB(pii->pii_probe_next,
757 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
758 
759 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
760 
761 	target = pii->pii_probes[pr_ndx].pr_target;
762 
763 	/*
764 	 * Perform sanity checks, whether this probe reply that we
765 	 * have received is genuine
766 	 */
767 	if (target != NULL) {
768 		/*
769 		 * Compare the src. addr of the received ICMP or ICMPv6
770 		 * probe reply with the target address in our tables.
771 		 */
772 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
773 			/*
774 			 * We don't have any record of having sent a probe to
775 			 * this target. This is a fake probe reply. Log an error
776 			 */
777 			logtrace("probe status %d Fake probe reply seq %u "
778 			    "snxt %u on %s from %s\n",
779 			    pii->pii_probes[pr_ndx].pr_status,
780 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
781 			pii->pii_cum_stats.unknown++;
782 			return;
783 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
784 			/*
785 			 * The address matches, but our tables indicate that
786 			 * this probe reply has been acked already. So this
787 			 * is a duplicate probe reply. Log an error
788 			 */
789 			logtrace("probe status %d Duplicate probe reply seq %u "
790 			    "snxt %u on %s from %s\n",
791 			    pii->pii_probes[pr_ndx].pr_status,
792 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
793 			pii->pii_cum_stats.unknown++;
794 			return;
795 		}
796 	} else {
797 		/*
798 		 * Target must not be NULL in the PR_UNACKED state
799 		 */
800 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
801 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
802 			/*
803 			 * The probe stats slot is unused. So we didn't
804 			 * send out any probe to this target. This is a fake.
805 			 * Log an error.
806 			 */
807 			logtrace("probe status %d Fake probe reply seq %u "
808 			    "snxt %u on %s from %s\n",
809 			    pii->pii_probes[pr_ndx].pr_status,
810 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
811 		}
812 		pii->pii_cum_stats.unknown++;
813 		return;
814 	}
815 
816 	/*
817 	 * If the rtt does not appear to be right, don't update the
818 	 * rtt stats. This can happen if the system dropped into the
819 	 * debugger, or the system was hung or too busy for a
820 	 * substantial time that we didn't get a chance to run.
821 	 */
822 	if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
823 		/*
824 		 * If the probe corresponding to this received response
825 		 * was truly sent 'm' ns. ago, then this response must
826 		 * have been rejected by the sequence number checks. The
827 		 * fact that it has passed the sequence number checks
828 		 * means that the measured rtt is wrong. We were probably
829 		 * scheduled long after the packet was received.
830 		 */
831 		goto out;
832 	}
833 
834 	/*
835 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
836 	 * The initial few responses after the interface is repaired may
837 	 * contain high rtt's because they could have been queued up waiting
838 	 * for ARP/NDP resolution on a failed interface.
839 	 */
840 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
841 		goto out;
842 
843 	/*
844 	 * Don't update the Conservative Round Trip Time estimate for this
845 	 * (phint, target) pair if this is the not the highest ack seq seen
846 	 * thus far on this target.
847 	 */
848 	if (!highest_ack_tg(pr_icmp_seq, target))
849 		goto out;
850 
851 	/*
852 	 * Always update the rtt. This is a failure detection probe
853 	 * and we want to measure both increase / decrease in rtt.
854 	 */
855 	pi_set_crtt(target, m, _B_TRUE);
856 
857 	/*
858 	 * If the crtt exceeds the average time between probes,
859 	 * investigate if this slow target is an exception. If so we
860 	 * can avoid this target and still meet the failure detection
861 	 * time. Otherwise we can't meet the failure detection time.
862 	 */
863 	if (target->tg_crtt > pg->pg_probeint) {
864 		exception = check_exception_target(pii, target);
865 		if (exception) {
866 			/*
867 			 * This target is exceptionally slow. Don't use it
868 			 * for future probes. check_exception_target() has
869 			 * made sure that we have at least MIN_PROBE_TARGETS
870 			 * other active targets
871 			 */
872 			if (pii->pii_targets_are_routers) {
873 				/*
874 				 * This is a slow router, mark it as slow
875 				 * and don't use it for further probes. We
876 				 * don't delete it, since it will be populated
877 				 * again when we do a router scan. Hence we
878 				 * need to maintain extra state (unlike the
879 				 * host case below).  Mark it as TG_SLOW.
880 				 */
881 				if (target->tg_status == TG_ACTIVE)
882 					pii->pii_ntargets--;
883 				target->tg_status = TG_SLOW;
884 				target->tg_latime = gethrtime();
885 				target->tg_rtt_sa = -1;
886 				target->tg_crtt = 0;
887 				target->tg_rtt_sd = 0;
888 				if (pii->pii_target_next == target) {
889 					pii->pii_target_next =
890 					    target_next(target);
891 				}
892 			} else {
893 				/*
894 				 * the slow target is not a router, we can
895 				 * just delete it. Send an icmp multicast and
896 				 * pick the fastest responder that is not
897 				 * already an active target. target_delete()
898 				 * adjusts pii->pii_target_next
899 				 */
900 				target_delete(target);
901 				probe(pii, PROBE_MULTI, cur_hrtime);
902 			}
903 		} else {
904 			/*
905 			 * We can't meet the failure detection time.
906 			 * Log a message, and update the detection time to
907 			 * whatever we can achieve.
908 			 */
909 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
910 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
911 			last_fdt_bumpup_time = gethrtime();
912 			if (pg != phyint_anongroup) {
913 				logerr("Cannot meet requested failure detection"
914 				    " time of %d ms on (%s %s) new failure"
915 				    " detection time for group \"%s\" is %d"
916 				    " ms\n", user_failure_detection_time,
917 				    AF_STR(pii->pii_af), pii->pii_name,
918 				    pg->pg_name, pg->pg_fdt);
919 			}
920 		}
921 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
922 	    (user_failure_detection_time < pg->pg_fdt) &&
923 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
924 		/*
925 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
926 		 * investigate if we can improve the failure detection time to
927 		 * meet whatever the user specified.
928 		 */
929 		if (check_pg_crtt_improved(pg)) {
930 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
931 			    user_failure_detection_time);
932 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
933 			if (pg != phyint_anongroup) {
934 				logerr("Improved failure detection time %d ms "
935 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
936 				    AF_STR(pii->pii_af), pii->pii_name,
937 				    pg->pg_name);
938 			}
939 			if (user_failure_detection_time == pg->pg_fdt) {
940 				/* Avoid any truncation or rounding errors */
941 				pg->pg_probeint = user_probe_interval;
942 				/*
943 				 * No more rtt probes will be sent. The actual
944 				 * fdt has dropped to the user specified value.
945 				 * pii_fd_snxt_basetime and pii_snxt_basetime
946 				 * will be in sync henceforth.
947 				 */
948 				reset_snxt_basetimes();
949 			}
950 		}
951 	}
952 out:
953 	pr_statp = &pii->pii_probes[pr_ndx];
954 	pr_statp->pr_hrtime_ackproc = cur_hrtime;
955 	pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
956 	    (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
957 
958 	probe_chstate(pr_statp, pii, PR_ACKED);
959 
960 	/*
961 	 * Update pii->pii_rack, i.e. the sequence number of the last received
962 	 * probe response, based on the echo reply we have received now, if
963 	 * either of the following conditions are satisfied.
964 	 * a. pii_rack is outside the current receive window of
965 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
966 	 *    This means we have not received probe responses for a
967 	 *    long time, and the sequence number has wrapped around.
968 	 * b. pii_rack is within the current receive window and this echo
969 	 *    reply corresponds to the highest sequence number we have seen
970 	 *    so far.
971 	 */
972 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
973 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
974 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
975 		pii->pii_rack = pr_icmp_seq;
976 	}
977 }
978 
979 /*
980  * Returns true if seq is the highest unacknowledged seq for target tg
981  * else returns false
982  */
983 static boolean_t
984 highest_ack_tg(uint16_t seq, struct target *tg)
985 {
986 	struct phyint_instance *pii;
987 	int	 pr_ndx;
988 	uint16_t pr_seq;
989 
990 	pii = tg->tg_phyint_inst;
991 
992 	/*
993 	 * Get the seq number of the most recent probe sent so far,
994 	 * and also get the corresponding probe index in the probe stats
995 	 * array.
996 	 */
997 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
998 	pr_seq = pii->pii_snxt;
999 	pr_seq--;
1000 
1001 	/*
1002 	 * Start from the most recent probe and walk back, trying to find
1003 	 * an acked probe corresponding to target tg.
1004 	 */
1005 	for (; pr_ndx != pii->pii_probe_next;
1006 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1007 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
1008 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1009 			if (SEQ_GT(pr_seq, seq))
1010 				return (_B_FALSE);
1011 		}
1012 	}
1013 	return (_B_TRUE);
1014 }
1015 
1016 /*
1017  * Check whether the crtt for the group has improved by a factor of
1018  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1019  * detection time flapping in the face of small crtt changes.
1020  */
1021 static boolean_t
1022 check_pg_crtt_improved(struct phyint_group *pg)
1023 {
1024 	struct	phyint *pi;
1025 
1026 	if (debug & D_PROBE)
1027 		logdebug("check_pg_crtt_improved()\n");
1028 
1029 	/*
1030 	 * The crtt for the group is only improved if each phyint_instance
1031 	 * for both ipv4 and ipv6 is improved.
1032 	 */
1033 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1034 		if (!check_pii_crtt_improved(pi->pi_v4) ||
1035 		    !check_pii_crtt_improved(pi->pi_v6))
1036 			return (_B_FALSE);
1037 	}
1038 
1039 	return (_B_TRUE);
1040 }
1041 
1042 /*
1043  * Check whether the crtt has improved substantially on this phyint_instance.
1044  * Returns _B_TRUE if there's no crtt information available, because pii
1045  * is NULL or the phyint_instance is not capable of probing.
1046  */
1047 boolean_t
1048 check_pii_crtt_improved(struct phyint_instance *pii) {
1049 	struct 	target *tg;
1050 
1051 	if (pii == NULL)
1052 		return (_B_TRUE);
1053 
1054 	if (!PROBE_CAPABLE(pii) ||
1055 	    pii->pii_phyint->pi_state == PI_FAILED)
1056 		return (_B_TRUE);
1057 
1058 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1059 		if (tg->tg_status != TG_ACTIVE)
1060 			continue;
1061 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1062 		    LOWER_FDT_TRIGGER)) {
1063 			return (_B_FALSE);
1064 		}
1065 	}
1066 
1067 	return (_B_TRUE);
1068 }
1069 
1070 /*
1071  * This target responds very slowly to probes. The target's crtt exceeds
1072  * the probe interval of its group. Compare against other targets
1073  * and determine if this target is an exception, if so return true, else false
1074  */
1075 static boolean_t
1076 check_exception_target(struct phyint_instance *pii, struct target *target)
1077 {
1078 	struct	target *tg;
1079 	char abuf[INET6_ADDRSTRLEN];
1080 
1081 	if (debug & D_PROBE) {
1082 		logdebug("check_exception_target(%s %s target %s)\n",
1083 		    AF_STR(pii->pii_af), pii->pii_name,
1084 		    pr_addr(pii->pii_af, target->tg_address,
1085 		    abuf, sizeof (abuf)));
1086 	}
1087 
1088 	/*
1089 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1090 	 * to make a good judgement. Otherwise don't drop this target.
1091 	 */
1092 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1093 		return (_B_FALSE);
1094 
1095 	/*
1096 	 * Determine whether only this particular target is slow.
1097 	 * We know that this target's crtt exceeds the group's probe interval.
1098 	 * If all other active targets have a
1099 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1100 	 * then this target is considered slow.
1101 	 */
1102 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1103 		if (tg != target && tg->tg_status == TG_ACTIVE) {
1104 			if (tg->tg_crtt >
1105 			    pii->pii_phyint->pi_group->pg_probeint /
1106 			    EXCEPTION_FACTOR) {
1107 				return (_B_FALSE);
1108 			}
1109 		}
1110 	}
1111 
1112 	return (_B_TRUE);
1113 }
1114 
1115 /*
1116  * Update the target list. The icmp all hosts multicast has given us
1117  * some host to which we can send probes. If we already have sufficient
1118  * targets, discard it.
1119  */
1120 static void
1121 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1122     struct in6_addr fromaddr)
1123 /* ARGSUSED */
1124 {
1125 	int af;
1126 	char abuf[INET6_ADDRSTRLEN];
1127 	struct phyint *pi;
1128 
1129 	if (debug & D_PROBE) {
1130 		logdebug("incoming_mcast_reply(%s %s %s)\n",
1131 		    AF_STR(pii->pii_af), pii->pii_name,
1132 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1133 	}
1134 
1135 	/*
1136 	 * Using host targets is a fallback mechanism. If we have
1137 	 * found a router, don't add this host target. If we already
1138 	 * know MAX_PROBE_TARGETS, don't add another target.
1139 	 */
1140 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1141 	if (pii->pii_targets != NULL) {
1142 		if (pii->pii_targets_are_routers ||
1143 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1144 			return;
1145 		}
1146 	}
1147 
1148 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1149 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1150 		/*
1151 		 * Guard against response from 0.0.0.0
1152 		 * and ::. Log a trace message
1153 		 */
1154 		logtrace("probe response from %s on %s\n",
1155 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1156 		    pii->pii_name);
1157 		return;
1158 	}
1159 
1160 	/*
1161 	 * This address is one of our own, so reject this address as a
1162 	 * valid probe target.
1163 	 */
1164 	af = pii->pii_af;
1165 	if (own_address(fromaddr))
1166 		return;
1167 
1168 	/*
1169 	 * If the phyint is part a named group, then add the address to all
1170 	 * members of the group.  Otherwise, add the address only to the
1171 	 * phyint itself, since other phyints in the anongroup may not be on
1172 	 * the same subnet.
1173 	 */
1174 	pi = pii->pii_phyint;
1175 	if (pi->pi_group == phyint_anongroup) {
1176 		target_add(pii, fromaddr, _B_FALSE);
1177 	} else {
1178 		pi = pi->pi_group->pg_phyint;
1179 		for (; pi != NULL; pi = pi->pi_pgnext)
1180 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1181 	}
1182 }
1183 
1184 /*
1185  * Compute CRTT given an existing scaled average, scaled deviation estimate
1186  * and a new rtt time.  The formula is from Jacobson and Karels'
1187  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1188  * are the same as those in Appendix A.2 of that paper.
1189  *
1190  * m = new measurement
1191  * sa = scaled RTT average (8 * average estimates)
1192  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1193  * crtt = Conservative round trip time. Used to determine whether probe
1194  * has timed out.
1195  *
1196  * New scaled average and deviation are passed back via sap and svp
1197  */
1198 static int64_t
1199 compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
1200 {
1201 	int64_t sa = *sap;
1202 	int64_t sv = *svp;
1203 	int64_t crtt;
1204 	int64_t saved_m = m;
1205 
1206 	assert(*sap >= -1);
1207 	assert(*svp >= 0);
1208 
1209 	if (sa != -1) {
1210 		/*
1211 		 * Update average estimator:
1212 		 *	new rtt = old rtt + 1/8 Error
1213 		 *	    where Error = m - old rtt
1214 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1215 		 *	i.e. new sa =  old sa + Error
1216 		 */
1217 		m -= sa >> 3;		/* m is now Error in estimate. */
1218 		if ((sa += m) < 0) {
1219 			/* Don't allow the smoothed average to be negative. */
1220 			sa = 0;
1221 		}
1222 
1223 		/*
1224 		 * Update deviation estimator:
1225 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1226 		 *	i.e. 4 * new mdev = 4 * old mdev +
1227 		 *		(abs(Error) - old mdev)
1228 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1229 		 */
1230 		if (m < 0)
1231 			m = -m;
1232 		m -= sv >> 2;
1233 		sv += m;
1234 	} else {
1235 		/* Initialization. This is the first response received. */
1236 		sa = (m << 3);
1237 		sv = (m << 1);
1238 	}
1239 
1240 	crtt = (sa >> 3) + sv;
1241 
1242 	if (debug & D_PROBE) {
1243 		logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
1244 		    "crtt = %lld\n", saved_m, sa, sv, crtt);
1245 	}
1246 
1247 	*sap = sa;
1248 	*svp = sv;
1249 
1250 	/*
1251 	 * CRTT = average estimates  + 4 * deviation estimates
1252 	 *	= sa / 8 + sv
1253 	 */
1254 	return (crtt);
1255 }
1256 
1257 static void
1258 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
1259 {
1260 	struct phyint_instance *pii = tg->tg_phyint_inst;
1261 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1262 	int64_t sa = tg->tg_rtt_sa;
1263 	int64_t sv = tg->tg_rtt_sd;
1264 	int new_crtt;
1265 	int i;
1266 
1267 	if (debug & D_PROBE)
1268 		logdebug("pi_set_crtt: target -  m %lld\n", m);
1269 
1270 	/* store the round trip time, in case we need to defer computation */
1271 	tg->tg_deferred[tg->tg_num_deferred] = m;
1272 
1273 	new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
1274 
1275 	/*
1276 	 * If this probe's round trip time would singlehandedly cause an
1277 	 * increase in the group's probe interval consider it suspect.
1278 	 */
1279 	if ((new_crtt > probe_interval) && is_probe_uni) {
1280 		if (debug & D_PROBE) {
1281 			logdebug("Received a suspect probe on %s, new_crtt ="
1282 			    " %d, probe_interval = %d, num_deferred = %d\n",
1283 			    pii->pii_probe_logint->li_name, new_crtt,
1284 			    probe_interval, tg->tg_num_deferred);
1285 		}
1286 
1287 		/*
1288 		 * If we've deferred as many rtts as we plan on deferring, then
1289 		 * assume the link really did slow down and process all queued
1290 		 * rtts
1291 		 */
1292 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1293 			if (debug & D_PROBE) {
1294 				logdebug("Received MAXDEFERREDRTT probes which "
1295 				    "would cause an increased probe_interval.  "
1296 				    "Integrating queued rtt data points.\n");
1297 			}
1298 
1299 			for (i = 0; i <= tg->tg_num_deferred; i++) {
1300 				tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
1301 				    &tg->tg_rtt_sd, tg->tg_deferred[i]));
1302 			}
1303 
1304 			tg->tg_num_deferred = 0;
1305 		} else {
1306 			tg->tg_num_deferred++;
1307 		}
1308 		return;
1309 	}
1310 
1311 	/*
1312 	 * If this is a normal probe, or an RTT probe that would lead to a
1313 	 * reduced CRTT, then update our CRTT data.  Further, if this was
1314 	 * a normal probe, pitch any deferred probes since our probes are
1315 	 * again being answered within our CRTT estimates.
1316 	 */
1317 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1318 		tg->tg_rtt_sa = sa;
1319 		tg->tg_rtt_sd = sv;
1320 		tg->tg_crtt = new_crtt;
1321 		if (is_probe_uni)
1322 			tg->tg_num_deferred = 0;
1323 	}
1324 }
1325 
1326 /*
1327  * Return a pointer to the specified option buffer.
1328  * If not found return NULL.
1329  */
1330 static void *
1331 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
1332 {
1333 	struct cmsghdr *cmsg;
1334 
1335 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1336 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1337 		if (cmsg->cmsg_level == cmsg_level &&
1338 		    cmsg->cmsg_type == cmsg_type) {
1339 			return (CMSG_DATA(cmsg));
1340 		}
1341 	}
1342 	return (NULL);
1343 }
1344 
1345 /*
1346  * Try to activate another INACTIVE interface in the same group as `pi'.
1347  * Prefer STANDBY INACTIVE to just INACTIVE.
1348  */
1349 void
1350 phyint_activate_another(struct phyint *pi)
1351 {
1352 	struct phyint *pi2;
1353 	struct phyint *inactivepi = NULL;
1354 
1355 	if (pi->pi_group == phyint_anongroup)
1356 		return;
1357 
1358 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1359 		if (pi == pi2 || pi2->pi_state != PI_RUNNING ||
1360 		    !(pi2->pi_flags & IFF_INACTIVE))
1361 			continue;
1362 
1363 		inactivepi = pi2;
1364 		if (pi2->pi_flags & IFF_STANDBY)
1365 			break;
1366 	}
1367 
1368 	if (inactivepi != NULL)
1369 		(void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
1370 }
1371 
1372 /*
1373  * Transition a phyint to PI_RUNNING.  The caller must ensure that the
1374  * transition is appropriate.  Clears IFF_OFFLINE or IFF_FAILED if
1375  * appropriate.  Also sets IFF_INACTIVE on this or other interfaces as
1376  * appropriate (see comment below).  Finally, also updates the phyint's group
1377  * state to account for the change.
1378  */
1379 void
1380 phyint_transition_to_running(struct phyint *pi)
1381 {
1382 	struct phyint *pi2;
1383 	struct phyint *actstandbypi = NULL;
1384 	uint_t nactive = 0, nnonstandby = 0;
1385 	boolean_t onlining = (pi->pi_state == PI_OFFLINE);
1386 	boolean_t initial = (pi->pi_state == PI_INIT);
1387 	uint64_t set, clear;
1388 
1389 	/*
1390 	 * The interface is running again, but should it or another interface
1391 	 * in the group end up INACTIVE?  There are three cases:
1392 	 *
1393 	 * 1. If it's a STANDBY interface, it should be end up INACTIVE if
1394 	 *    the group is operating at capacity (i.e., there are at least as
1395 	 *    many active interfaces as non-STANDBY interfaces in the group).
1396 	 *    No other interfaces should be changed.
1397 	 *
1398 	 * 2. If it's a non-STANDBY interface and we're onlining it or
1399 	 *    FAILBACK is enabled, then it should *not* end up INACTIVE.
1400 	 *    Further, if the group is above capacity as a result of this
1401 	 *    interface, then an active STANDBY interface in the group should
1402 	 *    end up INACTIVE.
1403 	 *
1404 	 * 3. If it's a non-STANDBY interface, we're repairing it, and
1405 	 *    FAILBACK is disabled, then it should end up INACTIVE *unless*
1406 	 *    the group was failed (in which case we have no choice but to
1407 	 *    use it).  No other interfaces should be changed.
1408 	 */
1409 	if (pi->pi_group != phyint_anongroup) {
1410 		pi2 = pi->pi_group->pg_phyint;
1411 		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1412 			if (!(pi2->pi_flags & IFF_STANDBY))
1413 				nnonstandby++;
1414 
1415 			if (pi2->pi_state == PI_RUNNING) {
1416 				if (!(pi2->pi_flags & IFF_INACTIVE)) {
1417 					nactive++;
1418 					if (pi2->pi_flags & IFF_STANDBY)
1419 						actstandbypi = pi2;
1420 				}
1421 			}
1422 		}
1423 	}
1424 
1425 	set = 0;
1426 	clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
1427 
1428 	if (pi->pi_flags & IFF_STANDBY) {			/* case 1 */
1429 		if (nactive >= nnonstandby)
1430 			set |= IFF_INACTIVE;
1431 		else
1432 			clear |= IFF_INACTIVE;
1433 	} else if (onlining || failback_enabled) {		/* case 2 */
1434 		if (nactive >= nnonstandby && actstandbypi != NULL)
1435 			(void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
1436 	} else if (!initial && !GROUP_FAILED(pi->pi_group)) {	/* case 3 */
1437 		set |= IFF_INACTIVE;
1438 	}
1439 	(void) change_pif_flags(pi, set, clear);
1440 
1441 	phyint_chstate(pi, PI_RUNNING);
1442 
1443 	/*
1444 	 * Update the group state to account for the change.
1445 	 */
1446 	phyint_group_refresh_state(pi->pi_group);
1447 }
1448 
1449 /*
1450  * See if a previously failed interface has started working again.
1451  */
1452 void
1453 phyint_check_for_repair(struct phyint *pi)
1454 {
1455 	if (!phyint_repaired(pi))
1456 		return;
1457 
1458 	if (pi->pi_group == phyint_anongroup) {
1459 		logerr("IP interface repair detected on %s\n", pi->pi_name);
1460 	} else {
1461 		logerr("IP interface repair detected on %s of group %s\n",
1462 		    pi->pi_name, pi->pi_group->pg_name);
1463 	}
1464 
1465 	/*
1466 	 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
1467 	 * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
1468 	 * until it is brought back online.
1469 	 */
1470 	if (pi->pi_state == PI_OFFLINE) {
1471 		(void) change_pif_flags(pi, 0, IFF_FAILED);
1472 		return;
1473 	}
1474 
1475 	phyint_transition_to_running(pi);	/* calls phyint_chstate() */
1476 }
1477 
1478 /*
1479  * See if an interface has failed, or if the whole group of interfaces has
1480  * failed.
1481  */
1482 static void
1483 phyint_inst_check_for_failure(struct phyint_instance *pii)
1484 {
1485 	struct phyint	*pi = pii->pii_phyint;
1486 	struct phyint	*pi2;
1487 	boolean_t	was_active;
1488 
1489 	switch (failure_state(pii)) {
1490 	case PHYINT_FAILURE:
1491 		was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1492 
1493 		(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1494 		if (pi->pi_group == phyint_anongroup) {
1495 			logerr("IP interface failure detected on %s\n",
1496 			    pii->pii_name);
1497 		} else {
1498 			logerr("IP interface failure detected on %s of group"
1499 			    " %s\n", pii->pii_name, pi->pi_group->pg_name);
1500 		}
1501 
1502 		/*
1503 		 * If the failed interface was active, activate another
1504 		 * INACTIVE interface in the group if possible.
1505 		 */
1506 		if (was_active)
1507 			phyint_activate_another(pi);
1508 
1509 		/*
1510 		 * If the interface is offline, the state change will be
1511 		 * noted when it comes back online.
1512 		 */
1513 		if (pi->pi_state != PI_OFFLINE) {
1514 			phyint_chstate(pi, PI_FAILED);
1515 			reset_crtt_all(pi);
1516 		}
1517 		break;
1518 
1519 	case GROUP_FAILURE:
1520 		pi2 = pi->pi_group->pg_phyint;
1521 		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1522 			(void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
1523 			if (pi2->pi_state == PI_OFFLINE) /* see comment above */
1524 				continue;
1525 
1526 			reset_crtt_all(pi2);
1527 			/*
1528 			 * In the case of host targets, we would have flushed
1529 			 * the targets, and gone to PI_NOTARGETS state.
1530 			 */
1531 			if (pi2->pi_state == PI_RUNNING)
1532 				phyint_chstate(pi2, PI_FAILED);
1533 		}
1534 		break;
1535 
1536 	default:
1537 		break;
1538 	}
1539 }
1540 
1541 /*
1542  * Determines if any timeout event has occurred and returns the number of
1543  * milliseconds until the next timeout event for the phyint. Returns
1544  * TIMER_INFINITY for "never".
1545  */
1546 uint_t
1547 phyint_inst_timer(struct phyint_instance *pii)
1548 {
1549 	int 	pr_ndx;
1550 	uint_t	timeout;
1551 	struct	target	*cur_tg;
1552 	struct	probe_stats *pr_statp;
1553 	struct	phyint_instance *pii_other;
1554 	struct	phyint *pi;
1555 	int	valid_unack_count;
1556 	int	i;
1557 	int	interval;
1558 	uint_t	check_time;
1559 	uint_t	cur_time;
1560 	hrtime_t cur_hrtime;
1561 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1562 
1563 	cur_hrtime = gethrtime();
1564 	cur_time = ns2ms(cur_hrtime);
1565 
1566 	if (debug & D_TIMER) {
1567 		logdebug("phyint_inst_timer(%s %s)\n",
1568 		    AF_STR(pii->pii_af), pii->pii_name);
1569 	}
1570 
1571 	pii_other = phyint_inst_other(pii);
1572 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1573 		/*
1574 		 * Check to see if we're here due to link up/down flapping; If
1575 		 * enough time has passed, then try to bring the interface
1576 		 * back up; otherwise, schedule a timer to bring it back up
1577 		 * when enough time *has* elapsed.
1578 		 */
1579 		pi = pii->pii_phyint;
1580 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1581 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1582 			if (check_time > cur_time)
1583 				return (check_time - cur_time);
1584 
1585 			phyint_check_for_repair(pi);
1586 		}
1587 	}
1588 
1589 	/*
1590 	 * If probing is not enabled on this phyint instance, don't proceed.
1591 	 */
1592 	if (!PROBE_ENABLED(pii))
1593 		return (TIMER_INFINITY);
1594 
1595 	/*
1596 	 * If the timer has fired too soon, probably triggered
1597 	 * by some other phyint instance, return the remaining
1598 	 * time
1599 	 */
1600 	if (TIME_LT(cur_time, pii->pii_snxt_time))
1601 		return (pii->pii_snxt_time - cur_time);
1602 
1603 	/*
1604 	 * If the link is down, don't send any probes for now.
1605 	 */
1606 	if (LINK_DOWN(pii->pii_phyint))
1607 		return (TIMER_INFINITY);
1608 
1609 	/*
1610 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1611 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1612 	 * Base probe time is strictly periodic.
1613 	 */
1614 	interval = GET_RANDOM(
1615 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1616 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1617 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1618 
1619 	/*
1620 	 * Check if the current time > next time to probe. If so, we missed
1621 	 * sending 1 or more probes, probably due to heavy system load. At least
1622 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1623 	 * were scheduled. Make adjustments to the times, in multiples of
1624 	 * user_probe_interval.
1625 	 */
1626 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1627 		int n;
1628 
1629 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1630 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1631 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1632 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1633 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1634 		    pii->pii_snxt_basetime);
1635 
1636 		/* Collect statistics about missed probes */
1637 		probes_missed.pm_nprobes += n + 1;
1638 		probes_missed.pm_ntimes++;
1639 	}
1640 	pii->pii_snxt_basetime += user_probe_interval;
1641 	interval = pii->pii_snxt_time - cur_time;
1642 	if (debug & D_TARGET) {
1643 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1644 		    " interval %u\n", cur_time, pii->pii_snxt_time,
1645 		    pii->pii_snxt_basetime, interval);
1646 	}
1647 
1648 	/*
1649 	 * If no targets are known, we need to send an ICMP multicast. The
1650 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1651 	 * to see if we found a target.
1652 	 */
1653 	if (pii->pii_target_next == NULL) {
1654 		assert(pii->pii_ntargets == 0);
1655 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1656 		probe(pii, PROBE_MULTI, cur_time);
1657 		return (interval);
1658 	}
1659 
1660 	if ((user_probe_interval != probe_interval) &&
1661 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1662 		/*
1663 		 * the failure detection (fd) probe timer has not yet fired.
1664 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1665 		 */
1666 		probe(pii, PROBE_RTT, cur_hrtime);
1667 		return (interval);
1668 	}
1669 	/*
1670 	 * the fd probe timer has fired. Need to do all failure
1671 	 * detection / recovery calculations, and then send an fd probe
1672 	 * of type PROBE_UNI.
1673 	 */
1674 	if (user_probe_interval == probe_interval) {
1675 		/*
1676 		 * We could have missed some probes, and then adjusted
1677 		 * pii_snxt_basetime above. Otherwise we could have
1678 		 * blindly added probe_interval to pii_fd_snxt_basetime.
1679 		 */
1680 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1681 	} else {
1682 		pii->pii_fd_snxt_basetime += probe_interval;
1683 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1684 			int n;
1685 
1686 			n = (cur_time - pii->pii_fd_snxt_basetime) /
1687 			    probe_interval;
1688 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1689 		}
1690 	}
1691 
1692 	/*
1693 	 * We can have at most, the latest 2 probes that we sent, in
1694 	 * the PR_UNACKED state. All previous probes sent, are either
1695 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1696 	 * timed out if the probe's time_start + the CRTT < currenttime.
1697 	 * For each of the last 2 probes, examine whether it has timed
1698 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1699 	 */
1700 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1701 	valid_unack_count = 0;
1702 
1703 	for (i = 0; i < 2; i++) {
1704 		pr_statp = &pii->pii_probes[pr_ndx];
1705 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1706 		switch (pr_statp->pr_status) {
1707 		case PR_ACKED:
1708 			/*
1709 			 * We received back an ACK, so the switch clearly
1710 			 * is not dropping our traffic, and thus we can
1711 			 * enable failure detection immediately.
1712 			 */
1713 			if (pii->pii_fd_hrtime > gethrtime()) {
1714 				if (debug & D_PROBE) {
1715 					logdebug("successful probe on %s; "
1716 					    "ending quiet period\n",
1717 					    pii->pii_phyint->pi_name);
1718 				}
1719 				pii->pii_fd_hrtime = gethrtime();
1720 			}
1721 			break;
1722 
1723 		case PR_UNACKED:
1724 			assert(cur_tg != NULL);
1725 			/*
1726 			 * The crtt could be zero for some reason,
1727 			 * Eg. the phyint could be failed. If the crtt is
1728 			 * not available use group's probe interval,
1729 			 * which is a worst case estimate.
1730 			 */
1731 			timeout = ns2ms(pr_statp->pr_hrtime_start);
1732 			if (cur_tg->tg_crtt != 0) {
1733 				timeout += cur_tg->tg_crtt;
1734 			} else {
1735 				timeout += probe_interval;
1736 			}
1737 			if (TIME_LT(timeout, cur_time)) {
1738 				pr_statp->pr_time_lost = timeout;
1739 				probe_chstate(pr_statp, pii, PR_LOST);
1740 			} else if (i == 1) {
1741 				/*
1742 				 * We are forced to consider this probe
1743 				 * lost, as we can have at most 2 unack.
1744 				 * probes any time, and we will be sending a
1745 				 * probe at the end of this function.
1746 				 * Normally, we should not be here, but
1747 				 * this can happen if an incoming response
1748 				 * that was considered lost has increased
1749 				 * the crtt for this target, and also bumped
1750 				 * up the FDT. Note that we never cancel or
1751 				 * increase the current pii_time_left, so
1752 				 * when the timer fires, we find 2 valid
1753 				 * unacked probes, and they are yet to timeout
1754 				 */
1755 				pr_statp->pr_time_lost = cur_time;
1756 				probe_chstate(pr_statp, pii, PR_LOST);
1757 			} else {
1758 				/*
1759 				 * Only the most recent probe can enter
1760 				 * this 'else' arm. The second most recent
1761 				 * probe must take either of the above arms,
1762 				 * if it is unacked.
1763 				 */
1764 				valid_unack_count++;
1765 			}
1766 			break;
1767 		}
1768 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1769 	}
1770 
1771 	/*
1772 	 * We send out 1 probe randomly in the interval between one half
1773 	 * and one probe interval for the group. Given that the CRTT is always
1774 	 * less than the group's probe interval, we can have at most 1
1775 	 * unacknowledged probe now.  All previous probes are either lost or
1776 	 * acked.
1777 	 */
1778 	assert(valid_unack_count == 0 || valid_unack_count == 1);
1779 
1780 	/*
1781 	 * The timer has fired. Take appropriate action depending
1782 	 * on the current state of the phyint.
1783 	 *
1784 	 * PI_RUNNING state 	- Failure detection
1785 	 * PI_FAILED state 	- Repair detection
1786 	 */
1787 	switch (pii->pii_phyint->pi_state) {
1788 	case PI_FAILED:
1789 		/*
1790 		 * If the most recent probe (excluding unacked probes that
1791 		 * are yet to time out) has been acked, check whether the
1792 		 * phyint is now repaired.
1793 		 */
1794 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1795 			phyint_check_for_repair(pii->pii_phyint);
1796 		}
1797 		break;
1798 
1799 	case PI_RUNNING:
1800 		/*
1801 		 * It's possible our probes have been lost because of a
1802 		 * spanning-tree mandated quiet period on the switch.  If so,
1803 		 * ignore the lost probes.
1804 		 */
1805 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1806 			break;
1807 
1808 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1809 			/*
1810 			 * We have 1 or more failed probes (excluding unacked
1811 			 * probes that are yet to time out). Determine if the
1812 			 * phyint has failed.
1813 			 */
1814 			phyint_inst_check_for_failure(pii);
1815 		}
1816 		break;
1817 
1818 	default:
1819 		logerr("phyint_inst_timer: invalid state %d\n",
1820 		    pii->pii_phyint->pi_state);
1821 		abort();
1822 	}
1823 
1824 	/*
1825 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1826 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1827 	 * was called, the target list may be empty.
1828 	 */
1829 	if (pii->pii_target_next != NULL) {
1830 		probe(pii, PROBE_UNI, cur_hrtime);
1831 		/*
1832 		 * If we have just the one probe target, and we're not using
1833 		 * router targets, try to find another as we presently have
1834 		 * no resilience.
1835 		 */
1836 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1837 			probe(pii, PROBE_MULTI, cur_hrtime);
1838 	} else {
1839 		probe(pii, PROBE_MULTI, cur_hrtime);
1840 	}
1841 	return (interval);
1842 }
1843 
1844 /*
1845  * Start the probe timer for an interface instance.
1846  */
1847 void
1848 start_timer(struct phyint_instance *pii)
1849 {
1850 	uint32_t interval;
1851 
1852 	/*
1853 	 * Spread the base probe times (pi_snxt_basetime) across phyints
1854 	 * uniformly over the (curtime..curtime + the group's probe_interval).
1855 	 * pi_snxt_basetime is strictly periodic with a frequency of
1856 	 * the group's probe interval. The actual probe time pi_snxt_time
1857 	 * adds some randomness to pi_snxt_basetime and happens in probe().
1858 	 * For the 1st probe on each phyint after the timer is started,
1859 	 * pi_snxt_time and pi_snxt_basetime are the same.
1860 	 */
1861 	interval = GET_RANDOM(0,
1862 	    (int)pii->pii_phyint->pi_group->pg_probeint);
1863 
1864 	pii->pii_snxt_basetime = getcurrenttime() + interval;
1865 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1866 	pii->pii_snxt_time = pii->pii_snxt_basetime;
1867 	timer_schedule(interval);
1868 }
1869 
1870 /*
1871  * Restart the probe timer on an interface instance.
1872  */
1873 static void
1874 restart_timer(struct phyint_instance *pii)
1875 {
1876 	/*
1877 	 * We don't need to restart the timer if it was never started in
1878 	 * the first place (pii->pii_basetime_inited not set), as the timer
1879 	 * won't have gone off yet.
1880 	 */
1881 	if (pii->pii_basetime_inited != 0) {
1882 
1883 		if (debug & D_LINKNOTE)
1884 			logdebug("restart timer: restarting timer on %s, "
1885 			    "address family %s\n", pii->pii_phyint->pi_name,
1886 			    AF_STR(pii->pii_af));
1887 
1888 		start_timer(pii);
1889 	}
1890 }
1891 
1892 static void
1893 process_link_state_down(struct phyint *pi)
1894 {
1895 	logerr("The link has gone down on %s\n", pi->pi_name);
1896 
1897 	/*
1898 	 * Clear the probe statistics arrays, we don't want the repair
1899 	 * detection logic relying on probes that were successful prior
1900 	 * to the link going down.
1901 	 */
1902 	if (PROBE_CAPABLE(pi->pi_v4))
1903 		clear_pii_probe_stats(pi->pi_v4);
1904 	if (PROBE_CAPABLE(pi->pi_v6))
1905 		clear_pii_probe_stats(pi->pi_v6);
1906 	/*
1907 	 * Check for interface failure.  Although we know the interface
1908 	 * has failed, we don't know if all the other interfaces in the
1909 	 * group have failed as well.
1910 	 */
1911 	if ((pi->pi_state == PI_RUNNING) ||
1912 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1913 		if (debug & D_LINKNOTE) {
1914 			logdebug("process_link_state_down:"
1915 			    " checking for failure on %s\n", pi->pi_name);
1916 		}
1917 
1918 		if (pi->pi_v4 != NULL)
1919 			phyint_inst_check_for_failure(pi->pi_v4);
1920 		else if (pi->pi_v6 != NULL)
1921 			phyint_inst_check_for_failure(pi->pi_v6);
1922 	}
1923 }
1924 
1925 static void
1926 process_link_state_up(struct phyint *pi)
1927 {
1928 	logerr("The link has come up on %s\n", pi->pi_name);
1929 
1930 	/*
1931 	 * We stopped any running timers on each instance when the link
1932 	 * went down, so restart them.
1933 	 */
1934 	if (pi->pi_v4)
1935 		restart_timer(pi->pi_v4);
1936 	if (pi->pi_v6)
1937 		restart_timer(pi->pi_v6);
1938 
1939 	phyint_check_for_repair(pi);
1940 
1941 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1942 	if (pi->pi_whendx == LINK_UP_PERMIN)
1943 		pi->pi_whendx = 0;
1944 }
1945 
1946 /*
1947  * Process any changes in link state passed up from the interfaces.
1948  */
1949 void
1950 process_link_state_changes(void)
1951 {
1952 	struct phyint *pi;
1953 
1954 	/* Look for interfaces where the link state has just changed */
1955 
1956 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1957 		boolean_t old_link_state_up = LINK_UP(pi);
1958 
1959 		/*
1960 		 * Except when the "phyint" structure is created, this is
1961 		 * the only place the link state is updated.  This allows
1962 		 * this routine to detect changes in link state, rather
1963 		 * than just the current state.
1964 		 */
1965 		UPDATE_LINK_STATE(pi);
1966 
1967 		if (LINK_DOWN(pi)) {
1968 			/*
1969 			 * Has link just gone down?
1970 			 */
1971 			if (old_link_state_up)
1972 				process_link_state_down(pi);
1973 		} else {
1974 			/*
1975 			 * Has link just gone back up?
1976 			 */
1977 			if (!old_link_state_up)
1978 				process_link_state_up(pi);
1979 		}
1980 	}
1981 }
1982 
1983 void
1984 reset_crtt_all(struct phyint *pi)
1985 {
1986 	struct phyint_instance *pii;
1987 	struct target *tg;
1988 
1989 	pii = pi->pi_v4;
1990 	if (pii != NULL) {
1991 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1992 			tg->tg_crtt = 0;
1993 			tg->tg_rtt_sa = -1;
1994 			tg->tg_rtt_sd = 0;
1995 		}
1996 	}
1997 
1998 	pii = pi->pi_v6;
1999 	if (pii != NULL) {
2000 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2001 			tg->tg_crtt = 0;
2002 			tg->tg_rtt_sa = -1;
2003 			tg->tg_rtt_sd = 0;
2004 		}
2005 	}
2006 }
2007 
2008 /*
2009  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
2010  * probes on both instances IPv4 and IPv6.
2011  * If the interface has failed, return the time of the first probe failure
2012  * in "tff".
2013  */
2014 static int
2015 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
2016 {
2017 	uint_t	pi_tff;
2018 	struct	target *cur_tg;
2019 	struct	probe_fail_count pfinfo;
2020 	struct	phyint_instance *pii_other;
2021 	int	pr_ndx;
2022 
2023 	/*
2024 	 * Get the number of consecutive failed probes on
2025 	 * this phyint across all targets. Also get the number
2026 	 * of consecutive failed probes on this target only
2027 	 */
2028 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2029 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
2030 	probe_fail_info(pii, cur_tg, &pfinfo);
2031 
2032 	/* Get the time of first failure, for later use */
2033 	pi_tff = pfinfo.pf_tff;
2034 
2035 	/*
2036 	 * If the current target has not responded to the
2037 	 * last NUM_PROBE_FAILS probes, and other targets are
2038 	 * responding delete this target. Dead gateway detection
2039 	 * will eventually remove this target (if router) from the
2040 	 * routing tables. If that does not occur, we may end
2041 	 * up adding this to our list again.
2042 	 */
2043 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2044 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2045 		if (pii->pii_targets_are_routers) {
2046 			if (cur_tg->tg_status == TG_ACTIVE)
2047 				pii->pii_ntargets--;
2048 			cur_tg->tg_status = TG_DEAD;
2049 			cur_tg->tg_crtt = 0;
2050 			cur_tg->tg_rtt_sa = -1;
2051 			cur_tg->tg_rtt_sd = 0;
2052 			if (pii->pii_target_next == cur_tg)
2053 				pii->pii_target_next = target_next(cur_tg);
2054 		} else {
2055 			target_delete(cur_tg);
2056 			probe(pii, PROBE_MULTI, gethrtime());
2057 		}
2058 		return (PHYINT_OK);
2059 	}
2060 
2061 	/*
2062 	 * If the phyint has lost NUM_PROBE_FAILS or more
2063 	 * consecutive probes, on both IPv4 and IPv6 protocol
2064 	 * instances of the phyint, then trigger failure
2065 	 * detection, else return false
2066 	 */
2067 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2068 		return (PHYINT_OK);
2069 
2070 	pii_other = phyint_inst_other(pii);
2071 	if (PROBE_CAPABLE(pii_other)) {
2072 		probe_fail_info(pii_other, NULL, &pfinfo);
2073 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2074 			/*
2075 			 * We have NUM_PROBE_FAILS or more failures
2076 			 * on both IPv4 and IPv6. Get the earliest
2077 			 * time when failure was detected on this
2078 			 * phyint across IPv4 and IPv6.
2079 			 */
2080 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2081 				pi_tff = pfinfo.pf_tff;
2082 		} else {
2083 			/*
2084 			 * This instance has < NUM_PROBE_FAILS failure.
2085 			 * So return false
2086 			 */
2087 			return (PHYINT_OK);
2088 		}
2089 	}
2090 	*tff = pi_tff;
2091 	return (PHYINT_FAILURE);
2092 }
2093 
2094 /*
2095  * Check if the link has gone down on this phyint, or it has failed the
2096  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2097  * Also look at other phyints of this group, for group failures.
2098  */
2099 int
2100 failure_state(struct phyint_instance *pii)
2101 {
2102 	struct	probe_success_count psinfo;
2103 	uint_t	pi2_tls;		/* time last success */
2104 	uint_t	pi_tff;			/* time first fail */
2105 	struct	phyint *pi2;
2106 	struct	phyint *pi;
2107 	struct	phyint_instance *pii2;
2108 	struct  phyint_group *pg;
2109 	int	retval;
2110 
2111 	if (debug & D_FAILREP)
2112 		logdebug("phyint_failed(%s)\n", pii->pii_name);
2113 
2114 	pi = pii->pii_phyint;
2115 	pg = pi->pi_group;
2116 
2117 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2118 	    PHYINT_OK)
2119 		return (PHYINT_OK);
2120 
2121 	/*
2122 	 * At this point, the link is down, or the phyint is suspect, as it
2123 	 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
2124 	 * belong to any group, this is a PHYINT_FAILURE.  Otherwise, continue
2125 	 * on to determine whether this should be considered a PHYINT_FAILURE
2126 	 * or GROUP_FAILURE.
2127 	 */
2128 	if (pg == phyint_anongroup)
2129 		return (PHYINT_FAILURE);
2130 
2131 	/*
2132 	 * Need to compare against other phyints of the same group
2133 	 * to exclude group failures. If the failure was detected via
2134 	 * probing, then if the time of last success (tls) of any
2135 	 * phyint is more recent than the time of first fail (tff) of the
2136 	 * phyint in question, and the link is up on the phyint,
2137 	 * then it is a phyint failure. Otherwise it is a group failure.
2138 	 * If failure was detected via a link down notification sent from
2139 	 * the driver to IP, we see if any phyints in the group are still
2140 	 * running and haven't received a link down notification.  We
2141 	 * will usually be processing the link down notification shortly
2142 	 * after it was received, so there is no point looking at the tls
2143 	 * of other phyints.
2144 	 */
2145 	retval = GROUP_FAILURE;
2146 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2147 		/* Exclude ourself from comparison */
2148 		if (pi2 == pi)
2149 			continue;
2150 
2151 		if (LINK_DOWN(pi)) {
2152 			/*
2153 			 * We use FLAGS_TO_LINK_STATE() to test the flags
2154 			 * directly, rather then LINK_UP() or LINK_DOWN(), as
2155 			 * we may not have got round to processing the link
2156 			 * state for the other phyints in the group yet.
2157 			 *
2158 			 * The check for PI_RUNNING and group failure handles
2159 			 * the case when the group begins to recover.
2160 			 * PI_RUNNING will be set, and group failure cleared
2161 			 * only after receipt of NUM_PROBE_REPAIRS, by which
2162 			 * time the other phyints should have received at
2163 			 * least 1 packet, and so will not have NUM_PROBE_FAILS.
2164 			 */
2165 			if ((pi2->pi_state == PI_RUNNING) &&
2166 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
2167 				retval = PHYINT_FAILURE;
2168 				break;
2169 			}
2170 			continue;
2171 		}
2172 
2173 		if (LINK_DOWN(pi2))
2174 			continue;
2175 
2176 		/*
2177 		 * If there's no probe-based failure detection on this
2178 		 * interface, and its link is still up, then it's still
2179 		 * working and thus the group has not failed.
2180 		 */
2181 		if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
2182 			retval = PHYINT_FAILURE;
2183 			break;
2184 		}
2185 
2186 		/*
2187 		 * Need to compare against both IPv4 and IPv6 instances.
2188 		 */
2189 		pii2 = pi2->pi_v4;
2190 		if (pii2 != NULL) {
2191 			probe_success_info(pii2, NULL, &psinfo);
2192 			if (psinfo.ps_tls_valid) {
2193 				pi2_tls = psinfo.ps_tls;
2194 				/*
2195 				 * See comment above regarding check
2196 				 * for PI_RUNNING and group failure.
2197 				 */
2198 				if (TIME_GT(pi2_tls, pi_tff) &&
2199 				    (pi2->pi_state == PI_RUNNING) &&
2200 				    !GROUP_FAILED(pg) &&
2201 				    FLAGS_TO_LINK_STATE(pi2)) {
2202 					retval = PHYINT_FAILURE;
2203 					break;
2204 				}
2205 			}
2206 		}
2207 
2208 		pii2 = pi2->pi_v6;
2209 		if (pii2 != NULL) {
2210 			probe_success_info(pii2, NULL, &psinfo);
2211 			if (psinfo.ps_tls_valid) {
2212 				pi2_tls = psinfo.ps_tls;
2213 				/*
2214 				 * See comment above regarding check
2215 				 * for PI_RUNNING and group failure.
2216 				 */
2217 				if (TIME_GT(pi2_tls, pi_tff) &&
2218 				    (pi2->pi_state == PI_RUNNING) &&
2219 				    !GROUP_FAILED(pg) &&
2220 				    FLAGS_TO_LINK_STATE(pi2)) {
2221 					retval = PHYINT_FAILURE;
2222 					break;
2223 				}
2224 			}
2225 		}
2226 	}
2227 
2228 	/*
2229 	 * Update the group state to account for the changes.
2230 	 */
2231 	phyint_group_refresh_state(pg);
2232 	return (retval);
2233 }
2234 
2235 /*
2236  * Return the information associated with consecutive probe successes
2237  * starting with the most recent probe. At most the last 2 probes can be
2238  * in the unacknowledged state. All previous probes have either failed
2239  * or succeeded.
2240  */
2241 static void
2242 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2243     struct probe_success_count *psinfo)
2244 {
2245 	uint_t	i;
2246 	struct probe_stats *pr_statp;
2247 	uint_t most_recent;
2248 	uint_t second_most_recent;
2249 	boolean_t pi_found_failure = _B_FALSE;
2250 	boolean_t tg_found_failure = _B_FALSE;
2251 	uint_t now;
2252 	uint_t timeout;
2253 	struct target *tg;
2254 
2255 	if (debug & D_FAILREP)
2256 		logdebug("probe_success_info(%s)\n", pii->pii_name);
2257 
2258 	bzero(psinfo, sizeof (*psinfo));
2259 	now = getcurrenttime();
2260 
2261 	/*
2262 	 * Start with the most recent probe, and count the number
2263 	 * of consecutive probe successes. Latch the number of successes
2264 	 * on hitting a failure.
2265 	 */
2266 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2267 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2268 
2269 	for (i = most_recent; i != pii->pii_probe_next;
2270 	    i = PROBE_INDEX_PREV(i)) {
2271 		pr_statp = &pii->pii_probes[i];
2272 
2273 		switch (pr_statp->pr_status) {
2274 		case PR_UNACKED:
2275 			/*
2276 			 * Only the most recent 2 probes can be unacknowledged
2277 			 */
2278 			assert(i == most_recent || i == second_most_recent);
2279 
2280 			tg = pr_statp->pr_target;
2281 			assert(tg != NULL);
2282 			/*
2283 			 * The crtt could be zero for some reason,
2284 			 * Eg. the phyint could be failed. If the crtt is
2285 			 * not available use the value of the group's probe
2286 			 * interval which is a worst case estimate.
2287 			 */
2288 			timeout = ns2ms(pr_statp->pr_hrtime_start);
2289 			if (tg->tg_crtt != 0) {
2290 				timeout += tg->tg_crtt;
2291 			} else {
2292 				timeout +=
2293 				    pii->pii_phyint->pi_group->pg_probeint;
2294 			}
2295 
2296 			if (TIME_LT(timeout, now)) {
2297 				/*
2298 				 * We hit a failure. Latch the total number of
2299 				 * recent consecutive successes.
2300 				 */
2301 				pr_statp->pr_time_lost = timeout;
2302 				probe_chstate(pr_statp, pii, PR_LOST);
2303 				pi_found_failure = _B_TRUE;
2304 				if (cur_tg != NULL && tg == cur_tg) {
2305 					/*
2306 					 * We hit a failure for the desired
2307 					 * target. Latch the number of recent
2308 					 * consecutive successes for this target
2309 					 */
2310 					tg_found_failure = _B_TRUE;
2311 				}
2312 			}
2313 			break;
2314 
2315 		case PR_ACKED:
2316 			/*
2317 			 * Bump up the count of probe successes, if we
2318 			 * have not seen any failure so far.
2319 			 */
2320 			if (!pi_found_failure)
2321 				psinfo->ps_nsucc++;
2322 
2323 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2324 			    !tg_found_failure) {
2325 				psinfo->ps_nsucc_tg++;
2326 			}
2327 
2328 			/*
2329 			 * Record the time of last success, if this is
2330 			 * the most recent probe success.
2331 			 */
2332 			if (!psinfo->ps_tls_valid) {
2333 				psinfo->ps_tls =
2334 				    ns2ms(pr_statp->pr_hrtime_ackproc);
2335 				psinfo->ps_tls_valid = _B_TRUE;
2336 			}
2337 			break;
2338 
2339 		case PR_LOST:
2340 			/*
2341 			 * We hit a failure. Latch the total number of
2342 			 * recent consecutive successes.
2343 			 */
2344 			pi_found_failure = _B_TRUE;
2345 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2346 				/*
2347 				 * We hit a failure for the desired target.
2348 				 * Latch the number of recent consecutive
2349 				 * successes for this target
2350 				 */
2351 				tg_found_failure = _B_TRUE;
2352 			}
2353 			break;
2354 
2355 		default:
2356 			return;
2357 
2358 		}
2359 	}
2360 }
2361 
2362 /*
2363  * Return the information associated with consecutive probe failures
2364  * starting with the most recent probe. Only the last 2 probes can be in the
2365  * unacknowledged state. All previous probes have either failed or succeeded.
2366  */
2367 static void
2368 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2369     struct probe_fail_count *pfinfo)
2370 {
2371 	int	i;
2372 	struct probe_stats *pr_statp;
2373 	boolean_t	tg_found_success = _B_FALSE;
2374 	boolean_t	pi_found_success = _B_FALSE;
2375 	int	most_recent;
2376 	int	second_most_recent;
2377 	uint_t	now;
2378 	uint_t	timeout;
2379 	struct	target *tg;
2380 
2381 	if (debug & D_FAILREP)
2382 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2383 
2384 	bzero(pfinfo, sizeof (*pfinfo));
2385 	now = getcurrenttime();
2386 
2387 	/*
2388 	 * Start with the most recent probe, and count the number
2389 	 * of consecutive probe failures. Latch the number of failures
2390 	 * on hitting a probe success.
2391 	 */
2392 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2393 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2394 
2395 	for (i = most_recent; i != pii->pii_probe_next;
2396 	    i = PROBE_INDEX_PREV(i)) {
2397 		pr_statp = &pii->pii_probes[i];
2398 
2399 		assert(PR_STATUS_VALID(pr_statp->pr_status));
2400 
2401 		switch (pr_statp->pr_status) {
2402 		case PR_UNACKED:
2403 			/*
2404 			 * Only the most recent 2 probes can be unacknowledged
2405 			 */
2406 			assert(i == most_recent || i == second_most_recent);
2407 
2408 			tg = pr_statp->pr_target;
2409 			/*
2410 			 * Target is guaranteed to exist in the unack. state
2411 			 */
2412 			assert(tg != NULL);
2413 			/*
2414 			 * The crtt could be zero for some reason,
2415 			 * Eg. the phyint could be failed. If the crtt is
2416 			 * not available use the group's probe interval,
2417 			 * which is a worst case estimate.
2418 			 */
2419 			timeout = ns2ms(pr_statp->pr_hrtime_start);
2420 			if (tg->tg_crtt != 0) {
2421 				timeout += tg->tg_crtt;
2422 			} else {
2423 				timeout +=
2424 				    pii->pii_phyint->pi_group->pg_probeint;
2425 			}
2426 
2427 			if (TIME_GT(timeout, now))
2428 				break;
2429 
2430 			pr_statp->pr_time_lost = timeout;
2431 			probe_chstate(pr_statp, pii, PR_LOST);
2432 			/* FALLTHRU */
2433 
2434 		case PR_LOST:
2435 			if (!pi_found_success) {
2436 				pfinfo->pf_nfail++;
2437 				pfinfo->pf_tff = pr_statp->pr_time_lost;
2438 			}
2439 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2440 			    !tg_found_success)  {
2441 				pfinfo->pf_nfail_tg++;
2442 			}
2443 			break;
2444 
2445 		default:
2446 			/*
2447 			 * We hit a success or unused slot. Latch the
2448 			 * total number of recent consecutive failures.
2449 			 */
2450 			pi_found_success = _B_TRUE;
2451 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2452 				/*
2453 				 * We hit a success for the desired target.
2454 				 * Latch the number of recent consecutive
2455 				 * failures for this target
2456 				 */
2457 				tg_found_success = _B_TRUE;
2458 			}
2459 		}
2460 	}
2461 }
2462 
2463 /*
2464  * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
2465  */
2466 void
2467 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
2468 {
2469 	if (pr->pr_status == state)
2470 		return;
2471 
2472 	pr->pr_status = state;
2473 	(void) probe_state_event(pr, pii);
2474 }
2475 
2476 /*
2477  * Check if the phyint has been repaired.  If no test address has been
2478  * configured, then consider the interface repaired if the link is up (unless
2479  * the link is flapping; see below).  Otherwise, look for proof of probes
2480  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2481  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2482  */
2483 static boolean_t
2484 phyint_repaired(struct phyint *pi)
2485 {
2486 	struct	probe_success_count psinfo;
2487 	struct	phyint_instance *pii;
2488 	struct	target *cur_tg;
2489 	int	pr_ndx;
2490 	uint_t	cur_time;
2491 
2492 	if (debug & D_FAILREP)
2493 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2494 
2495 	if (LINK_DOWN(pi))
2496 		return (_B_FALSE);
2497 
2498 	/*
2499 	 * If we don't have any test addresses and the link is up, then
2500 	 * consider the interface repaired, unless we've received more than
2501 	 * LINK_UP_PERMIN link up notifications in the last minute, in
2502 	 * which case we keep the link down until we drop back below
2503 	 * the threshold.
2504 	 */
2505 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2506 		cur_time = getcurrenttime();
2507 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2508 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2509 			pi->pi_lfmsg_printed = 0;
2510 			return (_B_TRUE);
2511 		}
2512 		if (!pi->pi_lfmsg_printed) {
2513 			logerr("The link has come up on %s more than %d times "
2514 			    "in the last minute; disabling repair until it "
2515 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2516 			pi->pi_lfmsg_printed = 1;
2517 		}
2518 
2519 		return (_B_FALSE);
2520 	}
2521 
2522 	pii = pi->pi_v4;
2523 	if (PROBE_CAPABLE(pii)) {
2524 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2525 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2526 		probe_success_info(pii, cur_tg, &psinfo);
2527 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2528 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2529 			return (_B_TRUE);
2530 	}
2531 
2532 	pii = pi->pi_v6;
2533 	if (PROBE_CAPABLE(pii)) {
2534 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2535 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2536 		probe_success_info(pii, cur_tg, &psinfo);
2537 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2538 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2539 			return (_B_TRUE);
2540 	}
2541 
2542 	return (_B_FALSE);
2543 }
2544 
2545 /*
2546  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2547  */
2548 boolean_t
2549 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
2550 {
2551 	int ifsock;
2552 	struct lifreq lifr;
2553 	uint64_t old_flags;
2554 
2555 	if (debug & D_FAILREP) {
2556 		logdebug("change_pif_flags(%s): set %llx clear %llx\n",
2557 		    pi->pi_name, set, clear);
2558 	}
2559 
2560 	if (pi->pi_v4 != NULL)
2561 		ifsock = ifsock_v4;
2562 	else
2563 		ifsock = ifsock_v6;
2564 
2565 	/*
2566 	 * Get the current flags from the kernel, and set/clear the
2567 	 * desired phyint flags. Since we set only phyint flags, we can
2568 	 * do it on either IPv4 or IPv6 instance.
2569 	 */
2570 	(void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2571 
2572 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2573 		if (errno != ENXIO)
2574 			logperror("change_pif_flags: ioctl (get flags)");
2575 		return (_B_FALSE);
2576 	}
2577 
2578 	old_flags = lifr.lifr_flags;
2579 	lifr.lifr_flags |= set;
2580 	lifr.lifr_flags &= ~clear;
2581 
2582 	if (old_flags == lifr.lifr_flags) {
2583 		/* No change in the flags. No need to send ioctl */
2584 		return (_B_TRUE);
2585 	}
2586 
2587 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2588 		if (errno != ENXIO)
2589 			logperror("change_pif_flags: ioctl (set flags)");
2590 		return (_B_FALSE);
2591 	}
2592 
2593 	/*
2594 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2595 	 * phyint flags.
2596 	 */
2597 	pi->pi_flags |= set;
2598 	pi->pi_flags &= ~clear;
2599 
2600 	if (pi->pi_v4 != NULL)
2601 		pi->pi_v4->pii_flags = pi->pi_flags;
2602 
2603 	if (pi->pi_v6 != NULL)
2604 		pi->pi_v6->pii_flags = pi->pi_flags;
2605 
2606 	return (_B_TRUE);
2607 }
2608 
2609 /*
2610  * icmp cksum computation for IPv4.
2611  */
2612 static int
2613 in_cksum(ushort_t *addr, int len)
2614 {
2615 	register int nleft = len;
2616 	register ushort_t *w = addr;
2617 	register ushort_t answer;
2618 	ushort_t odd_byte = 0;
2619 	register int sum = 0;
2620 
2621 	/*
2622 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2623 	 *  we add sequential 16 bit words to it, and at the end, fold
2624 	 *  back all the carry bits from the top 16 bits into the lower
2625 	 *  16 bits.
2626 	 */
2627 	while (nleft > 1)  {
2628 		sum += *w++;
2629 		nleft -= 2;
2630 	}
2631 
2632 	/* mop up an odd byte, if necessary */
2633 	if (nleft == 1) {
2634 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2635 		sum += odd_byte;
2636 	}
2637 
2638 	/*
2639 	 * add back carry outs from top 16 bits to low 16 bits
2640 	 */
2641 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2642 	sum += (sum >> 16);			/* add carry */
2643 	answer = ~sum;				/* truncate to 16 bits */
2644 	return (answer);
2645 }
2646 
2647 static void
2648 reset_snxt_basetimes(void)
2649 {
2650 	struct phyint_instance *pii;
2651 
2652 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2653 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2654 	}
2655 }
2656 
2657 /*
2658  * Is the address one of our own addresses? Unfortunately,
2659  * we cannot check our phyint tables to determine if the address
2660  * is our own. This is because, we don't track interfaces that
2661  * are not part of any group. We have to either use a 'bind' or
2662  * get the complete list of all interfaces using SIOCGLIFCONF,
2663  * to do this check. We could also use SIOCTMYADDR.
2664  * Bind fails for the local zone address, so we might include local zone
2665  * address as target address. If local zone address is a target address
2666  * and it is up, it is not possible to detect the interface failure.
2667  * SIOCTMYADDR also doesn't consider local zone address as own address.
2668  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2669  * are stored in `localaddrs'
2670  */
2671 boolean_t
2672 own_address(struct in6_addr addr)
2673 {
2674 	addrlist_t *addrp;
2675 	struct sockaddr_storage ss;
2676 	int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
2677 
2678 	addr2storage(af, &addr, &ss);
2679 	for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
2680 		if (sockaddrcmp(&ss, &addrp->al_addr))
2681 			return (_B_TRUE);
2682 	}
2683 	return (_B_FALSE);
2684 }
2685 
2686 static int
2687 ns2ms(int64_t ns)
2688 {
2689 	return (ns / (NANOSEC / MILLISEC));
2690 }
2691 
2692 static int64_t
2693 tv2ns(struct timeval *tvp)
2694 {
2695 	return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
2696 }
2697