1/*
2 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 */
5
6/*
7 * Copyright (c) 1987 Regents of the University of California.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms are permitted
11 * provided that the above copyright notice and this paragraph are
12 * duplicated in all such forms and that any documentation,
13 * advertising materials, and other materials related to such
14 * distribution and use acknowledge that the software was developed
15 * by the University of California, Berkeley. The name of the
16 * University may not be used to endorse or promote products derived
17 * from this software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21 */
22
23#include "mpd_defs.h"
24#include "mpd_tables.h"
25
26/*
27 * Probe types for probe()
28 */
29#define	PROBE_UNI	0x1234		/* Unicast probe packet */
30#define	PROBE_MULTI	0x5678		/* Multicast probe packet */
31#define	PROBE_RTT	0x9abc		/* RTT only probe packet */
32
33#define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
34
35/*
36 * Format of probe / probe response packets. This is an ICMP Echo request
37 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
38 */
39struct pr_icmp
40{
41	uint8_t  pr_icmp_type;		/* type field */
42	uint8_t  pr_icmp_code;		/* code field */
43	uint16_t pr_icmp_cksum;		/* checksum field */
44	uint16_t pr_icmp_id;		/* Identification */
45	uint16_t pr_icmp_seq;		/* sequence number */
46	uint64_t pr_icmp_timestamp;	/* Time stamp (in ns) */
47	uint32_t pr_icmp_mtype;		/* Message type */
48};
49
50static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
51				    0x0, 0x0, 0x0, 0x0,
52				    0x0, 0x0, 0x0, 0x0,
53				    0x0, 0x0, 0x0, 0x1 } };
54
55static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
56
57static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
58
59static void		*find_ancillary(struct msghdr *msg, int cmsg_level,
60    int cmsg_type);
61static void		pi_set_crtt(struct target *tg, int64_t m,
62    boolean_t is_probe_uni);
63static void		incoming_echo_reply(struct phyint_instance *pii,
64    struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
65static void		incoming_rtt_reply(struct phyint_instance *pii,
66    struct pr_icmp *reply, struct in6_addr fromaddr);
67static void		incoming_mcast_reply(struct phyint_instance *pii,
68    struct pr_icmp *reply, struct in6_addr fromaddr);
69
70static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
71static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
72static boolean_t	check_exception_target(struct phyint_instance *pii,
73    struct target *target);
74static void		probe_fail_info(struct phyint_instance *pii,
75    struct target *cur_tg, struct probe_fail_count *pfinfo);
76static void		probe_success_info(struct phyint_instance *pii,
77    struct target *cur_tg, struct probe_success_count *psinfo);
78static boolean_t	phyint_repaired(struct phyint *pi);
79
80static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
81static int 		in_cksum(ushort_t *addr, int len);
82static void		reset_snxt_basetimes(void);
83static int		ns2ms(int64_t ns);
84static int64_t		tv2ns(struct timeval *);
85
86/*
87 * CRTT - Conservative Round Trip Time Estimate
88 * Probe success - A matching probe reply received before CRTT ms has elapsed
89 *	after sending the probe.
90 * Probe failure - No probe reply received and more than CRTT ms has elapsed
91 *	after sending the probe.
92 *
93 * TLS - Time last success. Most recent probe ack received at this time.
94 * TFF - Time first fail. The time of the earliest probe failure in
95 *	a consecutive series of probe failures.
96 * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
97 * 	before declaring phyint repair.
98 * NUM_PROBE_FAILS - Number of consecutive probe failures required to
99 *	declare a phyint failure.
100 *
101 * 			Phyint state diagram
102 *
103 * The state of a phyint that is capable of being probed, is completely
104 * specified by the 3-tuple <pi_state, pg_state, I>.
105 *
106 * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
107 * IFF_OFFLINE is set.  If the phyint is also configured with a test address
108 * (the common case) and probe targets, then a phyint must also successfully
109 * be able to send and receive probes in order to remain in the PI_RUNNING
110 * state (otherwise, it transitions to PI_FAILED).
111 *
112 * Further, if a PI_RUNNING phyint is configured with a test address but is
113 * unable to find any probe targets, it will transition to the PI_NOTARGETS
114 * state, which indicates that the link is apparently functional but that
115 * in.mpathd is unable to send probes to verify functionality (in this case,
116 * in.mpathd makes the optimistic assumption that the interface is working
117 * correctly and thus does not mark the interface FAILED, but reports it as
118 * IPMP_IF_UNKNOWN through the async events and query interfaces).
119 *
120 * At any point, a phyint may be administratively marked offline via if_mpadm.
121 * In this case, the interface always transitions to PI_OFFLINE, regardless
122 * of its previous state.  When the interface is later brought back online,
123 * in.mpathd acts as if the interface is new (and thus it transitions to
124 * PI_RUNNING or PI_FAILED based on the status of the link and the result of
125 * its probes, if probes are sent).
126 *
127 * pi_state -  PI_RUNNING or PI_FAILED
128 *	PI_RUNNING: The failure detection logic says the phyint is good.
129 *	PI_FAILED: The failure detection logic says the phyint has failed.
130 *
131 * pg_state  - PG_OK, PG_DEGRADED, or PG_FAILED.
132 *	PG_OK: All interfaces in the group are OK.
133 *	PG_DEGRADED: Some interfaces in the group are unusable.
134 *	PG_FAILED: All interfaces in the group are unusable.
135 *
136 *	In the case of router targets, we assume that the current list of
137 *	targets obtained from the routing table, is still valid, so the
138 *	phyint stat is PI_FAILED. In the case of host targets, we delete the
139 *	list of targets, and multicast to the all hosts, to reconstruct the
140 *	target list. So the phyints are in the PI_NOTARGETS state.
141 *
142 * I -	value of (pi_flags & IFF_INACTIVE)
143 *	IFF_INACTIVE: This phyint will not send or receive packets.
144 *	Usually, inactive is tied to standby interfaces that are not yet
145 *	needed (e.g., no non-standby interfaces in the group have failed).
146 *	When failback has been disabled (FAILBACK=no configured), phyint can
147 *	also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
148 *	subsequently recovers after a failure.
149 *
150 * Not all 9 possible combinations of the above 3-tuple are possible.
151 *
152 * I is tracked by IP. pi_state is tracked by mpathd.
153 *
154 *			pi_state state machine
155 * ---------------------------------------------------------------------------
156 *	Event			State			New State
157 *				Action:
158 * ---------------------------------------------------------------------------
159 *	IP interface failure	(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
160 *	detection		: set IFF_FAILED on this phyint
161 *
162 *	IP interface failure	(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
163 *	detection		: set IFF_FAILED on this phyint
164 *
165 *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=yes)
166 *	detection				     -> (PI_RUNNING, I == 0)
167 *				: clear IFF_FAILED on this phyint
168 *
169 *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=no)
170 *	detection				     ->	(PI_RUNNING, I == 1)
171 *				: clear IFF_FAILED on this phyint
172 *				: if failback is disabled set I == 1
173 *
174 *	Group failure		(perform on all phyints in the group)
175 *	detection 		PI_RUNNING		PI_FAILED
176 *	(Router targets)	: set IFF_FAILED
177 *
178 *	Group failure		(perform on all phyints in the group)
179 *	detection 		PI_RUNNING		PI_NOTARGETS
180 *	(Host targets)		: set IFF_FAILED
181 *				: delete the target list on all phyints
182 * ---------------------------------------------------------------------------
183 */
184
185struct probes_missed probes_missed;
186
187/*
188 * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
189 * will be added on by the kernel.  The id field identifies this phyint.
190 * and the sequence number is an increasing (modulo 2^^16) integer. The data
191 * portion holds the time value when the packet is sent. On echo this is
192 * extracted to compute the round-trip time. Three different types of
193 * probe packets are used.
194 *
195 * PROBE_UNI: This type is used to do failure detection / failure recovery
196 *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
197 *	not less than the current CRTT. pii_probes[] stores data
198 *	about these probes. These packets consume sequence number space.
199 *
200 * PROBE_RTT: This type is used to make only rtt measurements. Normally these
201 * 	are not used. Under heavy network load, the rtt may go up very high,
202 *	due to a spike, or may appear to go high, due to extreme scheduling
203 * 	delays. Once the network stress is removed, mpathd takes long time to
204 *	recover, because the probe_interval is already high, and it takes
205 *	a long time to send out sufficient number of probes to bring down the
206 *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
207 *	user_probe_interval ms. and will cause only rtt updates. These packets
208 *	do not consume sequence number space nor is information about these
209 *	packets stored in the pii_probes[]
210 *
211 * PROBE_MULTI: This type is only used to construct a list of targets, when
212 *	no targets are known. The packet is multicast to the all hosts addr.
213 */
214static void
215probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
216{
217	hrtime_t sent_hrtime;
218	struct timeval sent_tv;
219	struct pr_icmp probe_pkt;	/* Probe packet */
220	struct sockaddr_storage targ;	/* target address */
221	uint_t	targaddrlen;		/* targed address length */
222	int	pr_ndx;			/* probe index in pii->pii_probes[] */
223	boolean_t sent = _B_FALSE;
224	int	rval;
225
226	if (debug & D_TARGET) {
227		logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
228		    pii->pii_name, probe_type, start_hrtime);
229	}
230
231	assert(pii->pii_probe_sock != -1);
232	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
233	    probe_type == PROBE_RTT);
234
235	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
236	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
237	probe_pkt.pr_icmp_code = 0;
238	probe_pkt.pr_icmp_cksum = 0;
239	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
240
241	/*
242	 * Since there is no need to do arithmetic on the icmpid,
243	 * (only equality check is done) pii_icmpid is stored in
244	 * network byte order at initialization itself.
245	 */
246	probe_pkt.pr_icmp_id = pii->pii_icmpid;
247	probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
248	probe_pkt.pr_icmp_mtype = htonl(probe_type);
249
250	/*
251	 * If probe_type is PROBE_MULTI, this packet will be multicast to
252	 * the all hosts address. Otherwise it is unicast to the next target.
253	 */
254	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
255	    pii->pii_rtt_target_next != NULL));
256
257	bzero(&targ, sizeof (targ));
258	targ.ss_family = pii->pii_af;
259
260	if (pii->pii_af == AF_INET6) {
261		struct in6_addr *addr6;
262
263		addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
264		targaddrlen = sizeof (struct sockaddr_in6);
265		if (probe_type == PROBE_MULTI) {
266			*addr6 = all_nodes_mcast_v6;
267		} else if (probe_type == PROBE_UNI) {
268			*addr6 = pii->pii_target_next->tg_address;
269		} else { /* type is PROBE_RTT */
270			*addr6 = pii->pii_rtt_target_next->tg_address;
271		}
272	} else {
273		struct in_addr *addr4;
274
275		addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
276		targaddrlen = sizeof (struct sockaddr_in);
277		if (probe_type == PROBE_MULTI) {
278			*addr4 = all_nodes_mcast_v4;
279		} else if (probe_type == PROBE_UNI) {
280			IN6_V4MAPPED_TO_INADDR(
281			    &pii->pii_target_next->tg_address, addr4);
282		} else { /* type is PROBE_RTT */
283			IN6_V4MAPPED_TO_INADDR(
284			    &pii->pii_rtt_target_next->tg_address, addr4);
285		}
286
287		/*
288		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
289		 */
290		probe_pkt.pr_icmp_cksum =
291		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
292	}
293
294	/*
295	 * Use the current time as the time we sent.  Not atomic, but the best
296	 * we can do from here.
297	 */
298	sent_hrtime = gethrtime();
299	(void) gettimeofday(&sent_tv, NULL);
300	rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
301	    (struct sockaddr *)&targ, targaddrlen);
302	/*
303	 * If the send would block, this may either be transient or a hang in a
304	 * lower layer. We pretend the probe was actually sent, the daemon will
305	 * not see a reply to the probe and will fail the interface if normal
306	 * failure detection criteria are met.
307	 */
308	if (rval == sizeof (probe_pkt) ||
309	    (rval == -1 && errno == EWOULDBLOCK)) {
310		sent = _B_TRUE;
311	} else {
312		logperror_pii(pii, "probe: probe sendto");
313	}
314
315	/*
316	 * If this is a PROBE_UNI probe packet being unicast to a target, then
317	 * update our tables. We will need this info in processing the probe
318	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
319	 * the purpose of failure or recovery detection. PROBE_MULTI packets
320	 * are only used to construct a list of targets. PROBE_RTT packets are
321	 * used only for updating the rtt and not for failure detection.
322	 */
323	if (probe_type == PROBE_UNI && sent) {
324		pr_ndx = pii->pii_probe_next;
325		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
326
327		/* Collect statistics, before we reuse the last slot. */
328		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
329			pii->pii_cum_stats.lost++;
330		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
331			pii->pii_cum_stats.acked++;
332		pii->pii_cum_stats.sent++;
333
334		pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
335		pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
336		pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
337		pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
338		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
339		probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
340
341		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
342		pii->pii_target_next = target_next(pii->pii_target_next);
343		assert(pii->pii_target_next != NULL);
344		/*
345		 * If we have a single variable to denote the next target to
346		 * probe for both rtt probes and failure detection probes, we
347		 * could end up with a situation where the failure detection
348		 * probe targets become disjoint from the rtt probe targets.
349		 * Eg. if 2 targets and the actual fdt is double the user
350		 * specified fdt. So we have 2 variables. In this scheme
351		 * we also reset pii_rtt_target_next for every fdt probe,
352		 * though that may not be necessary.
353		 */
354		pii->pii_rtt_target_next = pii->pii_target_next;
355		pii->pii_snxt++;
356	} else if (probe_type == PROBE_RTT) {
357		pii->pii_rtt_target_next =
358		    target_next(pii->pii_rtt_target_next);
359		assert(pii->pii_rtt_target_next != NULL);
360	}
361}
362
363/*
364 * Incoming IPv4 data from wire, is received here. Called from main.
365 */
366void
367in_data(struct phyint_instance *pii)
368{
369	struct	sockaddr_in 	from;
370	struct	in6_addr	fromaddr;
371	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
372	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
373	struct ip *ip;
374	int 	iphlen;
375	int 	len;
376	char 	abuf[INET_ADDRSTRLEN];
377	struct msghdr msg;
378	struct iovec iov;
379	struct pr_icmp *reply;
380	struct timeval *recv_tvp;
381
382	if (debug & D_PROBE) {
383		logdebug("in_data(%s %s)\n",
384		    AF_STR(pii->pii_af), pii->pii_name);
385	}
386
387	iov.iov_base = (char *)in_packet;
388	iov.iov_len = sizeof (in_packet);
389	msg.msg_iov = &iov;
390	msg.msg_iovlen = 1;
391	msg.msg_name = (struct sockaddr *)&from;
392	msg.msg_namelen = sizeof (from);
393	msg.msg_control = ancillary_data;
394	msg.msg_controllen = sizeof (ancillary_data);
395
396	/*
397	 * Poll has already told us that a message is waiting,
398	 * on this socket. Read it now. We should not block.
399	 */
400	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
401		logperror_pii(pii, "in_data: recvmsg");
402		return;
403	}
404
405	/*
406	 * If the datalink has indicated the link is down, don't go
407	 * any further.
408	 */
409	if (LINK_DOWN(pii->pii_phyint))
410		return;
411
412	/* Get the printable address for error reporting */
413	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
414
415	/* Ignore packets > 64k or control buffers that don't fit */
416	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
417		if (debug & D_PKTBAD) {
418			logdebug("Truncated message: msg_flags 0x%x from %s\n",
419			    msg.msg_flags, abuf);
420		}
421		return;
422	}
423
424	/* Make sure packet contains at least minimum ICMP header */
425	ip = (struct ip *)in_packet;
426	iphlen = ip->ip_hl << 2;
427	if (len < iphlen + ICMP_MINLEN) {
428		if (debug & D_PKTBAD) {
429			logdebug("in_data: packet too short (%d bytes)"
430			    " from %s\n", len, abuf);
431		}
432		return;
433	}
434
435	/*
436	 * Subtract the IP hdr length, 'len' will be length of the probe
437	 * reply, starting from the icmp hdr.
438	 */
439	len -= iphlen;
440	/* LINTED */
441	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
442
443	/* Probe replies are icmp echo replies. Ignore anything else */
444	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
445		return;
446
447	/*
448	 * The icmp id should match what we sent, which is stored
449	 * in pi_icmpid. The icmp code for reply must be 0.
450	 * The reply content must be a struct pr_icmp
451	 */
452	if (reply->pr_icmp_id != pii->pii_icmpid) {
453		/* Not in response to our probe */
454		return;
455	}
456
457	if (reply->pr_icmp_code != 0) {
458		logtrace("probe reply code %d from %s on %s\n",
459		    reply->pr_icmp_code, abuf, pii->pii_name);
460		return;
461	}
462
463	if (len < sizeof (struct pr_icmp)) {
464		logtrace("probe reply too short: %d bytes from %s on %s\n",
465		    len, abuf, pii->pii_name);
466		return;
467	}
468
469	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
470	if (recv_tvp == NULL) {
471		logtrace("message without timestamp from %s on %s\n",
472		    abuf, pii->pii_name);
473		return;
474	}
475
476	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
477	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
478		/* Unicast probe reply */
479		incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
480	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
481		/* Multicast reply */
482		incoming_mcast_reply(pii, reply, fromaddr);
483	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
484		incoming_rtt_reply(pii, reply, fromaddr);
485	} else {
486		/* Probably not in response to our probe */
487		logtrace("probe reply type: %d from %s on %s\n",
488		    reply->pr_icmp_mtype, abuf, pii->pii_name);
489		return;
490	}
491}
492
493/*
494 * Incoming IPv6 data from wire is received here. Called from main.
495 */
496void
497in6_data(struct phyint_instance *pii)
498{
499	struct sockaddr_in6 from;
500	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
501	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
502	int len;
503	char abuf[INET6_ADDRSTRLEN];
504	struct msghdr msg;
505	struct iovec iov;
506	void	*opt;
507	struct	pr_icmp *reply;
508	struct	timeval *recv_tvp;
509
510	if (debug & D_PROBE) {
511		logdebug("in6_data(%s %s)\n",
512		    AF_STR(pii->pii_af), pii->pii_name);
513	}
514
515	iov.iov_base = (char *)in_packet;
516	iov.iov_len = sizeof (in_packet);
517	msg.msg_iov = &iov;
518	msg.msg_iovlen = 1;
519	msg.msg_name = (struct sockaddr *)&from;
520	msg.msg_namelen = sizeof (from);
521	msg.msg_control = ancillary_data;
522	msg.msg_controllen = sizeof (ancillary_data);
523
524	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
525		logperror_pii(pii, "in6_data: recvmsg");
526		return;
527	}
528
529	/*
530	 * If the datalink has indicated that the link is down, don't go
531	 * any further.
532	 */
533	if (LINK_DOWN(pii->pii_phyint))
534		return;
535
536	/* Get the printable address for error reporting */
537	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
538	if (len < ICMP_MINLEN) {
539		if (debug & D_PKTBAD) {
540			logdebug("Truncated message: msg_flags 0x%x from %s\n",
541			    msg.msg_flags, abuf);
542		}
543		return;
544	}
545	/* Ignore packets > 64k or control buffers that don't fit */
546	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
547		if (debug & D_PKTBAD) {
548			logdebug("Truncated message: msg_flags 0x%x from %s\n",
549			    msg.msg_flags, abuf);
550		}
551		return;
552	}
553
554	reply = (struct pr_icmp *)in_packet;
555	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
556		return;
557
558	if (reply->pr_icmp_id != pii->pii_icmpid) {
559		/* Not in response to our probe */
560		return;
561	}
562
563	/*
564	 * The kernel has already verified the the ICMP checksum.
565	 */
566	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
567		logtrace("ICMPv6 echo reply source address not linklocal from "
568		    "%s on %s\n", abuf, pii->pii_name);
569		return;
570	}
571	opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
572	if (opt != NULL) {
573		/* Can't allow routing headers in probe replies  */
574		logtrace("message with routing header from %s on %s\n",
575		    abuf, pii->pii_name);
576		return;
577	}
578
579	if (reply->pr_icmp_code != 0) {
580		logtrace("probe reply code: %d from %s on %s\n",
581		    reply->pr_icmp_code, abuf, pii->pii_name);
582		return;
583	}
584	if (len < (sizeof (struct pr_icmp))) {
585		logtrace("probe reply too short: %d bytes from %s on %s\n",
586		    len, abuf, pii->pii_name);
587		return;
588	}
589
590	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
591	if (recv_tvp == NULL) {
592		logtrace("message without timestamp from %s on %s\n",
593		    abuf, pii->pii_name);
594		return;
595	}
596
597	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
598		incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
599	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
600		incoming_mcast_reply(pii, reply, from.sin6_addr);
601	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
602		incoming_rtt_reply(pii, reply, from.sin6_addr);
603	} else  {
604		/* Probably not in response to our probe */
605		logtrace("probe reply type: %d from %s on %s\n",
606		    reply->pr_icmp_mtype, abuf, pii->pii_name);
607	}
608}
609
610/*
611 * Process the incoming rtt reply, in response to our rtt probe.
612 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
613 * have any stored information about the probe we sent. So we don't log
614 * any errors if we receive bad replies.
615 */
616static void
617incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
618    struct in6_addr fromaddr)
619{
620	int64_t	m;		/* rtt measurement in ns */
621	char	abuf[INET6_ADDRSTRLEN];
622	struct	target	*target;
623	struct 	phyint_group *pg;
624
625	/* Get the printable address for error reporting */
626	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
627
628	if (debug & D_PROBE) {
629		logdebug("incoming_rtt_reply: %s %s %s\n",
630		    AF_STR(pii->pii_af), pii->pii_name, abuf);
631	}
632
633	/* Do we know this target ? */
634	target = target_lookup(pii, fromaddr);
635	if (target == NULL)
636		return;
637
638	m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
639	/* Invalid rtt. It has wrapped around */
640	if (m < 0)
641		return;
642
643	/*
644	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
645	 * The initial few responses after the interface is repaired may
646	 * contain high rtt's because they could have been queued up waiting
647	 * for ARP/NDP resolution on a failed interface.
648	 */
649	pg = pii->pii_phyint->pi_group;
650	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
651		return;
652
653	/*
654	 * Update rtt only if the new rtt is lower than the current rtt.
655	 * (specified by the 3rd parameter to pi_set_crtt).
656	 * If a spike has caused the current probe_interval to be >
657	 * user_probe_interval, then this mechanism is used to bring down
658	 * the rtt rapidly once the network stress is removed.
659	 * If the new rtt is higher than the current rtt, we don't want to
660	 * update the rtt. We are having more than 1 outstanding probe and
661	 * the increase in rtt we are seeing is being unnecessarily weighted
662	 * many times. The regular rtt update will be handled by
663	 * incoming_echo_reply() and will take care of any rtt increase.
664	 */
665	pi_set_crtt(target, m, _B_FALSE);
666	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
667	    (user_failure_detection_time < pg->pg_fdt) &&
668	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
669		/*
670		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
671		 * investigate if we can improve the failure detection time to
672		 * meet whatever the user specified.
673		 */
674		if (check_pg_crtt_improved(pg)) {
675			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
676			    user_failure_detection_time);
677			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
678			if (pii->pii_phyint->pi_group != phyint_anongroup) {
679				logerr("Improved failure detection time %d ms "
680				    "on (%s %s) for group \"%s\"\n",
681				    pg->pg_fdt, AF_STR(pii->pii_af),
682				    pii->pii_name,
683				    pii->pii_phyint->pi_group->pg_name);
684			}
685			if (user_failure_detection_time == pg->pg_fdt) {
686				/* Avoid any truncation or rounding errors */
687				pg->pg_probeint = user_probe_interval;
688				/*
689				 * No more rtt probes will be sent. The actual
690				 * fdt has dropped to the user specified value.
691				 * pii_fd_snxt_basetime and pii_snxt_basetime
692				 * will be in sync henceforth.
693				 */
694				reset_snxt_basetimes();
695			}
696		}
697	}
698}
699
700/*
701 * Process the incoming echo reply, in response to our unicast probe.
702 * Common for both IPv4 and IPv6
703 */
704static void
705incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
706    struct in6_addr fromaddr, struct timeval *recv_tvp)
707{
708	int64_t	m;		/* rtt measurement in ns */
709	hrtime_t cur_hrtime;	/* in ns from some arbitrary point */
710	char	abuf[INET6_ADDRSTRLEN];
711	int	pr_ndx;
712	struct	target	*target;
713	boolean_t exception;
714	uint64_t pr_icmp_timestamp;
715	uint16_t pr_icmp_seq;
716	struct	probe_stats *pr_statp;
717	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
718
719	/* Get the printable address for error reporting */
720	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
721
722	if (debug & D_PROBE) {
723		logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
724		    AF_STR(pii->pii_af), pii->pii_name, abuf,
725		    ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
726	}
727
728	pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
729	pr_icmp_seq = ntohs(reply->pr_icmp_seq);
730
731	/* Reject out of window probe replies */
732	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
733	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
734		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
735		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
736		pii->pii_cum_stats.unknown++;
737		return;
738	}
739
740	cur_hrtime = gethrtime();
741	m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
742	if (m < 0) {
743		/*
744		 * This is a ridiculously high value of rtt. rtt has wrapped
745		 * around. Log a message, and ignore the rtt.
746		 */
747		logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
748		    "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
749	}
750
751	/*
752	 * Get the probe index pr_ndx corresponding to the received icmp seq.
753	 * number in our pii->pii_probes[] array. The icmp sequence number
754	 * pii_snxt corresponds to the probe index pii->pii_probe_next
755	 */
756	pr_ndx = MOD_SUB(pii->pii_probe_next,
757	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
758
759	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
760
761	target = pii->pii_probes[pr_ndx].pr_target;
762
763	/*
764	 * Perform sanity checks, whether this probe reply that we
765	 * have received is genuine
766	 */
767	if (target != NULL) {
768		/*
769		 * Compare the src. addr of the received ICMP or ICMPv6
770		 * probe reply with the target address in our tables.
771		 */
772		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
773			/*
774			 * We don't have any record of having sent a probe to
775			 * this target. This is a fake probe reply. Log an error
776			 */
777			logtrace("probe status %d Fake probe reply seq %u "
778			    "snxt %u on %s from %s\n",
779			    pii->pii_probes[pr_ndx].pr_status,
780			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
781			pii->pii_cum_stats.unknown++;
782			return;
783		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
784			/*
785			 * The address matches, but our tables indicate that
786			 * this probe reply has been acked already. So this
787			 * is a duplicate probe reply. Log an error
788			 */
789			logtrace("probe status %d Duplicate probe reply seq %u "
790			    "snxt %u on %s from %s\n",
791			    pii->pii_probes[pr_ndx].pr_status,
792			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
793			pii->pii_cum_stats.unknown++;
794			return;
795		}
796	} else {
797		/*
798		 * Target must not be NULL in the PR_UNACKED state
799		 */
800		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
801		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
802			/*
803			 * The probe stats slot is unused. So we didn't
804			 * send out any probe to this target. This is a fake.
805			 * Log an error.
806			 */
807			logtrace("probe status %d Fake probe reply seq %u "
808			    "snxt %u on %s from %s\n",
809			    pii->pii_probes[pr_ndx].pr_status,
810			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
811		}
812		pii->pii_cum_stats.unknown++;
813		return;
814	}
815
816	/*
817	 * If the rtt does not appear to be right, don't update the
818	 * rtt stats. This can happen if the system dropped into the
819	 * debugger, or the system was hung or too busy for a
820	 * substantial time that we didn't get a chance to run.
821	 */
822	if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
823		/*
824		 * If the probe corresponding to this received response
825		 * was truly sent 'm' ns. ago, then this response must
826		 * have been rejected by the sequence number checks. The
827		 * fact that it has passed the sequence number checks
828		 * means that the measured rtt is wrong. We were probably
829		 * scheduled long after the packet was received.
830		 */
831		goto out;
832	}
833
834	/*
835	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
836	 * The initial few responses after the interface is repaired may
837	 * contain high rtt's because they could have been queued up waiting
838	 * for ARP/NDP resolution on a failed interface.
839	 */
840	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
841		goto out;
842
843	/*
844	 * Don't update the Conservative Round Trip Time estimate for this
845	 * (phint, target) pair if this is the not the highest ack seq seen
846	 * thus far on this target.
847	 */
848	if (!highest_ack_tg(pr_icmp_seq, target))
849		goto out;
850
851	/*
852	 * Always update the rtt. This is a failure detection probe
853	 * and we want to measure both increase / decrease in rtt.
854	 */
855	pi_set_crtt(target, m, _B_TRUE);
856
857	/*
858	 * If the crtt exceeds the average time between probes,
859	 * investigate if this slow target is an exception. If so we
860	 * can avoid this target and still meet the failure detection
861	 * time. Otherwise we can't meet the failure detection time.
862	 */
863	if (target->tg_crtt > pg->pg_probeint) {
864		exception = check_exception_target(pii, target);
865		if (exception) {
866			/*
867			 * This target is exceptionally slow. Don't use it
868			 * for future probes. check_exception_target() has
869			 * made sure that we have at least MIN_PROBE_TARGETS
870			 * other active targets
871			 */
872			if (pii->pii_targets_are_routers) {
873				/*
874				 * This is a slow router, mark it as slow
875				 * and don't use it for further probes. We
876				 * don't delete it, since it will be populated
877				 * again when we do a router scan. Hence we
878				 * need to maintain extra state (unlike the
879				 * host case below).  Mark it as TG_SLOW.
880				 */
881				if (target->tg_status == TG_ACTIVE)
882					pii->pii_ntargets--;
883				target->tg_status = TG_SLOW;
884				target->tg_latime = gethrtime();
885				target->tg_rtt_sa = -1;
886				target->tg_crtt = 0;
887				target->tg_rtt_sd = 0;
888				if (pii->pii_target_next == target) {
889					pii->pii_target_next =
890					    target_next(target);
891				}
892			} else {
893				/*
894				 * the slow target is not a router, we can
895				 * just delete it. Send an icmp multicast and
896				 * pick the fastest responder that is not
897				 * already an active target. target_delete()
898				 * adjusts pii->pii_target_next
899				 */
900				target_delete(target);
901				probe(pii, PROBE_MULTI, cur_hrtime);
902			}
903		} else {
904			/*
905			 * We can't meet the failure detection time.
906			 * Log a message, and update the detection time to
907			 * whatever we can achieve.
908			 */
909			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
910			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
911			last_fdt_bumpup_time = gethrtime();
912			if (pg != phyint_anongroup) {
913				logtrace("Cannot meet requested failure"
914				    " detection time of %d ms on (%s %s) new"
915				    " failure detection time for group \"%s\""
916				    " is %d ms\n", user_failure_detection_time,
917				    AF_STR(pii->pii_af), pii->pii_name,
918				    pg->pg_name, pg->pg_fdt);
919			}
920		}
921	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
922	    (user_failure_detection_time < pg->pg_fdt) &&
923	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
924		/*
925		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
926		 * investigate if we can improve the failure detection time to
927		 * meet whatever the user specified.
928		 */
929		if (check_pg_crtt_improved(pg)) {
930			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
931			    user_failure_detection_time);
932			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
933			if (pg != phyint_anongroup) {
934				logtrace("Improved failure detection time %d ms"
935				    " on (%s %s) for group \"%s\"\n",
936				    pg->pg_fdt, AF_STR(pii->pii_af),
937				    pii->pii_name, pg->pg_name);
938			}
939			if (user_failure_detection_time == pg->pg_fdt) {
940				/* Avoid any truncation or rounding errors */
941				pg->pg_probeint = user_probe_interval;
942				/*
943				 * No more rtt probes will be sent. The actual
944				 * fdt has dropped to the user specified value.
945				 * pii_fd_snxt_basetime and pii_snxt_basetime
946				 * will be in sync henceforth.
947				 */
948				reset_snxt_basetimes();
949			}
950		}
951	}
952out:
953	pr_statp = &pii->pii_probes[pr_ndx];
954	pr_statp->pr_hrtime_ackproc = cur_hrtime;
955	pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
956	    (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
957
958	probe_chstate(pr_statp, pii, PR_ACKED);
959
960	/*
961	 * Update pii->pii_rack, i.e. the sequence number of the last received
962	 * probe response, based on the echo reply we have received now, if
963	 * either of the following conditions are satisfied.
964	 * a. pii_rack is outside the current receive window of
965	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
966	 *    This means we have not received probe responses for a
967	 *    long time, and the sequence number has wrapped around.
968	 * b. pii_rack is within the current receive window and this echo
969	 *    reply corresponds to the highest sequence number we have seen
970	 *    so far.
971	 */
972	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
973	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
974	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
975		pii->pii_rack = pr_icmp_seq;
976	}
977}
978
979/*
980 * Returns true if seq is the highest unacknowledged seq for target tg
981 * else returns false
982 */
983static boolean_t
984highest_ack_tg(uint16_t seq, struct target *tg)
985{
986	struct phyint_instance *pii;
987	int	 pr_ndx;
988	uint16_t pr_seq;
989
990	pii = tg->tg_phyint_inst;
991
992	/*
993	 * Get the seq number of the most recent probe sent so far,
994	 * and also get the corresponding probe index in the probe stats
995	 * array.
996	 */
997	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
998	pr_seq = pii->pii_snxt;
999	pr_seq--;
1000
1001	/*
1002	 * Start from the most recent probe and walk back, trying to find
1003	 * an acked probe corresponding to target tg.
1004	 */
1005	for (; pr_ndx != pii->pii_probe_next;
1006	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1007		if (pii->pii_probes[pr_ndx].pr_target == tg &&
1008		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1009			if (SEQ_GT(pr_seq, seq))
1010				return (_B_FALSE);
1011		}
1012	}
1013	return (_B_TRUE);
1014}
1015
1016/*
1017 * Check whether the crtt for the group has improved by a factor of
1018 * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1019 * detection time flapping in the face of small crtt changes.
1020 */
1021static boolean_t
1022check_pg_crtt_improved(struct phyint_group *pg)
1023{
1024	struct	phyint *pi;
1025
1026	if (debug & D_PROBE)
1027		logdebug("check_pg_crtt_improved()\n");
1028
1029	/*
1030	 * The crtt for the group is only improved if each phyint_instance
1031	 * for both ipv4 and ipv6 is improved.
1032	 */
1033	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1034		if (!check_pii_crtt_improved(pi->pi_v4) ||
1035		    !check_pii_crtt_improved(pi->pi_v6))
1036			return (_B_FALSE);
1037	}
1038
1039	return (_B_TRUE);
1040}
1041
1042/*
1043 * Check whether the crtt has improved substantially on this phyint_instance.
1044 * Returns _B_TRUE if there's no crtt information available, because pii
1045 * is NULL or the phyint_instance is not capable of probing.
1046 */
1047boolean_t
1048check_pii_crtt_improved(struct phyint_instance *pii) {
1049	struct 	target *tg;
1050
1051	if (pii == NULL)
1052		return (_B_TRUE);
1053
1054	if (!PROBE_CAPABLE(pii) ||
1055	    pii->pii_phyint->pi_state == PI_FAILED)
1056		return (_B_TRUE);
1057
1058	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1059		if (tg->tg_status != TG_ACTIVE)
1060			continue;
1061		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1062		    LOWER_FDT_TRIGGER)) {
1063			return (_B_FALSE);
1064		}
1065	}
1066
1067	return (_B_TRUE);
1068}
1069
1070/*
1071 * This target responds very slowly to probes. The target's crtt exceeds
1072 * the probe interval of its group. Compare against other targets
1073 * and determine if this target is an exception, if so return true, else false
1074 */
1075static boolean_t
1076check_exception_target(struct phyint_instance *pii, struct target *target)
1077{
1078	struct	target *tg;
1079	char abuf[INET6_ADDRSTRLEN];
1080
1081	if (debug & D_PROBE) {
1082		logdebug("check_exception_target(%s %s target %s)\n",
1083		    AF_STR(pii->pii_af), pii->pii_name,
1084		    pr_addr(pii->pii_af, target->tg_address,
1085		    abuf, sizeof (abuf)));
1086	}
1087
1088	/*
1089	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1090	 * to make a good judgement. Otherwise don't drop this target.
1091	 */
1092	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1093		return (_B_FALSE);
1094
1095	/*
1096	 * Determine whether only this particular target is slow.
1097	 * We know that this target's crtt exceeds the group's probe interval.
1098	 * If all other active targets have a
1099	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1100	 * then this target is considered slow.
1101	 */
1102	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1103		if (tg != target && tg->tg_status == TG_ACTIVE) {
1104			if (tg->tg_crtt >
1105			    pii->pii_phyint->pi_group->pg_probeint /
1106			    EXCEPTION_FACTOR) {
1107				return (_B_FALSE);
1108			}
1109		}
1110	}
1111
1112	return (_B_TRUE);
1113}
1114
1115/*
1116 * Update the target list. The icmp all hosts multicast has given us
1117 * some host to which we can send probes. If we already have sufficient
1118 * targets, discard it.
1119 */
1120static void
1121incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1122    struct in6_addr fromaddr)
1123/* ARGSUSED */
1124{
1125	int af;
1126	char abuf[INET6_ADDRSTRLEN];
1127	struct phyint *pi;
1128
1129	if (debug & D_PROBE) {
1130		logdebug("incoming_mcast_reply(%s %s %s)\n",
1131		    AF_STR(pii->pii_af), pii->pii_name,
1132		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1133	}
1134
1135	/*
1136	 * Using host targets is a fallback mechanism. If we have
1137	 * found a router, don't add this host target. If we already
1138	 * know MAX_PROBE_TARGETS, don't add another target.
1139	 */
1140	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1141	if (pii->pii_targets != NULL) {
1142		if (pii->pii_targets_are_routers ||
1143		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1144			return;
1145		}
1146	}
1147
1148	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1149	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1150		/*
1151		 * Guard against response from 0.0.0.0
1152		 * and ::. Log a trace message
1153		 */
1154		logtrace("probe response from %s on %s\n",
1155		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1156		    pii->pii_name);
1157		return;
1158	}
1159
1160	/*
1161	 * This address is one of our own, so reject this address as a
1162	 * valid probe target.
1163	 */
1164	af = pii->pii_af;
1165	if (own_address(fromaddr))
1166		return;
1167
1168	/*
1169	 * If the phyint is part a named group, then add the address to all
1170	 * members of the group.  Otherwise, add the address only to the
1171	 * phyint itself, since other phyints in the anongroup may not be on
1172	 * the same subnet.
1173	 */
1174	pi = pii->pii_phyint;
1175	if (pi->pi_group == phyint_anongroup) {
1176		target_add(pii, fromaddr, _B_FALSE);
1177	} else {
1178		pi = pi->pi_group->pg_phyint;
1179		for (; pi != NULL; pi = pi->pi_pgnext)
1180			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1181	}
1182}
1183
1184/*
1185 * Compute CRTT given an existing scaled average, scaled deviation estimate
1186 * and a new rtt time.  The formula is from Jacobson and Karels'
1187 * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1188 * are the same as those in Appendix A.2 of that paper.
1189 *
1190 * m = new measurement
1191 * sa = scaled RTT average (8 * average estimates)
1192 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1193 * crtt = Conservative round trip time. Used to determine whether probe
1194 * has timed out.
1195 *
1196 * New scaled average and deviation are passed back via sap and svp
1197 */
1198static int64_t
1199compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
1200{
1201	int64_t sa = *sap;
1202	int64_t sv = *svp;
1203	int64_t crtt;
1204	int64_t saved_m = m;
1205
1206	assert(*sap >= -1);
1207	assert(*svp >= 0);
1208
1209	if (sa != -1) {
1210		/*
1211		 * Update average estimator:
1212		 *	new rtt = old rtt + 1/8 Error
1213		 *	    where Error = m - old rtt
1214		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1215		 *	i.e. new sa =  old sa + Error
1216		 */
1217		m -= sa >> 3;		/* m is now Error in estimate. */
1218		if ((sa += m) < 0) {
1219			/* Don't allow the smoothed average to be negative. */
1220			sa = 0;
1221		}
1222
1223		/*
1224		 * Update deviation estimator:
1225		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1226		 *	i.e. 4 * new mdev = 4 * old mdev +
1227		 *		(abs(Error) - old mdev)
1228		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1229		 */
1230		if (m < 0)
1231			m = -m;
1232		m -= sv >> 2;
1233		sv += m;
1234	} else {
1235		/* Initialization. This is the first response received. */
1236		sa = (m << 3);
1237		sv = (m << 1);
1238	}
1239
1240	crtt = (sa >> 3) + sv;
1241
1242	if (debug & D_PROBE) {
1243		logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
1244		    "crtt = %lld\n", saved_m, sa, sv, crtt);
1245	}
1246
1247	*sap = sa;
1248	*svp = sv;
1249
1250	/*
1251	 * CRTT = average estimates  + 4 * deviation estimates
1252	 *	= sa / 8 + sv
1253	 */
1254	return (crtt);
1255}
1256
1257static void
1258pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
1259{
1260	struct phyint_instance *pii = tg->tg_phyint_inst;
1261	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1262	int64_t sa = tg->tg_rtt_sa;
1263	int64_t sv = tg->tg_rtt_sd;
1264	int new_crtt;
1265	int i;
1266
1267	if (debug & D_PROBE)
1268		logdebug("pi_set_crtt: target -  m %lld\n", m);
1269
1270	/* store the round trip time, in case we need to defer computation */
1271	tg->tg_deferred[tg->tg_num_deferred] = m;
1272
1273	new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
1274
1275	/*
1276	 * If this probe's round trip time would singlehandedly cause an
1277	 * increase in the group's probe interval consider it suspect.
1278	 */
1279	if ((new_crtt > probe_interval) && is_probe_uni) {
1280		if (debug & D_PROBE) {
1281			logdebug("Received a suspect probe on %s, new_crtt ="
1282			    " %d, probe_interval = %d, num_deferred = %d\n",
1283			    pii->pii_probe_logint->li_name, new_crtt,
1284			    probe_interval, tg->tg_num_deferred);
1285		}
1286
1287		/*
1288		 * If we've deferred as many rtts as we plan on deferring, then
1289		 * assume the link really did slow down and process all queued
1290		 * rtts
1291		 */
1292		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1293			if (debug & D_PROBE) {
1294				logdebug("Received MAXDEFERREDRTT probes which "
1295				    "would cause an increased probe_interval.  "
1296				    "Integrating queued rtt data points.\n");
1297			}
1298
1299			for (i = 0; i <= tg->tg_num_deferred; i++) {
1300				tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
1301				    &tg->tg_rtt_sd, tg->tg_deferred[i]));
1302			}
1303
1304			tg->tg_num_deferred = 0;
1305		} else {
1306			tg->tg_num_deferred++;
1307		}
1308		return;
1309	}
1310
1311	/*
1312	 * If this is a normal probe, or an RTT probe that would lead to a
1313	 * reduced CRTT, then update our CRTT data.  Further, if this was
1314	 * a normal probe, pitch any deferred probes since our probes are
1315	 * again being answered within our CRTT estimates.
1316	 */
1317	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1318		tg->tg_rtt_sa = sa;
1319		tg->tg_rtt_sd = sv;
1320		tg->tg_crtt = new_crtt;
1321		if (is_probe_uni)
1322			tg->tg_num_deferred = 0;
1323	}
1324}
1325
1326/*
1327 * Return a pointer to the specified option buffer.
1328 * If not found return NULL.
1329 */
1330static void *
1331find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
1332{
1333	struct cmsghdr *cmsg;
1334
1335	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1336	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1337		if (cmsg->cmsg_level == cmsg_level &&
1338		    cmsg->cmsg_type == cmsg_type) {
1339			return (CMSG_DATA(cmsg));
1340		}
1341	}
1342	return (NULL);
1343}
1344
1345/*
1346 * Try to activate another INACTIVE interface in the same group as `pi'.
1347 * Prefer STANDBY INACTIVE to just INACTIVE.
1348 */
1349void
1350phyint_activate_another(struct phyint *pi)
1351{
1352	struct phyint *pi2;
1353	struct phyint *inactivepi = NULL;
1354
1355	if (pi->pi_group == phyint_anongroup)
1356		return;
1357
1358	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1359		if (pi == pi2 || !phyint_is_functioning(pi2) ||
1360		    !(pi2->pi_flags & IFF_INACTIVE))
1361			continue;
1362
1363		inactivepi = pi2;
1364		if (pi2->pi_flags & IFF_STANDBY)
1365			break;
1366	}
1367
1368	if (inactivepi != NULL)
1369		(void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
1370}
1371
1372/*
1373 * Transition a phyint to PI_RUNNING.  The caller must ensure that the
1374 * transition is appropriate.  Clears IFF_OFFLINE or IFF_FAILED if
1375 * appropriate.  Also sets IFF_INACTIVE on this or other interfaces as
1376 * appropriate (see comment below).  Finally, also updates the phyint's group
1377 * state to account for the change.
1378 */
1379void
1380phyint_transition_to_running(struct phyint *pi)
1381{
1382	struct phyint *pi2;
1383	struct phyint *actstandbypi = NULL;
1384	uint_t nactive = 0, nnonstandby = 0;
1385	boolean_t onlining = (pi->pi_state == PI_OFFLINE);
1386	boolean_t initial = (pi->pi_state == PI_INIT);
1387	uint64_t set, clear;
1388
1389	/*
1390	 * The interface is running again, but should it or another interface
1391	 * in the group end up INACTIVE?  There are three cases:
1392	 *
1393	 * 1. If it's a STANDBY interface, it should be end up INACTIVE if
1394	 *    the group is operating at capacity (i.e., there are at least as
1395	 *    many active interfaces as non-STANDBY interfaces in the group).
1396	 *    No other interfaces should be changed.
1397	 *
1398	 * 2. If it's a non-STANDBY interface and we're onlining it or
1399	 *    FAILBACK is enabled, then it should *not* end up INACTIVE.
1400	 *    Further, if the group is above capacity as a result of this
1401	 *    interface, then an active STANDBY interface in the group should
1402	 *    end up INACTIVE.
1403	 *
1404	 * 3. If it's a non-STANDBY interface, we're repairing it, and
1405	 *    FAILBACK is disabled, then it should end up INACTIVE *unless*
1406	 *    the group was failed (in which case we have no choice but to
1407	 *    use it).  No other interfaces should be changed.
1408	 */
1409	if (pi->pi_group != phyint_anongroup) {
1410		pi2 = pi->pi_group->pg_phyint;
1411		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1412			if (!(pi2->pi_flags & IFF_STANDBY))
1413				nnonstandby++;
1414
1415			if (phyint_is_functioning(pi2) &&
1416			    !(pi2->pi_flags & IFF_INACTIVE)) {
1417				nactive++;
1418				if (pi2->pi_flags & IFF_STANDBY)
1419					actstandbypi = pi2;
1420			}
1421		}
1422	}
1423
1424	set = 0;
1425	clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
1426
1427	if (pi->pi_flags & IFF_STANDBY) {			/* case 1 */
1428		if (nactive >= nnonstandby)
1429			set |= IFF_INACTIVE;
1430		else
1431			clear |= IFF_INACTIVE;
1432	} else if (onlining || failback_enabled) {		/* case 2 */
1433		if (nactive >= nnonstandby && actstandbypi != NULL)
1434			(void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
1435	} else if (!initial && !GROUP_FAILED(pi->pi_group)) {	/* case 3 */
1436		set |= IFF_INACTIVE;
1437	}
1438	(void) change_pif_flags(pi, set, clear);
1439
1440	phyint_chstate(pi, PI_RUNNING);
1441
1442	/*
1443	 * Update the group state to account for the change.
1444	 */
1445	phyint_group_refresh_state(pi->pi_group);
1446}
1447
1448/*
1449 * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
1450 * to have at least one active interface and as many active interfaces as
1451 * non-standby interfaces.
1452 */
1453void
1454phyint_standby_refresh_inactive(struct phyint *pi)
1455{
1456	struct phyint *pi2;
1457	uint_t nactive = 0, nnonstandby = 0;
1458
1459	/*
1460	 * All phyints in the anonymous group are effectively in their own
1461	 * group and thus active regardless of whether they're marked standby.
1462	 */
1463	if (pi->pi_group == phyint_anongroup) {
1464		(void) change_pif_flags(pi, 0, IFF_INACTIVE);
1465		return;
1466	}
1467
1468	/*
1469	 * If the phyint isn't functioning we can't consider it.
1470	 */
1471	if (!phyint_is_functioning(pi))
1472		return;
1473
1474	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1475		if (!(pi2->pi_flags & IFF_STANDBY))
1476			nnonstandby++;
1477
1478		if (phyint_is_functioning(pi2) &&
1479		    !(pi2->pi_flags & IFF_INACTIVE))
1480			nactive++;
1481	}
1482
1483	if (nactive == 0 || nactive < nnonstandby)
1484		(void) change_pif_flags(pi, 0, IFF_INACTIVE);
1485	else if (nactive > nnonstandby)
1486		(void) change_pif_flags(pi, IFF_INACTIVE, 0);
1487}
1488
1489/*
1490 * See if a previously failed interface has started working again.
1491 */
1492void
1493phyint_check_for_repair(struct phyint *pi)
1494{
1495	if (!phyint_repaired(pi))
1496		return;
1497
1498	if (pi->pi_group == phyint_anongroup) {
1499		logerr("IP interface repair detected on %s\n", pi->pi_name);
1500	} else {
1501		logerr("IP interface repair detected on %s of group %s\n",
1502		    pi->pi_name, pi->pi_group->pg_name);
1503	}
1504
1505	/*
1506	 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
1507	 * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
1508	 * until it is brought back online.
1509	 */
1510	if (pi->pi_state == PI_OFFLINE) {
1511		(void) change_pif_flags(pi, 0, IFF_FAILED);
1512		return;
1513	}
1514
1515	phyint_transition_to_running(pi);	/* calls phyint_chstate() */
1516}
1517
1518/*
1519 * See if an interface has failed, or if the whole group of interfaces has
1520 * failed.
1521 */
1522static void
1523phyint_inst_check_for_failure(struct phyint_instance *pii)
1524{
1525	struct phyint	*pi = pii->pii_phyint;
1526	struct phyint	*pi2;
1527	boolean_t	was_active;
1528
1529	switch (failure_state(pii)) {
1530	case PHYINT_FAILURE:
1531		was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1532
1533		(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1534		if (pi->pi_group == phyint_anongroup) {
1535			logerr("IP interface failure detected on %s\n",
1536			    pii->pii_name);
1537		} else {
1538			logerr("IP interface failure detected on %s of group"
1539			    " %s\n", pii->pii_name, pi->pi_group->pg_name);
1540		}
1541
1542		/*
1543		 * If the failed interface was active, activate another
1544		 * INACTIVE interface in the group if possible.
1545		 */
1546		if (was_active)
1547			phyint_activate_another(pi);
1548
1549		/*
1550		 * If the interface is offline, the state change will be
1551		 * noted when it comes back online.
1552		 */
1553		if (pi->pi_state != PI_OFFLINE) {
1554			phyint_chstate(pi, PI_FAILED);
1555			reset_crtt_all(pi);
1556		}
1557		break;
1558
1559	case GROUP_FAILURE:
1560		pi2 = pi->pi_group->pg_phyint;
1561		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1562			(void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
1563			if (pi2->pi_state == PI_OFFLINE) /* see comment above */
1564				continue;
1565
1566			reset_crtt_all(pi2);
1567			/*
1568			 * In the case of host targets, we would have flushed
1569			 * the targets, and gone to PI_NOTARGETS state.
1570			 */
1571			if (pi2->pi_state == PI_RUNNING)
1572				phyint_chstate(pi2, PI_FAILED);
1573		}
1574		break;
1575
1576	default:
1577		break;
1578	}
1579}
1580
1581/*
1582 * Determines if any timeout event has occurred and returns the number of
1583 * milliseconds until the next timeout event for the phyint. Returns
1584 * TIMER_INFINITY for "never".
1585 */
1586uint_t
1587phyint_inst_timer(struct phyint_instance *pii)
1588{
1589	int 	pr_ndx;
1590	uint_t	timeout;
1591	struct	target	*cur_tg;
1592	struct	probe_stats *pr_statp;
1593	struct	phyint_instance *pii_other;
1594	struct	phyint *pi;
1595	int	valid_unack_count;
1596	int	i;
1597	int	interval;
1598	uint_t	check_time;
1599	uint_t	cur_time;
1600	hrtime_t cur_hrtime;
1601	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1602
1603	cur_hrtime = gethrtime();
1604	cur_time = ns2ms(cur_hrtime);
1605
1606	if (debug & D_TIMER) {
1607		logdebug("phyint_inst_timer(%s %s)\n",
1608		    AF_STR(pii->pii_af), pii->pii_name);
1609	}
1610
1611	pii_other = phyint_inst_other(pii);
1612	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1613		/*
1614		 * Check to see if we're here due to link up/down flapping; If
1615		 * enough time has passed, then try to bring the interface
1616		 * back up; otherwise, schedule a timer to bring it back up
1617		 * when enough time *has* elapsed.
1618		 */
1619		pi = pii->pii_phyint;
1620		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1621			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1622			if (check_time > cur_time)
1623				return (check_time - cur_time);
1624
1625			phyint_check_for_repair(pi);
1626		}
1627	}
1628
1629	/*
1630	 * If probing is not enabled on this phyint instance, don't proceed.
1631	 */
1632	if (!PROBE_ENABLED(pii))
1633		return (TIMER_INFINITY);
1634
1635	/*
1636	 * If the timer has fired too soon, probably triggered
1637	 * by some other phyint instance, return the remaining
1638	 * time
1639	 */
1640	if (TIME_LT(cur_time, pii->pii_snxt_time))
1641		return (pii->pii_snxt_time - cur_time);
1642
1643	/*
1644	 * If the link is down, don't send any probes for now.
1645	 */
1646	if (LINK_DOWN(pii->pii_phyint))
1647		return (TIMER_INFINITY);
1648
1649	/*
1650	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1651	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1652	 * Base probe time is strictly periodic.
1653	 */
1654	interval = GET_RANDOM(
1655	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1656	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1657	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1658
1659	/*
1660	 * Check if the current time > next time to probe. If so, we missed
1661	 * sending 1 or more probes, probably due to heavy system load. At least
1662	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1663	 * were scheduled. Make adjustments to the times, in multiples of
1664	 * user_probe_interval.
1665	 */
1666	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1667		int n;
1668
1669		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1670		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1671		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1672		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1673		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1674		    pii->pii_snxt_basetime);
1675
1676		/* Collect statistics about missed probes */
1677		probes_missed.pm_nprobes += n + 1;
1678		probes_missed.pm_ntimes++;
1679	}
1680	pii->pii_snxt_basetime += user_probe_interval;
1681	interval = pii->pii_snxt_time - cur_time;
1682	if (debug & D_TARGET) {
1683		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1684		    " interval %u\n", cur_time, pii->pii_snxt_time,
1685		    pii->pii_snxt_basetime, interval);
1686	}
1687
1688	/*
1689	 * If no targets are known, we need to send an ICMP multicast. The
1690	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1691	 * to see if we found a target.
1692	 */
1693	if (pii->pii_target_next == NULL) {
1694		assert(pii->pii_ntargets == 0);
1695		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1696		probe(pii, PROBE_MULTI, cur_time);
1697		return (interval);
1698	}
1699
1700	if ((user_probe_interval != probe_interval) &&
1701	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1702		/*
1703		 * the failure detection (fd) probe timer has not yet fired.
1704		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1705		 */
1706		probe(pii, PROBE_RTT, cur_hrtime);
1707		return (interval);
1708	}
1709	/*
1710	 * the fd probe timer has fired. Need to do all failure
1711	 * detection / recovery calculations, and then send an fd probe
1712	 * of type PROBE_UNI.
1713	 */
1714	if (user_probe_interval == probe_interval) {
1715		/*
1716		 * We could have missed some probes, and then adjusted
1717		 * pii_snxt_basetime above. Otherwise we could have
1718		 * blindly added probe_interval to pii_fd_snxt_basetime.
1719		 */
1720		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1721	} else {
1722		pii->pii_fd_snxt_basetime += probe_interval;
1723		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1724			int n;
1725
1726			n = (cur_time - pii->pii_fd_snxt_basetime) /
1727			    probe_interval;
1728			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1729		}
1730	}
1731
1732	/*
1733	 * We can have at most, the latest 2 probes that we sent, in
1734	 * the PR_UNACKED state. All previous probes sent, are either
1735	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1736	 * timed out if the probe's time_start + the CRTT < currenttime.
1737	 * For each of the last 2 probes, examine whether it has timed
1738	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1739	 */
1740	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1741	valid_unack_count = 0;
1742
1743	for (i = 0; i < 2; i++) {
1744		pr_statp = &pii->pii_probes[pr_ndx];
1745		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1746		switch (pr_statp->pr_status) {
1747		case PR_ACKED:
1748			/*
1749			 * We received back an ACK, so the switch clearly
1750			 * is not dropping our traffic, and thus we can
1751			 * enable failure detection immediately.
1752			 */
1753			if (pii->pii_fd_hrtime > gethrtime()) {
1754				if (debug & D_PROBE) {
1755					logdebug("successful probe on %s; "
1756					    "ending quiet period\n",
1757					    pii->pii_phyint->pi_name);
1758				}
1759				pii->pii_fd_hrtime = gethrtime();
1760			}
1761			break;
1762
1763		case PR_UNACKED:
1764			assert(cur_tg != NULL);
1765			/*
1766			 * The crtt could be zero for some reason,
1767			 * Eg. the phyint could be failed. If the crtt is
1768			 * not available use group's probe interval,
1769			 * which is a worst case estimate.
1770			 */
1771			timeout = ns2ms(pr_statp->pr_hrtime_start);
1772			if (cur_tg->tg_crtt != 0) {
1773				timeout += cur_tg->tg_crtt;
1774			} else {
1775				timeout += probe_interval;
1776			}
1777			if (TIME_LT(timeout, cur_time)) {
1778				pr_statp->pr_time_lost = timeout;
1779				probe_chstate(pr_statp, pii, PR_LOST);
1780			} else if (i == 1) {
1781				/*
1782				 * We are forced to consider this probe
1783				 * lost, as we can have at most 2 unack.
1784				 * probes any time, and we will be sending a
1785				 * probe at the end of this function.
1786				 * Normally, we should not be here, but
1787				 * this can happen if an incoming response
1788				 * that was considered lost has increased
1789				 * the crtt for this target, and also bumped
1790				 * up the FDT. Note that we never cancel or
1791				 * increase the current pii_time_left, so
1792				 * when the timer fires, we find 2 valid
1793				 * unacked probes, and they are yet to timeout
1794				 */
1795				pr_statp->pr_time_lost = cur_time;
1796				probe_chstate(pr_statp, pii, PR_LOST);
1797			} else {
1798				/*
1799				 * Only the most recent probe can enter
1800				 * this 'else' arm. The second most recent
1801				 * probe must take either of the above arms,
1802				 * if it is unacked.
1803				 */
1804				valid_unack_count++;
1805			}
1806			break;
1807		}
1808		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1809	}
1810
1811	/*
1812	 * We send out 1 probe randomly in the interval between one half
1813	 * and one probe interval for the group. Given that the CRTT is always
1814	 * less than the group's probe interval, we can have at most 1
1815	 * unacknowledged probe now.  All previous probes are either lost or
1816	 * acked.
1817	 */
1818	assert(valid_unack_count == 0 || valid_unack_count == 1);
1819
1820	/*
1821	 * The timer has fired. Take appropriate action depending
1822	 * on the current state of the phyint.
1823	 *
1824	 * PI_RUNNING state 	- Failure detection
1825	 * PI_FAILED state 	- Repair detection
1826	 */
1827	switch (pii->pii_phyint->pi_state) {
1828	case PI_FAILED:
1829		/*
1830		 * If the most recent probe (excluding unacked probes that
1831		 * are yet to time out) has been acked, check whether the
1832		 * phyint is now repaired.
1833		 */
1834		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1835			phyint_check_for_repair(pii->pii_phyint);
1836		}
1837		break;
1838
1839	case PI_RUNNING:
1840		/*
1841		 * It's possible our probes have been lost because of a
1842		 * spanning-tree mandated quiet period on the switch.  If so,
1843		 * ignore the lost probes.
1844		 */
1845		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1846			break;
1847
1848		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1849			/*
1850			 * We have 1 or more failed probes (excluding unacked
1851			 * probes that are yet to time out). Determine if the
1852			 * phyint has failed.
1853			 */
1854			phyint_inst_check_for_failure(pii);
1855		}
1856		break;
1857
1858	default:
1859		logerr("phyint_inst_timer: invalid state %d\n",
1860		    pii->pii_phyint->pi_state);
1861		abort();
1862	}
1863
1864	/*
1865	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1866	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1867	 * was called, the target list may be empty.
1868	 */
1869	if (pii->pii_target_next != NULL) {
1870		probe(pii, PROBE_UNI, cur_hrtime);
1871		/*
1872		 * If we have just the one probe target, and we're not using
1873		 * router targets, try to find another as we presently have
1874		 * no resilience.
1875		 */
1876		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1877			probe(pii, PROBE_MULTI, cur_hrtime);
1878	} else {
1879		probe(pii, PROBE_MULTI, cur_hrtime);
1880	}
1881	return (interval);
1882}
1883
1884/*
1885 * Start the probe timer for an interface instance.
1886 */
1887void
1888start_timer(struct phyint_instance *pii)
1889{
1890	uint32_t interval;
1891
1892	/*
1893	 * Spread the base probe times (pi_snxt_basetime) across phyints
1894	 * uniformly over the (curtime..curtime + the group's probe_interval).
1895	 * pi_snxt_basetime is strictly periodic with a frequency of
1896	 * the group's probe interval. The actual probe time pi_snxt_time
1897	 * adds some randomness to pi_snxt_basetime and happens in probe().
1898	 * For the 1st probe on each phyint after the timer is started,
1899	 * pi_snxt_time and pi_snxt_basetime are the same.
1900	 */
1901	interval = GET_RANDOM(0,
1902	    (int)pii->pii_phyint->pi_group->pg_probeint);
1903
1904	pii->pii_snxt_basetime = getcurrenttime() + interval;
1905	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1906	pii->pii_snxt_time = pii->pii_snxt_basetime;
1907	timer_schedule(interval);
1908}
1909
1910/*
1911 * Restart the probe timer on an interface instance.
1912 */
1913static void
1914restart_timer(struct phyint_instance *pii)
1915{
1916	/*
1917	 * We don't need to restart the timer if it was never started in
1918	 * the first place (pii->pii_basetime_inited not set), as the timer
1919	 * won't have gone off yet.
1920	 */
1921	if (pii->pii_basetime_inited != 0) {
1922
1923		if (debug & D_LINKNOTE)
1924			logdebug("restart timer: restarting timer on %s, "
1925			    "address family %s\n", pii->pii_phyint->pi_name,
1926			    AF_STR(pii->pii_af));
1927
1928		start_timer(pii);
1929	}
1930}
1931
1932static void
1933process_link_state_down(struct phyint *pi)
1934{
1935	logerr("The link has gone down on %s\n", pi->pi_name);
1936
1937	/*
1938	 * Clear the probe statistics arrays, we don't want the repair
1939	 * detection logic relying on probes that were successful prior
1940	 * to the link going down.
1941	 */
1942	if (PROBE_CAPABLE(pi->pi_v4))
1943		clear_pii_probe_stats(pi->pi_v4);
1944	if (PROBE_CAPABLE(pi->pi_v6))
1945		clear_pii_probe_stats(pi->pi_v6);
1946	/*
1947	 * Check for interface failure.  Although we know the interface
1948	 * has failed, we don't know if all the other interfaces in the
1949	 * group have failed as well.
1950	 */
1951	if ((pi->pi_state == PI_RUNNING) ||
1952	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1953		if (debug & D_LINKNOTE) {
1954			logdebug("process_link_state_down:"
1955			    " checking for failure on %s\n", pi->pi_name);
1956		}
1957
1958		if (pi->pi_v4 != NULL)
1959			phyint_inst_check_for_failure(pi->pi_v4);
1960		else if (pi->pi_v6 != NULL)
1961			phyint_inst_check_for_failure(pi->pi_v6);
1962	}
1963}
1964
1965static void
1966process_link_state_up(struct phyint *pi)
1967{
1968	logerr("The link has come up on %s\n", pi->pi_name);
1969
1970	/*
1971	 * We stopped any running timers on each instance when the link
1972	 * went down, so restart them.
1973	 */
1974	if (pi->pi_v4)
1975		restart_timer(pi->pi_v4);
1976	if (pi->pi_v6)
1977		restart_timer(pi->pi_v6);
1978
1979	phyint_check_for_repair(pi);
1980
1981	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1982	if (pi->pi_whendx == LINK_UP_PERMIN)
1983		pi->pi_whendx = 0;
1984}
1985
1986/*
1987 * Process any changes in link state passed up from the interfaces.
1988 */
1989void
1990process_link_state_changes(void)
1991{
1992	struct phyint *pi;
1993
1994	/* Look for interfaces where the link state has just changed */
1995
1996	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1997		boolean_t old_link_state_up = LINK_UP(pi);
1998
1999		/*
2000		 * Except when the "phyint" structure is created, this is
2001		 * the only place the link state is updated.  This allows
2002		 * this routine to detect changes in link state, rather
2003		 * than just the current state.
2004		 */
2005		UPDATE_LINK_STATE(pi);
2006
2007		if (LINK_DOWN(pi)) {
2008			/*
2009			 * Has link just gone down?
2010			 */
2011			if (old_link_state_up)
2012				process_link_state_down(pi);
2013		} else {
2014			/*
2015			 * Has link just gone back up?
2016			 */
2017			if (!old_link_state_up)
2018				process_link_state_up(pi);
2019		}
2020	}
2021}
2022
2023void
2024reset_crtt_all(struct phyint *pi)
2025{
2026	struct phyint_instance *pii;
2027	struct target *tg;
2028
2029	pii = pi->pi_v4;
2030	if (pii != NULL) {
2031		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2032			tg->tg_crtt = 0;
2033			tg->tg_rtt_sa = -1;
2034			tg->tg_rtt_sd = 0;
2035		}
2036	}
2037
2038	pii = pi->pi_v6;
2039	if (pii != NULL) {
2040		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2041			tg->tg_crtt = 0;
2042			tg->tg_rtt_sa = -1;
2043			tg->tg_rtt_sd = 0;
2044		}
2045	}
2046}
2047
2048/*
2049 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
2050 * probes on both instances IPv4 and IPv6.
2051 * If the interface has failed, return the time of the first probe failure
2052 * in "tff".
2053 */
2054static int
2055phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
2056{
2057	uint_t	pi_tff;
2058	struct	target *cur_tg;
2059	struct	probe_fail_count pfinfo;
2060	struct	phyint_instance *pii_other;
2061	int	pr_ndx;
2062
2063	/*
2064	 * Get the number of consecutive failed probes on
2065	 * this phyint across all targets. Also get the number
2066	 * of consecutive failed probes on this target only
2067	 */
2068	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2069	cur_tg = pii->pii_probes[pr_ndx].pr_target;
2070	probe_fail_info(pii, cur_tg, &pfinfo);
2071
2072	/* Get the time of first failure, for later use */
2073	pi_tff = pfinfo.pf_tff;
2074
2075	/*
2076	 * If the current target has not responded to the
2077	 * last NUM_PROBE_FAILS probes, and other targets are
2078	 * responding delete this target. Dead gateway detection
2079	 * will eventually remove this target (if router) from the
2080	 * routing tables. If that does not occur, we may end
2081	 * up adding this to our list again.
2082	 */
2083	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2084	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2085		if (pii->pii_targets_are_routers) {
2086			if (cur_tg->tg_status == TG_ACTIVE)
2087				pii->pii_ntargets--;
2088			cur_tg->tg_status = TG_DEAD;
2089			cur_tg->tg_crtt = 0;
2090			cur_tg->tg_rtt_sa = -1;
2091			cur_tg->tg_rtt_sd = 0;
2092			if (pii->pii_target_next == cur_tg)
2093				pii->pii_target_next = target_next(cur_tg);
2094		} else {
2095			target_delete(cur_tg);
2096			probe(pii, PROBE_MULTI, gethrtime());
2097		}
2098		return (PHYINT_OK);
2099	}
2100
2101	/*
2102	 * If the phyint has lost NUM_PROBE_FAILS or more
2103	 * consecutive probes, on both IPv4 and IPv6 protocol
2104	 * instances of the phyint, then trigger failure
2105	 * detection, else return false
2106	 */
2107	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2108		return (PHYINT_OK);
2109
2110	pii_other = phyint_inst_other(pii);
2111	if (PROBE_CAPABLE(pii_other)) {
2112		probe_fail_info(pii_other, NULL, &pfinfo);
2113		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2114			/*
2115			 * We have NUM_PROBE_FAILS or more failures
2116			 * on both IPv4 and IPv6. Get the earliest
2117			 * time when failure was detected on this
2118			 * phyint across IPv4 and IPv6.
2119			 */
2120			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2121				pi_tff = pfinfo.pf_tff;
2122		} else {
2123			/*
2124			 * This instance has < NUM_PROBE_FAILS failure.
2125			 * So return false
2126			 */
2127			return (PHYINT_OK);
2128		}
2129	}
2130	*tff = pi_tff;
2131	return (PHYINT_FAILURE);
2132}
2133
2134/*
2135 * Check if the link has gone down on this phyint, or it has failed the
2136 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2137 * Also look at other phyints of this group, for group failures.
2138 */
2139int
2140failure_state(struct phyint_instance *pii)
2141{
2142	struct	probe_success_count psinfo;
2143	uint_t	pi2_tls;		/* time last success */
2144	uint_t	pi_tff;			/* time first fail */
2145	struct	phyint *pi2;
2146	struct	phyint *pi;
2147	struct	phyint_instance *pii2;
2148	struct  phyint_group *pg;
2149	int	retval;
2150
2151	if (debug & D_FAILREP)
2152		logdebug("phyint_failed(%s)\n", pii->pii_name);
2153
2154	pi = pii->pii_phyint;
2155	pg = pi->pi_group;
2156
2157	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2158	    PHYINT_OK)
2159		return (PHYINT_OK);
2160
2161	/*
2162	 * At this point, the link is down, or the phyint is suspect, as it
2163	 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
2164	 * belong to any group, this is a PHYINT_FAILURE.  Otherwise, continue
2165	 * on to determine whether this should be considered a PHYINT_FAILURE
2166	 * or GROUP_FAILURE.
2167	 */
2168	if (pg == phyint_anongroup)
2169		return (PHYINT_FAILURE);
2170
2171	/*
2172	 * Need to compare against other phyints of the same group
2173	 * to exclude group failures. If the failure was detected via
2174	 * probing, then if the time of last success (tls) of any
2175	 * phyint is more recent than the time of first fail (tff) of the
2176	 * phyint in question, and the link is up on the phyint,
2177	 * then it is a phyint failure. Otherwise it is a group failure.
2178	 * If failure was detected via a link down notification sent from
2179	 * the driver to IP, we see if any phyints in the group are still
2180	 * running and haven't received a link down notification.  We
2181	 * will usually be processing the link down notification shortly
2182	 * after it was received, so there is no point looking at the tls
2183	 * of other phyints.
2184	 */
2185	retval = GROUP_FAILURE;
2186	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2187		/* Exclude ourself from comparison */
2188		if (pi2 == pi)
2189			continue;
2190
2191		if (LINK_DOWN(pi)) {
2192			/*
2193			 * We use FLAGS_TO_LINK_STATE() to test the flags
2194			 * directly, rather then LINK_UP() or LINK_DOWN(), as
2195			 * we may not have got round to processing the link
2196			 * state for the other phyints in the group yet.
2197			 *
2198			 * The check for PI_RUNNING and group failure handles
2199			 * the case when the group begins to recover.
2200			 * PI_RUNNING will be set, and group failure cleared
2201			 * only after receipt of NUM_PROBE_REPAIRS, by which
2202			 * time the other phyints should have received at
2203			 * least 1 packet, and so will not have NUM_PROBE_FAILS.
2204			 */
2205			if ((pi2->pi_state == PI_RUNNING) &&
2206			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
2207				retval = PHYINT_FAILURE;
2208				break;
2209			}
2210			continue;
2211		}
2212
2213		if (LINK_DOWN(pi2))
2214			continue;
2215
2216		/*
2217		 * If there's no probe-based failure detection on this
2218		 * interface, and its link is still up, then it's still
2219		 * working and thus the group has not failed.
2220		 */
2221		if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
2222			retval = PHYINT_FAILURE;
2223			break;
2224		}
2225
2226		/*
2227		 * Need to compare against both IPv4 and IPv6 instances.
2228		 */
2229		pii2 = pi2->pi_v4;
2230		if (pii2 != NULL) {
2231			probe_success_info(pii2, NULL, &psinfo);
2232			if (psinfo.ps_tls_valid) {
2233				pi2_tls = psinfo.ps_tls;
2234				/*
2235				 * See comment above regarding check
2236				 * for PI_RUNNING and group failure.
2237				 */
2238				if (TIME_GT(pi2_tls, pi_tff) &&
2239				    (pi2->pi_state == PI_RUNNING) &&
2240				    !GROUP_FAILED(pg) &&
2241				    FLAGS_TO_LINK_STATE(pi2)) {
2242					retval = PHYINT_FAILURE;
2243					break;
2244				}
2245			}
2246		}
2247
2248		pii2 = pi2->pi_v6;
2249		if (pii2 != NULL) {
2250			probe_success_info(pii2, NULL, &psinfo);
2251			if (psinfo.ps_tls_valid) {
2252				pi2_tls = psinfo.ps_tls;
2253				/*
2254				 * See comment above regarding check
2255				 * for PI_RUNNING and group failure.
2256				 */
2257				if (TIME_GT(pi2_tls, pi_tff) &&
2258				    (pi2->pi_state == PI_RUNNING) &&
2259				    !GROUP_FAILED(pg) &&
2260				    FLAGS_TO_LINK_STATE(pi2)) {
2261					retval = PHYINT_FAILURE;
2262					break;
2263				}
2264			}
2265		}
2266	}
2267
2268	/*
2269	 * Update the group state to account for the changes.
2270	 */
2271	phyint_group_refresh_state(pg);
2272	return (retval);
2273}
2274
2275/*
2276 * Return the information associated with consecutive probe successes
2277 * starting with the most recent probe. At most the last 2 probes can be
2278 * in the unacknowledged state. All previous probes have either failed
2279 * or succeeded.
2280 */
2281static void
2282probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2283    struct probe_success_count *psinfo)
2284{
2285	uint_t	i;
2286	struct probe_stats *pr_statp;
2287	uint_t most_recent;
2288	uint_t second_most_recent;
2289	boolean_t pi_found_failure = _B_FALSE;
2290	boolean_t tg_found_failure = _B_FALSE;
2291	uint_t now;
2292	uint_t timeout;
2293	struct target *tg;
2294
2295	if (debug & D_FAILREP)
2296		logdebug("probe_success_info(%s)\n", pii->pii_name);
2297
2298	bzero(psinfo, sizeof (*psinfo));
2299	now = getcurrenttime();
2300
2301	/*
2302	 * Start with the most recent probe, and count the number
2303	 * of consecutive probe successes. Latch the number of successes
2304	 * on hitting a failure.
2305	 */
2306	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2307	second_most_recent = PROBE_INDEX_PREV(most_recent);
2308
2309	for (i = most_recent; i != pii->pii_probe_next;
2310	    i = PROBE_INDEX_PREV(i)) {
2311		pr_statp = &pii->pii_probes[i];
2312
2313		switch (pr_statp->pr_status) {
2314		case PR_UNACKED:
2315			/*
2316			 * Only the most recent 2 probes can be unacknowledged
2317			 */
2318			assert(i == most_recent || i == second_most_recent);
2319
2320			tg = pr_statp->pr_target;
2321			assert(tg != NULL);
2322			/*
2323			 * The crtt could be zero for some reason,
2324			 * Eg. the phyint could be failed. If the crtt is
2325			 * not available use the value of the group's probe
2326			 * interval which is a worst case estimate.
2327			 */
2328			timeout = ns2ms(pr_statp->pr_hrtime_start);
2329			if (tg->tg_crtt != 0) {
2330				timeout += tg->tg_crtt;
2331			} else {
2332				timeout +=
2333				    pii->pii_phyint->pi_group->pg_probeint;
2334			}
2335
2336			if (TIME_LT(timeout, now)) {
2337				/*
2338				 * We hit a failure. Latch the total number of
2339				 * recent consecutive successes.
2340				 */
2341				pr_statp->pr_time_lost = timeout;
2342				probe_chstate(pr_statp, pii, PR_LOST);
2343				pi_found_failure = _B_TRUE;
2344				if (cur_tg != NULL && tg == cur_tg) {
2345					/*
2346					 * We hit a failure for the desired
2347					 * target. Latch the number of recent
2348					 * consecutive successes for this target
2349					 */
2350					tg_found_failure = _B_TRUE;
2351				}
2352			}
2353			break;
2354
2355		case PR_ACKED:
2356			/*
2357			 * Bump up the count of probe successes, if we
2358			 * have not seen any failure so far.
2359			 */
2360			if (!pi_found_failure)
2361				psinfo->ps_nsucc++;
2362
2363			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2364			    !tg_found_failure) {
2365				psinfo->ps_nsucc_tg++;
2366			}
2367
2368			/*
2369			 * Record the time of last success, if this is
2370			 * the most recent probe success.
2371			 */
2372			if (!psinfo->ps_tls_valid) {
2373				psinfo->ps_tls =
2374				    ns2ms(pr_statp->pr_hrtime_ackproc);
2375				psinfo->ps_tls_valid = _B_TRUE;
2376			}
2377			break;
2378
2379		case PR_LOST:
2380			/*
2381			 * We hit a failure. Latch the total number of
2382			 * recent consecutive successes.
2383			 */
2384			pi_found_failure = _B_TRUE;
2385			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2386				/*
2387				 * We hit a failure for the desired target.
2388				 * Latch the number of recent consecutive
2389				 * successes for this target
2390				 */
2391				tg_found_failure = _B_TRUE;
2392			}
2393			break;
2394
2395		default:
2396			return;
2397
2398		}
2399	}
2400}
2401
2402/*
2403 * Return the information associated with consecutive probe failures
2404 * starting with the most recent probe. Only the last 2 probes can be in the
2405 * unacknowledged state. All previous probes have either failed or succeeded.
2406 */
2407static void
2408probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2409    struct probe_fail_count *pfinfo)
2410{
2411	int	i;
2412	struct probe_stats *pr_statp;
2413	boolean_t	tg_found_success = _B_FALSE;
2414	boolean_t	pi_found_success = _B_FALSE;
2415	int	most_recent;
2416	int	second_most_recent;
2417	uint_t	now;
2418	uint_t	timeout;
2419	struct	target *tg;
2420
2421	if (debug & D_FAILREP)
2422		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2423
2424	bzero(pfinfo, sizeof (*pfinfo));
2425	now = getcurrenttime();
2426
2427	/*
2428	 * Start with the most recent probe, and count the number
2429	 * of consecutive probe failures. Latch the number of failures
2430	 * on hitting a probe success.
2431	 */
2432	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2433	second_most_recent = PROBE_INDEX_PREV(most_recent);
2434
2435	for (i = most_recent; i != pii->pii_probe_next;
2436	    i = PROBE_INDEX_PREV(i)) {
2437		pr_statp = &pii->pii_probes[i];
2438
2439		assert(PR_STATUS_VALID(pr_statp->pr_status));
2440
2441		switch (pr_statp->pr_status) {
2442		case PR_UNACKED:
2443			/*
2444			 * Only the most recent 2 probes can be unacknowledged
2445			 */
2446			assert(i == most_recent || i == second_most_recent);
2447
2448			tg = pr_statp->pr_target;
2449			/*
2450			 * Target is guaranteed to exist in the unack. state
2451			 */
2452			assert(tg != NULL);
2453			/*
2454			 * The crtt could be zero for some reason,
2455			 * Eg. the phyint could be failed. If the crtt is
2456			 * not available use the group's probe interval,
2457			 * which is a worst case estimate.
2458			 */
2459			timeout = ns2ms(pr_statp->pr_hrtime_start);
2460			if (tg->tg_crtt != 0) {
2461				timeout += tg->tg_crtt;
2462			} else {
2463				timeout +=
2464				    pii->pii_phyint->pi_group->pg_probeint;
2465			}
2466
2467			if (TIME_GT(timeout, now))
2468				break;
2469
2470			pr_statp->pr_time_lost = timeout;
2471			probe_chstate(pr_statp, pii, PR_LOST);
2472			/* FALLTHRU */
2473
2474		case PR_LOST:
2475			if (!pi_found_success) {
2476				pfinfo->pf_nfail++;
2477				pfinfo->pf_tff = pr_statp->pr_time_lost;
2478			}
2479			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2480			    !tg_found_success)  {
2481				pfinfo->pf_nfail_tg++;
2482			}
2483			break;
2484
2485		default:
2486			/*
2487			 * We hit a success or unused slot. Latch the
2488			 * total number of recent consecutive failures.
2489			 */
2490			pi_found_success = _B_TRUE;
2491			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2492				/*
2493				 * We hit a success for the desired target.
2494				 * Latch the number of recent consecutive
2495				 * failures for this target
2496				 */
2497				tg_found_success = _B_TRUE;
2498			}
2499		}
2500	}
2501}
2502
2503/*
2504 * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
2505 */
2506void
2507probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
2508{
2509	if (pr->pr_status == state)
2510		return;
2511
2512	pr->pr_status = state;
2513	(void) probe_state_event(pr, pii);
2514}
2515
2516/*
2517 * Check if the phyint has been repaired.  If no test address has been
2518 * configured, then consider the interface repaired if the link is up (unless
2519 * the link is flapping; see below).  Otherwise, look for proof of probes
2520 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2521 * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2522 */
2523static boolean_t
2524phyint_repaired(struct phyint *pi)
2525{
2526	struct	probe_success_count psinfo;
2527	struct	phyint_instance *pii;
2528	struct	target *cur_tg;
2529	int	pr_ndx;
2530	uint_t	cur_time;
2531
2532	if (debug & D_FAILREP)
2533		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2534
2535	if (LINK_DOWN(pi))
2536		return (_B_FALSE);
2537
2538	/*
2539	 * If we don't have any test addresses and the link is up, then
2540	 * consider the interface repaired, unless we've received more than
2541	 * LINK_UP_PERMIN link up notifications in the last minute, in
2542	 * which case we keep the link down until we drop back below
2543	 * the threshold.
2544	 */
2545	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2546		cur_time = getcurrenttime();
2547		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2548		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2549			pi->pi_lfmsg_printed = 0;
2550			return (_B_TRUE);
2551		}
2552		if (!pi->pi_lfmsg_printed) {
2553			logerr("The link has come up on %s more than %d times "
2554			    "in the last minute; disabling repair until it "
2555			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2556			pi->pi_lfmsg_printed = 1;
2557		}
2558
2559		return (_B_FALSE);
2560	}
2561
2562	pii = pi->pi_v4;
2563	if (PROBE_CAPABLE(pii)) {
2564		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2565		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2566		probe_success_info(pii, cur_tg, &psinfo);
2567		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2568		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2569			return (_B_TRUE);
2570	}
2571
2572	pii = pi->pi_v6;
2573	if (PROBE_CAPABLE(pii)) {
2574		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2575		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2576		probe_success_info(pii, cur_tg, &psinfo);
2577		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2578		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2579			return (_B_TRUE);
2580	}
2581
2582	return (_B_FALSE);
2583}
2584
2585/*
2586 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2587 */
2588boolean_t
2589change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
2590{
2591	int ifsock;
2592	struct lifreq lifr;
2593	uint64_t old_flags;
2594
2595	if (debug & D_FAILREP) {
2596		logdebug("change_pif_flags(%s): set %llx clear %llx\n",
2597		    pi->pi_name, set, clear);
2598	}
2599
2600	if (pi->pi_v4 != NULL)
2601		ifsock = ifsock_v4;
2602	else
2603		ifsock = ifsock_v6;
2604
2605	/*
2606	 * Get the current flags from the kernel, and set/clear the
2607	 * desired phyint flags. Since we set only phyint flags, we can
2608	 * do it on either IPv4 or IPv6 instance.
2609	 */
2610	(void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2611
2612	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2613		if (errno != ENXIO)
2614			logperror("change_pif_flags: ioctl (get flags)");
2615		return (_B_FALSE);
2616	}
2617
2618	old_flags = lifr.lifr_flags;
2619	lifr.lifr_flags |= set;
2620	lifr.lifr_flags &= ~clear;
2621
2622	if (old_flags == lifr.lifr_flags) {
2623		/* No change in the flags. No need to send ioctl */
2624		return (_B_TRUE);
2625	}
2626
2627	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2628		if (errno != ENXIO)
2629			logperror("change_pif_flags: ioctl (set flags)");
2630		return (_B_FALSE);
2631	}
2632
2633	/*
2634	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2635	 * phyint flags.
2636	 */
2637	pi->pi_flags |= set;
2638	pi->pi_flags &= ~clear;
2639
2640	if (pi->pi_v4 != NULL)
2641		pi->pi_v4->pii_flags = pi->pi_flags;
2642
2643	if (pi->pi_v6 != NULL)
2644		pi->pi_v6->pii_flags = pi->pi_flags;
2645
2646	return (_B_TRUE);
2647}
2648
2649/*
2650 * icmp cksum computation for IPv4.
2651 */
2652static int
2653in_cksum(ushort_t *addr, int len)
2654{
2655	register int nleft = len;
2656	register ushort_t *w = addr;
2657	register ushort_t answer;
2658	ushort_t odd_byte = 0;
2659	register int sum = 0;
2660
2661	/*
2662	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2663	 *  we add sequential 16 bit words to it, and at the end, fold
2664	 *  back all the carry bits from the top 16 bits into the lower
2665	 *  16 bits.
2666	 */
2667	while (nleft > 1)  {
2668		sum += *w++;
2669		nleft -= 2;
2670	}
2671
2672	/* mop up an odd byte, if necessary */
2673	if (nleft == 1) {
2674		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2675		sum += odd_byte;
2676	}
2677
2678	/*
2679	 * add back carry outs from top 16 bits to low 16 bits
2680	 */
2681	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2682	sum += (sum >> 16);			/* add carry */
2683	answer = ~sum;				/* truncate to 16 bits */
2684	return (answer);
2685}
2686
2687static void
2688reset_snxt_basetimes(void)
2689{
2690	struct phyint_instance *pii;
2691
2692	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2693		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2694	}
2695}
2696
2697/*
2698 * Is the address one of our own addresses? Unfortunately,
2699 * we cannot check our phyint tables to determine if the address
2700 * is our own. This is because, we don't track interfaces that
2701 * are not part of any group. We have to either use a 'bind' or
2702 * get the complete list of all interfaces using SIOCGLIFCONF,
2703 * to do this check. We could also use SIOCTMYADDR.
2704 * Bind fails for the local zone address, so we might include local zone
2705 * address as target address. If local zone address is a target address
2706 * and it is up, it is not possible to detect the interface failure.
2707 * SIOCTMYADDR also doesn't consider local zone address as own address.
2708 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2709 * are stored in `localaddrs'
2710 */
2711boolean_t
2712own_address(struct in6_addr addr)
2713{
2714	addrlist_t *addrp;
2715	struct sockaddr_storage ss;
2716	int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
2717
2718	addr2storage(af, &addr, &ss);
2719	for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
2720		if (sockaddrcmp(&ss, &addrp->al_addr))
2721			return (_B_TRUE);
2722	}
2723	return (_B_FALSE);
2724}
2725
2726static int
2727ns2ms(int64_t ns)
2728{
2729	return (NSEC2MSEC(ns));
2730}
2731
2732static int64_t
2733tv2ns(struct timeval *tvp)
2734{
2735	return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
2736}
2737