17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
37c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
47c478bd9Sstevel@tonic-gate  */
57c478bd9Sstevel@tonic-gate 
67c478bd9Sstevel@tonic-gate /*
77c478bd9Sstevel@tonic-gate  * Copyright (c) 1987 Regents of the University of California.
87c478bd9Sstevel@tonic-gate  * All rights reserved.
97c478bd9Sstevel@tonic-gate  *
107c478bd9Sstevel@tonic-gate  * Redistribution and use in source and binary forms are permitted
117c478bd9Sstevel@tonic-gate  * provided that the above copyright notice and this paragraph are
127c478bd9Sstevel@tonic-gate  * duplicated in all such forms and that any documentation,
137c478bd9Sstevel@tonic-gate  * advertising materials, and other materials related to such
147c478bd9Sstevel@tonic-gate  * distribution and use acknowledge that the software was developed
157c478bd9Sstevel@tonic-gate  * by the University of California, Berkeley. The name of the
167c478bd9Sstevel@tonic-gate  * University may not be used to endorse or promote products derived
177c478bd9Sstevel@tonic-gate  * from this software without specific prior written permission.
187c478bd9Sstevel@tonic-gate  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
197c478bd9Sstevel@tonic-gate  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
207c478bd9Sstevel@tonic-gate  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
217c478bd9Sstevel@tonic-gate  */
227c478bd9Sstevel@tonic-gate 
237c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
247c478bd9Sstevel@tonic-gate 
257c478bd9Sstevel@tonic-gate #include "mpd_defs.h"
267c478bd9Sstevel@tonic-gate #include "mpd_tables.h"
277c478bd9Sstevel@tonic-gate 
287c478bd9Sstevel@tonic-gate /*
297c478bd9Sstevel@tonic-gate  * Probe types for probe()
307c478bd9Sstevel@tonic-gate  */
317c478bd9Sstevel@tonic-gate #define	PROBE_UNI	0x1234		/* Unicast probe packet */
327c478bd9Sstevel@tonic-gate #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
337c478bd9Sstevel@tonic-gate #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
347c478bd9Sstevel@tonic-gate 
357c478bd9Sstevel@tonic-gate #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
367c478bd9Sstevel@tonic-gate 
377c478bd9Sstevel@tonic-gate /*
387c478bd9Sstevel@tonic-gate  * Format of probe / probe response packets. This is an ICMP Echo request
397c478bd9Sstevel@tonic-gate  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
407c478bd9Sstevel@tonic-gate  */
417c478bd9Sstevel@tonic-gate struct pr_icmp
427c478bd9Sstevel@tonic-gate {
437c478bd9Sstevel@tonic-gate 	uint8_t  pr_icmp_type;		/* type field */
447c478bd9Sstevel@tonic-gate 	uint8_t  pr_icmp_code;		/* code field */
457c478bd9Sstevel@tonic-gate 	uint16_t pr_icmp_cksum;		/* checksum field */
467c478bd9Sstevel@tonic-gate 	uint16_t pr_icmp_id;		/* Identification */
477c478bd9Sstevel@tonic-gate 	uint16_t pr_icmp_seq;		/* sequence number */
487c478bd9Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;	/* Time stamp	*/
497c478bd9Sstevel@tonic-gate 	uint32_t pr_icmp_mtype;		/* Message type */
507c478bd9Sstevel@tonic-gate };
517c478bd9Sstevel@tonic-gate 
527c478bd9Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
537c478bd9Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
547c478bd9Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
557c478bd9Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x1 } };
567c478bd9Sstevel@tonic-gate 
577c478bd9Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
587c478bd9Sstevel@tonic-gate 
597c478bd9Sstevel@tonic-gate static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
607c478bd9Sstevel@tonic-gate 
617c478bd9Sstevel@tonic-gate static void		*find_ancillary(struct msghdr *msg, int cmsg_type);
627c478bd9Sstevel@tonic-gate static void		pi_set_crtt(struct target *tg, int m,
637c478bd9Sstevel@tonic-gate     boolean_t is_probe_uni);
647c478bd9Sstevel@tonic-gate static void		incoming_echo_reply(struct phyint_instance *pii,
657c478bd9Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
667c478bd9Sstevel@tonic-gate static void		incoming_rtt_reply(struct phyint_instance *pii,
677c478bd9Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
687c478bd9Sstevel@tonic-gate static void		incoming_mcast_reply(struct phyint_instance *pii,
697c478bd9Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
707c478bd9Sstevel@tonic-gate 
717c478bd9Sstevel@tonic-gate static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
727c478bd9Sstevel@tonic-gate static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
737c478bd9Sstevel@tonic-gate static boolean_t	check_exception_target(struct phyint_instance *pii,
747c478bd9Sstevel@tonic-gate     struct target *target);
757c478bd9Sstevel@tonic-gate static void		probe_fail_info(struct phyint_instance *pii,
767c478bd9Sstevel@tonic-gate     struct target *cur_tg, struct probe_fail_count *pfinfo);
777c478bd9Sstevel@tonic-gate static void		probe_success_info(struct phyint_instance *pii,
787c478bd9Sstevel@tonic-gate     struct target *cur_tg, struct probe_success_count *psinfo);
797c478bd9Sstevel@tonic-gate static boolean_t	phyint_repaired(struct phyint *pi);
807c478bd9Sstevel@tonic-gate 
817c478bd9Sstevel@tonic-gate static int		failover(struct phyint *from, struct phyint *to);
827c478bd9Sstevel@tonic-gate static int		failback(struct phyint *from, struct phyint *to);
837c478bd9Sstevel@tonic-gate static struct phyint	*get_failover_dst(struct phyint *pi, int failover_type);
847c478bd9Sstevel@tonic-gate 
857c478bd9Sstevel@tonic-gate static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
867c478bd9Sstevel@tonic-gate static int 		in_cksum(ushort_t *addr, int len);
877c478bd9Sstevel@tonic-gate static void		reset_snxt_basetimes(void);
887c478bd9Sstevel@tonic-gate 
897c478bd9Sstevel@tonic-gate /*
907c478bd9Sstevel@tonic-gate  * CRTT - Conservative Round Trip Time Estimate
917c478bd9Sstevel@tonic-gate  * Probe success - A matching probe reply received before CRTT ms has elapsed
927c478bd9Sstevel@tonic-gate  *	after sending the probe.
937c478bd9Sstevel@tonic-gate  * Probe failure - No probe reply received and more than CRTT ms has elapsed
947c478bd9Sstevel@tonic-gate  *	after sending the probe.
957c478bd9Sstevel@tonic-gate  *
967c478bd9Sstevel@tonic-gate  * TLS - Time last success. Most recent probe ack received at this time.
977c478bd9Sstevel@tonic-gate  * TFF - Time first fail. The time of the earliest probe failure in
987c478bd9Sstevel@tonic-gate  *	a consecutive series of probe failures.
997c478bd9Sstevel@tonic-gate  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
1007c478bd9Sstevel@tonic-gate  * 	before declaring phyint repair.
1017c478bd9Sstevel@tonic-gate  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
1027c478bd9Sstevel@tonic-gate  *	declare a phyint failure.
1037c478bd9Sstevel@tonic-gate  *
1047c478bd9Sstevel@tonic-gate  * 			Phyint state diagram
1057c478bd9Sstevel@tonic-gate  *
1067c478bd9Sstevel@tonic-gate  * The state of a phyint that is capable of being probed, is completely
1077c478bd9Sstevel@tonic-gate  * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
1087c478bd9Sstevel@tonic-gate  *
1097c478bd9Sstevel@tonic-gate  * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
1107c478bd9Sstevel@tonic-gate  * of the link (according to the driver).  If the phyint is also configured
1117c478bd9Sstevel@tonic-gate  * with a test address (the common case) and probe targets, then a phyint must
1127c478bd9Sstevel@tonic-gate  * also successfully be able to send and receive probes in order to remain in
1137c478bd9Sstevel@tonic-gate  * the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
1147c478bd9Sstevel@tonic-gate  *
1157c478bd9Sstevel@tonic-gate  * Further, if a PI_RUNNING phyint is configured with a test address but is
1167c478bd9Sstevel@tonic-gate  * unable to find any probe targets, it will transition to the PI_NOTARGETS
1177c478bd9Sstevel@tonic-gate  * state, which indicates that the link is apparently functional but that
1187c478bd9Sstevel@tonic-gate  * in.mpathd is unable to send probes to verify functionality (in this case,
1197c478bd9Sstevel@tonic-gate  * in.mpathd makes the optimistic assumption that the interface is working
1207c478bd9Sstevel@tonic-gate  * correctly and thus does not perform a failover, but reports the interface
1217c478bd9Sstevel@tonic-gate  * as IPMP_IF_UNKNOWN through the async events and query interfaces).
1227c478bd9Sstevel@tonic-gate  *
1237c478bd9Sstevel@tonic-gate  * At any point, a phyint may be administratively marked offline via if_mpadm.
1247c478bd9Sstevel@tonic-gate  * In this case, the interface always transitions to PI_OFFLINE, regardless
1257c478bd9Sstevel@tonic-gate  * of its previous state.  When the interface is later brought back online,
1267c478bd9Sstevel@tonic-gate  * in.mpathd acts as if the interface is new (and thus it transitions to
1277c478bd9Sstevel@tonic-gate  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
1287c478bd9Sstevel@tonic-gate  * its probes, if probes are sent).
1297c478bd9Sstevel@tonic-gate  *
1307c478bd9Sstevel@tonic-gate  * pi_state -  PI_RUNNING or PI_FAILED
1317c478bd9Sstevel@tonic-gate  *	PI_RUNNING: The failure detection logic says the phyint is good.
1327c478bd9Sstevel@tonic-gate  *	PI_FAILED: The failure detection logic says the phyint has failed.
1337c478bd9Sstevel@tonic-gate  *
1347c478bd9Sstevel@tonic-gate  * pg_groupfailed  - Group failure, all interfaces in the group have failed.
1357c478bd9Sstevel@tonic-gate  *	The pi_state may be either PI_FAILED or PI_NOTARGETS.
1367c478bd9Sstevel@tonic-gate  *	In the case of router targets, we assume that the current list of
1377c478bd9Sstevel@tonic-gate  *	targets obtained from the routing table, is still valid, so the
1387c478bd9Sstevel@tonic-gate  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
1397c478bd9Sstevel@tonic-gate  *	list of targets, and multicast to the all hosts, to reconstruct the
1407c478bd9Sstevel@tonic-gate  *	target list. So the phyints are in the PI_NOTARGETS state.
1417c478bd9Sstevel@tonic-gate  *
1427c478bd9Sstevel@tonic-gate  * I -	value of (pi_flags & IFF_INACTIVE)
143*49df4566Sethindra  *	IFF_INACTIVE: No failovers have been done to this phyint, from
144*49df4566Sethindra  *		other phyints. This phyint is inactive. Phyint can be a Standby.
145*49df4566Sethindra  *		When failback has been disabled (FAILOVER=no configured),
146*49df4566Sethindra  *		phyint can also be a non-STANDBY. In this case IFF_INACTIVE
147*49df4566Sethindra  *		is set when phyint subsequently recovers after a failure.
1487c478bd9Sstevel@tonic-gate  *
1497c478bd9Sstevel@tonic-gate  * pi_empty
1507c478bd9Sstevel@tonic-gate  *	This phyint has failed over successfully to another phyint, and
1517c478bd9Sstevel@tonic-gate  *	this phyint is currently "empty". It does not host any addresses or
1527c478bd9Sstevel@tonic-gate  *	multicast membership etc. This is the state of a phyint after a
1537c478bd9Sstevel@tonic-gate  *	failover from the phyint has completed successfully and no subsequent
1547c478bd9Sstevel@tonic-gate  *	'failover to' or 'failback to' has occurred on the phyint.
1557c478bd9Sstevel@tonic-gate  *	IP guarantees that no new logicals will be hosted nor any multicast
1567c478bd9Sstevel@tonic-gate  *	joins permitted on the phyint, since the phyint is either failed or
1577c478bd9Sstevel@tonic-gate  *	inactive. pi_empty is set implies the phyint is either failed or
1587c478bd9Sstevel@tonic-gate  *	inactive.
1597c478bd9Sstevel@tonic-gate  *
1607c478bd9Sstevel@tonic-gate  * pi_full
1617c478bd9Sstevel@tonic-gate  *	The phyint hosts all of its own addresses that it "owns". If the
1627c478bd9Sstevel@tonic-gate  *	phyint was previously failed or inactive, failbacks to the phyint
1637c478bd9Sstevel@tonic-gate  *	has completed successfully. i.e. No more failbacks to this phyint
1647c478bd9Sstevel@tonic-gate  *	can produce any change in system state whatsoever.
1657c478bd9Sstevel@tonic-gate  *
1667c478bd9Sstevel@tonic-gate  * Not all 32 possible combinations of the above 5-tuple are possible.
1677c478bd9Sstevel@tonic-gate  * Furthermore some of the above combinations are transient. They may occur
1687c478bd9Sstevel@tonic-gate  * only because the failover or failback did not complete successfully. The
1697c478bd9Sstevel@tonic-gate  * failover/failback will be retried and eventually a stable state will be
1707c478bd9Sstevel@tonic-gate  * reached.
1717c478bd9Sstevel@tonic-gate  *
1727c478bd9Sstevel@tonic-gate  * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
1737c478bd9Sstevel@tonic-gate  * The following are the state machines. 'from' and 'to' are the src and
1747c478bd9Sstevel@tonic-gate  * dst of the failover/failback, below
1757c478bd9Sstevel@tonic-gate  *
1767c478bd9Sstevel@tonic-gate  *			pi_empty state machine
1777c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1787c478bd9Sstevel@tonic-gate  *	Event				State	->	New State
1797c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1807c478bd9Sstevel@tonic-gate  *	successful completion 		from.pi_empty = 0 -> from.pi_empty = 1
1817c478bd9Sstevel@tonic-gate  *	of failover
1827c478bd9Sstevel@tonic-gate  *
1837c478bd9Sstevel@tonic-gate  *	Initiate failover 		to.pi_empty = X   -> to.pi_empty = 0
1847c478bd9Sstevel@tonic-gate  *
1857c478bd9Sstevel@tonic-gate  * 	Initiate failback 		to.pi_empty = X   -> to.pi_empty = 0
1867c478bd9Sstevel@tonic-gate  *
1877c478bd9Sstevel@tonic-gate  * 	group failure			pi_empty = X	  -> pi_empty = 0
1887c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1897c478bd9Sstevel@tonic-gate  *
1907c478bd9Sstevel@tonic-gate  *			pi_full state machine
1917c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1927c478bd9Sstevel@tonic-gate  *	Event				State		  -> New State
1937c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1947c478bd9Sstevel@tonic-gate  *	successful completion		to.pi_full = 0    -> to.pi_full = 1
1957c478bd9Sstevel@tonic-gate  *	of failback from
1967c478bd9Sstevel@tonic-gate  *	each of the other phyints
1977c478bd9Sstevel@tonic-gate  *
1987c478bd9Sstevel@tonic-gate  *	Initiate failover 		from.pi_full = X  -> from.pi_full = 0
1997c478bd9Sstevel@tonic-gate  *
2007c478bd9Sstevel@tonic-gate  *	group failure			pi_full = X	  -> pi_full = 0
2017c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2027c478bd9Sstevel@tonic-gate  *
2037c478bd9Sstevel@tonic-gate  *			pi_state state machine
2047c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2057c478bd9Sstevel@tonic-gate  *	Event			State			New State
2067c478bd9Sstevel@tonic-gate  *				Action:
2077c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2087c478bd9Sstevel@tonic-gate  *	NIC failure		(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
2097c478bd9Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
2107c478bd9Sstevel@tonic-gate  *				: failover from this phyint to another
2117c478bd9Sstevel@tonic-gate  *
212*49df4566Sethindra  *	NIC failure		(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
2137c478bd9Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
2147c478bd9Sstevel@tonic-gate  *
215*49df4566Sethindra  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=yes)
216*49df4566Sethindra  *	detection				     -> (PI_RUNNING, I == 0)
217*49df4566Sethindra  *				: to.pi_empty = 0
2187c478bd9Sstevel@tonic-gate  *				: clear IFF_FAILED on this phyint
219*49df4566Sethindra  *				: failback to this phyint if enabled
2207c478bd9Sstevel@tonic-gate  *
221*49df4566Sethindra  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=no)
222*49df4566Sethindra  *	detection				     ->	(PI_RUNNING, I == 1)
223*49df4566Sethindra  *				: to.pi_empty = 0
224*49df4566Sethindra  *				: clear IFF_FAILED on this phyint
225*49df4566Sethindra  *				: if failback is disabled set I == 1
2267c478bd9Sstevel@tonic-gate  *
2277c478bd9Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
2287c478bd9Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_FAILED
2297c478bd9Sstevel@tonic-gate  *	(Router targets)	: set IFF_FAILED
2307c478bd9Sstevel@tonic-gate  *				: clear pi_empty and pi_full
2317c478bd9Sstevel@tonic-gate  *
2327c478bd9Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
2337c478bd9Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_NOTARGETS
2347c478bd9Sstevel@tonic-gate  *	(Host targets)		: set IFF_FAILED
2357c478bd9Sstevel@tonic-gate  *				: clear pi_empty and pi_full
2367c478bd9Sstevel@tonic-gate  *				: delete the target list on all phyints
2377c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2387c478bd9Sstevel@tonic-gate  *
2397c478bd9Sstevel@tonic-gate  *			I state machine
2407c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2417c478bd9Sstevel@tonic-gate  *	Event		State			Action:
2427c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
243*49df4566Sethindra  *	Turn on I 	pi_empty == 0, STANDBY 	: failover from standby
2447c478bd9Sstevel@tonic-gate  *
245*49df4566Sethindra  *	Turn off I 	PI_RUNNING, STANDBY	: pi_empty = 0
2467c478bd9Sstevel@tonic-gate  *			pi_full == 0		: failback to this if enabled
2477c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2487c478bd9Sstevel@tonic-gate  *
2497c478bd9Sstevel@tonic-gate  * Assertions: (Read '==>' as implies)
2507c478bd9Sstevel@tonic-gate  *
2517c478bd9Sstevel@tonic-gate  * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
2527c478bd9Sstevel@tonic-gate  * (pi_empty == 1) ==> (pi_full == 0)
2537c478bd9Sstevel@tonic-gate  * (pi_full  == 1) ==> (pi_empty == 0)
2547c478bd9Sstevel@tonic-gate  *
2557c478bd9Sstevel@tonic-gate  * Invariants
2567c478bd9Sstevel@tonic-gate  *
2577c478bd9Sstevel@tonic-gate  * pg_groupfailed = 0  &&
258*49df4566Sethindra  *   1. (I == 1, pi_empty == 0)		   ==> initiate failover from standby
2597c478bd9Sstevel@tonic-gate  *   2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
2607c478bd9Sstevel@tonic-gate  *   3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
2617c478bd9Sstevel@tonic-gate  *
2627c478bd9Sstevel@tonic-gate  * 1. says that an inactive standby, that is not empty, has to be failed
2637c478bd9Sstevel@tonic-gate  * over. For a standby to be truly inactive, it should not host any
2647c478bd9Sstevel@tonic-gate  * addresses. So we move them to some other phyint. Usually we catch the
2657c478bd9Sstevel@tonic-gate  * turn on of IFF_INACTIVE, and perform this action. However if the failover
2667c478bd9Sstevel@tonic-gate  * did not complete successfully, then subsequently we have lost the edge
2677c478bd9Sstevel@tonic-gate  * trigger, and this invariant kicks in and completes the action.
2687c478bd9Sstevel@tonic-gate  *
2697c478bd9Sstevel@tonic-gate  * 2. says that any failed phyint that is not empty must be failed over.
2707c478bd9Sstevel@tonic-gate  * Usually we do the failover when we detect NIC failure. However if the
2717c478bd9Sstevel@tonic-gate  * failover does not complete successfully, this invariant kicks in and
2727c478bd9Sstevel@tonic-gate  * completes the failover. We exclude inactive standby which is covered by 1.
2737c478bd9Sstevel@tonic-gate  *
2747c478bd9Sstevel@tonic-gate  * 3. says that any running phyint that is not full must be failed back.
2757c478bd9Sstevel@tonic-gate  * Usually we do the failback when we detect NIC repair. However if the
2767c478bd9Sstevel@tonic-gate  * failback does not complete successfully, this invariant kicks in and
2777c478bd9Sstevel@tonic-gate  * completes the failback. Note that we don't want to failback to an inactive
2787c478bd9Sstevel@tonic-gate  * standby.
2797c478bd9Sstevel@tonic-gate  *
2807c478bd9Sstevel@tonic-gate  * The invariants 1 - 3 and the actions are in initifs().
2817c478bd9Sstevel@tonic-gate  */
2827c478bd9Sstevel@tonic-gate 
2837c478bd9Sstevel@tonic-gate struct probes_missed probes_missed;
2847c478bd9Sstevel@tonic-gate 
2857c478bd9Sstevel@tonic-gate /*
2867c478bd9Sstevel@tonic-gate  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
2877c478bd9Sstevel@tonic-gate  * will be added on by the kernel.  The id field identifies this phyint.
2887c478bd9Sstevel@tonic-gate  * and the sequence number is an increasing (modulo 2^^16) integer. The data
2897c478bd9Sstevel@tonic-gate  * portion holds the time value when the packet is sent. On echo this is
2907c478bd9Sstevel@tonic-gate  * extracted to compute the round-trip time. Three different types of
2917c478bd9Sstevel@tonic-gate  * probe packets are used.
2927c478bd9Sstevel@tonic-gate  *
2937c478bd9Sstevel@tonic-gate  * PROBE_UNI: This type is used to do failure detection / failure recovery
2947c478bd9Sstevel@tonic-gate  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
2957c478bd9Sstevel@tonic-gate  *	not less than the current CRTT. pii_probes[] stores data
2967c478bd9Sstevel@tonic-gate  *	about these probes. These packets consume sequence number space.
2977c478bd9Sstevel@tonic-gate  *
2987c478bd9Sstevel@tonic-gate  * PROBE_RTT: This type is used to make only rtt measurments. Normally these
2997c478bd9Sstevel@tonic-gate  * 	are not used. Under heavy network load, the rtt may go up very high,
3007c478bd9Sstevel@tonic-gate  *	due to a spike, or may appear to go high, due to extreme scheduling
3017c478bd9Sstevel@tonic-gate  * 	delays. Once the network stress is removed, mpathd takes long time to
3027c478bd9Sstevel@tonic-gate  *	recover, because the probe_interval is already high, and it takes
3037c478bd9Sstevel@tonic-gate  *	a long time to send out sufficient number of probes to bring down the
3047c478bd9Sstevel@tonic-gate  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
3057c478bd9Sstevel@tonic-gate  *	user_probe_interval ms. and will cause only rtt updates. These packets
3067c478bd9Sstevel@tonic-gate  *	do not consume sequence number space nor is information about these
3077c478bd9Sstevel@tonic-gate  *	packets stored in the pii_probes[]
3087c478bd9Sstevel@tonic-gate  *
3097c478bd9Sstevel@tonic-gate  * PROBE_MULTI: This type is only used to construct a list of targets, when
3107c478bd9Sstevel@tonic-gate  *	no targets are known. The packet is multicast to the all hosts addr.
3117c478bd9Sstevel@tonic-gate  */
3127c478bd9Sstevel@tonic-gate static void
3137c478bd9Sstevel@tonic-gate probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
3147c478bd9Sstevel@tonic-gate {
3157c478bd9Sstevel@tonic-gate 	struct pr_icmp probe_pkt;	/* Probe packet */
3167c478bd9Sstevel@tonic-gate 	struct sockaddr_in6 whereto6; 	/* target address IPv6 */
3177c478bd9Sstevel@tonic-gate 	struct sockaddr_in whereto; 	/* target address IPv4 */
3187c478bd9Sstevel@tonic-gate 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
3197c478bd9Sstevel@tonic-gate 	boolean_t sent = _B_TRUE;
3207c478bd9Sstevel@tonic-gate 
3217c478bd9Sstevel@tonic-gate 	if (debug & D_TARGET) {
3227c478bd9Sstevel@tonic-gate 		logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
3237c478bd9Sstevel@tonic-gate 		    pii->pii_name, probe_type, cur_time);
3247c478bd9Sstevel@tonic-gate 	}
3257c478bd9Sstevel@tonic-gate 
3267c478bd9Sstevel@tonic-gate 	assert(pii->pii_probe_sock != -1);
3277c478bd9Sstevel@tonic-gate 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
3287c478bd9Sstevel@tonic-gate 	    probe_type == PROBE_RTT);
3297c478bd9Sstevel@tonic-gate 
3307c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
3317c478bd9Sstevel@tonic-gate 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
3327c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_code = 0;
3337c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_cksum = 0;
3347c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
3357c478bd9Sstevel@tonic-gate 
3367c478bd9Sstevel@tonic-gate 	/*
3377c478bd9Sstevel@tonic-gate 	 * Since there is no need to do arithmetic on the icmpid,
3387c478bd9Sstevel@tonic-gate 	 * (only equality check is done) pii_icmpid is stored in
3397c478bd9Sstevel@tonic-gate 	 * network byte order at initialization itself.
3407c478bd9Sstevel@tonic-gate 	 */
3417c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
3427c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_timestamp = htonl(cur_time);
3437c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
3447c478bd9Sstevel@tonic-gate 
3457c478bd9Sstevel@tonic-gate 	/*
3467c478bd9Sstevel@tonic-gate 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
3477c478bd9Sstevel@tonic-gate 	 * the all hosts address. Otherwise it is unicast to the next target.
3487c478bd9Sstevel@tonic-gate 	 */
3497c478bd9Sstevel@tonic-gate 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
3507c478bd9Sstevel@tonic-gate 	    pii->pii_rtt_target_next != NULL));
3517c478bd9Sstevel@tonic-gate 
3527c478bd9Sstevel@tonic-gate 	if (pii->pii_af == AF_INET6) {
3537c478bd9Sstevel@tonic-gate 		bzero(&whereto6, sizeof (whereto6));
3547c478bd9Sstevel@tonic-gate 		whereto6.sin6_family = AF_INET6;
3557c478bd9Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
3567c478bd9Sstevel@tonic-gate 			whereto6.sin6_addr = all_nodes_mcast_v6;
3577c478bd9Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
3587c478bd9Sstevel@tonic-gate 			whereto6.sin6_addr = pii->pii_target_next->tg_address;
3597c478bd9Sstevel@tonic-gate 		} else  {
3607c478bd9Sstevel@tonic-gate 			/* type is PROBE_RTT */
3617c478bd9Sstevel@tonic-gate 			whereto6.sin6_addr =
3627c478bd9Sstevel@tonic-gate 			    pii->pii_rtt_target_next->tg_address;
3637c478bd9Sstevel@tonic-gate 		}
3647c478bd9Sstevel@tonic-gate 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
3657c478bd9Sstevel@tonic-gate 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
3667c478bd9Sstevel@tonic-gate 		    sizeof (whereto6)) != sizeof (probe_pkt)) {
3677c478bd9Sstevel@tonic-gate 			logperror_pii(pii, "probe: probe sendto");
3687c478bd9Sstevel@tonic-gate 			sent = _B_FALSE;
3697c478bd9Sstevel@tonic-gate 		}
3707c478bd9Sstevel@tonic-gate 	} else {
3717c478bd9Sstevel@tonic-gate 		bzero(&whereto, sizeof (whereto));
3727c478bd9Sstevel@tonic-gate 		whereto.sin_family = AF_INET;
3737c478bd9Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
3747c478bd9Sstevel@tonic-gate 			whereto.sin_addr = all_nodes_mcast_v4;
3757c478bd9Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
3767c478bd9Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
3777c478bd9Sstevel@tonic-gate 			    &pii->pii_target_next->tg_address,
3787c478bd9Sstevel@tonic-gate 			    &whereto.sin_addr);
3797c478bd9Sstevel@tonic-gate 		} else {
3807c478bd9Sstevel@tonic-gate 			/* type is PROBE_RTT */
3817c478bd9Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
3827c478bd9Sstevel@tonic-gate 			    &pii->pii_rtt_target_next->tg_address,
3837c478bd9Sstevel@tonic-gate 			    &whereto.sin_addr);
3847c478bd9Sstevel@tonic-gate 		}
3857c478bd9Sstevel@tonic-gate 
3867c478bd9Sstevel@tonic-gate 		/*
3877c478bd9Sstevel@tonic-gate 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
3887c478bd9Sstevel@tonic-gate 		 */
3897c478bd9Sstevel@tonic-gate 		probe_pkt.pr_icmp_cksum =
3907c478bd9Sstevel@tonic-gate 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
3917c478bd9Sstevel@tonic-gate 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
3927c478bd9Sstevel@tonic-gate 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
3937c478bd9Sstevel@tonic-gate 		    sizeof (whereto)) != sizeof (probe_pkt)) {
3947c478bd9Sstevel@tonic-gate 			logperror_pii(pii, "probe: probe sendto");
3957c478bd9Sstevel@tonic-gate 			sent = _B_FALSE;
3967c478bd9Sstevel@tonic-gate 		}
3977c478bd9Sstevel@tonic-gate 	}
3987c478bd9Sstevel@tonic-gate 
3997c478bd9Sstevel@tonic-gate 	/*
4007c478bd9Sstevel@tonic-gate 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
4017c478bd9Sstevel@tonic-gate 	 * update our tables. We will need this info in processing the probe
4027c478bd9Sstevel@tonic-gate 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
4037c478bd9Sstevel@tonic-gate 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
4047c478bd9Sstevel@tonic-gate 	 * are only used to construct a list of targets. PROBE_RTT packets are
4057c478bd9Sstevel@tonic-gate 	 * used only for updating the rtt and not for failure detection.
4067c478bd9Sstevel@tonic-gate 	 */
4077c478bd9Sstevel@tonic-gate 	if (probe_type == PROBE_UNI && sent) {
4087c478bd9Sstevel@tonic-gate 		pr_ndx = pii->pii_probe_next;
4097c478bd9Sstevel@tonic-gate 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
4107c478bd9Sstevel@tonic-gate 
4117c478bd9Sstevel@tonic-gate 		/* Collect statistics, before we reuse the last slot. */
4127c478bd9Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
4137c478bd9Sstevel@tonic-gate 			pii->pii_cum_stats.lost++;
4147c478bd9Sstevel@tonic-gate 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
4157c478bd9Sstevel@tonic-gate 			pii->pii_cum_stats.acked++;
4167c478bd9Sstevel@tonic-gate 		pii->pii_cum_stats.sent++;
4177c478bd9Sstevel@tonic-gate 
4187c478bd9Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
4197c478bd9Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
4207c478bd9Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
4217c478bd9Sstevel@tonic-gate 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
4227c478bd9Sstevel@tonic-gate 		pii->pii_target_next = target_next(pii->pii_target_next);
4237c478bd9Sstevel@tonic-gate 		assert(pii->pii_target_next != NULL);
4247c478bd9Sstevel@tonic-gate 		/*
4257c478bd9Sstevel@tonic-gate 		 * If we have a single variable to denote the next target to
4267c478bd9Sstevel@tonic-gate 		 * probe for both rtt probes and failure detection probes, we
4277c478bd9Sstevel@tonic-gate 		 * could end up with a situation where the failure detection
4287c478bd9Sstevel@tonic-gate 		 * probe targets become disjoint from the rtt probe targets.
4297c478bd9Sstevel@tonic-gate 		 * Eg. if 2 targets and the actual fdt is double the user
4307c478bd9Sstevel@tonic-gate 		 * specified fdt. So we have 2 variables. In this scheme
4317c478bd9Sstevel@tonic-gate 		 * we also reset pii_rtt_target_next for every fdt probe,
4327c478bd9Sstevel@tonic-gate 		 * though that may not be necessary.
4337c478bd9Sstevel@tonic-gate 		 */
4347c478bd9Sstevel@tonic-gate 		pii->pii_rtt_target_next = pii->pii_target_next;
4357c478bd9Sstevel@tonic-gate 		pii->pii_snxt++;
4367c478bd9Sstevel@tonic-gate 	} else if (probe_type == PROBE_RTT) {
4377c478bd9Sstevel@tonic-gate 		pii->pii_rtt_target_next =
4387c478bd9Sstevel@tonic-gate 		    target_next(pii->pii_rtt_target_next);
4397c478bd9Sstevel@tonic-gate 		assert(pii->pii_rtt_target_next != NULL);
4407c478bd9Sstevel@tonic-gate 	}
4417c478bd9Sstevel@tonic-gate }
4427c478bd9Sstevel@tonic-gate 
4437c478bd9Sstevel@tonic-gate /*
4447c478bd9Sstevel@tonic-gate  * Incoming IPv4 data from wire, is received here. Called from main.
4457c478bd9Sstevel@tonic-gate  */
4467c478bd9Sstevel@tonic-gate void
4477c478bd9Sstevel@tonic-gate in_data(struct phyint_instance *pii)
4487c478bd9Sstevel@tonic-gate {
4497c478bd9Sstevel@tonic-gate 	struct	sockaddr_in 	from;
4507c478bd9Sstevel@tonic-gate 	struct	in6_addr	fromaddr;
4517c478bd9Sstevel@tonic-gate 	uint_t	fromlen;
4527c478bd9Sstevel@tonic-gate 	static uint_t in_packet[(IP_MAXPACKET + 1)/4];
4537c478bd9Sstevel@tonic-gate 	struct ip *ip;
4547c478bd9Sstevel@tonic-gate 	int 	iphlen;
4557c478bd9Sstevel@tonic-gate 	int 	len;
4567c478bd9Sstevel@tonic-gate 	char 	abuf[INET_ADDRSTRLEN];
4577c478bd9Sstevel@tonic-gate 	struct	pr_icmp	*reply;
4587c478bd9Sstevel@tonic-gate 
4597c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE) {
4607c478bd9Sstevel@tonic-gate 		logdebug("in_data(%s %s)\n",
4617c478bd9Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
4627c478bd9Sstevel@tonic-gate 	}
4637c478bd9Sstevel@tonic-gate 
4647c478bd9Sstevel@tonic-gate 	/*
4657c478bd9Sstevel@tonic-gate 	 * Poll has already told us that a message is waiting,
4667c478bd9Sstevel@tonic-gate 	 * on this socket. Read it now. We should not block.
4677c478bd9Sstevel@tonic-gate 	 */
4687c478bd9Sstevel@tonic-gate 	fromlen = sizeof (from);
4697c478bd9Sstevel@tonic-gate 	len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
4707c478bd9Sstevel@tonic-gate 	    sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
4717c478bd9Sstevel@tonic-gate 	if (len < 0) {
4727c478bd9Sstevel@tonic-gate 		logperror_pii(pii, "in_data: recvfrom");
4737c478bd9Sstevel@tonic-gate 		return;
4747c478bd9Sstevel@tonic-gate 	}
4757c478bd9Sstevel@tonic-gate 
4767c478bd9Sstevel@tonic-gate 	/*
4777c478bd9Sstevel@tonic-gate 	 * If the NIC has indicated the link is down, don't go
4787c478bd9Sstevel@tonic-gate 	 * any further.
4797c478bd9Sstevel@tonic-gate 	 */
4807c478bd9Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
4817c478bd9Sstevel@tonic-gate 		return;
4827c478bd9Sstevel@tonic-gate 
4837c478bd9Sstevel@tonic-gate 	/* Get the printable address for error reporting */
4847c478bd9Sstevel@tonic-gate 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
4857c478bd9Sstevel@tonic-gate 
4867c478bd9Sstevel@tonic-gate 	/* Make sure packet contains at least minimum ICMP header */
4877c478bd9Sstevel@tonic-gate 	ip = (struct ip *)in_packet;
4887c478bd9Sstevel@tonic-gate 	iphlen = ip->ip_hl << 2;
4897c478bd9Sstevel@tonic-gate 	if (len < iphlen + ICMP_MINLEN) {
4907c478bd9Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
4917c478bd9Sstevel@tonic-gate 			logdebug("in_data: packet too short (%d bytes)"
4927c478bd9Sstevel@tonic-gate 			    " from %s\n", len, abuf);
4937c478bd9Sstevel@tonic-gate 		}
4947c478bd9Sstevel@tonic-gate 		return;
4957c478bd9Sstevel@tonic-gate 	}
4967c478bd9Sstevel@tonic-gate 
4977c478bd9Sstevel@tonic-gate 	/*
4987c478bd9Sstevel@tonic-gate 	 * Subtract the IP hdr length, 'len' will be length of the probe
4997c478bd9Sstevel@tonic-gate 	 * reply, starting from the icmp hdr.
5007c478bd9Sstevel@tonic-gate 	 */
5017c478bd9Sstevel@tonic-gate 	len -= iphlen;
5027c478bd9Sstevel@tonic-gate 	/* LINTED */
5037c478bd9Sstevel@tonic-gate 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
5047c478bd9Sstevel@tonic-gate 
5057c478bd9Sstevel@tonic-gate 	/* Probe replies are icmp echo replies. Ignore anything else */
5067c478bd9Sstevel@tonic-gate 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
5077c478bd9Sstevel@tonic-gate 		return;
5087c478bd9Sstevel@tonic-gate 
5097c478bd9Sstevel@tonic-gate 	/*
5107c478bd9Sstevel@tonic-gate 	 * The icmp id should match what we sent, which is stored
5117c478bd9Sstevel@tonic-gate 	 * in pi_icmpid. The icmp code for reply must be 0.
5127c478bd9Sstevel@tonic-gate 	 * The reply content must be a struct pr_icmp
5137c478bd9Sstevel@tonic-gate 	 */
5147c478bd9Sstevel@tonic-gate 	if (reply->pr_icmp_id != pii->pii_icmpid) {
5157c478bd9Sstevel@tonic-gate 		/* Not in response to our probe */
5167c478bd9Sstevel@tonic-gate 		return;
5177c478bd9Sstevel@tonic-gate 	}
5187c478bd9Sstevel@tonic-gate 
5197c478bd9Sstevel@tonic-gate 	if (reply->pr_icmp_code != 0) {
5207c478bd9Sstevel@tonic-gate 		logtrace("probe reply code %d from %s on %s\n",
5217c478bd9Sstevel@tonic-gate 		    reply->pr_icmp_code, abuf, pii->pii_name);
5227c478bd9Sstevel@tonic-gate 		return;
5237c478bd9Sstevel@tonic-gate 	}
5247c478bd9Sstevel@tonic-gate 
5257c478bd9Sstevel@tonic-gate 	if (len < sizeof (struct pr_icmp)) {
5267c478bd9Sstevel@tonic-gate 		logtrace("probe reply too short: %d bytes from %s on %s\n",
5277c478bd9Sstevel@tonic-gate 		    len, abuf, pii->pii_name);
5287c478bd9Sstevel@tonic-gate 		return;
5297c478bd9Sstevel@tonic-gate 	}
5307c478bd9Sstevel@tonic-gate 
5317c478bd9Sstevel@tonic-gate 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
5327c478bd9Sstevel@tonic-gate 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
5337c478bd9Sstevel@tonic-gate 		/* Unicast probe reply */
5347c478bd9Sstevel@tonic-gate 		incoming_echo_reply(pii, reply, fromaddr);
5357c478bd9Sstevel@tonic-gate 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
5367c478bd9Sstevel@tonic-gate 		/* Multicast reply */
5377c478bd9Sstevel@tonic-gate 		incoming_mcast_reply(pii, reply, fromaddr);
5387c478bd9Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
5397c478bd9Sstevel@tonic-gate 		incoming_rtt_reply(pii, reply, fromaddr);
5407c478bd9Sstevel@tonic-gate 	} else {
5417c478bd9Sstevel@tonic-gate 		/* Probably not in response to our probe */
5427c478bd9Sstevel@tonic-gate 		logtrace("probe reply type: %d from %s on %s\n",
5437c478bd9Sstevel@tonic-gate 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
5447c478bd9Sstevel@tonic-gate 		return;
5457c478bd9Sstevel@tonic-gate 	}
5467c478bd9Sstevel@tonic-gate 
5477c478bd9Sstevel@tonic-gate }
5487c478bd9Sstevel@tonic-gate 
5497c478bd9Sstevel@tonic-gate /*
5507c478bd9Sstevel@tonic-gate  * Incoming IPv6 data from wire is received here. Called from main.
5517c478bd9Sstevel@tonic-gate  */
5527c478bd9Sstevel@tonic-gate void
5537c478bd9Sstevel@tonic-gate in6_data(struct phyint_instance *pii)
5547c478bd9Sstevel@tonic-gate {
5557c478bd9Sstevel@tonic-gate 	struct sockaddr_in6 from;
5567c478bd9Sstevel@tonic-gate 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
5577c478bd9Sstevel@tonic-gate 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
5587c478bd9Sstevel@tonic-gate 	int len;
5597c478bd9Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
5607c478bd9Sstevel@tonic-gate 	struct msghdr msg;
5617c478bd9Sstevel@tonic-gate 	struct iovec iov;
5627c478bd9Sstevel@tonic-gate 	uchar_t *opt;
5637c478bd9Sstevel@tonic-gate 	struct	pr_icmp *reply;
5647c478bd9Sstevel@tonic-gate 
5657c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE) {
5667c478bd9Sstevel@tonic-gate 		logdebug("in6_data(%s %s)\n",
5677c478bd9Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
5687c478bd9Sstevel@tonic-gate 	}
5697c478bd9Sstevel@tonic-gate 
5707c478bd9Sstevel@tonic-gate 	iov.iov_base = (char *)in_packet;
5717c478bd9Sstevel@tonic-gate 	iov.iov_len = sizeof (in_packet);
5727c478bd9Sstevel@tonic-gate 	msg.msg_iov = &iov;
5737c478bd9Sstevel@tonic-gate 	msg.msg_iovlen = 1;
5747c478bd9Sstevel@tonic-gate 	msg.msg_name = (struct sockaddr *)&from;
5757c478bd9Sstevel@tonic-gate 	msg.msg_namelen = sizeof (from);
5767c478bd9Sstevel@tonic-gate 	msg.msg_control = ancillary_data;
5777c478bd9Sstevel@tonic-gate 	msg.msg_controllen = sizeof (ancillary_data);
5787c478bd9Sstevel@tonic-gate 
5797c478bd9Sstevel@tonic-gate 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
5807c478bd9Sstevel@tonic-gate 		logperror_pii(pii, "in6_data: recvfrom");
5817c478bd9Sstevel@tonic-gate 		return;
5827c478bd9Sstevel@tonic-gate 	}
5837c478bd9Sstevel@tonic-gate 
5847c478bd9Sstevel@tonic-gate 	/*
5857c478bd9Sstevel@tonic-gate 	 * If the NIC has indicated that the link is down, don't go
5867c478bd9Sstevel@tonic-gate 	 * any further.
5877c478bd9Sstevel@tonic-gate 	 */
5887c478bd9Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
5897c478bd9Sstevel@tonic-gate 		return;
5907c478bd9Sstevel@tonic-gate 
5917c478bd9Sstevel@tonic-gate 	/* Get the printable address for error reporting */
5927c478bd9Sstevel@tonic-gate 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
5937c478bd9Sstevel@tonic-gate 	if (len < ICMP_MINLEN) {
5947c478bd9Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
5957c478bd9Sstevel@tonic-gate 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
5967c478bd9Sstevel@tonic-gate 			    msg.msg_flags, abuf);
5977c478bd9Sstevel@tonic-gate 		}
5987c478bd9Sstevel@tonic-gate 		return;
5997c478bd9Sstevel@tonic-gate 	}
6007c478bd9Sstevel@tonic-gate 	/* Ignore packets > 64k or control buffers that don't fit */
6017c478bd9Sstevel@tonic-gate 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
6027c478bd9Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
6037c478bd9Sstevel@tonic-gate 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
6047c478bd9Sstevel@tonic-gate 			    msg.msg_flags, abuf);
6057c478bd9Sstevel@tonic-gate 		}
6067c478bd9Sstevel@tonic-gate 		return;
6077c478bd9Sstevel@tonic-gate 	}
6087c478bd9Sstevel@tonic-gate 
6097c478bd9Sstevel@tonic-gate 	reply = (struct pr_icmp *)in_packet;
6107c478bd9Sstevel@tonic-gate 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
6117c478bd9Sstevel@tonic-gate 		return;
6127c478bd9Sstevel@tonic-gate 
6137c478bd9Sstevel@tonic-gate 	if (reply->pr_icmp_id != pii->pii_icmpid) {
6147c478bd9Sstevel@tonic-gate 		/* Not in response to our probe */
6157c478bd9Sstevel@tonic-gate 		return;
6167c478bd9Sstevel@tonic-gate 	}
6177c478bd9Sstevel@tonic-gate 
6187c478bd9Sstevel@tonic-gate 	/*
6197c478bd9Sstevel@tonic-gate 	 * The kernel has already verified the the ICMP checksum.
6207c478bd9Sstevel@tonic-gate 	 */
6217c478bd9Sstevel@tonic-gate 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
6227c478bd9Sstevel@tonic-gate 		logtrace("ICMPv6 echo reply source address not linklocal from "
6237c478bd9Sstevel@tonic-gate 		    "%s on %s\n", abuf, pii->pii_name);
6247c478bd9Sstevel@tonic-gate 		return;
6257c478bd9Sstevel@tonic-gate 	}
6267c478bd9Sstevel@tonic-gate 	opt = find_ancillary(&msg, IPV6_RTHDR);
6277c478bd9Sstevel@tonic-gate 	if (opt != NULL) {
6287c478bd9Sstevel@tonic-gate 		/* Can't allow routing headers in probe replies  */
6297c478bd9Sstevel@tonic-gate 		logtrace("message with routing header from %s on %s\n",
6307c478bd9Sstevel@tonic-gate 		    abuf, pii->pii_name);
6317c478bd9Sstevel@tonic-gate 		return;
6327c478bd9Sstevel@tonic-gate 	}
6337c478bd9Sstevel@tonic-gate 	if (reply->pr_icmp_code != 0) {
6347c478bd9Sstevel@tonic-gate 		logtrace("probe reply code: %d from %s on %s\n",
6357c478bd9Sstevel@tonic-gate 		    reply->pr_icmp_code, abuf, pii->pii_name);
6367c478bd9Sstevel@tonic-gate 		return;
6377c478bd9Sstevel@tonic-gate 	}
6387c478bd9Sstevel@tonic-gate 	if (len < (sizeof (struct pr_icmp))) {
6397c478bd9Sstevel@tonic-gate 		logtrace("probe reply too short: %d bytes from %s on %s\n",
6407c478bd9Sstevel@tonic-gate 		    len, abuf, pii->pii_name);
6417c478bd9Sstevel@tonic-gate 		return;
6427c478bd9Sstevel@tonic-gate 	}
6437c478bd9Sstevel@tonic-gate 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
6447c478bd9Sstevel@tonic-gate 		incoming_echo_reply(pii, reply, from.sin6_addr);
6457c478bd9Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
6467c478bd9Sstevel@tonic-gate 		incoming_mcast_reply(pii, reply, from.sin6_addr);
6477c478bd9Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
6487c478bd9Sstevel@tonic-gate 		incoming_rtt_reply(pii, reply, from.sin6_addr);
6497c478bd9Sstevel@tonic-gate 	} else  {
6507c478bd9Sstevel@tonic-gate 		/* Probably not in response to our probe */
6517c478bd9Sstevel@tonic-gate 		logtrace("probe reply type: %d from %s on %s\n",
6527c478bd9Sstevel@tonic-gate 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
6537c478bd9Sstevel@tonic-gate 	}
6547c478bd9Sstevel@tonic-gate }
6557c478bd9Sstevel@tonic-gate 
6567c478bd9Sstevel@tonic-gate /*
6577c478bd9Sstevel@tonic-gate  * Process the incoming rtt reply, in response to our rtt probe.
6587c478bd9Sstevel@tonic-gate  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
6597c478bd9Sstevel@tonic-gate  * have any stored information about the probe we sent. So we don't log
6607c478bd9Sstevel@tonic-gate  * any errors if we receive bad replies.
6617c478bd9Sstevel@tonic-gate  */
6627c478bd9Sstevel@tonic-gate static void
6637c478bd9Sstevel@tonic-gate incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
6647c478bd9Sstevel@tonic-gate     struct in6_addr fromaddr)
6657c478bd9Sstevel@tonic-gate {
6667c478bd9Sstevel@tonic-gate 	int 	m;		/* rtt measurment in ms */
6677c478bd9Sstevel@tonic-gate 	uint32_t cur_time;	/* in ms from some arbitrary point */
6687c478bd9Sstevel@tonic-gate 	char	abuf[INET6_ADDRSTRLEN];
6697c478bd9Sstevel@tonic-gate 	struct	target	*target;
6707c478bd9Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;
6717c478bd9Sstevel@tonic-gate 	struct 	phyint_group *pg;
6727c478bd9Sstevel@tonic-gate 
6737c478bd9Sstevel@tonic-gate 	/* Get the printable address for error reporting */
6747c478bd9Sstevel@tonic-gate 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
6757c478bd9Sstevel@tonic-gate 
6767c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE) {
6777c478bd9Sstevel@tonic-gate 		logdebug("incoming_rtt_reply: %s %s %s\n",
6787c478bd9Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
6797c478bd9Sstevel@tonic-gate 	}
6807c478bd9Sstevel@tonic-gate 
6817c478bd9Sstevel@tonic-gate 	/* Do we know this target ? */
6827c478bd9Sstevel@tonic-gate 	target = target_lookup(pii, fromaddr);
6837c478bd9Sstevel@tonic-gate 	if (target == NULL)
6847c478bd9Sstevel@tonic-gate 		return;
6857c478bd9Sstevel@tonic-gate 
6867c478bd9Sstevel@tonic-gate 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
6877c478bd9Sstevel@tonic-gate 	cur_time = getcurrenttime();
6887c478bd9Sstevel@tonic-gate 	m = (int)(cur_time - pr_icmp_timestamp);
6897c478bd9Sstevel@tonic-gate 
6907c478bd9Sstevel@tonic-gate 	/* Invalid rtt. It has wrapped around */
6917c478bd9Sstevel@tonic-gate 	if (m < 0)
6927c478bd9Sstevel@tonic-gate 		return;
6937c478bd9Sstevel@tonic-gate 
6947c478bd9Sstevel@tonic-gate 	/*
6957c478bd9Sstevel@tonic-gate 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
6967c478bd9Sstevel@tonic-gate 	 * The initial few responses after the interface is repaired may
6977c478bd9Sstevel@tonic-gate 	 * contain high rtt's because they could have been queued up waiting
6987c478bd9Sstevel@tonic-gate 	 * for ARP/NDP resolution on a failed interface.
6997c478bd9Sstevel@tonic-gate 	 */
7007c478bd9Sstevel@tonic-gate 	pg = pii->pii_phyint->pi_group;
7017c478bd9Sstevel@tonic-gate 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
7027c478bd9Sstevel@tonic-gate 		return;
7037c478bd9Sstevel@tonic-gate 
7047c478bd9Sstevel@tonic-gate 	/*
7057c478bd9Sstevel@tonic-gate 	 * Update rtt only if the new rtt is lower than the current rtt.
7067c478bd9Sstevel@tonic-gate 	 * (specified by the 3rd parameter to pi_set_crtt).
7077c478bd9Sstevel@tonic-gate 	 * If a spike has caused the current probe_interval to be >
7087c478bd9Sstevel@tonic-gate 	 * user_probe_interval, then this mechanism is used to bring down
7097c478bd9Sstevel@tonic-gate 	 * the rtt rapidly once the network stress is removed.
7107c478bd9Sstevel@tonic-gate 	 * If the new rtt is higher than the current rtt, we don't want to
7117c478bd9Sstevel@tonic-gate 	 * update the rtt. We are having more than 1 outstanding probe and
7127c478bd9Sstevel@tonic-gate 	 * the increase in rtt we are seeing is being unnecessarily weighted
7137c478bd9Sstevel@tonic-gate 	 * many times. The regular rtt update will be handled by
7147c478bd9Sstevel@tonic-gate 	 * incoming_echo_reply() and will take care of any rtt increase.
7157c478bd9Sstevel@tonic-gate 	 */
7167c478bd9Sstevel@tonic-gate 	pi_set_crtt(target, m, _B_FALSE);
7177c478bd9Sstevel@tonic-gate 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
7187c478bd9Sstevel@tonic-gate 	    (user_failure_detection_time < pg->pg_fdt) &&
7197c478bd9Sstevel@tonic-gate 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
7207c478bd9Sstevel@tonic-gate 		/*
7217c478bd9Sstevel@tonic-gate 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
7227c478bd9Sstevel@tonic-gate 		 * investigate if we can improve the failure detection time to
7237c478bd9Sstevel@tonic-gate 		 * meet whatever the user specified.
7247c478bd9Sstevel@tonic-gate 		 */
7257c478bd9Sstevel@tonic-gate 		if (check_pg_crtt_improved(pg)) {
7267c478bd9Sstevel@tonic-gate 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
7277c478bd9Sstevel@tonic-gate 			    user_failure_detection_time);
7287c478bd9Sstevel@tonic-gate 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
7297c478bd9Sstevel@tonic-gate 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
7307c478bd9Sstevel@tonic-gate 				logerr("Improved failure detection time %d ms "
7317c478bd9Sstevel@tonic-gate 				    "on (%s %s) for group \"%s\"\n",
7327c478bd9Sstevel@tonic-gate 				    pg->pg_fdt, AF_STR(pii->pii_af),
7337c478bd9Sstevel@tonic-gate 				    pii->pii_name,
7347c478bd9Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_name);
7357c478bd9Sstevel@tonic-gate 			}
7367c478bd9Sstevel@tonic-gate 			if (user_failure_detection_time == pg->pg_fdt) {
7377c478bd9Sstevel@tonic-gate 				/* Avoid any truncation or rounding errors */
7387c478bd9Sstevel@tonic-gate 				pg->pg_probeint = user_probe_interval;
7397c478bd9Sstevel@tonic-gate 				/*
7407c478bd9Sstevel@tonic-gate 				 * No more rtt probes will be sent. The actual
7417c478bd9Sstevel@tonic-gate 				 * fdt has dropped to the user specified value.
7427c478bd9Sstevel@tonic-gate 				 * pii_fd_snxt_basetime and pii_snxt_basetime
7437c478bd9Sstevel@tonic-gate 				 * will be in sync henceforth.
7447c478bd9Sstevel@tonic-gate 				 */
7457c478bd9Sstevel@tonic-gate 				reset_snxt_basetimes();
7467c478bd9Sstevel@tonic-gate 			}
7477c478bd9Sstevel@tonic-gate 		}
7487c478bd9Sstevel@tonic-gate 	}
7497c478bd9Sstevel@tonic-gate }
7507c478bd9Sstevel@tonic-gate 
7517c478bd9Sstevel@tonic-gate /*
7527c478bd9Sstevel@tonic-gate  * Process the incoming echo reply, in response to our unicast probe.
7537c478bd9Sstevel@tonic-gate  * Common for both IPv4 and IPv6
7547c478bd9Sstevel@tonic-gate  */
7557c478bd9Sstevel@tonic-gate static void
7567c478bd9Sstevel@tonic-gate incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
7577c478bd9Sstevel@tonic-gate     struct in6_addr fromaddr)
7587c478bd9Sstevel@tonic-gate {
7597c478bd9Sstevel@tonic-gate 	int 	m;		/* rtt measurment in ms */
7607c478bd9Sstevel@tonic-gate 	uint32_t cur_time;	/* in ms from some arbitrary point */
7617c478bd9Sstevel@tonic-gate 	char	abuf[INET6_ADDRSTRLEN];
7627c478bd9Sstevel@tonic-gate 	int	pr_ndx;
7637c478bd9Sstevel@tonic-gate 	struct	target	*target;
7647c478bd9Sstevel@tonic-gate 	boolean_t exception;
7657c478bd9Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;
7667c478bd9Sstevel@tonic-gate 	uint16_t pr_icmp_seq;
7677c478bd9Sstevel@tonic-gate 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
7687c478bd9Sstevel@tonic-gate 
7697c478bd9Sstevel@tonic-gate 	/* Get the printable address for error reporting */
7707c478bd9Sstevel@tonic-gate 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
7717c478bd9Sstevel@tonic-gate 
7727c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE) {
7737c478bd9Sstevel@tonic-gate 		logdebug("incoming_echo_reply: %s %s %s seq %u\n",
7747c478bd9Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
7757c478bd9Sstevel@tonic-gate 		    ntohs(reply->pr_icmp_seq));
7767c478bd9Sstevel@tonic-gate 	}
7777c478bd9Sstevel@tonic-gate 
7787c478bd9Sstevel@tonic-gate 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
7797c478bd9Sstevel@tonic-gate 	pr_icmp_seq  = ntohs(reply->pr_icmp_seq);
7807c478bd9Sstevel@tonic-gate 
7817c478bd9Sstevel@tonic-gate 	/* Reject out of window probe replies */
7827c478bd9Sstevel@tonic-gate 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
7837c478bd9Sstevel@tonic-gate 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
7847c478bd9Sstevel@tonic-gate 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
7857c478bd9Sstevel@tonic-gate 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
7867c478bd9Sstevel@tonic-gate 		pii->pii_cum_stats.unknown++;
7877c478bd9Sstevel@tonic-gate 		return;
7887c478bd9Sstevel@tonic-gate 	}
7897c478bd9Sstevel@tonic-gate 	cur_time = getcurrenttime();
7907c478bd9Sstevel@tonic-gate 	m = (int)(cur_time - pr_icmp_timestamp);
7917c478bd9Sstevel@tonic-gate 	if (m < 0) {
7927c478bd9Sstevel@tonic-gate 		/*
7937c478bd9Sstevel@tonic-gate 		 * This is a ridiculously high value of rtt. rtt has wrapped
7947c478bd9Sstevel@tonic-gate 		 * around. Log a message, and ignore the rtt.
7957c478bd9Sstevel@tonic-gate 		 */
7967c478bd9Sstevel@tonic-gate 		logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
7977c478bd9Sstevel@tonic-gate 		    "timestamp %u\n", cur_time, pr_icmp_timestamp);
7987c478bd9Sstevel@tonic-gate 	}
7997c478bd9Sstevel@tonic-gate 
8007c478bd9Sstevel@tonic-gate 	/*
8017c478bd9Sstevel@tonic-gate 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
8027c478bd9Sstevel@tonic-gate 	 * number in our pii->pii_probes[] array. The icmp sequence number
8037c478bd9Sstevel@tonic-gate 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
8047c478bd9Sstevel@tonic-gate 	 */
8057c478bd9Sstevel@tonic-gate 	pr_ndx = MOD_SUB(pii->pii_probe_next,
8067c478bd9Sstevel@tonic-gate 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
8077c478bd9Sstevel@tonic-gate 
8087c478bd9Sstevel@tonic-gate 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
8097c478bd9Sstevel@tonic-gate 
8107c478bd9Sstevel@tonic-gate 	target = pii->pii_probes[pr_ndx].pr_target;
8117c478bd9Sstevel@tonic-gate 
8127c478bd9Sstevel@tonic-gate 	/*
8137c478bd9Sstevel@tonic-gate 	 * Perform sanity checks, whether this probe reply that we
8147c478bd9Sstevel@tonic-gate 	 * have received is genuine
8157c478bd9Sstevel@tonic-gate 	 */
8167c478bd9Sstevel@tonic-gate 	if (target != NULL) {
8177c478bd9Sstevel@tonic-gate 		/*
8187c478bd9Sstevel@tonic-gate 		 * Compare the src. addr of the received ICMP or ICMPv6
8197c478bd9Sstevel@tonic-gate 		 * probe reply with the target address in our tables.
8207c478bd9Sstevel@tonic-gate 		 */
8217c478bd9Sstevel@tonic-gate 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
8227c478bd9Sstevel@tonic-gate 			/*
8237c478bd9Sstevel@tonic-gate 			 * We don't have any record of having sent a probe to
8247c478bd9Sstevel@tonic-gate 			 * this target. This is a fake probe reply. Log an error
8257c478bd9Sstevel@tonic-gate 			 */
8267c478bd9Sstevel@tonic-gate 			logtrace("probe status %d Fake probe reply seq %u "
8277c478bd9Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
8287c478bd9Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
8297c478bd9Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
8307c478bd9Sstevel@tonic-gate 			pii->pii_cum_stats.unknown++;
8317c478bd9Sstevel@tonic-gate 			return;
8327c478bd9Sstevel@tonic-gate 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
8337c478bd9Sstevel@tonic-gate 			/*
8347c478bd9Sstevel@tonic-gate 			 * The address matches, but our tables indicate that
8357c478bd9Sstevel@tonic-gate 			 * this probe reply has been acked already. So this
8367c478bd9Sstevel@tonic-gate 			 * is a duplicate probe reply. Log an error
8377c478bd9Sstevel@tonic-gate 			 */
8387c478bd9Sstevel@tonic-gate 			logtrace("probe status %d Duplicate probe reply seq %u "
8397c478bd9Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
8407c478bd9Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
8417c478bd9Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
8427c478bd9Sstevel@tonic-gate 			pii->pii_cum_stats.unknown++;
8437c478bd9Sstevel@tonic-gate 			return;
8447c478bd9Sstevel@tonic-gate 		}
8457c478bd9Sstevel@tonic-gate 	} else {
8467c478bd9Sstevel@tonic-gate 		/*
8477c478bd9Sstevel@tonic-gate 		 * Target must not be NULL in the PR_UNACKED state
8487c478bd9Sstevel@tonic-gate 		 */
8497c478bd9Sstevel@tonic-gate 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
8507c478bd9Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
8517c478bd9Sstevel@tonic-gate 			/*
8527c478bd9Sstevel@tonic-gate 			 * The probe stats slot is unused. So we didn't
8537c478bd9Sstevel@tonic-gate 			 * send out any probe to this target. This is a fake.
8547c478bd9Sstevel@tonic-gate 			 * Log an error.
8557c478bd9Sstevel@tonic-gate 			 */
8567c478bd9Sstevel@tonic-gate 			logtrace("probe status %d Fake probe reply seq %u "
8577c478bd9Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
8587c478bd9Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
8597c478bd9Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
8607c478bd9Sstevel@tonic-gate 		}
8617c478bd9Sstevel@tonic-gate 		pii->pii_cum_stats.unknown++;
8627c478bd9Sstevel@tonic-gate 		return;
8637c478bd9Sstevel@tonic-gate 	}
8647c478bd9Sstevel@tonic-gate 
8657c478bd9Sstevel@tonic-gate 	/*
8667c478bd9Sstevel@tonic-gate 	 * If the rtt does not appear to be right, don't update the
8677c478bd9Sstevel@tonic-gate 	 * rtt stats. This can happen if the system dropped into the
8687c478bd9Sstevel@tonic-gate 	 * debugger, or the system was hung or too busy for a
8697c478bd9Sstevel@tonic-gate 	 * substantial time that we didn't get a chance to run.
8707c478bd9Sstevel@tonic-gate 	 */
8717c478bd9Sstevel@tonic-gate 	if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
8727c478bd9Sstevel@tonic-gate 		/*
8737c478bd9Sstevel@tonic-gate 		 * If the probe corresponding to this receieved response
8747c478bd9Sstevel@tonic-gate 		 * was truly sent 'm' ms. ago, then this response must
8757c478bd9Sstevel@tonic-gate 		 * have been rejected by the sequence number checks. The
8767c478bd9Sstevel@tonic-gate 		 * fact that it has passed the sequence number checks
8777c478bd9Sstevel@tonic-gate 		 * means that the measured rtt is wrong. We were probably
8787c478bd9Sstevel@tonic-gate 		 * scheduled long after the packet was received.
8797c478bd9Sstevel@tonic-gate 		 */
8807c478bd9Sstevel@tonic-gate 		goto out;
8817c478bd9Sstevel@tonic-gate 	}
8827c478bd9Sstevel@tonic-gate 
8837c478bd9Sstevel@tonic-gate 	/*
8847c478bd9Sstevel@tonic-gate 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
8857c478bd9Sstevel@tonic-gate 	 * The initial few responses after the interface is repaired may
8867c478bd9Sstevel@tonic-gate 	 * contain high rtt's because they could have been queued up waiting
8877c478bd9Sstevel@tonic-gate 	 * for ARP/NDP resolution on a failed interface.
8887c478bd9Sstevel@tonic-gate 	 */
8897c478bd9Sstevel@tonic-gate 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
8907c478bd9Sstevel@tonic-gate 		goto out;
8917c478bd9Sstevel@tonic-gate 
8927c478bd9Sstevel@tonic-gate 	/*
8937c478bd9Sstevel@tonic-gate 	 * Don't update the Conservative Round Trip Time estimate for this
8947c478bd9Sstevel@tonic-gate 	 * (phint, target) pair if this is the not the highest ack seq seen
8957c478bd9Sstevel@tonic-gate 	 * thus far on this target.
8967c478bd9Sstevel@tonic-gate 	 */
8977c478bd9Sstevel@tonic-gate 	if (!highest_ack_tg(pr_icmp_seq, target))
8987c478bd9Sstevel@tonic-gate 		goto out;
8997c478bd9Sstevel@tonic-gate 
9007c478bd9Sstevel@tonic-gate 	/*
9017c478bd9Sstevel@tonic-gate 	 * Always update the rtt. This is a failure detection probe
9027c478bd9Sstevel@tonic-gate 	 * and we want to measure both increase / decrease in rtt.
9037c478bd9Sstevel@tonic-gate 	 */
9047c478bd9Sstevel@tonic-gate 	pi_set_crtt(target, m, _B_TRUE);
9057c478bd9Sstevel@tonic-gate 
9067c478bd9Sstevel@tonic-gate 	/*
9077c478bd9Sstevel@tonic-gate 	 * If the crtt exceeds the average time between probes,
9087c478bd9Sstevel@tonic-gate 	 * investigate if this slow target is an exception. If so we
9097c478bd9Sstevel@tonic-gate 	 * can avoid this target and still meet the failure detection
9107c478bd9Sstevel@tonic-gate 	 * time. Otherwise we can't meet the failure detection time.
9117c478bd9Sstevel@tonic-gate 	 */
9127c478bd9Sstevel@tonic-gate 	if (target->tg_crtt > pg->pg_probeint) {
9137c478bd9Sstevel@tonic-gate 		exception = check_exception_target(pii, target);
9147c478bd9Sstevel@tonic-gate 		if (exception) {
9157c478bd9Sstevel@tonic-gate 			/*
9167c478bd9Sstevel@tonic-gate 			 * This target is exceptionally slow. Don't use it
9177c478bd9Sstevel@tonic-gate 			 * for future probes. check_exception_target() has
9187c478bd9Sstevel@tonic-gate 			 * made sure that we have at least MIN_PROBE_TARGETS
9197c478bd9Sstevel@tonic-gate 			 * other active targets
9207c478bd9Sstevel@tonic-gate 			 */
9217c478bd9Sstevel@tonic-gate 			if (pii->pii_targets_are_routers) {
9227c478bd9Sstevel@tonic-gate 				/*
9237c478bd9Sstevel@tonic-gate 				 * This is a slow router, mark it as slow
9247c478bd9Sstevel@tonic-gate 				 * and don't use it for further probes. We
9257c478bd9Sstevel@tonic-gate 				 * don't delete it, since it will be populated
9267c478bd9Sstevel@tonic-gate 				 * again when we do a router scan. Hence we
9277c478bd9Sstevel@tonic-gate 				 * need to maintain extra state (unlike the
9287c478bd9Sstevel@tonic-gate 				 * host case below).  Mark it as TG_SLOW.
9297c478bd9Sstevel@tonic-gate 				 */
9307c478bd9Sstevel@tonic-gate 				if (target->tg_status == TG_ACTIVE)
9317c478bd9Sstevel@tonic-gate 					pii->pii_ntargets--;
9327c478bd9Sstevel@tonic-gate 				target->tg_status = TG_SLOW;
9337c478bd9Sstevel@tonic-gate 				target->tg_latime = gethrtime();
9347c478bd9Sstevel@tonic-gate 				target->tg_rtt_sa = -1;
9357c478bd9Sstevel@tonic-gate 				target->tg_crtt = 0;
9367c478bd9Sstevel@tonic-gate 				target->tg_rtt_sd = 0;
9377c478bd9Sstevel@tonic-gate 				if (pii->pii_target_next == target) {
9387c478bd9Sstevel@tonic-gate 					pii->pii_target_next =
9397c478bd9Sstevel@tonic-gate 					    target_next(target);
9407c478bd9Sstevel@tonic-gate 				}
9417c478bd9Sstevel@tonic-gate 			} else {
9427c478bd9Sstevel@tonic-gate 				/*
9437c478bd9Sstevel@tonic-gate 				 * the slow target is not a router, we can
9447c478bd9Sstevel@tonic-gate 				 * just delete it. Send an icmp multicast and
9457c478bd9Sstevel@tonic-gate 				 * pick the fastest responder that is not
9467c478bd9Sstevel@tonic-gate 				 * already an active target. target_delete()
9477c478bd9Sstevel@tonic-gate 				 * adjusts pii->pii_target_next
9487c478bd9Sstevel@tonic-gate 				 */
9497c478bd9Sstevel@tonic-gate 				target_delete(target);
9507c478bd9Sstevel@tonic-gate 				probe(pii, PROBE_MULTI, cur_time);
9517c478bd9Sstevel@tonic-gate 			}
9527c478bd9Sstevel@tonic-gate 		} else {
9537c478bd9Sstevel@tonic-gate 			/*
9547c478bd9Sstevel@tonic-gate 			 * We can't meet the failure detection time.
9557c478bd9Sstevel@tonic-gate 			 * Log a message, and update the detection time to
9567c478bd9Sstevel@tonic-gate 			 * whatever we can achieve.
9577c478bd9Sstevel@tonic-gate 			 */
9587c478bd9Sstevel@tonic-gate 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
9597c478bd9Sstevel@tonic-gate 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
9607c478bd9Sstevel@tonic-gate 			last_fdt_bumpup_time = gethrtime();
9617c478bd9Sstevel@tonic-gate 			if (pg != phyint_anongroup) {
9627c478bd9Sstevel@tonic-gate 				logerr("Cannot meet requested failure detection"
9637c478bd9Sstevel@tonic-gate 				    " time of %d ms on (%s %s) new failure"
9647c478bd9Sstevel@tonic-gate 				    " detection time for group \"%s\" is %d"
9657c478bd9Sstevel@tonic-gate 				    " ms\n", user_failure_detection_time,
9667c478bd9Sstevel@tonic-gate 				    AF_STR(pii->pii_af), pii->pii_name,
9677c478bd9Sstevel@tonic-gate 				    pg->pg_name, pg->pg_fdt);
9687c478bd9Sstevel@tonic-gate 			}
9697c478bd9Sstevel@tonic-gate 		}
9707c478bd9Sstevel@tonic-gate 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
9717c478bd9Sstevel@tonic-gate 	    (user_failure_detection_time < pg->pg_fdt) &&
9727c478bd9Sstevel@tonic-gate 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
9737c478bd9Sstevel@tonic-gate 		/*
9747c478bd9Sstevel@tonic-gate 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
9757c478bd9Sstevel@tonic-gate 		 * investigate if we can improve the failure detection time to
9767c478bd9Sstevel@tonic-gate 		 * meet whatever the user specified.
9777c478bd9Sstevel@tonic-gate 		 */
9787c478bd9Sstevel@tonic-gate 		if (check_pg_crtt_improved(pg)) {
9797c478bd9Sstevel@tonic-gate 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
9807c478bd9Sstevel@tonic-gate 			    user_failure_detection_time);
9817c478bd9Sstevel@tonic-gate 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
9827c478bd9Sstevel@tonic-gate 			if (pg != phyint_anongroup) {
9837c478bd9Sstevel@tonic-gate 				logerr("Improved failure detection time %d ms "
9847c478bd9Sstevel@tonic-gate 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
9857c478bd9Sstevel@tonic-gate 				    AF_STR(pii->pii_af), pii->pii_name,
9867c478bd9Sstevel@tonic-gate 				    pg->pg_name);
9877c478bd9Sstevel@tonic-gate 			}
9887c478bd9Sstevel@tonic-gate 			if (user_failure_detection_time == pg->pg_fdt) {
9897c478bd9Sstevel@tonic-gate 				/* Avoid any truncation or rounding errors */
9907c478bd9Sstevel@tonic-gate 				pg->pg_probeint = user_probe_interval;
9917c478bd9Sstevel@tonic-gate 				/*
9927c478bd9Sstevel@tonic-gate 				 * No more rtt probes will be sent. The actual
9937c478bd9Sstevel@tonic-gate 				 * fdt has dropped to the user specified value.
9947c478bd9Sstevel@tonic-gate 				 * pii_fd_snxt_basetime and pii_snxt_basetime
9957c478bd9Sstevel@tonic-gate 				 * will be in sync henceforth.
9967c478bd9Sstevel@tonic-gate 				 */
9977c478bd9Sstevel@tonic-gate 				reset_snxt_basetimes();
9987c478bd9Sstevel@tonic-gate 			}
9997c478bd9Sstevel@tonic-gate 		}
10007c478bd9Sstevel@tonic-gate 	}
10017c478bd9Sstevel@tonic-gate out:
10027c478bd9Sstevel@tonic-gate 	pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
10037c478bd9Sstevel@tonic-gate 	pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
10047c478bd9Sstevel@tonic-gate 
10057c478bd9Sstevel@tonic-gate 	/*
10067c478bd9Sstevel@tonic-gate 	 * Update pii->pii_rack, i.e. the sequence number of the last received
10077c478bd9Sstevel@tonic-gate 	 * probe response, based on the echo reply we have received now, if
10087c478bd9Sstevel@tonic-gate 	 * either of the following conditions are satisfied.
10097c478bd9Sstevel@tonic-gate 	 * a. pii_rack is outside the current receive window of
10107c478bd9Sstevel@tonic-gate 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
10117c478bd9Sstevel@tonic-gate 	 *    This means we have not received probe responses for a
10127c478bd9Sstevel@tonic-gate 	 *    long time, and the sequence number has wrapped around.
10137c478bd9Sstevel@tonic-gate 	 * b. pii_rack is within the current receive window and this echo
10147c478bd9Sstevel@tonic-gate 	 *    reply corresponds to the highest sequence number we have seen
10157c478bd9Sstevel@tonic-gate 	 *    so far.
10167c478bd9Sstevel@tonic-gate 	 */
10177c478bd9Sstevel@tonic-gate 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
10187c478bd9Sstevel@tonic-gate 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
10197c478bd9Sstevel@tonic-gate 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
10207c478bd9Sstevel@tonic-gate 		pii->pii_rack = pr_icmp_seq;
10217c478bd9Sstevel@tonic-gate 	}
10227c478bd9Sstevel@tonic-gate }
10237c478bd9Sstevel@tonic-gate 
10247c478bd9Sstevel@tonic-gate /*
10257c478bd9Sstevel@tonic-gate  * Returns true if seq is the highest unacknowledged seq for target tg
10267c478bd9Sstevel@tonic-gate  * else returns false
10277c478bd9Sstevel@tonic-gate  */
10287c478bd9Sstevel@tonic-gate static boolean_t
10297c478bd9Sstevel@tonic-gate highest_ack_tg(uint16_t seq, struct target *tg)
10307c478bd9Sstevel@tonic-gate {
10317c478bd9Sstevel@tonic-gate 	struct phyint_instance *pii;
10327c478bd9Sstevel@tonic-gate 	int	 pr_ndx;
10337c478bd9Sstevel@tonic-gate 	uint16_t pr_seq;
10347c478bd9Sstevel@tonic-gate 
10357c478bd9Sstevel@tonic-gate 	pii = tg->tg_phyint_inst;
10367c478bd9Sstevel@tonic-gate 
10377c478bd9Sstevel@tonic-gate 	/*
10387c478bd9Sstevel@tonic-gate 	 * Get the seq number of the most recent probe sent so far,
10397c478bd9Sstevel@tonic-gate 	 * and also get the corresponding probe index in the probe stats
10407c478bd9Sstevel@tonic-gate 	 * array.
10417c478bd9Sstevel@tonic-gate 	 */
10427c478bd9Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
10437c478bd9Sstevel@tonic-gate 	pr_seq = pii->pii_snxt;
10447c478bd9Sstevel@tonic-gate 	pr_seq--;
10457c478bd9Sstevel@tonic-gate 
10467c478bd9Sstevel@tonic-gate 	/*
10477c478bd9Sstevel@tonic-gate 	 * Start from the most recent probe and walk back, trying to find
10487c478bd9Sstevel@tonic-gate 	 * an acked probe corresponding to target tg.
10497c478bd9Sstevel@tonic-gate 	 */
10507c478bd9Sstevel@tonic-gate 	for (; pr_ndx != pii->pii_probe_next;
10517c478bd9Sstevel@tonic-gate 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
10527c478bd9Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
10537c478bd9Sstevel@tonic-gate 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
10547c478bd9Sstevel@tonic-gate 			if (SEQ_GT(pr_seq, seq))
10557c478bd9Sstevel@tonic-gate 				return (_B_FALSE);
10567c478bd9Sstevel@tonic-gate 		}
10577c478bd9Sstevel@tonic-gate 	}
10587c478bd9Sstevel@tonic-gate 	return (_B_TRUE);
10597c478bd9Sstevel@tonic-gate }
10607c478bd9Sstevel@tonic-gate 
10617c478bd9Sstevel@tonic-gate /*
10627c478bd9Sstevel@tonic-gate  * Check whether the crtt for the group has improved by a factor of
10637c478bd9Sstevel@tonic-gate  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
10647c478bd9Sstevel@tonic-gate  * detection time flapping in the face of small crtt changes.
10657c478bd9Sstevel@tonic-gate  */
10667c478bd9Sstevel@tonic-gate static boolean_t
10677c478bd9Sstevel@tonic-gate check_pg_crtt_improved(struct phyint_group *pg)
10687c478bd9Sstevel@tonic-gate {
10697c478bd9Sstevel@tonic-gate 	struct	phyint *pi;
10707c478bd9Sstevel@tonic-gate 
10717c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE)
10727c478bd9Sstevel@tonic-gate 		logdebug("check_pg_crtt_improved()\n");
10737c478bd9Sstevel@tonic-gate 
10747c478bd9Sstevel@tonic-gate 	/*
10757c478bd9Sstevel@tonic-gate 	 * The crtt for the group is only improved if each phyint_instance
10767c478bd9Sstevel@tonic-gate 	 * for both ipv4 and ipv6 is improved.
10777c478bd9Sstevel@tonic-gate 	 */
10787c478bd9Sstevel@tonic-gate 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
10797c478bd9Sstevel@tonic-gate 		if (!check_pii_crtt_improved(pi->pi_v4) ||
10807c478bd9Sstevel@tonic-gate 		    !check_pii_crtt_improved(pi->pi_v6))
10817c478bd9Sstevel@tonic-gate 			return (_B_FALSE);
10827c478bd9Sstevel@tonic-gate 	}
10837c478bd9Sstevel@tonic-gate 
10847c478bd9Sstevel@tonic-gate 	return (_B_TRUE);
10857c478bd9Sstevel@tonic-gate }
10867c478bd9Sstevel@tonic-gate 
10877c478bd9Sstevel@tonic-gate /*
10887c478bd9Sstevel@tonic-gate  * Check whether the crtt has improved substantially on this phyint_instance.
10897c478bd9Sstevel@tonic-gate  * Returns _B_TRUE if there's no crtt information available, because pii
10907c478bd9Sstevel@tonic-gate  * is NULL or the phyint_instance is not capable of probing.
10917c478bd9Sstevel@tonic-gate  */
10927c478bd9Sstevel@tonic-gate boolean_t
10937c478bd9Sstevel@tonic-gate check_pii_crtt_improved(struct phyint_instance *pii) {
10947c478bd9Sstevel@tonic-gate 	struct 	target *tg;
10957c478bd9Sstevel@tonic-gate 
10967c478bd9Sstevel@tonic-gate 	if (pii == NULL)
10977c478bd9Sstevel@tonic-gate 		return (_B_TRUE);
10987c478bd9Sstevel@tonic-gate 
10997c478bd9Sstevel@tonic-gate 	if (!PROBE_CAPABLE(pii) ||
11007c478bd9Sstevel@tonic-gate 	    pii->pii_phyint->pi_state == PI_FAILED)
11017c478bd9Sstevel@tonic-gate 		return (_B_TRUE);
11027c478bd9Sstevel@tonic-gate 
11037c478bd9Sstevel@tonic-gate 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
11047c478bd9Sstevel@tonic-gate 		if (tg->tg_status != TG_ACTIVE)
11057c478bd9Sstevel@tonic-gate 			continue;
11067c478bd9Sstevel@tonic-gate 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
11077c478bd9Sstevel@tonic-gate 		    LOWER_FDT_TRIGGER)) {
11087c478bd9Sstevel@tonic-gate 			return (_B_FALSE);
11097c478bd9Sstevel@tonic-gate 		}
11107c478bd9Sstevel@tonic-gate 	}
11117c478bd9Sstevel@tonic-gate 
11127c478bd9Sstevel@tonic-gate 	return (_B_TRUE);
11137c478bd9Sstevel@tonic-gate }
11147c478bd9Sstevel@tonic-gate 
11157c478bd9Sstevel@tonic-gate /*
11167c478bd9Sstevel@tonic-gate  * This target responds very slowly to probes. The target's crtt exceeds
11177c478bd9Sstevel@tonic-gate  * the probe interval of its group. Compare against other targets
11187c478bd9Sstevel@tonic-gate  * and determine if this target is an exception, if so return true, else false
11197c478bd9Sstevel@tonic-gate  */
11207c478bd9Sstevel@tonic-gate static boolean_t
11217c478bd9Sstevel@tonic-gate check_exception_target(struct phyint_instance *pii, struct target *target)
11227c478bd9Sstevel@tonic-gate {
11237c478bd9Sstevel@tonic-gate 	struct	target *tg;
11247c478bd9Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
11257c478bd9Sstevel@tonic-gate 
11267c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE) {
11277c478bd9Sstevel@tonic-gate 		logdebug("check_exception_target(%s %s target %s)\n",
11287c478bd9Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name,
11297c478bd9Sstevel@tonic-gate 		    pr_addr(pii->pii_af, target->tg_address,
11307c478bd9Sstevel@tonic-gate 			abuf, sizeof (abuf)));
11317c478bd9Sstevel@tonic-gate 	}
11327c478bd9Sstevel@tonic-gate 
11337c478bd9Sstevel@tonic-gate 	/*
11347c478bd9Sstevel@tonic-gate 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
11357c478bd9Sstevel@tonic-gate 	 * to make a good judgement. Otherwise don't drop this target.
11367c478bd9Sstevel@tonic-gate 	 */
11377c478bd9Sstevel@tonic-gate 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
11387c478bd9Sstevel@tonic-gate 		return (_B_FALSE);
11397c478bd9Sstevel@tonic-gate 
11407c478bd9Sstevel@tonic-gate 	/*
11417c478bd9Sstevel@tonic-gate 	 * Determine whether only this particular target is slow.
11427c478bd9Sstevel@tonic-gate 	 * We know that this target's crtt exceeds the group's probe interval.
11437c478bd9Sstevel@tonic-gate 	 * If all other active targets have a
11447c478bd9Sstevel@tonic-gate 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
11457c478bd9Sstevel@tonic-gate 	 * then this target is considered slow.
11467c478bd9Sstevel@tonic-gate 	 */
11477c478bd9Sstevel@tonic-gate 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
11487c478bd9Sstevel@tonic-gate 		if (tg != target && tg->tg_status == TG_ACTIVE) {
11497c478bd9Sstevel@tonic-gate 			if (tg->tg_crtt >
11507c478bd9Sstevel@tonic-gate 			    pii->pii_phyint->pi_group->pg_probeint /
11517c478bd9Sstevel@tonic-gate 			    EXCEPTION_FACTOR) {
11527c478bd9Sstevel@tonic-gate 				return (_B_FALSE);
11537c478bd9Sstevel@tonic-gate 			}
11547c478bd9Sstevel@tonic-gate 		}
11557c478bd9Sstevel@tonic-gate 	}
11567c478bd9Sstevel@tonic-gate 
11577c478bd9Sstevel@tonic-gate 	return (_B_TRUE);
11587c478bd9Sstevel@tonic-gate }
11597c478bd9Sstevel@tonic-gate 
11607c478bd9Sstevel@tonic-gate /*
11617c478bd9Sstevel@tonic-gate  * Update the target list. The icmp all hosts multicast has given us
11627c478bd9Sstevel@tonic-gate  * some host to which we can send probes. If we already have sufficient
11637c478bd9Sstevel@tonic-gate  * targets, discard it.
11647c478bd9Sstevel@tonic-gate  */
11657c478bd9Sstevel@tonic-gate static void
11667c478bd9Sstevel@tonic-gate incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
11677c478bd9Sstevel@tonic-gate     struct in6_addr fromaddr)
11687c478bd9Sstevel@tonic-gate /* ARGSUSED */
11697c478bd9Sstevel@tonic-gate {
11707c478bd9Sstevel@tonic-gate 	int af;
11717c478bd9Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
11727c478bd9Sstevel@tonic-gate 	struct phyint *pi;
11737c478bd9Sstevel@tonic-gate 
11747c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE) {
11757c478bd9Sstevel@tonic-gate 		logdebug("incoming_mcast_reply(%s %s %s)\n",
11767c478bd9Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name,
11777c478bd9Sstevel@tonic-gate 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
11787c478bd9Sstevel@tonic-gate 	}
11797c478bd9Sstevel@tonic-gate 
11807c478bd9Sstevel@tonic-gate 	/*
11817c478bd9Sstevel@tonic-gate 	 * Using host targets is a fallback mechanism. If we have
11827c478bd9Sstevel@tonic-gate 	 * found a router, don't add this host target. If we already
11837c478bd9Sstevel@tonic-gate 	 * know MAX_PROBE_TARGETS, don't add another target.
11847c478bd9Sstevel@tonic-gate 	 */
11857c478bd9Sstevel@tonic-gate 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
11867c478bd9Sstevel@tonic-gate 	if (pii->pii_targets != NULL) {
11877c478bd9Sstevel@tonic-gate 		if (pii->pii_targets_are_routers ||
11887c478bd9Sstevel@tonic-gate 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
11897c478bd9Sstevel@tonic-gate 			return;
11907c478bd9Sstevel@tonic-gate 		}
11917c478bd9Sstevel@tonic-gate 	}
11927c478bd9Sstevel@tonic-gate 
11937c478bd9Sstevel@tonic-gate 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
11947c478bd9Sstevel@tonic-gate 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
11957c478bd9Sstevel@tonic-gate 		/*
11967c478bd9Sstevel@tonic-gate 		 * Guard against response from 0.0.0.0
11977c478bd9Sstevel@tonic-gate 		 * and ::. Log a trace message
11987c478bd9Sstevel@tonic-gate 		 */
11997c478bd9Sstevel@tonic-gate 		logtrace("probe response from %s on %s\n",
12007c478bd9Sstevel@tonic-gate 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
12017c478bd9Sstevel@tonic-gate 		    pii->pii_name);
12027c478bd9Sstevel@tonic-gate 		return;
12037c478bd9Sstevel@tonic-gate 	}
12047c478bd9Sstevel@tonic-gate 
12057c478bd9Sstevel@tonic-gate 	/*
12067c478bd9Sstevel@tonic-gate 	 * This address is one of our own, so reject this address as a
12077c478bd9Sstevel@tonic-gate 	 * valid probe target.
12087c478bd9Sstevel@tonic-gate 	 */
12097c478bd9Sstevel@tonic-gate 	af = pii->pii_af;
12107c478bd9Sstevel@tonic-gate 	if (own_address(af, fromaddr))
12117c478bd9Sstevel@tonic-gate 		return;
12127c478bd9Sstevel@tonic-gate 
12137c478bd9Sstevel@tonic-gate 	/*
12147c478bd9Sstevel@tonic-gate 	 * If the phyint is part a named group, then add the address to all
12157c478bd9Sstevel@tonic-gate 	 * members of the group.  Otherwise, add the address only to the
12167c478bd9Sstevel@tonic-gate 	 * phyint itself, since other phyints in the anongroup may not be on
12177c478bd9Sstevel@tonic-gate 	 * the same subnet.
12187c478bd9Sstevel@tonic-gate 	 */
12197c478bd9Sstevel@tonic-gate 	pi = pii->pii_phyint;
12207c478bd9Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup) {
12217c478bd9Sstevel@tonic-gate 		target_add(pii, fromaddr, _B_FALSE);
12227c478bd9Sstevel@tonic-gate 	} else {
12237c478bd9Sstevel@tonic-gate 		pi = pi->pi_group->pg_phyint;
12247c478bd9Sstevel@tonic-gate 		for (; pi != NULL; pi = pi->pi_pgnext)
12257c478bd9Sstevel@tonic-gate 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
12267c478bd9Sstevel@tonic-gate 	}
12277c478bd9Sstevel@tonic-gate }
12287c478bd9Sstevel@tonic-gate 
12297c478bd9Sstevel@tonic-gate /*
12307c478bd9Sstevel@tonic-gate  * Compute CRTT given an existing scaled average, scaled deviation estimate
12317c478bd9Sstevel@tonic-gate  * and a new rtt time.  The formula is from Jacobson and Karels'
12327c478bd9Sstevel@tonic-gate  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
12337c478bd9Sstevel@tonic-gate  * are the same as those in Appendix A.2 of that paper.
12347c478bd9Sstevel@tonic-gate  *
12357c478bd9Sstevel@tonic-gate  * m = new measurement
12367c478bd9Sstevel@tonic-gate  * sa = scaled RTT average (8 * average estimates)
12377c478bd9Sstevel@tonic-gate  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
12387c478bd9Sstevel@tonic-gate  * crtt = Conservative round trip time. Used to determine whether probe
12397c478bd9Sstevel@tonic-gate  * has timed out.
12407c478bd9Sstevel@tonic-gate  *
12417c478bd9Sstevel@tonic-gate  * New scaled average and deviation are passed back via sap and svp
12427c478bd9Sstevel@tonic-gate  */
12437c478bd9Sstevel@tonic-gate static int
12447c478bd9Sstevel@tonic-gate compute_crtt(int *sap, int *svp, int m)
12457c478bd9Sstevel@tonic-gate {
12467c478bd9Sstevel@tonic-gate 	int sa = *sap;
12477c478bd9Sstevel@tonic-gate 	int sv = *svp;
12487c478bd9Sstevel@tonic-gate 	int crtt;
12497c478bd9Sstevel@tonic-gate 	int saved_m = m;
12507c478bd9Sstevel@tonic-gate 
12517c478bd9Sstevel@tonic-gate 	assert(*sap >= -1);
12527c478bd9Sstevel@tonic-gate 	assert(*svp >= 0);
12537c478bd9Sstevel@tonic-gate 
12547c478bd9Sstevel@tonic-gate 	if (sa != -1) {
12557c478bd9Sstevel@tonic-gate 		/*
12567c478bd9Sstevel@tonic-gate 		 * Update average estimator:
12577c478bd9Sstevel@tonic-gate 		 *	new rtt = old rtt + 1/8 Error
12587c478bd9Sstevel@tonic-gate 		 *	    where Error = m - old rtt
12597c478bd9Sstevel@tonic-gate 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
12607c478bd9Sstevel@tonic-gate 		 *	i.e. new sa =  old sa + Error
12617c478bd9Sstevel@tonic-gate 		 */
12627c478bd9Sstevel@tonic-gate 		m -= sa >> 3;		/* m is now Error in estimate. */
12637c478bd9Sstevel@tonic-gate 		if ((sa += m) < 0) {
12647c478bd9Sstevel@tonic-gate 			/* Don't allow the smoothed average to be negative. */
12657c478bd9Sstevel@tonic-gate 			sa = 0;
12667c478bd9Sstevel@tonic-gate 		}
12677c478bd9Sstevel@tonic-gate 
12687c478bd9Sstevel@tonic-gate 		/*
12697c478bd9Sstevel@tonic-gate 		 * Update deviation estimator:
12707c478bd9Sstevel@tonic-gate 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
12717c478bd9Sstevel@tonic-gate 		 *	i.e. 4 * new mdev = 4 * old mdev +
12727c478bd9Sstevel@tonic-gate 		 *		(abs(Error) - old mdev)
12737c478bd9Sstevel@tonic-gate 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
12747c478bd9Sstevel@tonic-gate 		 */
12757c478bd9Sstevel@tonic-gate 		if (m < 0)
12767c478bd9Sstevel@tonic-gate 			m = -m;
12777c478bd9Sstevel@tonic-gate 		m -= sv >> 2;
12787c478bd9Sstevel@tonic-gate 		sv += m;
12797c478bd9Sstevel@tonic-gate 	} else {
12807c478bd9Sstevel@tonic-gate 		/* Initialization. This is the first response received. */
12817c478bd9Sstevel@tonic-gate 		sa = (m << 3);
12827c478bd9Sstevel@tonic-gate 		sv = (m << 1);
12837c478bd9Sstevel@tonic-gate 	}
12847c478bd9Sstevel@tonic-gate 
12857c478bd9Sstevel@tonic-gate 	crtt = (sa >> 3) + sv;
12867c478bd9Sstevel@tonic-gate 
12877c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE) {
12887c478bd9Sstevel@tonic-gate 		logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
12897c478bd9Sstevel@tonic-gate 		    "%d\n", saved_m, sa, sv, crtt);
12907c478bd9Sstevel@tonic-gate 	}
12917c478bd9Sstevel@tonic-gate 
12927c478bd9Sstevel@tonic-gate 	*sap = sa;
12937c478bd9Sstevel@tonic-gate 	*svp = sv;
12947c478bd9Sstevel@tonic-gate 
12957c478bd9Sstevel@tonic-gate 	/*
12967c478bd9Sstevel@tonic-gate 	 * CRTT = average estimates  + 4 * deviation estimates
12977c478bd9Sstevel@tonic-gate 	 *	= sa / 8 + sv
12987c478bd9Sstevel@tonic-gate 	 */
12997c478bd9Sstevel@tonic-gate 	return (crtt);
13007c478bd9Sstevel@tonic-gate }
13017c478bd9Sstevel@tonic-gate 
13027c478bd9Sstevel@tonic-gate static void
13037c478bd9Sstevel@tonic-gate pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
13047c478bd9Sstevel@tonic-gate {
13057c478bd9Sstevel@tonic-gate 	struct phyint_instance *pii = tg->tg_phyint_inst;
13067c478bd9Sstevel@tonic-gate 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
13077c478bd9Sstevel@tonic-gate 	int sa = tg->tg_rtt_sa;
13087c478bd9Sstevel@tonic-gate 	int sv = tg->tg_rtt_sd;
13097c478bd9Sstevel@tonic-gate 	int new_crtt;
13107c478bd9Sstevel@tonic-gate 	int i;
13117c478bd9Sstevel@tonic-gate 
13127c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE)
13137c478bd9Sstevel@tonic-gate 		logdebug("pi_set_crtt: target -  m %d\n", m);
13147c478bd9Sstevel@tonic-gate 
13157c478bd9Sstevel@tonic-gate 	/* store the round trip time, in case we need to defer computation */
13167c478bd9Sstevel@tonic-gate 	tg->tg_deferred[tg->tg_num_deferred] = m;
13177c478bd9Sstevel@tonic-gate 
13187c478bd9Sstevel@tonic-gate 	new_crtt = compute_crtt(&sa, &sv, m);
13197c478bd9Sstevel@tonic-gate 
13207c478bd9Sstevel@tonic-gate 	/*
13217c478bd9Sstevel@tonic-gate 	 * If this probe's round trip time would singlehandedly cause an
13227c478bd9Sstevel@tonic-gate 	 * increase in the group's probe interval consider it suspect.
13237c478bd9Sstevel@tonic-gate 	 */
13247c478bd9Sstevel@tonic-gate 	if ((new_crtt > probe_interval) && is_probe_uni) {
13257c478bd9Sstevel@tonic-gate 		if (debug & D_PROBE) {
13267c478bd9Sstevel@tonic-gate 			logdebug("Received a suspect probe on %s, new_crtt ="
13277c478bd9Sstevel@tonic-gate 			    " %d, probe_interval = %d, num_deferred = %d\n",
13287c478bd9Sstevel@tonic-gate 			    pii->pii_probe_logint->li_name, new_crtt,
13297c478bd9Sstevel@tonic-gate 			    probe_interval, tg->tg_num_deferred);
13307c478bd9Sstevel@tonic-gate 		}
13317c478bd9Sstevel@tonic-gate 
13327c478bd9Sstevel@tonic-gate 		/*
13337c478bd9Sstevel@tonic-gate 		 * If we've deferred as many rtts as we plan on deferring, then
13347c478bd9Sstevel@tonic-gate 		 * assume the link really did slow down and process all queued
13357c478bd9Sstevel@tonic-gate 		 * rtts
13367c478bd9Sstevel@tonic-gate 		 */
13377c478bd9Sstevel@tonic-gate 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
13387c478bd9Sstevel@tonic-gate 			if (debug & D_PROBE) {
13397c478bd9Sstevel@tonic-gate 				logdebug("Received MAXDEFERREDRTT probes which "
13407c478bd9Sstevel@tonic-gate 				    "would cause an increased probe_interval.  "
13417c478bd9Sstevel@tonic-gate 				    "Integrating queued rtt data points.\n");
13427c478bd9Sstevel@tonic-gate 			}
13437c478bd9Sstevel@tonic-gate 
13447c478bd9Sstevel@tonic-gate 			for (i = 0; i <= tg->tg_num_deferred; i++) {
13457c478bd9Sstevel@tonic-gate 				tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
13467c478bd9Sstevel@tonic-gate 				    &tg->tg_rtt_sd, tg->tg_deferred[i]);
13477c478bd9Sstevel@tonic-gate 			}
13487c478bd9Sstevel@tonic-gate 
13497c478bd9Sstevel@tonic-gate 			tg->tg_num_deferred = 0;
13507c478bd9Sstevel@tonic-gate 		} else {
13517c478bd9Sstevel@tonic-gate 			tg->tg_num_deferred++;
13527c478bd9Sstevel@tonic-gate 		}
13537c478bd9Sstevel@tonic-gate 		return;
13547c478bd9Sstevel@tonic-gate 	}
13557c478bd9Sstevel@tonic-gate 
13567c478bd9Sstevel@tonic-gate 	/*
13577c478bd9Sstevel@tonic-gate 	 * If this is a normal probe, or an RTT probe that would lead to a
13587c478bd9Sstevel@tonic-gate 	 * reduced CRTT, then update our CRTT data.  Further, if this was
13597c478bd9Sstevel@tonic-gate 	 * a normal probe, pitch any deferred probes since our probes are
13607c478bd9Sstevel@tonic-gate 	 * again being answered within our CRTT estimates.
13617c478bd9Sstevel@tonic-gate 	 */
13627c478bd9Sstevel@tonic-gate 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
13637c478bd9Sstevel@tonic-gate 		tg->tg_rtt_sa = sa;
13647c478bd9Sstevel@tonic-gate 		tg->tg_rtt_sd = sv;
13657c478bd9Sstevel@tonic-gate 		tg->tg_crtt = new_crtt;
13667c478bd9Sstevel@tonic-gate 		if (is_probe_uni)
13677c478bd9Sstevel@tonic-gate 			tg->tg_num_deferred = 0;
13687c478bd9Sstevel@tonic-gate 	}
13697c478bd9Sstevel@tonic-gate }
13707c478bd9Sstevel@tonic-gate 
13717c478bd9Sstevel@tonic-gate /*
13727c478bd9Sstevel@tonic-gate  * Return a pointer to the specified option buffer.
13737c478bd9Sstevel@tonic-gate  * If not found return NULL.
13747c478bd9Sstevel@tonic-gate  */
13757c478bd9Sstevel@tonic-gate static void *
13767c478bd9Sstevel@tonic-gate find_ancillary(struct msghdr *msg, int cmsg_type)
13777c478bd9Sstevel@tonic-gate {
13787c478bd9Sstevel@tonic-gate 	struct cmsghdr *cmsg;
13797c478bd9Sstevel@tonic-gate 
13807c478bd9Sstevel@tonic-gate 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
13817c478bd9Sstevel@tonic-gate 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
13827c478bd9Sstevel@tonic-gate 		if (cmsg->cmsg_level == IPPROTO_IPV6 &&
13837c478bd9Sstevel@tonic-gate 		    cmsg->cmsg_type == cmsg_type) {
13847c478bd9Sstevel@tonic-gate 			return (CMSG_DATA(cmsg));
13857c478bd9Sstevel@tonic-gate 		}
13867c478bd9Sstevel@tonic-gate 	}
13877c478bd9Sstevel@tonic-gate 	return (NULL);
13887c478bd9Sstevel@tonic-gate }
13897c478bd9Sstevel@tonic-gate 
13907c478bd9Sstevel@tonic-gate /*
13917c478bd9Sstevel@tonic-gate  * See if a previously failed interface has started working again.
13927c478bd9Sstevel@tonic-gate  */
13937c478bd9Sstevel@tonic-gate void
13947c478bd9Sstevel@tonic-gate phyint_check_for_repair(struct phyint *pi)
13957c478bd9Sstevel@tonic-gate {
13967c478bd9Sstevel@tonic-gate 	if (phyint_repaired(pi)) {
13977c478bd9Sstevel@tonic-gate 		if (pi->pi_group == phyint_anongroup) {
13987c478bd9Sstevel@tonic-gate 			logerr("NIC repair detected on %s\n", pi->pi_name);
13997c478bd9Sstevel@tonic-gate 		} else {
14007c478bd9Sstevel@tonic-gate 			logerr("NIC repair detected on %s of group %s\n",
14017c478bd9Sstevel@tonic-gate 			    pi->pi_name, pi->pi_group->pg_name);
14027c478bd9Sstevel@tonic-gate 		}
14037c478bd9Sstevel@tonic-gate 
14047c478bd9Sstevel@tonic-gate 		/*
14057c478bd9Sstevel@tonic-gate 		 * If the interface is offline, just clear the FAILED flag,
14067c478bd9Sstevel@tonic-gate 		 * delaying the state change and failback operation until it
14077c478bd9Sstevel@tonic-gate 		 * is brought back online.
14087c478bd9Sstevel@tonic-gate 		 */
14097c478bd9Sstevel@tonic-gate 		if (pi->pi_state == PI_OFFLINE) {
14107c478bd9Sstevel@tonic-gate 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
14117c478bd9Sstevel@tonic-gate 			return;
14127c478bd9Sstevel@tonic-gate 		}
14137c478bd9Sstevel@tonic-gate 
1414*49df4566Sethindra 		if (pi->pi_flags & IFF_STANDBY) {
14157c478bd9Sstevel@tonic-gate 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
14167c478bd9Sstevel@tonic-gate 		} else {
14177c478bd9Sstevel@tonic-gate 			if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) {
14187c478bd9Sstevel@tonic-gate 				(void) change_lif_flags(pi,
14197c478bd9Sstevel@tonic-gate 				    IFF_FAILED, _B_FALSE);
14207c478bd9Sstevel@tonic-gate 				/* Per state diagram */
14217c478bd9Sstevel@tonic-gate 				pi->pi_empty = 0;
14227c478bd9Sstevel@tonic-gate 			}
14237c478bd9Sstevel@tonic-gate 		}
14247c478bd9Sstevel@tonic-gate 
14257c478bd9Sstevel@tonic-gate 		phyint_chstate(pi, PI_RUNNING);
14267c478bd9Sstevel@tonic-gate 
14277c478bd9Sstevel@tonic-gate 		if (GROUP_FAILED(pi->pi_group)) {
14287c478bd9Sstevel@tonic-gate 			/*
14297c478bd9Sstevel@tonic-gate 			 * This is the 1st phyint to receive a response
14307c478bd9Sstevel@tonic-gate 			 * after group failure.
14317c478bd9Sstevel@tonic-gate 			 */
14327c478bd9Sstevel@tonic-gate 			logerr("At least 1 interface (%s) of group %s has "
14337c478bd9Sstevel@tonic-gate 			    "repaired\n", pi->pi_name, pi->pi_group->pg_name);
14347c478bd9Sstevel@tonic-gate 			phyint_group_chstate(pi->pi_group, PG_RUNNING);
14357c478bd9Sstevel@tonic-gate 		}
14367c478bd9Sstevel@tonic-gate 	}
14377c478bd9Sstevel@tonic-gate }
14387c478bd9Sstevel@tonic-gate 
14397c478bd9Sstevel@tonic-gate /*
14407c478bd9Sstevel@tonic-gate  * See if a previously functioning interface has failed, or if the
14417c478bd9Sstevel@tonic-gate  * whole group of interfaces has failed.
14427c478bd9Sstevel@tonic-gate  */
14437c478bd9Sstevel@tonic-gate static void
14447c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(struct phyint_instance *pii)
14457c478bd9Sstevel@tonic-gate {
14467c478bd9Sstevel@tonic-gate 	struct	phyint	*pi;
14477c478bd9Sstevel@tonic-gate 	struct	phyint	*pi2;
14487c478bd9Sstevel@tonic-gate 
14497c478bd9Sstevel@tonic-gate 	pi = pii->pii_phyint;
14507c478bd9Sstevel@tonic-gate 
14517c478bd9Sstevel@tonic-gate 	switch (failure_state(pii)) {
14527c478bd9Sstevel@tonic-gate 	case PHYINT_FAILURE:
14537c478bd9Sstevel@tonic-gate 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
14547c478bd9Sstevel@tonic-gate 		if (pi->pi_group == phyint_anongroup) {
14557c478bd9Sstevel@tonic-gate 			logerr("NIC failure detected on %s\n", pii->pii_name);
14567c478bd9Sstevel@tonic-gate 		} else {
14577c478bd9Sstevel@tonic-gate 			logerr("NIC failure detected on %s of group %s\n",
14587c478bd9Sstevel@tonic-gate 			    pii->pii_name, pi->pi_group->pg_name);
14597c478bd9Sstevel@tonic-gate 		}
14607c478bd9Sstevel@tonic-gate 		/*
14617c478bd9Sstevel@tonic-gate 		 * Do the failover, unless the interface is offline (in
14627c478bd9Sstevel@tonic-gate 		 * which case we've already failed over).
14637c478bd9Sstevel@tonic-gate 		 */
14647c478bd9Sstevel@tonic-gate 		if (pi->pi_state != PI_OFFLINE) {
14657c478bd9Sstevel@tonic-gate 			phyint_chstate(pi, PI_FAILED);
14667c478bd9Sstevel@tonic-gate 			reset_crtt_all(pi);
14677c478bd9Sstevel@tonic-gate 			if (!(pi->pi_flags & IFF_INACTIVE))
14687c478bd9Sstevel@tonic-gate 				(void) try_failover(pi, FAILOVER_NORMAL);
14697c478bd9Sstevel@tonic-gate 		}
14707c478bd9Sstevel@tonic-gate 		break;
14717c478bd9Sstevel@tonic-gate 
14727c478bd9Sstevel@tonic-gate 	case GROUP_FAILURE:
14737c478bd9Sstevel@tonic-gate 		logerr("All Interfaces in group %s have failed\n",
14747c478bd9Sstevel@tonic-gate 		    pi->pi_group->pg_name);
14757c478bd9Sstevel@tonic-gate 		for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
14767c478bd9Sstevel@tonic-gate 		    pi2 = pi2->pi_pgnext) {
14777c478bd9Sstevel@tonic-gate 			if (pi2->pi_flags & IFF_OFFLINE)
14787c478bd9Sstevel@tonic-gate 				continue;
14797c478bd9Sstevel@tonic-gate 			(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
14807c478bd9Sstevel@tonic-gate 			reset_crtt_all(pi2);
14817c478bd9Sstevel@tonic-gate 
14827c478bd9Sstevel@tonic-gate 			/*
14837c478bd9Sstevel@tonic-gate 			 * In the case of host targets, we
14847c478bd9Sstevel@tonic-gate 			 * would have flushed the targets,
14857c478bd9Sstevel@tonic-gate 			 * and gone to PI_NOTARGETS state.
14867c478bd9Sstevel@tonic-gate 			 */
14877c478bd9Sstevel@tonic-gate 			if (pi2->pi_state == PI_RUNNING)
1488*49df4566Sethindra 				phyint_chstate(pi2, PI_FAILED);
14897c478bd9Sstevel@tonic-gate 
14907c478bd9Sstevel@tonic-gate 			pi2->pi_empty = 0;
14917c478bd9Sstevel@tonic-gate 			pi2->pi_full = 0;
14927c478bd9Sstevel@tonic-gate 		}
14937c478bd9Sstevel@tonic-gate 		break;
14947c478bd9Sstevel@tonic-gate 
14957c478bd9Sstevel@tonic-gate 	default:
14967c478bd9Sstevel@tonic-gate 		break;
14977c478bd9Sstevel@tonic-gate 	}
14987c478bd9Sstevel@tonic-gate }
14997c478bd9Sstevel@tonic-gate 
15007c478bd9Sstevel@tonic-gate /*
15017c478bd9Sstevel@tonic-gate  * Determines if any timeout event has occurred and returns the number of
15027c478bd9Sstevel@tonic-gate  * milliseconds until the next timeout event for the phyint. Returns
15037c478bd9Sstevel@tonic-gate  * TIMER_INFINITY for "never".
15047c478bd9Sstevel@tonic-gate  */
15057c478bd9Sstevel@tonic-gate uint_t
15067c478bd9Sstevel@tonic-gate phyint_inst_timer(struct phyint_instance *pii)
15077c478bd9Sstevel@tonic-gate {
15087c478bd9Sstevel@tonic-gate 	int 	pr_ndx;
15097c478bd9Sstevel@tonic-gate 	uint_t	timeout;
15107c478bd9Sstevel@tonic-gate 	struct	target	*cur_tg;
15117c478bd9Sstevel@tonic-gate 	struct	probe_stats *pr_statp;
15127c478bd9Sstevel@tonic-gate 	struct	phyint_instance *pii_other;
15137c478bd9Sstevel@tonic-gate 	struct	phyint *pi;
15147c478bd9Sstevel@tonic-gate 	int	valid_unack_count;
15157c478bd9Sstevel@tonic-gate 	int	i;
15167c478bd9Sstevel@tonic-gate 	int	interval;
15177c478bd9Sstevel@tonic-gate 	uint_t	check_time;
15187c478bd9Sstevel@tonic-gate 	uint_t	cur_time;
15197c478bd9Sstevel@tonic-gate 	hrtime_t cur_hrtime;
15207c478bd9Sstevel@tonic-gate 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
15217c478bd9Sstevel@tonic-gate 
15227c478bd9Sstevel@tonic-gate 	cur_time = getcurrenttime();
15237c478bd9Sstevel@tonic-gate 
15247c478bd9Sstevel@tonic-gate 	if (debug & D_TIMER) {
15257c478bd9Sstevel@tonic-gate 		logdebug("phyint_inst_timer(%s %s)\n",
15267c478bd9Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
15277c478bd9Sstevel@tonic-gate 	}
15287c478bd9Sstevel@tonic-gate 
15297c478bd9Sstevel@tonic-gate 	pii_other = phyint_inst_other(pii);
15307c478bd9Sstevel@tonic-gate 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
15317c478bd9Sstevel@tonic-gate 		/*
15327c478bd9Sstevel@tonic-gate 		 * Check to see if we're here due to link up/down flapping; If
15337c478bd9Sstevel@tonic-gate 		 * enough time has passed, then try to bring the interface
15347c478bd9Sstevel@tonic-gate 		 * back up; otherwise, schedule a timer to bring it back up
15357c478bd9Sstevel@tonic-gate 		 * when enough time *has* elapsed.
15367c478bd9Sstevel@tonic-gate 		 */
15377c478bd9Sstevel@tonic-gate 		pi = pii->pii_phyint;
15387c478bd9Sstevel@tonic-gate 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
15397c478bd9Sstevel@tonic-gate 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
15407c478bd9Sstevel@tonic-gate 			if (check_time > cur_time)
15417c478bd9Sstevel@tonic-gate 				return (check_time - cur_time);
15427c478bd9Sstevel@tonic-gate 
15437c478bd9Sstevel@tonic-gate 			phyint_check_for_repair(pi);
15447c478bd9Sstevel@tonic-gate 		}
15457c478bd9Sstevel@tonic-gate 	}
15467c478bd9Sstevel@tonic-gate 
15477c478bd9Sstevel@tonic-gate 	/*
15487c478bd9Sstevel@tonic-gate 	 * If this phyint is not yet initialized for probes,
15497c478bd9Sstevel@tonic-gate 	 * don't proceed further
15507c478bd9Sstevel@tonic-gate 	 */
15517c478bd9Sstevel@tonic-gate 	if (pii->pii_probe_sock == -1)
15527c478bd9Sstevel@tonic-gate 		return (TIMER_INFINITY);
15537c478bd9Sstevel@tonic-gate 
15547c478bd9Sstevel@tonic-gate 	/*
15557c478bd9Sstevel@tonic-gate 	 * If the timer has fired too soon, probably triggered
15567c478bd9Sstevel@tonic-gate 	 * by some other phyint instance, return the remaining
15577c478bd9Sstevel@tonic-gate 	 * time
15587c478bd9Sstevel@tonic-gate 	 */
15597c478bd9Sstevel@tonic-gate 	if (TIME_LT(cur_time, pii->pii_snxt_time))
15607c478bd9Sstevel@tonic-gate 		return (pii->pii_snxt_time - cur_time);
15617c478bd9Sstevel@tonic-gate 
15627c478bd9Sstevel@tonic-gate 	/*
15637c478bd9Sstevel@tonic-gate 	 * If the link is down, don't send any probes for now.
15647c478bd9Sstevel@tonic-gate 	 */
15657c478bd9Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
15667c478bd9Sstevel@tonic-gate 		return (TIMER_INFINITY);
15677c478bd9Sstevel@tonic-gate 
15687c478bd9Sstevel@tonic-gate 	/*
15697c478bd9Sstevel@tonic-gate 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
15707c478bd9Sstevel@tonic-gate 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
15717c478bd9Sstevel@tonic-gate 	 * Base probe time is strictly periodic.
15727c478bd9Sstevel@tonic-gate 	 */
15737c478bd9Sstevel@tonic-gate 	interval = GET_RANDOM(
15747c478bd9Sstevel@tonic-gate 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
15757c478bd9Sstevel@tonic-gate 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
15767c478bd9Sstevel@tonic-gate 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
15777c478bd9Sstevel@tonic-gate 
15787c478bd9Sstevel@tonic-gate 	/*
15797c478bd9Sstevel@tonic-gate 	 * Check if the current time > next time to probe. If so, we missed
15807c478bd9Sstevel@tonic-gate 	 * sending 1 or more probes, probably due to heavy system load. At least
15817c478bd9Sstevel@tonic-gate 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
15827c478bd9Sstevel@tonic-gate 	 * were scheduled. Make adjustments to the times, in multiples of
15837c478bd9Sstevel@tonic-gate 	 * user_probe_interval.
15847c478bd9Sstevel@tonic-gate 	 */
15857c478bd9Sstevel@tonic-gate 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
15867c478bd9Sstevel@tonic-gate 		int n;
15877c478bd9Sstevel@tonic-gate 
15887c478bd9Sstevel@tonic-gate 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
15897c478bd9Sstevel@tonic-gate 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
15907c478bd9Sstevel@tonic-gate 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
15917c478bd9Sstevel@tonic-gate 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
15927c478bd9Sstevel@tonic-gate 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
15937c478bd9Sstevel@tonic-gate 		    pii->pii_snxt_basetime);
15947c478bd9Sstevel@tonic-gate 
15957c478bd9Sstevel@tonic-gate 		/* Collect statistics about missed probes */
15967c478bd9Sstevel@tonic-gate 		probes_missed.pm_nprobes += n + 1;
15977c478bd9Sstevel@tonic-gate 		probes_missed.pm_ntimes++;
15987c478bd9Sstevel@tonic-gate 	}
15997c478bd9Sstevel@tonic-gate 	pii->pii_snxt_basetime += user_probe_interval;
16007c478bd9Sstevel@tonic-gate 	interval = pii->pii_snxt_time - cur_time;
16017c478bd9Sstevel@tonic-gate 	if (debug & D_TARGET) {
16027c478bd9Sstevel@tonic-gate 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
16037c478bd9Sstevel@tonic-gate 		    " interval %u\n", cur_time, pii->pii_snxt_time,
16047c478bd9Sstevel@tonic-gate 		    pii->pii_snxt_basetime, interval);
16057c478bd9Sstevel@tonic-gate 	}
16067c478bd9Sstevel@tonic-gate 
16077c478bd9Sstevel@tonic-gate 	/*
16087c478bd9Sstevel@tonic-gate 	 * If no targets are known, we need to send an ICMP multicast. The
16097c478bd9Sstevel@tonic-gate 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
16107c478bd9Sstevel@tonic-gate 	 * to see if we found a target.
16117c478bd9Sstevel@tonic-gate 	 */
16127c478bd9Sstevel@tonic-gate 	if (pii->pii_target_next == NULL) {
16137c478bd9Sstevel@tonic-gate 		assert(pii->pii_ntargets == 0);
16147c478bd9Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
16157c478bd9Sstevel@tonic-gate 		probe(pii, PROBE_MULTI, cur_time);
16167c478bd9Sstevel@tonic-gate 		return (interval);
16177c478bd9Sstevel@tonic-gate 	}
16187c478bd9Sstevel@tonic-gate 
16197c478bd9Sstevel@tonic-gate 	if ((user_probe_interval != probe_interval) &&
16207c478bd9Sstevel@tonic-gate 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
16217c478bd9Sstevel@tonic-gate 		/*
16227c478bd9Sstevel@tonic-gate 		 * the failure detection (fd) probe timer has not yet fired.
16237c478bd9Sstevel@tonic-gate 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
16247c478bd9Sstevel@tonic-gate 		 */
16257c478bd9Sstevel@tonic-gate 		probe(pii, PROBE_RTT, cur_time);
16267c478bd9Sstevel@tonic-gate 		return (interval);
16277c478bd9Sstevel@tonic-gate 	}
16287c478bd9Sstevel@tonic-gate 	/*
16297c478bd9Sstevel@tonic-gate 	 * the fd probe timer has fired. Need to do all failure
16307c478bd9Sstevel@tonic-gate 	 * detection / recovery calculations, and then send an fd probe
16317c478bd9Sstevel@tonic-gate 	 * of type PROBE_UNI.
16327c478bd9Sstevel@tonic-gate 	 */
16337c478bd9Sstevel@tonic-gate 	if (user_probe_interval == probe_interval) {
16347c478bd9Sstevel@tonic-gate 		/*
16357c478bd9Sstevel@tonic-gate 		 * We could have missed some probes, and then adjusted
16367c478bd9Sstevel@tonic-gate 		 * pii_snxt_basetime above. Otherwise we could have
16377c478bd9Sstevel@tonic-gate 		 * blindly added probe_interval to pii_fd_snxt_basetime.
16387c478bd9Sstevel@tonic-gate 		 */
16397c478bd9Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
16407c478bd9Sstevel@tonic-gate 	} else {
16417c478bd9Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime += probe_interval;
16427c478bd9Sstevel@tonic-gate 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
16437c478bd9Sstevel@tonic-gate 			int n;
16447c478bd9Sstevel@tonic-gate 
16457c478bd9Sstevel@tonic-gate 			n = (cur_time - pii->pii_fd_snxt_basetime) /
16467c478bd9Sstevel@tonic-gate 			    probe_interval;
16477c478bd9Sstevel@tonic-gate 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
16487c478bd9Sstevel@tonic-gate 		}
16497c478bd9Sstevel@tonic-gate 	}
16507c478bd9Sstevel@tonic-gate 
16517c478bd9Sstevel@tonic-gate 	/*
16527c478bd9Sstevel@tonic-gate 	 * We can have at most, the latest 2 probes that we sent, in
16537c478bd9Sstevel@tonic-gate 	 * the PR_UNACKED state. All previous probes sent, are either
16547c478bd9Sstevel@tonic-gate 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
16557c478bd9Sstevel@tonic-gate 	 * timed out if the probe's time_sent + the CRTT < currenttime.
16567c478bd9Sstevel@tonic-gate 	 * For each of the last 2 probes, examine whether it has timed
16577c478bd9Sstevel@tonic-gate 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
16587c478bd9Sstevel@tonic-gate 	 */
16597c478bd9Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
16607c478bd9Sstevel@tonic-gate 	valid_unack_count = 0;
16617c478bd9Sstevel@tonic-gate 
16627c478bd9Sstevel@tonic-gate 	for (i = 0; i < 2; i++) {
16637c478bd9Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[pr_ndx];
16647c478bd9Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
16657c478bd9Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
16667c478bd9Sstevel@tonic-gate 		case PR_ACKED:
16677c478bd9Sstevel@tonic-gate 			/*
16687c478bd9Sstevel@tonic-gate 			 * We received back an ACK, so the switch clearly
16697c478bd9Sstevel@tonic-gate 			 * is not dropping our traffic, and thus we can
16707c478bd9Sstevel@tonic-gate 			 * enable failure detection immediately.
16717c478bd9Sstevel@tonic-gate 			 */
16727c478bd9Sstevel@tonic-gate 			if (pii->pii_fd_hrtime > gethrtime()) {
16737c478bd9Sstevel@tonic-gate 				if (debug & D_PROBE) {
16747c478bd9Sstevel@tonic-gate 					logdebug("successful probe on %s; "
16757c478bd9Sstevel@tonic-gate 					    "ending quiet period\n",
16767c478bd9Sstevel@tonic-gate 					    pii->pii_phyint->pi_name);
16777c478bd9Sstevel@tonic-gate 				}
16787c478bd9Sstevel@tonic-gate 				pii->pii_fd_hrtime = gethrtime();
16797c478bd9Sstevel@tonic-gate 			}
16807c478bd9Sstevel@tonic-gate 			break;
16817c478bd9Sstevel@tonic-gate 
16827c478bd9Sstevel@tonic-gate 		case PR_UNACKED:
16837c478bd9Sstevel@tonic-gate 			assert(cur_tg != NULL);
16847c478bd9Sstevel@tonic-gate 			/*
16857c478bd9Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
16867c478bd9Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
16877c478bd9Sstevel@tonic-gate 			 * not available use group's probe interval,
16887c478bd9Sstevel@tonic-gate 			 * which is a worst case estimate.
16897c478bd9Sstevel@tonic-gate 			 */
16907c478bd9Sstevel@tonic-gate 			if (cur_tg->tg_crtt != 0) {
16917c478bd9Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
16927c478bd9Sstevel@tonic-gate 				    cur_tg->tg_crtt;
16937c478bd9Sstevel@tonic-gate 			} else {
16947c478bd9Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
16957c478bd9Sstevel@tonic-gate 				    probe_interval;
16967c478bd9Sstevel@tonic-gate 			}
16977c478bd9Sstevel@tonic-gate 			if (TIME_LT(timeout, cur_time)) {
16987c478bd9Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
16997c478bd9Sstevel@tonic-gate 				pr_statp->pr_time_lost = timeout;
17007c478bd9Sstevel@tonic-gate 			} else if (i == 1) {
17017c478bd9Sstevel@tonic-gate 				/*
17027c478bd9Sstevel@tonic-gate 				 * We are forced to consider this probe
17037c478bd9Sstevel@tonic-gate 				 * lost, as we can have at most 2 unack.
17047c478bd9Sstevel@tonic-gate 				 * probes any time, and we will be sending a
17057c478bd9Sstevel@tonic-gate 				 * probe at the end of this function.
17067c478bd9Sstevel@tonic-gate 				 * Normally, we should not be here, but
17077c478bd9Sstevel@tonic-gate 				 * this can happen if an incoming response
17087c478bd9Sstevel@tonic-gate 				 * that was considered lost has increased
17097c478bd9Sstevel@tonic-gate 				 * the crtt for this target, and also bumped
17107c478bd9Sstevel@tonic-gate 				 * up the FDT. Note that we never cancel or
17117c478bd9Sstevel@tonic-gate 				 * increase the current pii_time_left, so
17127c478bd9Sstevel@tonic-gate 				 * when the timer fires, we find 2 valid
17137c478bd9Sstevel@tonic-gate 				 * unacked probes, and they are yet to timeout
17147c478bd9Sstevel@tonic-gate 				 */
17157c478bd9Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
17167c478bd9Sstevel@tonic-gate 				pr_statp->pr_time_lost = cur_time;
17177c478bd9Sstevel@tonic-gate 			} else {
17187c478bd9Sstevel@tonic-gate 				/*
17197c478bd9Sstevel@tonic-gate 				 * Only the most recent probe can enter
17207c478bd9Sstevel@tonic-gate 				 * this 'else' arm. The second most recent
17217c478bd9Sstevel@tonic-gate 				 * probe must take either of the above arms,
17227c478bd9Sstevel@tonic-gate 				 * if it is unacked.
17237c478bd9Sstevel@tonic-gate 				 */
17247c478bd9Sstevel@tonic-gate 				valid_unack_count++;
17257c478bd9Sstevel@tonic-gate 			}
17267c478bd9Sstevel@tonic-gate 			break;
17277c478bd9Sstevel@tonic-gate 		}
17287c478bd9Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
17297c478bd9Sstevel@tonic-gate 	}
17307c478bd9Sstevel@tonic-gate 
17317c478bd9Sstevel@tonic-gate 	/*
17327c478bd9Sstevel@tonic-gate 	 * We send out 1 probe randomly in the interval between one half
17337c478bd9Sstevel@tonic-gate 	 * and one probe interval for the group. Given that the CRTT is always
17347c478bd9Sstevel@tonic-gate 	 * less than the group's probe interval, we can have at most 1
17357c478bd9Sstevel@tonic-gate 	 * unacknowledged probe now.  All previous probes are either lost or
17367c478bd9Sstevel@tonic-gate 	 * acked.
17377c478bd9Sstevel@tonic-gate 	 */
17387c478bd9Sstevel@tonic-gate 	assert(valid_unack_count == 0 || valid_unack_count == 1);
17397c478bd9Sstevel@tonic-gate 
17407c478bd9Sstevel@tonic-gate 	/*
17417c478bd9Sstevel@tonic-gate 	 * The timer has fired. Take appropriate action depending
17427c478bd9Sstevel@tonic-gate 	 * on the current state of the phyint.
17437c478bd9Sstevel@tonic-gate 	 *
17447c478bd9Sstevel@tonic-gate 	 * PI_RUNNING state 	- Failure detection and failover
17457c478bd9Sstevel@tonic-gate 	 * PI_FAILED state 	- Repair detection and failback
17467c478bd9Sstevel@tonic-gate 	 */
17477c478bd9Sstevel@tonic-gate 	switch (pii->pii_phyint->pi_state) {
17487c478bd9Sstevel@tonic-gate 	case PI_FAILED:
17497c478bd9Sstevel@tonic-gate 		/*
17507c478bd9Sstevel@tonic-gate 		 * If the most recent probe (excluding unacked probes that
17517c478bd9Sstevel@tonic-gate 		 * are yet to time out) has been acked, check whether the
17527c478bd9Sstevel@tonic-gate 		 * phyint is now repaired. If the phyint is repaired, then
17537c478bd9Sstevel@tonic-gate 		 * attempt failback, unless it is an inactive standby.
17547c478bd9Sstevel@tonic-gate 		 */
17557c478bd9Sstevel@tonic-gate 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
17567c478bd9Sstevel@tonic-gate 			phyint_check_for_repair(pii->pii_phyint);
17577c478bd9Sstevel@tonic-gate 		}
17587c478bd9Sstevel@tonic-gate 		break;
17597c478bd9Sstevel@tonic-gate 
17607c478bd9Sstevel@tonic-gate 	case PI_RUNNING:
17617c478bd9Sstevel@tonic-gate 		/*
17627c478bd9Sstevel@tonic-gate 		 * It's possible our probes have been lost because of a
17637c478bd9Sstevel@tonic-gate 		 * spanning-tree mandated quiet period on the switch.  If so,
17647c478bd9Sstevel@tonic-gate 		 * ignore the lost probes and consider the interface to still
17657c478bd9Sstevel@tonic-gate 		 * be functioning.
17667c478bd9Sstevel@tonic-gate 		 */
17677c478bd9Sstevel@tonic-gate 		cur_hrtime = gethrtime();
17687c478bd9Sstevel@tonic-gate 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
17697c478bd9Sstevel@tonic-gate 			break;
17707c478bd9Sstevel@tonic-gate 
17717c478bd9Sstevel@tonic-gate 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
17727c478bd9Sstevel@tonic-gate 			/*
17737c478bd9Sstevel@tonic-gate 			 * We have 1 or more failed probes (excluding unacked
17747c478bd9Sstevel@tonic-gate 			 * probes that are yet to time out). Determine if the
17757c478bd9Sstevel@tonic-gate 			 * phyint has failed. If so attempt a failover,
17767c478bd9Sstevel@tonic-gate 			 * unless it is an inactive standby
17777c478bd9Sstevel@tonic-gate 			 */
17787c478bd9Sstevel@tonic-gate 			phyint_inst_check_for_failure(pii);
17797c478bd9Sstevel@tonic-gate 		}
17807c478bd9Sstevel@tonic-gate 		break;
17817c478bd9Sstevel@tonic-gate 
17827c478bd9Sstevel@tonic-gate 	default:
17837c478bd9Sstevel@tonic-gate 		logerr("phyint_inst_timer: invalid state %d\n",
17847c478bd9Sstevel@tonic-gate 		    pii->pii_phyint->pi_state);
17857c478bd9Sstevel@tonic-gate 		abort();
17867c478bd9Sstevel@tonic-gate 	}
17877c478bd9Sstevel@tonic-gate 
17887c478bd9Sstevel@tonic-gate 	/*
17897c478bd9Sstevel@tonic-gate 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
17907c478bd9Sstevel@tonic-gate 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
17917c478bd9Sstevel@tonic-gate 	 * was called, the target list may be empty.
17927c478bd9Sstevel@tonic-gate 	 */
17937c478bd9Sstevel@tonic-gate 	if (pii->pii_target_next != NULL) {
17947c478bd9Sstevel@tonic-gate 		probe(pii, PROBE_UNI, cur_time);
17957c478bd9Sstevel@tonic-gate 		/*
17967c478bd9Sstevel@tonic-gate 		 * If we have just the one probe target, and we're not using
17977c478bd9Sstevel@tonic-gate 		 * router targets, try to find another as we presently have
17987c478bd9Sstevel@tonic-gate 		 * no resilience.
17997c478bd9Sstevel@tonic-gate 		 */
18007c478bd9Sstevel@tonic-gate 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
18017c478bd9Sstevel@tonic-gate 			probe(pii, PROBE_MULTI, cur_time);
18027c478bd9Sstevel@tonic-gate 	} else {
18037c478bd9Sstevel@tonic-gate 		probe(pii, PROBE_MULTI, cur_time);
18047c478bd9Sstevel@tonic-gate 	}
18057c478bd9Sstevel@tonic-gate 	return (interval);
18067c478bd9Sstevel@tonic-gate }
18077c478bd9Sstevel@tonic-gate 
18087c478bd9Sstevel@tonic-gate /*
18097c478bd9Sstevel@tonic-gate  * Start the probe timer for an interface instance.
18107c478bd9Sstevel@tonic-gate  */
18117c478bd9Sstevel@tonic-gate void
18127c478bd9Sstevel@tonic-gate start_timer(struct phyint_instance *pii)
18137c478bd9Sstevel@tonic-gate {
18147c478bd9Sstevel@tonic-gate 	uint32_t interval;
18157c478bd9Sstevel@tonic-gate 
18167c478bd9Sstevel@tonic-gate 	/*
18177c478bd9Sstevel@tonic-gate 	 * Spread the base probe times (pi_snxt_basetime) across phyints
18187c478bd9Sstevel@tonic-gate 	 * uniformly over the (curtime..curtime + the group's probe_interval).
18197c478bd9Sstevel@tonic-gate 	 * pi_snxt_basetime is strictly periodic with a frequency of
18207c478bd9Sstevel@tonic-gate 	 * the group's probe interval. The actual probe time pi_snxt_time
18217c478bd9Sstevel@tonic-gate 	 * adds some randomness to pi_snxt_basetime and happens in probe().
18227c478bd9Sstevel@tonic-gate 	 * For the 1st probe on each phyint after the timer is started,
18237c478bd9Sstevel@tonic-gate 	 * pi_snxt_time and pi_snxt_basetime are the same.
18247c478bd9Sstevel@tonic-gate 	 */
18257c478bd9Sstevel@tonic-gate 	interval = GET_RANDOM(0,
18267c478bd9Sstevel@tonic-gate 	    (int)pii->pii_phyint->pi_group->pg_probeint);
18277c478bd9Sstevel@tonic-gate 
18287c478bd9Sstevel@tonic-gate 	pii->pii_snxt_basetime = getcurrenttime() + interval;
18297c478bd9Sstevel@tonic-gate 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
18307c478bd9Sstevel@tonic-gate 	pii->pii_snxt_time = pii->pii_snxt_basetime;
18317c478bd9Sstevel@tonic-gate 	timer_schedule(interval);
18327c478bd9Sstevel@tonic-gate }
18337c478bd9Sstevel@tonic-gate 
18347c478bd9Sstevel@tonic-gate /*
18357c478bd9Sstevel@tonic-gate  * Restart the probe timer on an interface instance.
18367c478bd9Sstevel@tonic-gate  */
18377c478bd9Sstevel@tonic-gate static void
18387c478bd9Sstevel@tonic-gate restart_timer(struct phyint_instance *pii)
18397c478bd9Sstevel@tonic-gate {
18407c478bd9Sstevel@tonic-gate 	/*
18417c478bd9Sstevel@tonic-gate 	 * We don't need to restart the timer if it was never started in
18427c478bd9Sstevel@tonic-gate 	 * the first place (pii->pii_basetime_inited not set), as the timer
18437c478bd9Sstevel@tonic-gate 	 * won't have gone off yet.
18447c478bd9Sstevel@tonic-gate 	 */
18457c478bd9Sstevel@tonic-gate 	if (pii->pii_basetime_inited != 0) {
18467c478bd9Sstevel@tonic-gate 
18477c478bd9Sstevel@tonic-gate 		if (debug & D_LINKNOTE)
18487c478bd9Sstevel@tonic-gate 			logdebug("restart timer: restarting timer on %s, "
18497c478bd9Sstevel@tonic-gate 			    "address family %s\n", pii->pii_phyint->pi_name,
18507c478bd9Sstevel@tonic-gate 			    AF_STR(pii->pii_af));
18517c478bd9Sstevel@tonic-gate 
18527c478bd9Sstevel@tonic-gate 		start_timer(pii);
18537c478bd9Sstevel@tonic-gate 	}
18547c478bd9Sstevel@tonic-gate }
18557c478bd9Sstevel@tonic-gate 
18567c478bd9Sstevel@tonic-gate static void
18577c478bd9Sstevel@tonic-gate process_link_state_down(struct phyint *pi)
18587c478bd9Sstevel@tonic-gate {
18597c478bd9Sstevel@tonic-gate 	logerr("The link has gone down on %s\n", pi->pi_name);
18607c478bd9Sstevel@tonic-gate 
18617c478bd9Sstevel@tonic-gate 	/*
18627c478bd9Sstevel@tonic-gate 	 * Clear the probe statistics arrays, we don't want the repair
18637c478bd9Sstevel@tonic-gate 	 * detection logic relying on probes that were succesful prior
18647c478bd9Sstevel@tonic-gate 	 *  to the link going down.
18657c478bd9Sstevel@tonic-gate 	 */
18667c478bd9Sstevel@tonic-gate 	if (PROBE_CAPABLE(pi->pi_v4))
18677c478bd9Sstevel@tonic-gate 		clear_pii_probe_stats(pi->pi_v4);
18687c478bd9Sstevel@tonic-gate 	if (PROBE_CAPABLE(pi->pi_v6))
18697c478bd9Sstevel@tonic-gate 		clear_pii_probe_stats(pi->pi_v6);
18707c478bd9Sstevel@tonic-gate 	/*
18717c478bd9Sstevel@tonic-gate 	 * Check for interface failure.  Although we know the interface
18727c478bd9Sstevel@tonic-gate 	 * has failed, we don't know if all the other interfaces in the
18737c478bd9Sstevel@tonic-gate 	 * group have failed as well.
18747c478bd9Sstevel@tonic-gate 	 */
18757c478bd9Sstevel@tonic-gate 	if ((pi->pi_state == PI_RUNNING) ||
18767c478bd9Sstevel@tonic-gate 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
18777c478bd9Sstevel@tonic-gate 		if (debug & D_LINKNOTE) {
18787c478bd9Sstevel@tonic-gate 			logdebug("process_link_state_down:"
18797c478bd9Sstevel@tonic-gate 			    " checking for failure on %s\n", pi->pi_name);
18807c478bd9Sstevel@tonic-gate 		}
18817c478bd9Sstevel@tonic-gate 
18827c478bd9Sstevel@tonic-gate 		if (pi->pi_v4 != NULL)
18837c478bd9Sstevel@tonic-gate 			phyint_inst_check_for_failure(pi->pi_v4);
18847c478bd9Sstevel@tonic-gate 		else if (pi->pi_v6 != NULL)
18857c478bd9Sstevel@tonic-gate 			phyint_inst_check_for_failure(pi->pi_v6);
18867c478bd9Sstevel@tonic-gate 	}
18877c478bd9Sstevel@tonic-gate }
18887c478bd9Sstevel@tonic-gate 
18897c478bd9Sstevel@tonic-gate static void
18907c478bd9Sstevel@tonic-gate process_link_state_up(struct phyint *pi)
18917c478bd9Sstevel@tonic-gate {
18927c478bd9Sstevel@tonic-gate 	logerr("The link has come up on %s\n", pi->pi_name);
18937c478bd9Sstevel@tonic-gate 
18947c478bd9Sstevel@tonic-gate 	/*
18957c478bd9Sstevel@tonic-gate 	 * We stopped any running timers on each instance when the link
18967c478bd9Sstevel@tonic-gate 	 * went down, so restart them.
18977c478bd9Sstevel@tonic-gate 	 */
18987c478bd9Sstevel@tonic-gate 	if (pi->pi_v4)
18997c478bd9Sstevel@tonic-gate 		restart_timer(pi->pi_v4);
19007c478bd9Sstevel@tonic-gate 	if (pi->pi_v6)
19017c478bd9Sstevel@tonic-gate 		restart_timer(pi->pi_v6);
19027c478bd9Sstevel@tonic-gate 
19037c478bd9Sstevel@tonic-gate 	phyint_check_for_repair(pi);
19047c478bd9Sstevel@tonic-gate 
19057c478bd9Sstevel@tonic-gate 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
19067c478bd9Sstevel@tonic-gate 	if (pi->pi_whendx == LINK_UP_PERMIN)
19077c478bd9Sstevel@tonic-gate 		pi->pi_whendx = 0;
19087c478bd9Sstevel@tonic-gate }
19097c478bd9Sstevel@tonic-gate 
19107c478bd9Sstevel@tonic-gate /*
19117c478bd9Sstevel@tonic-gate  * Process any changes in link state passed up from the interfaces.
19127c478bd9Sstevel@tonic-gate  */
19137c478bd9Sstevel@tonic-gate void
19147c478bd9Sstevel@tonic-gate process_link_state_changes(void)
19157c478bd9Sstevel@tonic-gate {
19167c478bd9Sstevel@tonic-gate 	struct phyint *pi;
19177c478bd9Sstevel@tonic-gate 
19187c478bd9Sstevel@tonic-gate 	/* Look for interfaces where the link state has just changed */
19197c478bd9Sstevel@tonic-gate 
19207c478bd9Sstevel@tonic-gate 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
19217c478bd9Sstevel@tonic-gate 		boolean_t old_link_state_up = LINK_UP(pi);
19227c478bd9Sstevel@tonic-gate 
19237c478bd9Sstevel@tonic-gate 		/*
19247c478bd9Sstevel@tonic-gate 		 * Except when the "phyint" structure is created, this is
19257c478bd9Sstevel@tonic-gate 		 * the only place the link state is updated.  This allows
19267c478bd9Sstevel@tonic-gate 		 * this routine to detect changes in link state, rather
19277c478bd9Sstevel@tonic-gate 		 * than just the current state.
19287c478bd9Sstevel@tonic-gate 		 */
19297c478bd9Sstevel@tonic-gate 		UPDATE_LINK_STATE(pi);
19307c478bd9Sstevel@tonic-gate 
19317c478bd9Sstevel@tonic-gate 		if (LINK_DOWN(pi)) {
19327c478bd9Sstevel@tonic-gate 			/*
19337c478bd9Sstevel@tonic-gate 			 * Has link just gone down?
19347c478bd9Sstevel@tonic-gate 			 */
19357c478bd9Sstevel@tonic-gate 			if (old_link_state_up)
19367c478bd9Sstevel@tonic-gate 				process_link_state_down(pi);
19377c478bd9Sstevel@tonic-gate 		} else {
19387c478bd9Sstevel@tonic-gate 			/*
19397c478bd9Sstevel@tonic-gate 			 * Has link just gone back up?
19407c478bd9Sstevel@tonic-gate 			 */
19417c478bd9Sstevel@tonic-gate 			if (!old_link_state_up)
19427c478bd9Sstevel@tonic-gate 				process_link_state_up(pi);
19437c478bd9Sstevel@tonic-gate 		}
19447c478bd9Sstevel@tonic-gate 	}
19457c478bd9Sstevel@tonic-gate }
19467c478bd9Sstevel@tonic-gate 
19477c478bd9Sstevel@tonic-gate void
19487c478bd9Sstevel@tonic-gate reset_crtt_all(struct phyint *pi)
19497c478bd9Sstevel@tonic-gate {
19507c478bd9Sstevel@tonic-gate 	struct phyint_instance *pii;
19517c478bd9Sstevel@tonic-gate 	struct target *tg;
19527c478bd9Sstevel@tonic-gate 
19537c478bd9Sstevel@tonic-gate 	pii = pi->pi_v4;
19547c478bd9Sstevel@tonic-gate 	if (pii != NULL) {
19557c478bd9Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
19567c478bd9Sstevel@tonic-gate 			tg->tg_crtt = 0;
19577c478bd9Sstevel@tonic-gate 			tg->tg_rtt_sa = -1;
19587c478bd9Sstevel@tonic-gate 			tg->tg_rtt_sd = 0;
19597c478bd9Sstevel@tonic-gate 		}
19607c478bd9Sstevel@tonic-gate 	}
19617c478bd9Sstevel@tonic-gate 
19627c478bd9Sstevel@tonic-gate 	pii = pi->pi_v6;
19637c478bd9Sstevel@tonic-gate 	if (pii != NULL) {
19647c478bd9Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
19657c478bd9Sstevel@tonic-gate 			tg->tg_crtt = 0;
19667c478bd9Sstevel@tonic-gate 			tg->tg_rtt_sa = -1;
19677c478bd9Sstevel@tonic-gate 			tg->tg_rtt_sd = 0;
19687c478bd9Sstevel@tonic-gate 		}
19697c478bd9Sstevel@tonic-gate 	}
19707c478bd9Sstevel@tonic-gate }
19717c478bd9Sstevel@tonic-gate 
19727c478bd9Sstevel@tonic-gate /*
19737c478bd9Sstevel@tonic-gate  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
19747c478bd9Sstevel@tonic-gate  * probes on both instances IPv4 and IPv6.
19757c478bd9Sstevel@tonic-gate  * If the interface has failed, return the time of the first probe failure
19767c478bd9Sstevel@tonic-gate  * in "tff".
19777c478bd9Sstevel@tonic-gate  */
19787c478bd9Sstevel@tonic-gate static int
19797c478bd9Sstevel@tonic-gate phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
19807c478bd9Sstevel@tonic-gate {
19817c478bd9Sstevel@tonic-gate 	uint_t	pi_tff;
19827c478bd9Sstevel@tonic-gate 	struct	target *cur_tg;
19837c478bd9Sstevel@tonic-gate 	struct	probe_fail_count pfinfo;
19847c478bd9Sstevel@tonic-gate 	struct	phyint_instance *pii_other;
19857c478bd9Sstevel@tonic-gate 	int	pr_ndx;
19867c478bd9Sstevel@tonic-gate 
19877c478bd9Sstevel@tonic-gate 	/*
19887c478bd9Sstevel@tonic-gate 	 * Get the number of consecutive failed probes on
19897c478bd9Sstevel@tonic-gate 	 * this phyint across all targets. Also get the number
19907c478bd9Sstevel@tonic-gate 	 * of consecutive failed probes on this target only
19917c478bd9Sstevel@tonic-gate 	 */
19927c478bd9Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
19937c478bd9Sstevel@tonic-gate 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
19947c478bd9Sstevel@tonic-gate 	probe_fail_info(pii, cur_tg, &pfinfo);
19957c478bd9Sstevel@tonic-gate 
19967c478bd9Sstevel@tonic-gate 	/* Get the time of first failure, for later use */
19977c478bd9Sstevel@tonic-gate 	pi_tff = pfinfo.pf_tff;
19987c478bd9Sstevel@tonic-gate 
19997c478bd9Sstevel@tonic-gate 	/*
20007c478bd9Sstevel@tonic-gate 	 * If the current target has not responded to the
20017c478bd9Sstevel@tonic-gate 	 * last NUM_PROBE_FAILS probes, and other targets are
20027c478bd9Sstevel@tonic-gate 	 * responding delete this target. Dead gateway detection
20037c478bd9Sstevel@tonic-gate 	 * will eventually remove this target (if router) from the
20047c478bd9Sstevel@tonic-gate 	 * routing tables. If that does not occur, we may end
20057c478bd9Sstevel@tonic-gate 	 * up adding this to our list again.
20067c478bd9Sstevel@tonic-gate 	 */
20077c478bd9Sstevel@tonic-gate 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
20087c478bd9Sstevel@tonic-gate 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
20097c478bd9Sstevel@tonic-gate 		if (pii->pii_targets_are_routers) {
20107c478bd9Sstevel@tonic-gate 			if (cur_tg->tg_status == TG_ACTIVE)
20117c478bd9Sstevel@tonic-gate 				pii->pii_ntargets--;
20127c478bd9Sstevel@tonic-gate 			cur_tg->tg_status = TG_DEAD;
20137c478bd9Sstevel@tonic-gate 			cur_tg->tg_crtt = 0;
20147c478bd9Sstevel@tonic-gate 			cur_tg->tg_rtt_sa = -1;
20157c478bd9Sstevel@tonic-gate 			cur_tg->tg_rtt_sd = 0;
20167c478bd9Sstevel@tonic-gate 			if (pii->pii_target_next == cur_tg)
20177c478bd9Sstevel@tonic-gate 				pii->pii_target_next = target_next(cur_tg);
20187c478bd9Sstevel@tonic-gate 		} else {
20197c478bd9Sstevel@tonic-gate 			target_delete(cur_tg);
20207c478bd9Sstevel@tonic-gate 			probe(pii, PROBE_MULTI, getcurrenttime());
20217c478bd9Sstevel@tonic-gate 		}
20227c478bd9Sstevel@tonic-gate 		return (PHYINT_OK);
20237c478bd9Sstevel@tonic-gate 	}
20247c478bd9Sstevel@tonic-gate 
20257c478bd9Sstevel@tonic-gate 	/*
20267c478bd9Sstevel@tonic-gate 	 * If the phyint has lost NUM_PROBE_FAILS or more
20277c478bd9Sstevel@tonic-gate 	 * consecutive probes, on both IPv4 and IPv6 protocol
20287c478bd9Sstevel@tonic-gate 	 * instances of the phyint, then trigger failure
20297c478bd9Sstevel@tonic-gate 	 * detection, else return false
20307c478bd9Sstevel@tonic-gate 	 */
20317c478bd9Sstevel@tonic-gate 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
20327c478bd9Sstevel@tonic-gate 		return (PHYINT_OK);
20337c478bd9Sstevel@tonic-gate 
20347c478bd9Sstevel@tonic-gate 	pii_other = phyint_inst_other(pii);
20357c478bd9Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii_other)) {
20367c478bd9Sstevel@tonic-gate 		probe_fail_info(pii_other, NULL, &pfinfo);
20377c478bd9Sstevel@tonic-gate 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
20387c478bd9Sstevel@tonic-gate 			/*
20397c478bd9Sstevel@tonic-gate 			 * We have NUM_PROBE_FAILS or more failures
20407c478bd9Sstevel@tonic-gate 			 * on both IPv4 and IPv6. Get the earliest
20417c478bd9Sstevel@tonic-gate 			 * time when failure was detected on this
20427c478bd9Sstevel@tonic-gate 			 * phyint across IPv4 and IPv6.
20437c478bd9Sstevel@tonic-gate 			 */
20447c478bd9Sstevel@tonic-gate 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
20457c478bd9Sstevel@tonic-gate 				pi_tff = pfinfo.pf_tff;
20467c478bd9Sstevel@tonic-gate 		} else {
20477c478bd9Sstevel@tonic-gate 			/*
20487c478bd9Sstevel@tonic-gate 			 * This instance has < NUM_PROBE_FAILS failure.
20497c478bd9Sstevel@tonic-gate 			 * So return false
20507c478bd9Sstevel@tonic-gate 			 */
20517c478bd9Sstevel@tonic-gate 			return (PHYINT_OK);
20527c478bd9Sstevel@tonic-gate 		}
20537c478bd9Sstevel@tonic-gate 	}
20547c478bd9Sstevel@tonic-gate 	*tff = pi_tff;
20557c478bd9Sstevel@tonic-gate 	return (PHYINT_FAILURE);
20567c478bd9Sstevel@tonic-gate }
20577c478bd9Sstevel@tonic-gate 
20587c478bd9Sstevel@tonic-gate /*
20597c478bd9Sstevel@tonic-gate  * Check if the link has gone down on this phyint, or it has failed the
20607c478bd9Sstevel@tonic-gate  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
20617c478bd9Sstevel@tonic-gate  * Also look at other phyints of this group, for group failures.
20627c478bd9Sstevel@tonic-gate  */
20637c478bd9Sstevel@tonic-gate int
20647c478bd9Sstevel@tonic-gate failure_state(struct phyint_instance *pii)
20657c478bd9Sstevel@tonic-gate {
20667c478bd9Sstevel@tonic-gate 	struct	probe_success_count psinfo;
20677c478bd9Sstevel@tonic-gate 	uint_t	pi2_tls;		/* time last success */
20687c478bd9Sstevel@tonic-gate 	uint_t	pi_tff;			/* time first fail */
20697c478bd9Sstevel@tonic-gate 	struct	phyint	*pi2;
20707c478bd9Sstevel@tonic-gate 	struct	phyint *pi;
20717c478bd9Sstevel@tonic-gate 	struct	phyint_instance *pii2;
20727c478bd9Sstevel@tonic-gate 	struct  phyint_group *pg;
20737c478bd9Sstevel@tonic-gate 	boolean_t alone;
20747c478bd9Sstevel@tonic-gate 
20757c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER)
20767c478bd9Sstevel@tonic-gate 		logdebug("phyint_failed(%s)\n", pii->pii_name);
20777c478bd9Sstevel@tonic-gate 
20787c478bd9Sstevel@tonic-gate 	pi = pii->pii_phyint;
20797c478bd9Sstevel@tonic-gate 	pg = pi->pi_group;
20807c478bd9Sstevel@tonic-gate 
20817c478bd9Sstevel@tonic-gate 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
20827c478bd9Sstevel@tonic-gate 		PHYINT_OK)
20837c478bd9Sstevel@tonic-gate 		return (PHYINT_OK);
20847c478bd9Sstevel@tonic-gate 
20857c478bd9Sstevel@tonic-gate 	/*
20867c478bd9Sstevel@tonic-gate 	 * At this point, the link is down, or the phyint is suspect,
20877c478bd9Sstevel@tonic-gate 	 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
20887c478bd9Sstevel@tonic-gate 	 * does not belong to any group, or is the only member of the
20897c478bd9Sstevel@tonic-gate 	 * group capable of being probed, return PHYINT_FAILURE.
20907c478bd9Sstevel@tonic-gate 	 */
20917c478bd9Sstevel@tonic-gate 	alone = _B_TRUE;
20927c478bd9Sstevel@tonic-gate 	if (pg != phyint_anongroup) {
20937c478bd9Sstevel@tonic-gate 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
20947c478bd9Sstevel@tonic-gate 			if (pi2 == pi)
20957c478bd9Sstevel@tonic-gate 				continue;
20967c478bd9Sstevel@tonic-gate 			if (PROBE_CAPABLE(pi2->pi_v4) ||
20977c478bd9Sstevel@tonic-gate 			    PROBE_CAPABLE(pi2->pi_v6)) {
20987c478bd9Sstevel@tonic-gate 				alone = _B_FALSE;
20997c478bd9Sstevel@tonic-gate 				break;
21007c478bd9Sstevel@tonic-gate 			}
21017c478bd9Sstevel@tonic-gate 		}
21027c478bd9Sstevel@tonic-gate 	}
21037c478bd9Sstevel@tonic-gate 	if (alone)
21047c478bd9Sstevel@tonic-gate 		return (PHYINT_FAILURE);
21057c478bd9Sstevel@tonic-gate 
21067c478bd9Sstevel@tonic-gate 	/*
21077c478bd9Sstevel@tonic-gate 	 * Need to compare against other phyints of the same group
21087c478bd9Sstevel@tonic-gate 	 * to exclude group failures. If the failure was detected via
21097c478bd9Sstevel@tonic-gate 	 * probing, then if the time of last success (tls) of any
21107c478bd9Sstevel@tonic-gate 	 * phyint is more recent than the time of first fail (tff) of the
21117c478bd9Sstevel@tonic-gate 	 * phyint in question, and the link is up on the phyint,
21127c478bd9Sstevel@tonic-gate 	 * then it is a phyint failure. Otherwise it is a group failure.
21137c478bd9Sstevel@tonic-gate 	 * If failure was detected via a link down notification sent from
21147c478bd9Sstevel@tonic-gate 	 * the driver to IP, we see if any phyints in the group are still
21157c478bd9Sstevel@tonic-gate 	 * running and haven't received a link down notification.  We
21167c478bd9Sstevel@tonic-gate 	 * will usually be processing the link down notification shortly
21177c478bd9Sstevel@tonic-gate 	 * after it was received, so there is no point looking at the tls
21187c478bd9Sstevel@tonic-gate 	 * of other phyints.
21197c478bd9Sstevel@tonic-gate 	 */
21207c478bd9Sstevel@tonic-gate 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
21217c478bd9Sstevel@tonic-gate 		/* Exclude ourself from comparison */
21227c478bd9Sstevel@tonic-gate 		if (pi2 == pi)
21237c478bd9Sstevel@tonic-gate 			continue;
21247c478bd9Sstevel@tonic-gate 
21257c478bd9Sstevel@tonic-gate 		if (LINK_DOWN(pi)) {
21267c478bd9Sstevel@tonic-gate 			/*
21277c478bd9Sstevel@tonic-gate 			 * We use FLAGS_TO_LINK_STATE() to test the
21287c478bd9Sstevel@tonic-gate 			 * flags directly, rather then LINK_UP() or
21297c478bd9Sstevel@tonic-gate 			 * LINK_DOWN(), as we may not have got round
21307c478bd9Sstevel@tonic-gate 			 * to processing the link state for the other
21317c478bd9Sstevel@tonic-gate 			 * phyints in the group yet.
21327c478bd9Sstevel@tonic-gate 			 *
21337c478bd9Sstevel@tonic-gate 			 * The check for PI_RUNNING and group
21347c478bd9Sstevel@tonic-gate 			 * failure handles the case when the
21357c478bd9Sstevel@tonic-gate 			 * group begins to recover.  The first
21367c478bd9Sstevel@tonic-gate 			 * phyint to recover should not trigger
21377c478bd9Sstevel@tonic-gate 			 * a failover from the soon-to-recover
21387c478bd9Sstevel@tonic-gate 			 * other phyints to the first recovered
21397c478bd9Sstevel@tonic-gate 			 * phyint. PI_RUNNING will be set, and
21407c478bd9Sstevel@tonic-gate 			 * pg_groupfailed cleared only after
21417c478bd9Sstevel@tonic-gate 			 * receipt of NUM_PROBE_REPAIRS, by
21427c478bd9Sstevel@tonic-gate 			 * which time the other phyints should
21437c478bd9Sstevel@tonic-gate 			 * have received at least 1 packet,
21447c478bd9Sstevel@tonic-gate 			 * and so will not have NUM_PROBE_FAILS.
21457c478bd9Sstevel@tonic-gate 			 */
21467c478bd9Sstevel@tonic-gate 			if ((pi2->pi_state == PI_RUNNING) &&
21477c478bd9Sstevel@tonic-gate 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
21487c478bd9Sstevel@tonic-gate 				return (PHYINT_FAILURE);
21497c478bd9Sstevel@tonic-gate 		} else {
21507c478bd9Sstevel@tonic-gate 			/*
21517c478bd9Sstevel@tonic-gate 			 * Need to compare against both IPv4 and
21527c478bd9Sstevel@tonic-gate 			 * IPv6 instances.
21537c478bd9Sstevel@tonic-gate 			 */
21547c478bd9Sstevel@tonic-gate 			pii2 = pi2->pi_v4;
21557c478bd9Sstevel@tonic-gate 			if (pii2 != NULL) {
21567c478bd9Sstevel@tonic-gate 				probe_success_info(pii2, NULL, &psinfo);
21577c478bd9Sstevel@tonic-gate 				if (psinfo.ps_tls_valid) {
21587c478bd9Sstevel@tonic-gate 					pi2_tls = psinfo.ps_tls;
21597c478bd9Sstevel@tonic-gate 					/*
21607c478bd9Sstevel@tonic-gate 					 * See comment above regarding check
21617c478bd9Sstevel@tonic-gate 					 * for PI_RUNNING and group failure.
21627c478bd9Sstevel@tonic-gate 					 */
21637c478bd9Sstevel@tonic-gate 					if (TIME_GT(pi2_tls, pi_tff) &&
21647c478bd9Sstevel@tonic-gate 					    (pi2->pi_state == PI_RUNNING) &&
21657c478bd9Sstevel@tonic-gate 					    !GROUP_FAILED(pg) &&
21667c478bd9Sstevel@tonic-gate 					    FLAGS_TO_LINK_STATE(pi2))
21677c478bd9Sstevel@tonic-gate 						return (PHYINT_FAILURE);
21687c478bd9Sstevel@tonic-gate 				}
21697c478bd9Sstevel@tonic-gate 			}
21707c478bd9Sstevel@tonic-gate 
21717c478bd9Sstevel@tonic-gate 			pii2 = pi2->pi_v6;
21727c478bd9Sstevel@tonic-gate 			if (pii2 != NULL) {
21737c478bd9Sstevel@tonic-gate 				probe_success_info(pii2, NULL, &psinfo);
21747c478bd9Sstevel@tonic-gate 				if (psinfo.ps_tls_valid) {
21757c478bd9Sstevel@tonic-gate 					pi2_tls = psinfo.ps_tls;
21767c478bd9Sstevel@tonic-gate 					/*
21777c478bd9Sstevel@tonic-gate 					 * See comment above regarding check
21787c478bd9Sstevel@tonic-gate 					 * for PI_RUNNING and group failure.
21797c478bd9Sstevel@tonic-gate 					 */
21807c478bd9Sstevel@tonic-gate 					if (TIME_GT(pi2_tls, pi_tff) &&
21817c478bd9Sstevel@tonic-gate 					    (pi2->pi_state == PI_RUNNING) &&
21827c478bd9Sstevel@tonic-gate 					    !GROUP_FAILED(pg) &&
21837c478bd9Sstevel@tonic-gate 					    FLAGS_TO_LINK_STATE(pi2))
21847c478bd9Sstevel@tonic-gate 						return (PHYINT_FAILURE);
21857c478bd9Sstevel@tonic-gate 				}
21867c478bd9Sstevel@tonic-gate 			}
21877c478bd9Sstevel@tonic-gate 		}
21887c478bd9Sstevel@tonic-gate 	}
21897c478bd9Sstevel@tonic-gate 
21907c478bd9Sstevel@tonic-gate 	/*
21917c478bd9Sstevel@tonic-gate 	 * Change the group state to PG_FAILED if it's not already.
21927c478bd9Sstevel@tonic-gate 	 */
21937c478bd9Sstevel@tonic-gate 	if (!GROUP_FAILED(pg))
21947c478bd9Sstevel@tonic-gate 		phyint_group_chstate(pg, PG_FAILED);
21957c478bd9Sstevel@tonic-gate 
21967c478bd9Sstevel@tonic-gate 	return (GROUP_FAILURE);
21977c478bd9Sstevel@tonic-gate }
21987c478bd9Sstevel@tonic-gate 
21997c478bd9Sstevel@tonic-gate /*
22007c478bd9Sstevel@tonic-gate  * Return the information associated with consecutive probe successes
22017c478bd9Sstevel@tonic-gate  * starting with the most recent probe. At most the last 2 probes can be
22027c478bd9Sstevel@tonic-gate  * in the unacknowledged state. All previous probes have either failed
22037c478bd9Sstevel@tonic-gate  * or succeeded.
22047c478bd9Sstevel@tonic-gate  */
22057c478bd9Sstevel@tonic-gate static void
22067c478bd9Sstevel@tonic-gate probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
22077c478bd9Sstevel@tonic-gate     struct probe_success_count *psinfo)
22087c478bd9Sstevel@tonic-gate {
22097c478bd9Sstevel@tonic-gate 	uint_t	i;
22107c478bd9Sstevel@tonic-gate 	struct probe_stats *pr_statp;
22117c478bd9Sstevel@tonic-gate 	uint_t most_recent;
22127c478bd9Sstevel@tonic-gate 	uint_t second_most_recent;
22137c478bd9Sstevel@tonic-gate 	boolean_t pi_found_failure = _B_FALSE;
22147c478bd9Sstevel@tonic-gate 	boolean_t tg_found_failure = _B_FALSE;
22157c478bd9Sstevel@tonic-gate 	uint_t now;
22167c478bd9Sstevel@tonic-gate 	uint_t timeout;
22177c478bd9Sstevel@tonic-gate 	struct target *tg;
22187c478bd9Sstevel@tonic-gate 
22197c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER)
22207c478bd9Sstevel@tonic-gate 		logdebug("probe_success_info(%s)\n", pii->pii_name);
22217c478bd9Sstevel@tonic-gate 
22227c478bd9Sstevel@tonic-gate 	bzero(psinfo, sizeof (*psinfo));
22237c478bd9Sstevel@tonic-gate 	now = getcurrenttime();
22247c478bd9Sstevel@tonic-gate 
22257c478bd9Sstevel@tonic-gate 	/*
22267c478bd9Sstevel@tonic-gate 	 * Start with the most recent probe, and count the number
22277c478bd9Sstevel@tonic-gate 	 * of consecutive probe successes. Latch the number of successes
22287c478bd9Sstevel@tonic-gate 	 * on hitting a failure.
22297c478bd9Sstevel@tonic-gate 	 */
22307c478bd9Sstevel@tonic-gate 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
22317c478bd9Sstevel@tonic-gate 	second_most_recent = PROBE_INDEX_PREV(most_recent);
22327c478bd9Sstevel@tonic-gate 
22337c478bd9Sstevel@tonic-gate 	for (i = most_recent; i != pii->pii_probe_next;
22347c478bd9Sstevel@tonic-gate 	    i = PROBE_INDEX_PREV(i)) {
22357c478bd9Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[i];
22367c478bd9Sstevel@tonic-gate 
22377c478bd9Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
22387c478bd9Sstevel@tonic-gate 		case PR_UNACKED:
22397c478bd9Sstevel@tonic-gate 			/*
22407c478bd9Sstevel@tonic-gate 			 * Only the most recent 2 probes can be unacknowledged
22417c478bd9Sstevel@tonic-gate 			 */
22427c478bd9Sstevel@tonic-gate 			assert(i == most_recent || i == second_most_recent);
22437c478bd9Sstevel@tonic-gate 
22447c478bd9Sstevel@tonic-gate 			tg = pr_statp->pr_target;
22457c478bd9Sstevel@tonic-gate 			assert(tg != NULL);
22467c478bd9Sstevel@tonic-gate 			/*
22477c478bd9Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
22487c478bd9Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
22497c478bd9Sstevel@tonic-gate 			 * not available use the value of the group's probe
22507c478bd9Sstevel@tonic-gate 			 * interval which is a worst case estimate.
22517c478bd9Sstevel@tonic-gate 			 */
22527c478bd9Sstevel@tonic-gate 			if (tg->tg_crtt != 0) {
22537c478bd9Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
22547c478bd9Sstevel@tonic-gate 			} else {
22557c478bd9Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
22567c478bd9Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_probeint;
22577c478bd9Sstevel@tonic-gate 			}
22587c478bd9Sstevel@tonic-gate 
22597c478bd9Sstevel@tonic-gate 			if (TIME_LT(timeout, now)) {
22607c478bd9Sstevel@tonic-gate 				/*
22617c478bd9Sstevel@tonic-gate 				 * We hit a failure. Latch the total number of
22627c478bd9Sstevel@tonic-gate 				 * recent consecutive successes.
22637c478bd9Sstevel@tonic-gate 				 */
22647c478bd9Sstevel@tonic-gate 				pr_statp->pr_time_lost = timeout;
22657c478bd9Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
22667c478bd9Sstevel@tonic-gate 				pi_found_failure = _B_TRUE;
22677c478bd9Sstevel@tonic-gate 				if (cur_tg != NULL && tg == cur_tg) {
22687c478bd9Sstevel@tonic-gate 					/*
22697c478bd9Sstevel@tonic-gate 					 * We hit a failure for the desired
22707c478bd9Sstevel@tonic-gate 					 * target. Latch the number of recent
22717c478bd9Sstevel@tonic-gate 					 * consecutive successes for this target
22727c478bd9Sstevel@tonic-gate 					 */
22737c478bd9Sstevel@tonic-gate 					tg_found_failure = _B_TRUE;
22747c478bd9Sstevel@tonic-gate 				}
22757c478bd9Sstevel@tonic-gate 			}
22767c478bd9Sstevel@tonic-gate 			break;
22777c478bd9Sstevel@tonic-gate 
22787c478bd9Sstevel@tonic-gate 		case PR_ACKED:
22797c478bd9Sstevel@tonic-gate 			/*
22807c478bd9Sstevel@tonic-gate 			 * Bump up the count of probe successes, if we
22817c478bd9Sstevel@tonic-gate 			 * have not seen any failure so far.
22827c478bd9Sstevel@tonic-gate 			 */
22837c478bd9Sstevel@tonic-gate 			if (!pi_found_failure)
22847c478bd9Sstevel@tonic-gate 				psinfo->ps_nsucc++;
22857c478bd9Sstevel@tonic-gate 
22867c478bd9Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
22877c478bd9Sstevel@tonic-gate 			    !tg_found_failure) {
22887c478bd9Sstevel@tonic-gate 				psinfo->ps_nsucc_tg++;
22897c478bd9Sstevel@tonic-gate 			}
22907c478bd9Sstevel@tonic-gate 
22917c478bd9Sstevel@tonic-gate 			/*
22927c478bd9Sstevel@tonic-gate 			 * Record the time of last success, if this is
22937c478bd9Sstevel@tonic-gate 			 * the most recent probe success.
22947c478bd9Sstevel@tonic-gate 			 */
22957c478bd9Sstevel@tonic-gate 			if (!psinfo->ps_tls_valid) {
22967c478bd9Sstevel@tonic-gate 				psinfo->ps_tls = pr_statp->pr_time_acked;
22977c478bd9Sstevel@tonic-gate 				psinfo->ps_tls_valid = _B_TRUE;
22987c478bd9Sstevel@tonic-gate 			}
22997c478bd9Sstevel@tonic-gate 			break;
23007c478bd9Sstevel@tonic-gate 
23017c478bd9Sstevel@tonic-gate 		case PR_LOST:
23027c478bd9Sstevel@tonic-gate 			/*
23037c478bd9Sstevel@tonic-gate 			 * We hit a failure. Latch the total number of
23047c478bd9Sstevel@tonic-gate 			 * recent consecutive successes.
23057c478bd9Sstevel@tonic-gate 			 */
23067c478bd9Sstevel@tonic-gate 			pi_found_failure = _B_TRUE;
23077c478bd9Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
23087c478bd9Sstevel@tonic-gate 				/*
23097c478bd9Sstevel@tonic-gate 				 * We hit a failure for the desired target.
23107c478bd9Sstevel@tonic-gate 				 * Latch the number of recent consecutive
23117c478bd9Sstevel@tonic-gate 				 * successes for this target
23127c478bd9Sstevel@tonic-gate 				 */
23137c478bd9Sstevel@tonic-gate 				tg_found_failure = _B_TRUE;
23147c478bd9Sstevel@tonic-gate 			}
23157c478bd9Sstevel@tonic-gate 			break;
23167c478bd9Sstevel@tonic-gate 
23177c478bd9Sstevel@tonic-gate 		default:
23187c478bd9Sstevel@tonic-gate 			return;
23197c478bd9Sstevel@tonic-gate 
23207c478bd9Sstevel@tonic-gate 		}
23217c478bd9Sstevel@tonic-gate 	}
23227c478bd9Sstevel@tonic-gate }
23237c478bd9Sstevel@tonic-gate 
23247c478bd9Sstevel@tonic-gate /*
23257c478bd9Sstevel@tonic-gate  * Return the information associated with consecutive probe failures
23267c478bd9Sstevel@tonic-gate  * starting with the most recent probe. Only the last 2 probes can be in the
23277c478bd9Sstevel@tonic-gate  * unacknowledged state. All previous probes have either failed or succeeded.
23287c478bd9Sstevel@tonic-gate  */
23297c478bd9Sstevel@tonic-gate static void
23307c478bd9Sstevel@tonic-gate probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
23317c478bd9Sstevel@tonic-gate     struct probe_fail_count *pfinfo)
23327c478bd9Sstevel@tonic-gate {
23337c478bd9Sstevel@tonic-gate 	int	i;
23347c478bd9Sstevel@tonic-gate 	struct probe_stats *pr_statp;
23357c478bd9Sstevel@tonic-gate 	boolean_t	tg_found_success = _B_FALSE;
23367c478bd9Sstevel@tonic-gate 	boolean_t	pi_found_success = _B_FALSE;
23377c478bd9Sstevel@tonic-gate 	int	most_recent;
23387c478bd9Sstevel@tonic-gate 	int	second_most_recent;
23397c478bd9Sstevel@tonic-gate 	uint_t	now;
23407c478bd9Sstevel@tonic-gate 	uint_t	timeout;
23417c478bd9Sstevel@tonic-gate 	struct	target *tg;
23427c478bd9Sstevel@tonic-gate 
23437c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER)
23447c478bd9Sstevel@tonic-gate 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
23457c478bd9Sstevel@tonic-gate 
23467c478bd9Sstevel@tonic-gate 	bzero(pfinfo, sizeof (*pfinfo));
23477c478bd9Sstevel@tonic-gate 	now = getcurrenttime();
23487c478bd9Sstevel@tonic-gate 
23497c478bd9Sstevel@tonic-gate 	/*
23507c478bd9Sstevel@tonic-gate 	 * Start with the most recent probe, and count the number
23517c478bd9Sstevel@tonic-gate 	 * of consecutive probe failures. Latch the number of failures
23527c478bd9Sstevel@tonic-gate 	 * on hitting a probe success.
23537c478bd9Sstevel@tonic-gate 	 */
23547c478bd9Sstevel@tonic-gate 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
23557c478bd9Sstevel@tonic-gate 	second_most_recent = PROBE_INDEX_PREV(most_recent);
23567c478bd9Sstevel@tonic-gate 
23577c478bd9Sstevel@tonic-gate 	for (i = most_recent; i != pii->pii_probe_next;
23587c478bd9Sstevel@tonic-gate 	    i = PROBE_INDEX_PREV(i)) {
23597c478bd9Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[i];
23607c478bd9Sstevel@tonic-gate 
23617c478bd9Sstevel@tonic-gate 		assert(PR_STATUS_VALID(pr_statp->pr_status));
23627c478bd9Sstevel@tonic-gate 
23637c478bd9Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
23647c478bd9Sstevel@tonic-gate 		case PR_UNACKED:
23657c478bd9Sstevel@tonic-gate 			/*
23667c478bd9Sstevel@tonic-gate 			 * Only the most recent 2 probes can be unacknowledged
23677c478bd9Sstevel@tonic-gate 			 */
23687c478bd9Sstevel@tonic-gate 			assert(i == most_recent || i == second_most_recent);
23697c478bd9Sstevel@tonic-gate 
23707c478bd9Sstevel@tonic-gate 			tg = pr_statp->pr_target;
23717c478bd9Sstevel@tonic-gate 			/*
23727c478bd9Sstevel@tonic-gate 			 * Target is guaranteed to exist in the unack. state
23737c478bd9Sstevel@tonic-gate 			 */
23747c478bd9Sstevel@tonic-gate 			assert(tg != NULL);
23757c478bd9Sstevel@tonic-gate 			/*
23767c478bd9Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
23777c478bd9Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
23787c478bd9Sstevel@tonic-gate 			 * not available use the group's probe interval,
23797c478bd9Sstevel@tonic-gate 			 * which is a worst case estimate.
23807c478bd9Sstevel@tonic-gate 			 */
23817c478bd9Sstevel@tonic-gate 			if (tg->tg_crtt != 0) {
23827c478bd9Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
23837c478bd9Sstevel@tonic-gate 			} else {
23847c478bd9Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
23857c478bd9Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_probeint;
23867c478bd9Sstevel@tonic-gate 			}
23877c478bd9Sstevel@tonic-gate 
23887c478bd9Sstevel@tonic-gate 			if (TIME_GT(timeout, now))
23897c478bd9Sstevel@tonic-gate 				break;
23907c478bd9Sstevel@tonic-gate 
23917c478bd9Sstevel@tonic-gate 			pr_statp->pr_time_lost = timeout;
23927c478bd9Sstevel@tonic-gate 			pr_statp->pr_status = PR_LOST;
23937c478bd9Sstevel@tonic-gate 			/* FALLTHRU */
23947c478bd9Sstevel@tonic-gate 
23957c478bd9Sstevel@tonic-gate 		case PR_LOST:
23967c478bd9Sstevel@tonic-gate 			if (!pi_found_success) {
23977c478bd9Sstevel@tonic-gate 				pfinfo->pf_nfail++;
23987c478bd9Sstevel@tonic-gate 				pfinfo->pf_tff = pr_statp->pr_time_lost;
23997c478bd9Sstevel@tonic-gate 			}
24007c478bd9Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
24017c478bd9Sstevel@tonic-gate 			    !tg_found_success)  {
24027c478bd9Sstevel@tonic-gate 				pfinfo->pf_nfail_tg++;
24037c478bd9Sstevel@tonic-gate 			}
24047c478bd9Sstevel@tonic-gate 			break;
24057c478bd9Sstevel@tonic-gate 
24067c478bd9Sstevel@tonic-gate 		default:
24077c478bd9Sstevel@tonic-gate 			/*
24087c478bd9Sstevel@tonic-gate 			 * We hit a success or unused slot. Latch the
24097c478bd9Sstevel@tonic-gate 			 * total number of recent consecutive failures.
24107c478bd9Sstevel@tonic-gate 			 */
24117c478bd9Sstevel@tonic-gate 			pi_found_success = _B_TRUE;
24127c478bd9Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
24137c478bd9Sstevel@tonic-gate 				/*
24147c478bd9Sstevel@tonic-gate 				 * We hit a success for the desired target.
24157c478bd9Sstevel@tonic-gate 				 * Latch the number of recent consecutive
24167c478bd9Sstevel@tonic-gate 				 * failures for this target
24177c478bd9Sstevel@tonic-gate 				 */
24187c478bd9Sstevel@tonic-gate 				tg_found_success = _B_TRUE;
24197c478bd9Sstevel@tonic-gate 			}
24207c478bd9Sstevel@tonic-gate 		}
24217c478bd9Sstevel@tonic-gate 	}
24227c478bd9Sstevel@tonic-gate }
24237c478bd9Sstevel@tonic-gate 
24247c478bd9Sstevel@tonic-gate /*
24257c478bd9Sstevel@tonic-gate  * Check if the phyint has been repaired.  If no test address has been
24267c478bd9Sstevel@tonic-gate  * configured, then consider the interface repaired if the link is up (unless
24277c478bd9Sstevel@tonic-gate  * the link is flapping; see below).  Otherwise, look for proof of probes
24287c478bd9Sstevel@tonic-gate  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
24297c478bd9Sstevel@tonic-gate  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
24307c478bd9Sstevel@tonic-gate  */
24317c478bd9Sstevel@tonic-gate static boolean_t
24327c478bd9Sstevel@tonic-gate phyint_repaired(struct phyint *pi)
24337c478bd9Sstevel@tonic-gate {
24347c478bd9Sstevel@tonic-gate 	struct	probe_success_count psinfo;
24357c478bd9Sstevel@tonic-gate 	struct	phyint_instance *pii;
24367c478bd9Sstevel@tonic-gate 	struct	target *cur_tg;
24377c478bd9Sstevel@tonic-gate 	int	pr_ndx;
24387c478bd9Sstevel@tonic-gate 	uint_t	cur_time;
24397c478bd9Sstevel@tonic-gate 
24407c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER)
24417c478bd9Sstevel@tonic-gate 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
24427c478bd9Sstevel@tonic-gate 
24437c478bd9Sstevel@tonic-gate 	if (LINK_DOWN(pi))
24447c478bd9Sstevel@tonic-gate 		return (_B_FALSE);
24457c478bd9Sstevel@tonic-gate 
24467c478bd9Sstevel@tonic-gate 	/*
24477c478bd9Sstevel@tonic-gate 	 * If we don't have any test addresses and the link is up, then
24487c478bd9Sstevel@tonic-gate 	 * consider the interface repaired, unless we've received more than
24497c478bd9Sstevel@tonic-gate 	 * LINK_UP_PERMIN link up notifications in the last minute, in
24507c478bd9Sstevel@tonic-gate 	 * which case we keep the link down until we drop back below
24517c478bd9Sstevel@tonic-gate 	 * the threshold.
24527c478bd9Sstevel@tonic-gate 	 */
24537c478bd9Sstevel@tonic-gate 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
24547c478bd9Sstevel@tonic-gate 		cur_time = getcurrenttime();
24557c478bd9Sstevel@tonic-gate 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
24567c478bd9Sstevel@tonic-gate 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
24577c478bd9Sstevel@tonic-gate 			pi->pi_lfmsg_printed = 0;
24587c478bd9Sstevel@tonic-gate 			return (_B_TRUE);
24597c478bd9Sstevel@tonic-gate 		}
24607c478bd9Sstevel@tonic-gate 		if (!pi->pi_lfmsg_printed) {
24617c478bd9Sstevel@tonic-gate 			logerr("The link has come up on %s more than %d times "
24627c478bd9Sstevel@tonic-gate 			    "in the last minute; disabling failback until it "
24637c478bd9Sstevel@tonic-gate 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
24647c478bd9Sstevel@tonic-gate 			pi->pi_lfmsg_printed = 1;
24657c478bd9Sstevel@tonic-gate 		}
24667c478bd9Sstevel@tonic-gate 
24677c478bd9Sstevel@tonic-gate 		return (_B_FALSE);
24687c478bd9Sstevel@tonic-gate 	}
24697c478bd9Sstevel@tonic-gate 
24707c478bd9Sstevel@tonic-gate 	pii = pi->pi_v4;
24717c478bd9Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii)) {
24727c478bd9Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
24737c478bd9Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
24747c478bd9Sstevel@tonic-gate 		probe_success_info(pii, cur_tg, &psinfo);
24757c478bd9Sstevel@tonic-gate 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
24767c478bd9Sstevel@tonic-gate 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
24777c478bd9Sstevel@tonic-gate 			return (_B_TRUE);
24787c478bd9Sstevel@tonic-gate 	}
24797c478bd9Sstevel@tonic-gate 
24807c478bd9Sstevel@tonic-gate 	pii = pi->pi_v6;
24817c478bd9Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii)) {
24827c478bd9Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
24837c478bd9Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
24847c478bd9Sstevel@tonic-gate 		probe_success_info(pii, cur_tg, &psinfo);
24857c478bd9Sstevel@tonic-gate 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
24867c478bd9Sstevel@tonic-gate 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
24877c478bd9Sstevel@tonic-gate 			return (_B_TRUE);
24887c478bd9Sstevel@tonic-gate 	}
24897c478bd9Sstevel@tonic-gate 
24907c478bd9Sstevel@tonic-gate 	return (_B_FALSE);
24917c478bd9Sstevel@tonic-gate }
24927c478bd9Sstevel@tonic-gate 
24937c478bd9Sstevel@tonic-gate /*
24947c478bd9Sstevel@tonic-gate  * Try failover from phyint 'pi' to a suitable destination.
24957c478bd9Sstevel@tonic-gate  */
24967c478bd9Sstevel@tonic-gate int
24977c478bd9Sstevel@tonic-gate try_failover(struct phyint *pi, int failover_type)
24987c478bd9Sstevel@tonic-gate {
24997c478bd9Sstevel@tonic-gate 	struct phyint *dst;
25007c478bd9Sstevel@tonic-gate 	int err;
25017c478bd9Sstevel@tonic-gate 
25027c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER)
25037c478bd9Sstevel@tonic-gate 		logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
25047c478bd9Sstevel@tonic-gate 
25057c478bd9Sstevel@tonic-gate 	/*
25067c478bd9Sstevel@tonic-gate 	 * Attempt to find a failover destination 'dst'.
25077c478bd9Sstevel@tonic-gate 	 * dst will be null if any of the following is true
25087c478bd9Sstevel@tonic-gate 	 * Phyint is not part of a group  OR
25097c478bd9Sstevel@tonic-gate 	 * Phyint is the only member of a group OR
25107c478bd9Sstevel@tonic-gate 	 * No suitable failover dst was available
25117c478bd9Sstevel@tonic-gate 	 */
25127c478bd9Sstevel@tonic-gate 	dst = get_failover_dst(pi, failover_type);
25137c478bd9Sstevel@tonic-gate 	if (dst == NULL)
25147c478bd9Sstevel@tonic-gate 		return (IPMP_EMINRED);
25157c478bd9Sstevel@tonic-gate 
25167c478bd9Sstevel@tonic-gate 	dst->pi_empty = 0;			/* Per state diagram */
25177c478bd9Sstevel@tonic-gate 	pi->pi_full = 0;			/* Per state diagram */
25187c478bd9Sstevel@tonic-gate 
25197c478bd9Sstevel@tonic-gate 	err = failover(pi, dst);
25207c478bd9Sstevel@tonic-gate 
25217c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
25227c478bd9Sstevel@tonic-gate 		logdebug("failed over from %s to %s ret %d\n",
25237c478bd9Sstevel@tonic-gate 		    pi->pi_name, dst->pi_name, err);
25247c478bd9Sstevel@tonic-gate 	}
25257c478bd9Sstevel@tonic-gate 	if (err == 0) {
25267c478bd9Sstevel@tonic-gate 		pi->pi_empty = 1;		/* Per state diagram */
25277c478bd9Sstevel@tonic-gate 		/*
25287c478bd9Sstevel@tonic-gate 		 * we don't want to print out this message if a
25297c478bd9Sstevel@tonic-gate 		 * phyint is leaving the group, nor for failover from
25307c478bd9Sstevel@tonic-gate 		 * standby
25317c478bd9Sstevel@tonic-gate 		 */
25327c478bd9Sstevel@tonic-gate 		if (failover_type == FAILOVER_NORMAL) {
25337c478bd9Sstevel@tonic-gate 			logerr("Successfully failed over from NIC %s to NIC "
25347c478bd9Sstevel@tonic-gate 			    "%s\n", pi->pi_name, dst->pi_name);
25357c478bd9Sstevel@tonic-gate 		}
25367c478bd9Sstevel@tonic-gate 		return (0);
25377c478bd9Sstevel@tonic-gate 	} else {
25387c478bd9Sstevel@tonic-gate 		/*
25397c478bd9Sstevel@tonic-gate 		 * The failover did not succeed. We must retry the failover
25407c478bd9Sstevel@tonic-gate 		 * only after resyncing our state based on the kernel's.
25417c478bd9Sstevel@tonic-gate 		 * For eg. either the src or the dst might have been unplumbed
25427c478bd9Sstevel@tonic-gate 		 * causing this failure. initifs() will be called again,
25437c478bd9Sstevel@tonic-gate 		 * from main, since full_scan_required has been set to true
25447c478bd9Sstevel@tonic-gate 		 * by failover();
25457c478bd9Sstevel@tonic-gate 		 */
25467c478bd9Sstevel@tonic-gate 		return (IPMP_FAILURE);
25477c478bd9Sstevel@tonic-gate 	}
25487c478bd9Sstevel@tonic-gate }
25497c478bd9Sstevel@tonic-gate 
25507c478bd9Sstevel@tonic-gate /*
25517c478bd9Sstevel@tonic-gate  * global_errno captures the errno value, if failover() or failback()
25527c478bd9Sstevel@tonic-gate  * fails. This is sent to if_mpadm(1M).
25537c478bd9Sstevel@tonic-gate  */
25547c478bd9Sstevel@tonic-gate int global_errno;
25557c478bd9Sstevel@tonic-gate 
25567c478bd9Sstevel@tonic-gate /*
25577c478bd9Sstevel@tonic-gate  * Attempt failover from phyint 'from' to phyint 'to'.
25587c478bd9Sstevel@tonic-gate  * IP moves everything from phyint 'from' to phyint 'to'.
25597c478bd9Sstevel@tonic-gate  */
25607c478bd9Sstevel@tonic-gate static int
25617c478bd9Sstevel@tonic-gate failover(struct phyint *from, struct phyint *to)
25627c478bd9Sstevel@tonic-gate {
25637c478bd9Sstevel@tonic-gate 	struct	lifreq	lifr;
25647c478bd9Sstevel@tonic-gate 	int 	ret;
25657c478bd9Sstevel@tonic-gate 
25667c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
25677c478bd9Sstevel@tonic-gate 		logdebug("failing over from %s to %s\n",
25687c478bd9Sstevel@tonic-gate 		    from->pi_name, to->pi_name);
25697c478bd9Sstevel@tonic-gate 	}
25707c478bd9Sstevel@tonic-gate 
25717c478bd9Sstevel@tonic-gate 	/*
25727c478bd9Sstevel@tonic-gate 	 * Perform the failover. Both IPv4 and IPv6 are failed over
25737c478bd9Sstevel@tonic-gate 	 * using a single ioctl by passing in AF_UNSPEC family.
25747c478bd9Sstevel@tonic-gate 	 */
25757c478bd9Sstevel@tonic-gate 	lifr.lifr_addr.ss_family = AF_UNSPEC;
25767c478bd9Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
25777c478bd9Sstevel@tonic-gate 	lifr.lifr_movetoindex = to->pi_ifindex;
25787c478bd9Sstevel@tonic-gate 
25797c478bd9Sstevel@tonic-gate 	ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
25807c478bd9Sstevel@tonic-gate 	if (ret < 0) {
25817c478bd9Sstevel@tonic-gate 		global_errno = errno;
25827c478bd9Sstevel@tonic-gate 		logperror("failover: ioctl (failover)");
25837c478bd9Sstevel@tonic-gate 	}
25847c478bd9Sstevel@tonic-gate 
25857c478bd9Sstevel@tonic-gate 	/*
25867c478bd9Sstevel@tonic-gate 	 * Set full_scan_required to true. This will make us read
25877c478bd9Sstevel@tonic-gate 	 * the state from the kernel in initifs() and update our tables,
25887c478bd9Sstevel@tonic-gate 	 * to reflect the current state after the failover. If the
25897c478bd9Sstevel@tonic-gate 	 * failover has failed it will then reissue the failover.
25907c478bd9Sstevel@tonic-gate 	 */
25917c478bd9Sstevel@tonic-gate 	full_scan_required = _B_TRUE;
25927c478bd9Sstevel@tonic-gate 	return (ret);
25937c478bd9Sstevel@tonic-gate }
25947c478bd9Sstevel@tonic-gate 
25957c478bd9Sstevel@tonic-gate /*
25967c478bd9Sstevel@tonic-gate  * phyint 'pi' has recovered. Attempt failback from every phyint in the same
25977c478bd9Sstevel@tonic-gate  * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
25987c478bd9Sstevel@tonic-gate  * Return values:
25997c478bd9Sstevel@tonic-gate  * IPMP_SUCCESS:		Failback successful from each of the other
26007c478bd9Sstevel@tonic-gate  *				phyints in the group.
26017c478bd9Sstevel@tonic-gate  * IPMP_EFBPARTIAL: 		Failback successful from some of the other
26027c478bd9Sstevel@tonic-gate  *				phyints in the group.
26037c478bd9Sstevel@tonic-gate  * IPMP_FAILURE:		Failback syscall failed with some error.
26047c478bd9Sstevel@tonic-gate  *
26057c478bd9Sstevel@tonic-gate  * Note that failback is attempted regardless of the setting of the
26067c478bd9Sstevel@tonic-gate  * failback_enabled flag.
26077c478bd9Sstevel@tonic-gate  */
26087c478bd9Sstevel@tonic-gate int
26097c478bd9Sstevel@tonic-gate do_failback(struct phyint *pi, boolean_t check_only)
26107c478bd9Sstevel@tonic-gate {
26117c478bd9Sstevel@tonic-gate 	struct  phyint *from;
26127c478bd9Sstevel@tonic-gate 	boolean_t done;
26137c478bd9Sstevel@tonic-gate 	boolean_t partial;
26147c478bd9Sstevel@tonic-gate 	boolean_t attempted_failback = _B_FALSE;
26157c478bd9Sstevel@tonic-gate 
26167c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER)
26177c478bd9Sstevel@tonic-gate 		logdebug("do_failback(%s)\n", pi->pi_name);
26187c478bd9Sstevel@tonic-gate 
26197c478bd9Sstevel@tonic-gate 	/* If this phyint is not part of a named group, return. */
26207c478bd9Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup) {
26217c478bd9Sstevel@tonic-gate 		pi->pi_full = 1;
26227c478bd9Sstevel@tonic-gate 		return (IPMP_SUCCESS);
26237c478bd9Sstevel@tonic-gate 	}
26247c478bd9Sstevel@tonic-gate 
26257c478bd9Sstevel@tonic-gate 	/*
26267c478bd9Sstevel@tonic-gate 	 * Attempt failback from every phyint in the group to 'pi'.
26277c478bd9Sstevel@tonic-gate 	 * The reason for doing this, instead of only from the
26287c478bd9Sstevel@tonic-gate 	 * phyint to which we did the failover is given below.
26297c478bd9Sstevel@tonic-gate 	 *
26307c478bd9Sstevel@tonic-gate 	 * After 'pi' failed, if any app. tries to join on a multicast
26317c478bd9Sstevel@tonic-gate 	 * address (IPv6), on the failed phyint, IP picks any arbitrary
26327c478bd9Sstevel@tonic-gate 	 * non-failed phyint in the group, instead of the failed phyint,
26337c478bd9Sstevel@tonic-gate 	 * in.mpathd is not aware of this. Thus failing back only from the
26347c478bd9Sstevel@tonic-gate 	 * interface to which 'pi' failed over, will failback the ipif's
26357c478bd9Sstevel@tonic-gate 	 * but not the ilm's. So we need to failback from all members of
26367c478bd9Sstevel@tonic-gate 	 * the phyint group
26377c478bd9Sstevel@tonic-gate 	 */
26387c478bd9Sstevel@tonic-gate 	done = _B_TRUE;
26397c478bd9Sstevel@tonic-gate 	partial = _B_FALSE;
26407c478bd9Sstevel@tonic-gate 	for (from = pi->pi_group->pg_phyint; from != NULL;
26417c478bd9Sstevel@tonic-gate 	    from = from->pi_pgnext) {
26427c478bd9Sstevel@tonic-gate 		/* Exclude ourself as a failback src */
26437c478bd9Sstevel@tonic-gate 		if (from == pi)
26447c478bd9Sstevel@tonic-gate 			continue;
26457c478bd9Sstevel@tonic-gate 
26467c478bd9Sstevel@tonic-gate 		/*
26477c478bd9Sstevel@tonic-gate 		 * If the 'from' phyint has IPv4 plumbed, the 'to'
26487c478bd9Sstevel@tonic-gate 		 * phyint must also have IPv4 plumbed. Similar check
26497c478bd9Sstevel@tonic-gate 		 * for IPv6. IP makes the same check. Otherwise the
26507c478bd9Sstevel@tonic-gate 		 * failback will fail.
26517c478bd9Sstevel@tonic-gate 		 */
26527c478bd9Sstevel@tonic-gate 		if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
26537c478bd9Sstevel@tonic-gate 		    (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
26547c478bd9Sstevel@tonic-gate 			partial = _B_TRUE;
26557c478bd9Sstevel@tonic-gate 			continue;
26567c478bd9Sstevel@tonic-gate 		}
26577c478bd9Sstevel@tonic-gate 
26587c478bd9Sstevel@tonic-gate 		if (!check_only) {
26597c478bd9Sstevel@tonic-gate 			pi->pi_empty = 0;	/* Per state diagram */
26607c478bd9Sstevel@tonic-gate 			attempted_failback = _B_TRUE;
26617c478bd9Sstevel@tonic-gate 			if (failback(from, pi) != 0) {
26627c478bd9Sstevel@tonic-gate 				done = _B_FALSE;
26637c478bd9Sstevel@tonic-gate 				break;
26647c478bd9Sstevel@tonic-gate 			}
26657c478bd9Sstevel@tonic-gate 		}
26667c478bd9Sstevel@tonic-gate 	}
26677c478bd9Sstevel@tonic-gate 
26687c478bd9Sstevel@tonic-gate 	if (check_only) {
26697c478bd9Sstevel@tonic-gate 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
26707c478bd9Sstevel@tonic-gate 	}
26717c478bd9Sstevel@tonic-gate 
26727c478bd9Sstevel@tonic-gate 	/*
26737c478bd9Sstevel@tonic-gate 	 * We are done. No more phyint from which we can src the failback
26747c478bd9Sstevel@tonic-gate 	 */
26757c478bd9Sstevel@tonic-gate 	if (done) {
26767c478bd9Sstevel@tonic-gate 		if (!partial)
26777c478bd9Sstevel@tonic-gate 			pi->pi_full = 1;	/* Per state diagram */
26787c478bd9Sstevel@tonic-gate 		/*
26797c478bd9Sstevel@tonic-gate 		 * Don't print out a message unless there is a
26807c478bd9Sstevel@tonic-gate 		 * transition from FAILED to RUNNING. For eg.
26817c478bd9Sstevel@tonic-gate 		 * we don't want to print out this message if a
26827c478bd9Sstevel@tonic-gate 		 * phyint is leaving the group, or at startup
26837c478bd9Sstevel@tonic-gate 		 */
26847c478bd9Sstevel@tonic-gate 		if (attempted_failback && (pi->pi_flags &
26857c478bd9Sstevel@tonic-gate 		    (IFF_FAILED | IFF_OFFLINE))) {
26867c478bd9Sstevel@tonic-gate 			logerr("Successfully failed back to NIC %s\n",
26877c478bd9Sstevel@tonic-gate 			    pi->pi_name);
26887c478bd9Sstevel@tonic-gate 		}
26897c478bd9Sstevel@tonic-gate 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
26907c478bd9Sstevel@tonic-gate 	}
26917c478bd9Sstevel@tonic-gate 
26927c478bd9Sstevel@tonic-gate 	return (IPMP_FAILURE);
26937c478bd9Sstevel@tonic-gate }
26947c478bd9Sstevel@tonic-gate 
26957c478bd9Sstevel@tonic-gate /*
26967c478bd9Sstevel@tonic-gate  * This function is similar to do_failback() above, but respects the
26977c478bd9Sstevel@tonic-gate  * failback_enabled flag for phyints in named groups.
26987c478bd9Sstevel@tonic-gate  */
26997c478bd9Sstevel@tonic-gate int
27007c478bd9Sstevel@tonic-gate try_failback(struct phyint *pi, boolean_t check_only)
27017c478bd9Sstevel@tonic-gate {
27027c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER)
27037c478bd9Sstevel@tonic-gate 		logdebug("try_failback(%s)\n", pi->pi_name);
27047c478bd9Sstevel@tonic-gate 
27057c478bd9Sstevel@tonic-gate 	if (pi->pi_group != phyint_anongroup && !failback_enabled)
27067c478bd9Sstevel@tonic-gate 		return (IPMP_EFBDISABLED);
27077c478bd9Sstevel@tonic-gate 
27087c478bd9Sstevel@tonic-gate 	return (do_failback(pi, check_only));
27097c478bd9Sstevel@tonic-gate }
27107c478bd9Sstevel@tonic-gate 
27117c478bd9Sstevel@tonic-gate /*
27127c478bd9Sstevel@tonic-gate  * Failback everything from phyint 'from' that has the same ifindex
27137c478bd9Sstevel@tonic-gate  * as phyint to's ifindex.
27147c478bd9Sstevel@tonic-gate  */
27157c478bd9Sstevel@tonic-gate static int
27167c478bd9Sstevel@tonic-gate failback(struct phyint *from, struct phyint *to)
27177c478bd9Sstevel@tonic-gate {
27187c478bd9Sstevel@tonic-gate 	struct lifreq lifr;
27197c478bd9Sstevel@tonic-gate 	int ret;
27207c478bd9Sstevel@tonic-gate 
27217c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER)
27227c478bd9Sstevel@tonic-gate 		logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
27237c478bd9Sstevel@tonic-gate 
27247c478bd9Sstevel@tonic-gate 	lifr.lifr_addr.ss_family = AF_UNSPEC;
27257c478bd9Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
27267c478bd9Sstevel@tonic-gate 	lifr.lifr_movetoindex = to->pi_ifindex;
27277c478bd9Sstevel@tonic-gate 
27287c478bd9Sstevel@tonic-gate 	ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
27297c478bd9Sstevel@tonic-gate 	if (ret < 0) {
27307c478bd9Sstevel@tonic-gate 		global_errno = errno;
27317c478bd9Sstevel@tonic-gate 		logperror("failback: ioctl (failback)");
27327c478bd9Sstevel@tonic-gate 	}
27337c478bd9Sstevel@tonic-gate 
27347c478bd9Sstevel@tonic-gate 	/*
27357c478bd9Sstevel@tonic-gate 	 * Set full_scan_required to true. This will make us read
27367c478bd9Sstevel@tonic-gate 	 * the state from the kernel in initifs() and update our tables,
27377c478bd9Sstevel@tonic-gate 	 * to reflect the current state after the failback. If the
27387c478bd9Sstevel@tonic-gate 	 * failback has failed it will then reissue the failback.
27397c478bd9Sstevel@tonic-gate 	 */
27407c478bd9Sstevel@tonic-gate 	full_scan_required = _B_TRUE;
27417c478bd9Sstevel@tonic-gate 
27427c478bd9Sstevel@tonic-gate 	return (ret);
27437c478bd9Sstevel@tonic-gate }
27447c478bd9Sstevel@tonic-gate 
27457c478bd9Sstevel@tonic-gate /*
27467c478bd9Sstevel@tonic-gate  * Select a target phyint for failing over from 'pi'.
27477c478bd9Sstevel@tonic-gate  * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
27487c478bd9Sstevel@tonic-gate  * target phyint is chosen as follows,
27497c478bd9Sstevel@tonic-gate  *	1. Pick any inactive standby interface.
27507c478bd9Sstevel@tonic-gate  *	2. If no inactive standby is available, select any phyint in the
27517c478bd9Sstevel@tonic-gate  *	   same group that has the least number of logints, (excluding
27527c478bd9Sstevel@tonic-gate  *	   IFF_NOFAILOVER and !IFF_UP logints)
27537c478bd9Sstevel@tonic-gate  * If we are failing over from a standby, failover_type is
27547c478bd9Sstevel@tonic-gate  * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
27557c478bd9Sstevel@tonic-gate  * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
27567c478bd9Sstevel@tonic-gate  * and we won't return NULL, as long as there is at least 1 other phyint
27577c478bd9Sstevel@tonic-gate  * in the group.
27587c478bd9Sstevel@tonic-gate  */
27597c478bd9Sstevel@tonic-gate static struct phyint *
27607c478bd9Sstevel@tonic-gate get_failover_dst(struct phyint *pi, int failover_type)
27617c478bd9Sstevel@tonic-gate {
27627c478bd9Sstevel@tonic-gate 	struct phyint	*maybe = NULL;
27637c478bd9Sstevel@tonic-gate 	struct phyint	*pi2;
27647c478bd9Sstevel@tonic-gate 	struct phyint 	*last_choice = NULL;
27657c478bd9Sstevel@tonic-gate 
27667c478bd9Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup)
27677c478bd9Sstevel@tonic-gate 		return (NULL);
27687c478bd9Sstevel@tonic-gate 
27697c478bd9Sstevel@tonic-gate 	/*
27707c478bd9Sstevel@tonic-gate 	 * Loop thru the phyints in the group, and pick the preferred
27717c478bd9Sstevel@tonic-gate 	 * phyint for the target.
27727c478bd9Sstevel@tonic-gate 	 */
27737c478bd9Sstevel@tonic-gate 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
27747c478bd9Sstevel@tonic-gate 		/* Exclude ourself and offlined interfaces */
27757c478bd9Sstevel@tonic-gate 		if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
27767c478bd9Sstevel@tonic-gate 			continue;
27777c478bd9Sstevel@tonic-gate 
27787c478bd9Sstevel@tonic-gate 		/*
27797c478bd9Sstevel@tonic-gate 		 * The chosen target phyint must have IPv4 instance
27807c478bd9Sstevel@tonic-gate 		 * plumbed, if the src phyint has IPv4 plumbed. Similarly
27817c478bd9Sstevel@tonic-gate 		 * for IPv6.
27827c478bd9Sstevel@tonic-gate 		 */
27837c478bd9Sstevel@tonic-gate 		if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
27847c478bd9Sstevel@tonic-gate 		    (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
27857c478bd9Sstevel@tonic-gate 			continue;
27867c478bd9Sstevel@tonic-gate 
27877c478bd9Sstevel@tonic-gate 		/* The chosen target must be PI_RUNNING. */
27887c478bd9Sstevel@tonic-gate 		if (pi2->pi_state != PI_RUNNING) {
27897c478bd9Sstevel@tonic-gate 			last_choice = pi2;
27907c478bd9Sstevel@tonic-gate 			continue;
27917c478bd9Sstevel@tonic-gate 		}
27927c478bd9Sstevel@tonic-gate 
2793*49df4566Sethindra 		if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) &&
27947c478bd9Sstevel@tonic-gate 		    (failover_type != FAILOVER_TO_NONSTANDBY)) {
27957c478bd9Sstevel@tonic-gate 			return (pi2);
27967c478bd9Sstevel@tonic-gate 		} else {
27977c478bd9Sstevel@tonic-gate 			if (maybe == NULL)
27987c478bd9Sstevel@tonic-gate 				maybe = pi2;
27997c478bd9Sstevel@tonic-gate 			else if (logint_upcount(pi2) < logint_upcount(maybe))
28007c478bd9Sstevel@tonic-gate 				maybe = pi2;
28017c478bd9Sstevel@tonic-gate 		}
28027c478bd9Sstevel@tonic-gate 	}
28037c478bd9Sstevel@tonic-gate 	if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
28047c478bd9Sstevel@tonic-gate 		return (last_choice);
28057c478bd9Sstevel@tonic-gate 	else
28067c478bd9Sstevel@tonic-gate 		return (maybe);
28077c478bd9Sstevel@tonic-gate }
28087c478bd9Sstevel@tonic-gate 
28097c478bd9Sstevel@tonic-gate /*
28107c478bd9Sstevel@tonic-gate  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
28117c478bd9Sstevel@tonic-gate  */
28127c478bd9Sstevel@tonic-gate boolean_t
28137c478bd9Sstevel@tonic-gate change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
28147c478bd9Sstevel@tonic-gate {
28157c478bd9Sstevel@tonic-gate 	int ifsock;
28167c478bd9Sstevel@tonic-gate 	struct lifreq lifr;
28177c478bd9Sstevel@tonic-gate 
28187c478bd9Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
28197c478bd9Sstevel@tonic-gate 		logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
28207c478bd9Sstevel@tonic-gate 		    pi->pi_name, flags, (int)setfl);
28217c478bd9Sstevel@tonic-gate 	}
28227c478bd9Sstevel@tonic-gate 
28237c478bd9Sstevel@tonic-gate 	if (pi->pi_v4 != NULL) {
28247c478bd9Sstevel@tonic-gate 		ifsock = ifsock_v4;
28257c478bd9Sstevel@tonic-gate 	} else  {
28267c478bd9Sstevel@tonic-gate 		ifsock = ifsock_v6;
28277c478bd9Sstevel@tonic-gate 	}
28287c478bd9Sstevel@tonic-gate 
28297c478bd9Sstevel@tonic-gate 	/*
28307c478bd9Sstevel@tonic-gate 	 * Get the current flags from the kernel, and set/clear the
28317c478bd9Sstevel@tonic-gate 	 * desired phyint flags. Since we set only phyint flags, we can
28327c478bd9Sstevel@tonic-gate 	 * do it on either IPv4 or IPv6 instance.
28337c478bd9Sstevel@tonic-gate 	 */
28347c478bd9Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
28357c478bd9Sstevel@tonic-gate 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
28367c478bd9Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
28377c478bd9Sstevel@tonic-gate 		if (errno != ENXIO)
28387c478bd9Sstevel@tonic-gate 			logperror("change_lif_flags: ioctl (get flags)");
28397c478bd9Sstevel@tonic-gate 		return (_B_FALSE);
28407c478bd9Sstevel@tonic-gate 	}
28417c478bd9Sstevel@tonic-gate 	if (setfl)
28427c478bd9Sstevel@tonic-gate 		lifr.lifr_flags |= flags;
28437c478bd9Sstevel@tonic-gate 	else
28447c478bd9Sstevel@tonic-gate 		lifr.lifr_flags &= ~flags;
28457c478bd9Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
28467c478bd9Sstevel@tonic-gate 		if (errno != ENXIO)
28477c478bd9Sstevel@tonic-gate 			logperror("change_lif_flags: ioctl (set flags)");
28487c478bd9Sstevel@tonic-gate 		return (_B_FALSE);
28497c478bd9Sstevel@tonic-gate 	}
28507c478bd9Sstevel@tonic-gate 
28517c478bd9Sstevel@tonic-gate 	/*
28527c478bd9Sstevel@tonic-gate 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
28537c478bd9Sstevel@tonic-gate 	 * phyint flags.
28547c478bd9Sstevel@tonic-gate 	 */
28557c478bd9Sstevel@tonic-gate 	if (setfl)
28567c478bd9Sstevel@tonic-gate 		pi->pi_flags |= flags;
28577c478bd9Sstevel@tonic-gate 	else
28587c478bd9Sstevel@tonic-gate 		pi->pi_flags &= ~flags;
28597c478bd9Sstevel@tonic-gate 
28607c478bd9Sstevel@tonic-gate 	if (pi->pi_v4)
28617c478bd9Sstevel@tonic-gate 		pi->pi_v4->pii_flags = pi->pi_flags;
28627c478bd9Sstevel@tonic-gate 
28637c478bd9Sstevel@tonic-gate 	if (pi->pi_v6)
28647c478bd9Sstevel@tonic-gate 		pi->pi_v6->pii_flags = pi->pi_flags;
28657c478bd9Sstevel@tonic-gate 
28667c478bd9Sstevel@tonic-gate 	return (_B_TRUE);
28677c478bd9Sstevel@tonic-gate }
28687c478bd9Sstevel@tonic-gate 
28697c478bd9Sstevel@tonic-gate /*
28707c478bd9Sstevel@tonic-gate  * icmp cksum computation for IPv4.
28717c478bd9Sstevel@tonic-gate  */
28727c478bd9Sstevel@tonic-gate static int
28737c478bd9Sstevel@tonic-gate in_cksum(ushort_t *addr, int len)
28747c478bd9Sstevel@tonic-gate {
28757c478bd9Sstevel@tonic-gate 	register int nleft = len;
28767c478bd9Sstevel@tonic-gate 	register ushort_t *w = addr;
28777c478bd9Sstevel@tonic-gate 	register ushort_t answer;
28787c478bd9Sstevel@tonic-gate 	ushort_t odd_byte = 0;
28797c478bd9Sstevel@tonic-gate 	register int sum = 0;
28807c478bd9Sstevel@tonic-gate 
28817c478bd9Sstevel@tonic-gate 	/*
28827c478bd9Sstevel@tonic-gate 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
28837c478bd9Sstevel@tonic-gate 	 *  we add sequential 16 bit words to it, and at the end, fold
28847c478bd9Sstevel@tonic-gate 	 *  back all the carry bits from the top 16 bits into the lower
28857c478bd9Sstevel@tonic-gate 	 *  16 bits.
28867c478bd9Sstevel@tonic-gate 	 */
28877c478bd9Sstevel@tonic-gate 	while (nleft > 1)  {
28887c478bd9Sstevel@tonic-gate 		sum += *w++;
28897c478bd9Sstevel@tonic-gate 		nleft -= 2;
28907c478bd9Sstevel@tonic-gate 	}
28917c478bd9Sstevel@tonic-gate 
28927c478bd9Sstevel@tonic-gate 	/* mop up an odd byte, if necessary */
28937c478bd9Sstevel@tonic-gate 	if (nleft == 1) {
28947c478bd9Sstevel@tonic-gate 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
28957c478bd9Sstevel@tonic-gate 		sum += odd_byte;
28967c478bd9Sstevel@tonic-gate 	}
28977c478bd9Sstevel@tonic-gate 
28987c478bd9Sstevel@tonic-gate 	/*
28997c478bd9Sstevel@tonic-gate 	 * add back carry outs from top 16 bits to low 16 bits
29007c478bd9Sstevel@tonic-gate 	 */
29017c478bd9Sstevel@tonic-gate 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
29027c478bd9Sstevel@tonic-gate 	sum += (sum >> 16);			/* add carry */
29037c478bd9Sstevel@tonic-gate 	answer = ~sum;				/* truncate to 16 bits */
29047c478bd9Sstevel@tonic-gate 	return (answer);
29057c478bd9Sstevel@tonic-gate }
29067c478bd9Sstevel@tonic-gate 
29077c478bd9Sstevel@tonic-gate static void
29087c478bd9Sstevel@tonic-gate reset_snxt_basetimes(void)
29097c478bd9Sstevel@tonic-gate {
29107c478bd9Sstevel@tonic-gate 	struct phyint_instance *pii;
29117c478bd9Sstevel@tonic-gate 
29127c478bd9Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
29137c478bd9Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
29147c478bd9Sstevel@tonic-gate 	}
29157c478bd9Sstevel@tonic-gate }
29167c478bd9Sstevel@tonic-gate 
29177c478bd9Sstevel@tonic-gate /*
29187c478bd9Sstevel@tonic-gate  * Is the address one of our own addresses? Unfortunately,
29197c478bd9Sstevel@tonic-gate  * we cannot check our phyint tables to determine if the address
29207c478bd9Sstevel@tonic-gate  * is our own. This is because, we don't track interfaces that
29217c478bd9Sstevel@tonic-gate  * are not part of any group. We have to either use a 'bind' or
29227c478bd9Sstevel@tonic-gate  * get the complete list of all interfaces using SIOCGLIFCONF,
29237c478bd9Sstevel@tonic-gate  * to do this check. We choose to use 'bind'. We could use
29247c478bd9Sstevel@tonic-gate  * SIOCTMYADDR, but bind is preferred, since it is stronger.
29257c478bd9Sstevel@tonic-gate  * SIOCTMYADDR excludes down interfaces, while bind includes even
29267c478bd9Sstevel@tonic-gate  * down interfaces.
29277c478bd9Sstevel@tonic-gate  */
29287c478bd9Sstevel@tonic-gate boolean_t
29297c478bd9Sstevel@tonic-gate own_address(int af, struct in6_addr addr)
29307c478bd9Sstevel@tonic-gate {
29317c478bd9Sstevel@tonic-gate 	int sock;
29327c478bd9Sstevel@tonic-gate 	boolean_t ours = _B_TRUE;
29337c478bd9Sstevel@tonic-gate 
29347c478bd9Sstevel@tonic-gate 	sock = socket(AF_INET6, SOCK_DGRAM, 0);
29357c478bd9Sstevel@tonic-gate 	if (sock  == -1) {
29367c478bd9Sstevel@tonic-gate 		logperror("own_address: socket");
29377c478bd9Sstevel@tonic-gate 		/*
29387c478bd9Sstevel@tonic-gate 		 * If the socket call fails, err on the side of caution,
29397c478bd9Sstevel@tonic-gate 		 * and return true.
29407c478bd9Sstevel@tonic-gate 		 */
29417c478bd9Sstevel@tonic-gate 	} else {
29427c478bd9Sstevel@tonic-gate 		struct sockaddr_in6 sin6;
29437c478bd9Sstevel@tonic-gate 
29447c478bd9Sstevel@tonic-gate 		(void) memset(&sin6, 0, sizeof (struct sockaddr_in6));
29457c478bd9Sstevel@tonic-gate 		sin6.sin6_family = AF_INET6;
29467c478bd9Sstevel@tonic-gate 		sin6.sin6_addr = addr;
29477c478bd9Sstevel@tonic-gate 		/*
29487c478bd9Sstevel@tonic-gate 		 * If the bind succeeds, then this address is one of our
29497c478bd9Sstevel@tonic-gate 		 * addresses.
29507c478bd9Sstevel@tonic-gate 		 * If bind returns error EADDRNOTAVAIL, the address is
29517c478bd9Sstevel@tonic-gate 		 * not one of ours.
29527c478bd9Sstevel@tonic-gate 		 * If bind returns an error other than EADDRNOTAVAIL, err
29537c478bd9Sstevel@tonic-gate 		 * on the side of caution and report the address as one of
29547c478bd9Sstevel@tonic-gate 		 * our own.
29557c478bd9Sstevel@tonic-gate 		 */
29567c478bd9Sstevel@tonic-gate 		if (bind(sock, (struct sockaddr *)&sin6,
29577c478bd9Sstevel@tonic-gate 		    sizeof (struct sockaddr_in6)) == -1) {
29587c478bd9Sstevel@tonic-gate 			if (errno == EADDRNOTAVAIL)
29597c478bd9Sstevel@tonic-gate 				ours = _B_FALSE;
29607c478bd9Sstevel@tonic-gate 			else
29617c478bd9Sstevel@tonic-gate 				logperror("own_address: bind");
29627c478bd9Sstevel@tonic-gate 		}
29637c478bd9Sstevel@tonic-gate 		(void) close(sock);
29647c478bd9Sstevel@tonic-gate 	}
29657c478bd9Sstevel@tonic-gate 	if (debug & D_TARGET) {
29667c478bd9Sstevel@tonic-gate 		char abuf[INET6_ADDRSTRLEN];
29677c478bd9Sstevel@tonic-gate 
29687c478bd9Sstevel@tonic-gate 		logdebug("own_address: addr %s is %s ours\n",
29697c478bd9Sstevel@tonic-gate 		    pr_addr(af, addr, abuf, sizeof (abuf)),
29707c478bd9Sstevel@tonic-gate 		    ours ? "one of" : "not");
29717c478bd9Sstevel@tonic-gate 	}
29727c478bd9Sstevel@tonic-gate 	return (ours);
29737c478bd9Sstevel@tonic-gate }
2974