17c478bd9Sstevel@tonic-gate /*
2e11c3f44Smeem  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
37c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
47c478bd9Sstevel@tonic-gate  */
57c478bd9Sstevel@tonic-gate 
67c478bd9Sstevel@tonic-gate /*
77c478bd9Sstevel@tonic-gate  * Copyright (c) 1987 Regents of the University of California.
87c478bd9Sstevel@tonic-gate  * All rights reserved.
97c478bd9Sstevel@tonic-gate  *
107c478bd9Sstevel@tonic-gate  * Redistribution and use in source and binary forms are permitted
117c478bd9Sstevel@tonic-gate  * provided that the above copyright notice and this paragraph are
127c478bd9Sstevel@tonic-gate  * duplicated in all such forms and that any documentation,
137c478bd9Sstevel@tonic-gate  * advertising materials, and other materials related to such
147c478bd9Sstevel@tonic-gate  * distribution and use acknowledge that the software was developed
157c478bd9Sstevel@tonic-gate  * by the University of California, Berkeley. The name of the
167c478bd9Sstevel@tonic-gate  * University may not be used to endorse or promote products derived
177c478bd9Sstevel@tonic-gate  * from this software without specific prior written permission.
187c478bd9Sstevel@tonic-gate  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
197c478bd9Sstevel@tonic-gate  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
207c478bd9Sstevel@tonic-gate  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
217c478bd9Sstevel@tonic-gate  */
227c478bd9Sstevel@tonic-gate 
237c478bd9Sstevel@tonic-gate #include "mpd_defs.h"
247c478bd9Sstevel@tonic-gate #include "mpd_tables.h"
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /*
277c478bd9Sstevel@tonic-gate  * Probe types for probe()
287c478bd9Sstevel@tonic-gate  */
297c478bd9Sstevel@tonic-gate #define	PROBE_UNI	0x1234		/* Unicast probe packet */
307c478bd9Sstevel@tonic-gate #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
317c478bd9Sstevel@tonic-gate #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
327c478bd9Sstevel@tonic-gate 
337c478bd9Sstevel@tonic-gate #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
347c478bd9Sstevel@tonic-gate 
357c478bd9Sstevel@tonic-gate /*
367c478bd9Sstevel@tonic-gate  * Format of probe / probe response packets. This is an ICMP Echo request
377c478bd9Sstevel@tonic-gate  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
387c478bd9Sstevel@tonic-gate  */
397c478bd9Sstevel@tonic-gate struct pr_icmp
407c478bd9Sstevel@tonic-gate {
417c478bd9Sstevel@tonic-gate 	uint8_t  pr_icmp_type;		/* type field */
427c478bd9Sstevel@tonic-gate 	uint8_t  pr_icmp_code;		/* code field */
437c478bd9Sstevel@tonic-gate 	uint16_t pr_icmp_cksum;		/* checksum field */
447c478bd9Sstevel@tonic-gate 	uint16_t pr_icmp_id;		/* Identification */
457c478bd9Sstevel@tonic-gate 	uint16_t pr_icmp_seq;		/* sequence number */
46e11c3f44Smeem 	uint64_t pr_icmp_timestamp;	/* Time stamp (in ns) */
477c478bd9Sstevel@tonic-gate 	uint32_t pr_icmp_mtype;		/* Message type */
487c478bd9Sstevel@tonic-gate };
497c478bd9Sstevel@tonic-gate 
507c478bd9Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
517c478bd9Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
527c478bd9Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
537c478bd9Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x1 } };
547c478bd9Sstevel@tonic-gate 
557c478bd9Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
567c478bd9Sstevel@tonic-gate 
577c478bd9Sstevel@tonic-gate static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
587c478bd9Sstevel@tonic-gate 
59e11c3f44Smeem static void		*find_ancillary(struct msghdr *msg, int cmsg_level,
60e11c3f44Smeem     int cmsg_type);
61e11c3f44Smeem static void		pi_set_crtt(struct target *tg, int64_t m,
627c478bd9Sstevel@tonic-gate     boolean_t is_probe_uni);
637c478bd9Sstevel@tonic-gate static void		incoming_echo_reply(struct phyint_instance *pii,
64e11c3f44Smeem     struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
657c478bd9Sstevel@tonic-gate static void		incoming_rtt_reply(struct phyint_instance *pii,
667c478bd9Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
677c478bd9Sstevel@tonic-gate static void		incoming_mcast_reply(struct phyint_instance *pii,
687c478bd9Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
697c478bd9Sstevel@tonic-gate 
707c478bd9Sstevel@tonic-gate static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
717c478bd9Sstevel@tonic-gate static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
727c478bd9Sstevel@tonic-gate static boolean_t	check_exception_target(struct phyint_instance *pii,
737c478bd9Sstevel@tonic-gate     struct target *target);
747c478bd9Sstevel@tonic-gate static void		probe_fail_info(struct phyint_instance *pii,
757c478bd9Sstevel@tonic-gate     struct target *cur_tg, struct probe_fail_count *pfinfo);
767c478bd9Sstevel@tonic-gate static void		probe_success_info(struct phyint_instance *pii,
777c478bd9Sstevel@tonic-gate     struct target *cur_tg, struct probe_success_count *psinfo);
787c478bd9Sstevel@tonic-gate static boolean_t	phyint_repaired(struct phyint *pi);
797c478bd9Sstevel@tonic-gate 
807c478bd9Sstevel@tonic-gate static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
817c478bd9Sstevel@tonic-gate static int 		in_cksum(ushort_t *addr, int len);
827c478bd9Sstevel@tonic-gate static void		reset_snxt_basetimes(void);
83e11c3f44Smeem static int		ns2ms(int64_t ns);
84e11c3f44Smeem static int64_t		tv2ns(struct timeval *);
857c478bd9Sstevel@tonic-gate 
867c478bd9Sstevel@tonic-gate /*
877c478bd9Sstevel@tonic-gate  * CRTT - Conservative Round Trip Time Estimate
887c478bd9Sstevel@tonic-gate  * Probe success - A matching probe reply received before CRTT ms has elapsed
897c478bd9Sstevel@tonic-gate  *	after sending the probe.
907c478bd9Sstevel@tonic-gate  * Probe failure - No probe reply received and more than CRTT ms has elapsed
917c478bd9Sstevel@tonic-gate  *	after sending the probe.
927c478bd9Sstevel@tonic-gate  *
937c478bd9Sstevel@tonic-gate  * TLS - Time last success. Most recent probe ack received at this time.
947c478bd9Sstevel@tonic-gate  * TFF - Time first fail. The time of the earliest probe failure in
957c478bd9Sstevel@tonic-gate  *	a consecutive series of probe failures.
967c478bd9Sstevel@tonic-gate  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
977c478bd9Sstevel@tonic-gate  * 	before declaring phyint repair.
987c478bd9Sstevel@tonic-gate  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
997c478bd9Sstevel@tonic-gate  *	declare a phyint failure.
1007c478bd9Sstevel@tonic-gate  *
1017c478bd9Sstevel@tonic-gate  * 			Phyint state diagram
1027c478bd9Sstevel@tonic-gate  *
1037c478bd9Sstevel@tonic-gate  * The state of a phyint that is capable of being probed, is completely
104e11c3f44Smeem  * specified by the 3-tuple <pi_state, pg_state, I>.
1057c478bd9Sstevel@tonic-gate  *
106fcdc8680Smeem  * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
107fcdc8680Smeem  * IFF_OFFLINE is set.  If the phyint is also configured with a test address
108fcdc8680Smeem  * (the common case) and probe targets, then a phyint must also successfully
109fcdc8680Smeem  * be able to send and receive probes in order to remain in the PI_RUNNING
110fcdc8680Smeem  * state (otherwise, it transitions to PI_FAILED).
1117c478bd9Sstevel@tonic-gate  *
1127c478bd9Sstevel@tonic-gate  * Further, if a PI_RUNNING phyint is configured with a test address but is
1137c478bd9Sstevel@tonic-gate  * unable to find any probe targets, it will transition to the PI_NOTARGETS
1147c478bd9Sstevel@tonic-gate  * state, which indicates that the link is apparently functional but that
1157c478bd9Sstevel@tonic-gate  * in.mpathd is unable to send probes to verify functionality (in this case,
1167c478bd9Sstevel@tonic-gate  * in.mpathd makes the optimistic assumption that the interface is working
117e11c3f44Smeem  * correctly and thus does not mark the interface FAILED, but reports it as
118e11c3f44Smeem  * IPMP_IF_UNKNOWN through the async events and query interfaces).
1197c478bd9Sstevel@tonic-gate  *
1207c478bd9Sstevel@tonic-gate  * At any point, a phyint may be administratively marked offline via if_mpadm.
1217c478bd9Sstevel@tonic-gate  * In this case, the interface always transitions to PI_OFFLINE, regardless
1227c478bd9Sstevel@tonic-gate  * of its previous state.  When the interface is later brought back online,
1237c478bd9Sstevel@tonic-gate  * in.mpathd acts as if the interface is new (and thus it transitions to
1247c478bd9Sstevel@tonic-gate  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
1257c478bd9Sstevel@tonic-gate  * its probes, if probes are sent).
1267c478bd9Sstevel@tonic-gate  *
1277c478bd9Sstevel@tonic-gate  * pi_state -  PI_RUNNING or PI_FAILED
1287c478bd9Sstevel@tonic-gate  *	PI_RUNNING: The failure detection logic says the phyint is good.
1297c478bd9Sstevel@tonic-gate  *	PI_FAILED: The failure detection logic says the phyint has failed.
1307c478bd9Sstevel@tonic-gate  *
131e11c3f44Smeem  * pg_state  - PG_OK, PG_DEGRADED, or PG_FAILED.
132e11c3f44Smeem  *	PG_OK: All interfaces in the group are OK.
133e11c3f44Smeem  *	PG_DEGRADED: Some interfaces in the group are unusable.
134e11c3f44Smeem  *	PG_FAILED: All interfaces in the group are unusable.
135e11c3f44Smeem  *
1367c478bd9Sstevel@tonic-gate  *	In the case of router targets, we assume that the current list of
1377c478bd9Sstevel@tonic-gate  *	targets obtained from the routing table, is still valid, so the
1387c478bd9Sstevel@tonic-gate  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
1397c478bd9Sstevel@tonic-gate  *	list of targets, and multicast to the all hosts, to reconstruct the
1407c478bd9Sstevel@tonic-gate  *	target list. So the phyints are in the PI_NOTARGETS state.
1417c478bd9Sstevel@tonic-gate  *
1427c478bd9Sstevel@tonic-gate  * I -	value of (pi_flags & IFF_INACTIVE)
143e11c3f44Smeem  *	IFF_INACTIVE: This phyint will not send or receive packets.
144e11c3f44Smeem  *	Usually, inactive is tied to standby interfaces that are not yet
145e11c3f44Smeem  *	needed (e.g., no non-standby interfaces in the group have failed).
146e11c3f44Smeem  *	When failback has been disabled (FAILBACK=no configured), phyint can
147e11c3f44Smeem  *	also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
148e11c3f44Smeem  *	subsequently recovers after a failure.
1497c478bd9Sstevel@tonic-gate  *
150e11c3f44Smeem  * Not all 9 possible combinations of the above 3-tuple are possible.
1517c478bd9Sstevel@tonic-gate  *
152e11c3f44Smeem  * I is tracked by IP. pi_state is tracked by mpathd.
1537c478bd9Sstevel@tonic-gate  *
1547c478bd9Sstevel@tonic-gate  *			pi_state state machine
1557c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1567c478bd9Sstevel@tonic-gate  *	Event			State			New State
1577c478bd9Sstevel@tonic-gate  *				Action:
1587c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
159e11c3f44Smeem  *	IP interface failure	(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
1607c478bd9Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
1617c478bd9Sstevel@tonic-gate  *
162e11c3f44Smeem  *	IP interface failure	(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
1637c478bd9Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
1647c478bd9Sstevel@tonic-gate  *
165e11c3f44Smeem  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=yes)
16649df4566Sethindra  *	detection				     -> (PI_RUNNING, I == 0)
1677c478bd9Sstevel@tonic-gate  *				: clear IFF_FAILED on this phyint
1687c478bd9Sstevel@tonic-gate  *
169e11c3f44Smeem  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=no)
17049df4566Sethindra  *	detection				     ->	(PI_RUNNING, I == 1)
17149df4566Sethindra  *				: clear IFF_FAILED on this phyint
17249df4566Sethindra  *				: if failback is disabled set I == 1
1737c478bd9Sstevel@tonic-gate  *
1747c478bd9Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
1757c478bd9Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_FAILED
1767c478bd9Sstevel@tonic-gate  *	(Router targets)	: set IFF_FAILED
1777c478bd9Sstevel@tonic-gate  *
1787c478bd9Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
1797c478bd9Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_NOTARGETS
1807c478bd9Sstevel@tonic-gate  *	(Host targets)		: set IFF_FAILED
1817c478bd9Sstevel@tonic-gate  *				: delete the target list on all phyints
1827c478bd9Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1837c478bd9Sstevel@tonic-gate  */
1847c478bd9Sstevel@tonic-gate 
1857c478bd9Sstevel@tonic-gate struct probes_missed probes_missed;
1867c478bd9Sstevel@tonic-gate 
1877c478bd9Sstevel@tonic-gate /*
1887c478bd9Sstevel@tonic-gate  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
1897c478bd9Sstevel@tonic-gate  * will be added on by the kernel.  The id field identifies this phyint.
1907c478bd9Sstevel@tonic-gate  * and the sequence number is an increasing (modulo 2^^16) integer. The data
1917c478bd9Sstevel@tonic-gate  * portion holds the time value when the packet is sent. On echo this is
1927c478bd9Sstevel@tonic-gate  * extracted to compute the round-trip time. Three different types of
1937c478bd9Sstevel@tonic-gate  * probe packets are used.
1947c478bd9Sstevel@tonic-gate  *
1957c478bd9Sstevel@tonic-gate  * PROBE_UNI: This type is used to do failure detection / failure recovery
1967c478bd9Sstevel@tonic-gate  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
1977c478bd9Sstevel@tonic-gate  *	not less than the current CRTT. pii_probes[] stores data
1987c478bd9Sstevel@tonic-gate  *	about these probes. These packets consume sequence number space.
1997c478bd9Sstevel@tonic-gate  *
200e11c3f44Smeem  * PROBE_RTT: This type is used to make only rtt measurements. Normally these
2017c478bd9Sstevel@tonic-gate  * 	are not used. Under heavy network load, the rtt may go up very high,
2027c478bd9Sstevel@tonic-gate  *	due to a spike, or may appear to go high, due to extreme scheduling
2037c478bd9Sstevel@tonic-gate  * 	delays. Once the network stress is removed, mpathd takes long time to
2047c478bd9Sstevel@tonic-gate  *	recover, because the probe_interval is already high, and it takes
2057c478bd9Sstevel@tonic-gate  *	a long time to send out sufficient number of probes to bring down the
2067c478bd9Sstevel@tonic-gate  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
2077c478bd9Sstevel@tonic-gate  *	user_probe_interval ms. and will cause only rtt updates. These packets
2087c478bd9Sstevel@tonic-gate  *	do not consume sequence number space nor is information about these
2097c478bd9Sstevel@tonic-gate  *	packets stored in the pii_probes[]
2107c478bd9Sstevel@tonic-gate  *
2117c478bd9Sstevel@tonic-gate  * PROBE_MULTI: This type is only used to construct a list of targets, when
2127c478bd9Sstevel@tonic-gate  *	no targets are known. The packet is multicast to the all hosts addr.
2137c478bd9Sstevel@tonic-gate  */
2147c478bd9Sstevel@tonic-gate static void
probe(struct phyint_instance * pii,uint_t probe_type,hrtime_t start_hrtime)215e11c3f44Smeem probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
2167c478bd9Sstevel@tonic-gate {
217e11c3f44Smeem 	hrtime_t sent_hrtime;
218e11c3f44Smeem 	struct timeval sent_tv;
2197c478bd9Sstevel@tonic-gate 	struct pr_icmp probe_pkt;	/* Probe packet */
220e11c3f44Smeem 	struct sockaddr_storage targ;	/* target address */
221e11c3f44Smeem 	uint_t	targaddrlen;		/* targed address length */
2227c478bd9Sstevel@tonic-gate 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
223b6bc5f8fSGeorge Shepherd 	boolean_t sent = _B_FALSE;
224b6bc5f8fSGeorge Shepherd 	int	rval;
2257c478bd9Sstevel@tonic-gate 
2267c478bd9Sstevel@tonic-gate 	if (debug & D_TARGET) {
227e11c3f44Smeem 		logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
228e11c3f44Smeem 		    pii->pii_name, probe_type, start_hrtime);
2297c478bd9Sstevel@tonic-gate 	}
2307c478bd9Sstevel@tonic-gate 
2317c478bd9Sstevel@tonic-gate 	assert(pii->pii_probe_sock != -1);
2327c478bd9Sstevel@tonic-gate 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
2337c478bd9Sstevel@tonic-gate 	    probe_type == PROBE_RTT);
2347c478bd9Sstevel@tonic-gate 
2357c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
2367c478bd9Sstevel@tonic-gate 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
2377c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_code = 0;
2387c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_cksum = 0;
2397c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
2407c478bd9Sstevel@tonic-gate 
2417c478bd9Sstevel@tonic-gate 	/*
2427c478bd9Sstevel@tonic-gate 	 * Since there is no need to do arithmetic on the icmpid,
2437c478bd9Sstevel@tonic-gate 	 * (only equality check is done) pii_icmpid is stored in
2447c478bd9Sstevel@tonic-gate 	 * network byte order at initialization itself.
2457c478bd9Sstevel@tonic-gate 	 */
2467c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
247e11c3f44Smeem 	probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
2487c478bd9Sstevel@tonic-gate 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
2497c478bd9Sstevel@tonic-gate 
2507c478bd9Sstevel@tonic-gate 	/*
2517c478bd9Sstevel@tonic-gate 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
2527c478bd9Sstevel@tonic-gate 	 * the all hosts address. Otherwise it is unicast to the next target.
2537c478bd9Sstevel@tonic-gate 	 */
2547c478bd9Sstevel@tonic-gate 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
2557c478bd9Sstevel@tonic-gate 	    pii->pii_rtt_target_next != NULL));
2567c478bd9Sstevel@tonic-gate 
257e11c3f44Smeem 	bzero(&targ, sizeof (targ));
258e11c3f44Smeem 	targ.ss_family = pii->pii_af;
259e11c3f44Smeem 
2607c478bd9Sstevel@tonic-gate 	if (pii->pii_af == AF_INET6) {
261e11c3f44Smeem 		struct in6_addr *addr6;
262e11c3f44Smeem 
263e11c3f44Smeem 		addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
264e11c3f44Smeem 		targaddrlen = sizeof (struct sockaddr_in6);
2657c478bd9Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
266e11c3f44Smeem 			*addr6 = all_nodes_mcast_v6;
2677c478bd9Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
268e11c3f44Smeem 			*addr6 = pii->pii_target_next->tg_address;
269e11c3f44Smeem 		} else { /* type is PROBE_RTT */
270e11c3f44Smeem 			*addr6 = pii->pii_rtt_target_next->tg_address;
2717c478bd9Sstevel@tonic-gate 		}
2727c478bd9Sstevel@tonic-gate 	} else {
273e11c3f44Smeem 		struct in_addr *addr4;
274e11c3f44Smeem 
275e11c3f44Smeem 		addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
276e11c3f44Smeem 		targaddrlen = sizeof (struct sockaddr_in);
2777c478bd9Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
278e11c3f44Smeem 			*addr4 = all_nodes_mcast_v4;
2797c478bd9Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
2807c478bd9Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
281e11c3f44Smeem 			    &pii->pii_target_next->tg_address, addr4);
282e11c3f44Smeem 		} else { /* type is PROBE_RTT */
2837c478bd9Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
284e11c3f44Smeem 			    &pii->pii_rtt_target_next->tg_address, addr4);
2857c478bd9Sstevel@tonic-gate 		}
2867c478bd9Sstevel@tonic-gate 
2877c478bd9Sstevel@tonic-gate 		/*
2887c478bd9Sstevel@tonic-gate 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
2897c478bd9Sstevel@tonic-gate 		 */
2907c478bd9Sstevel@tonic-gate 		probe_pkt.pr_icmp_cksum =
2917c478bd9Sstevel@tonic-gate 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
292e11c3f44Smeem 	}
293e11c3f44Smeem 
294e11c3f44Smeem 	/*
295e11c3f44Smeem 	 * Use the current time as the time we sent.  Not atomic, but the best
296e11c3f44Smeem 	 * we can do from here.
297e11c3f44Smeem 	 */
298e11c3f44Smeem 	sent_hrtime = gethrtime();
299e11c3f44Smeem 	(void) gettimeofday(&sent_tv, NULL);
300b6bc5f8fSGeorge Shepherd 	rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
301b6bc5f8fSGeorge Shepherd 	    (struct sockaddr *)&targ, targaddrlen);
302b6bc5f8fSGeorge Shepherd 	/*
303b6bc5f8fSGeorge Shepherd 	 * If the send would block, this may either be transient or a hang in a
304b6bc5f8fSGeorge Shepherd 	 * lower layer. We pretend the probe was actually sent, the daemon will
305b6bc5f8fSGeorge Shepherd 	 * not see a reply to the probe and will fail the interface if normal
306b6bc5f8fSGeorge Shepherd 	 * failure detection criteria are met.
307b6bc5f8fSGeorge Shepherd 	 */
308b6bc5f8fSGeorge Shepherd 	if (rval == sizeof (probe_pkt) ||
309b6bc5f8fSGeorge Shepherd 	    (rval == -1 && errno == EWOULDBLOCK)) {
310b6bc5f8fSGeorge Shepherd 		sent = _B_TRUE;
311b6bc5f8fSGeorge Shepherd 	} else {
312e11c3f44Smeem 		logperror_pii(pii, "probe: probe sendto");
3137c478bd9Sstevel@tonic-gate 	}
3147c478bd9Sstevel@tonic-gate 
3157c478bd9Sstevel@tonic-gate 	/*
3167c478bd9Sstevel@tonic-gate 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
3177c478bd9Sstevel@tonic-gate 	 * update our tables. We will need this info in processing the probe
3187c478bd9Sstevel@tonic-gate 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
3197c478bd9Sstevel@tonic-gate 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
3207c478bd9Sstevel@tonic-gate 	 * are only used to construct a list of targets. PROBE_RTT packets are
3217c478bd9Sstevel@tonic-gate 	 * used only for updating the rtt and not for failure detection.
3227c478bd9Sstevel@tonic-gate 	 */
3237c478bd9Sstevel@tonic-gate 	if (probe_type == PROBE_UNI && sent) {
3247c478bd9Sstevel@tonic-gate 		pr_ndx = pii->pii_probe_next;
3257c478bd9Sstevel@tonic-gate 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
3267c478bd9Sstevel@tonic-gate 
3277c478bd9Sstevel@tonic-gate 		/* Collect statistics, before we reuse the last slot. */
3287c478bd9Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
3297c478bd9Sstevel@tonic-gate 			pii->pii_cum_stats.lost++;
3307c478bd9Sstevel@tonic-gate 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
3317c478bd9Sstevel@tonic-gate 			pii->pii_cum_stats.acked++;
3327c478bd9Sstevel@tonic-gate 		pii->pii_cum_stats.sent++;
3337c478bd9Sstevel@tonic-gate 
334e11c3f44Smeem 		pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
335e11c3f44Smeem 		pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
336e11c3f44Smeem 		pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
337e11c3f44Smeem 		pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
3387c478bd9Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
339e11c3f44Smeem 		probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
340e11c3f44Smeem 
3417c478bd9Sstevel@tonic-gate 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
3427c478bd9Sstevel@tonic-gate 		pii->pii_target_next = target_next(pii->pii_target_next);
3437c478bd9Sstevel@tonic-gate 		assert(pii->pii_target_next != NULL);
3447c478bd9Sstevel@tonic-gate 		/*
3457c478bd9Sstevel@tonic-gate 		 * If we have a single variable to denote the next target to
3467c478bd9Sstevel@tonic-gate 		 * probe for both rtt probes and failure detection probes, we
3477c478bd9Sstevel@tonic-gate 		 * could end up with a situation where the failure detection
3487c478bd9Sstevel@tonic-gate 		 * probe targets become disjoint from the rtt probe targets.
3497c478bd9Sstevel@tonic-gate 		 * Eg. if 2 targets and the actual fdt is double the user
3507c478bd9Sstevel@tonic-gate 		 * specified fdt. So we have 2 variables. In this scheme
3517c478bd9Sstevel@tonic-gate 		 * we also reset pii_rtt_target_next for every fdt probe,
3527c478bd9Sstevel@tonic-gate 		 * though that may not be necessary.
3537c478bd9Sstevel@tonic-gate 		 */
3547c478bd9Sstevel@tonic-gate 		pii->pii_rtt_target_next = pii->pii_target_next;
3557c478bd9Sstevel@tonic-gate 		pii->pii_snxt++;
3567c478bd9Sstevel@tonic-gate 	} else if (probe_type == PROBE_RTT) {
3577c478bd9Sstevel@tonic-gate 		pii->pii_rtt_target_next =
3587c478bd9Sstevel@tonic-gate 		    target_next(pii->pii_rtt_target_next);
3597c478bd9Sstevel@tonic-gate 		assert(pii->pii_rtt_target_next != NULL);
3607c478bd9Sstevel@tonic-gate 	}
3617c478bd9Sstevel@tonic-gate }
3627c478bd9Sstevel@tonic-gate 
3637c478bd9Sstevel@tonic-gate /*
3647c478bd9Sstevel@tonic-gate  * Incoming IPv4 data from wire, is received here. Called from main.
3657c478bd9Sstevel@tonic-gate  */
3667c478bd9Sstevel@tonic-gate void
in_data(struct phyint_instance * pii)3677c478bd9Sstevel@tonic-gate in_data(struct phyint_instance *pii)
3687c478bd9Sstevel@tonic-gate {
3697c478bd9Sstevel@tonic-gate 	struct	sockaddr_in 	from;
3707c478bd9Sstevel@tonic-gate 	struct	in6_addr	fromaddr;
371e11c3f44Smeem 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
372e11c3f44Smeem 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
3737c478bd9Sstevel@tonic-gate 	struct ip *ip;
3747c478bd9Sstevel@tonic-gate 	int 	iphlen;
3757c478bd9Sstevel@tonic-gate 	int 	len;
3767c478bd9Sstevel@tonic-gate 	char 	abuf[INET_ADDRSTRLEN];
377e11c3f44Smeem 	struct msghdr msg;
378e11c3f44Smeem 	struct iovec iov;
379e11c3f44Smeem 	struct pr_icmp *reply;
380e11c3f44Smeem 	struct timeval *recv_tvp;
3817c478bd9Sstevel@tonic-gate 
3827c478bd9Sstevel@tonic-gate 	if (debug & D_PROBE) {
3837c478bd9Sstevel@tonic-gate 		logdebug("in_data(%s %s)\n",
3847c478bd9Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
3857c478bd9Sstevel@tonic-gate 	}
3867c478bd9Sstevel@tonic-gate 
387e11c3f44Smeem 	iov.iov_base = (char *)in_packet;
388e11c3f44Smeem 	iov.iov_len = sizeof (in_packet);
389