17c478bd9Sstevel@tonic-gate /*
2e11c3f44Smeem * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
37c478bd9Sstevel@tonic-gate * Use is subject to license terms.
47c478bd9Sstevel@tonic-gate */
57c478bd9Sstevel@tonic-gate
67c478bd9Sstevel@tonic-gate /*
77c478bd9Sstevel@tonic-gate * Copyright (c) 1987 Regents of the University of California.
87c478bd9Sstevel@tonic-gate * All rights reserved.
97c478bd9Sstevel@tonic-gate *
107c478bd9Sstevel@tonic-gate * Redistribution and use in source and binary forms are permitted
117c478bd9Sstevel@tonic-gate * provided that the above copyright notice and this paragraph are
127c478bd9Sstevel@tonic-gate * duplicated in all such forms and that any documentation,
137c478bd9Sstevel@tonic-gate * advertising materials, and other materials related to such
147c478bd9Sstevel@tonic-gate * distribution and use acknowledge that the software was developed
157c478bd9Sstevel@tonic-gate * by the University of California, Berkeley. The name of the
167c478bd9Sstevel@tonic-gate * University may not be used to endorse or promote products derived
177c478bd9Sstevel@tonic-gate * from this software without specific prior written permission.
187c478bd9Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
197c478bd9Sstevel@tonic-gate * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
207c478bd9Sstevel@tonic-gate * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
217c478bd9Sstevel@tonic-gate */
227c478bd9Sstevel@tonic-gate
237c478bd9Sstevel@tonic-gate #include "mpd_defs.h"
247c478bd9Sstevel@tonic-gate #include "mpd_tables.h"
257c478bd9Sstevel@tonic-gate
267c478bd9Sstevel@tonic-gate /*
277c478bd9Sstevel@tonic-gate * Probe types for probe()
287c478bd9Sstevel@tonic-gate */
297c478bd9Sstevel@tonic-gate #define PROBE_UNI 0x1234 /* Unicast probe packet */
307c478bd9Sstevel@tonic-gate #define PROBE_MULTI 0x5678 /* Multicast probe packet */
317c478bd9Sstevel@tonic-gate #define PROBE_RTT 0x9abc /* RTT only probe packet */
327c478bd9Sstevel@tonic-gate
337c478bd9Sstevel@tonic-gate #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */
347c478bd9Sstevel@tonic-gate
357c478bd9Sstevel@tonic-gate /*
367c478bd9Sstevel@tonic-gate * Format of probe / probe response packets. This is an ICMP Echo request
377c478bd9Sstevel@tonic-gate * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
387c478bd9Sstevel@tonic-gate */
397c478bd9Sstevel@tonic-gate struct pr_icmp
407c478bd9Sstevel@tonic-gate {
417c478bd9Sstevel@tonic-gate uint8_t pr_icmp_type; /* type field */
427c478bd9Sstevel@tonic-gate uint8_t pr_icmp_code; /* code field */
437c478bd9Sstevel@tonic-gate uint16_t pr_icmp_cksum; /* checksum field */
447c478bd9Sstevel@tonic-gate uint16_t pr_icmp_id; /* Identification */
457c478bd9Sstevel@tonic-gate uint16_t pr_icmp_seq; /* sequence number */
46e11c3f44Smeem uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */
477c478bd9Sstevel@tonic-gate uint32_t pr_icmp_mtype; /* Message type */
487c478bd9Sstevel@tonic-gate };
497c478bd9Sstevel@tonic-gate
507c478bd9Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
517c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0,
527c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0,
537c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x1 } };
547c478bd9Sstevel@tonic-gate
557c478bd9Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
567c478bd9Sstevel@tonic-gate
577c478bd9Sstevel@tonic-gate static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */
587c478bd9Sstevel@tonic-gate
59e11c3f44Smeem static void *find_ancillary(struct msghdr *msg, int cmsg_level,
60e11c3f44Smeem int cmsg_type);
61e11c3f44Smeem static void pi_set_crtt(struct target *tg, int64_t m,
627c478bd9Sstevel@tonic-gate boolean_t is_probe_uni);
637c478bd9Sstevel@tonic-gate static void incoming_echo_reply(struct phyint_instance *pii,
64e11c3f44Smeem struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
657c478bd9Sstevel@tonic-gate static void incoming_rtt_reply(struct phyint_instance *pii,
667c478bd9Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr);
677c478bd9Sstevel@tonic-gate static void incoming_mcast_reply(struct phyint_instance *pii,
687c478bd9Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr);
697c478bd9Sstevel@tonic-gate
707c478bd9Sstevel@tonic-gate static boolean_t check_pg_crtt_improved(struct phyint_group *pg);
717c478bd9Sstevel@tonic-gate static boolean_t check_pii_crtt_improved(struct phyint_instance *pii);
727c478bd9Sstevel@tonic-gate static boolean_t check_exception_target(struct phyint_instance *pii,
737c478bd9Sstevel@tonic-gate struct target *target);
747c478bd9Sstevel@tonic-gate static void probe_fail_info(struct phyint_instance *pii,
757c478bd9Sstevel@tonic-gate struct target *cur_tg, struct probe_fail_count *pfinfo);
767c478bd9Sstevel@tonic-gate static void probe_success_info(struct phyint_instance *pii,
777c478bd9Sstevel@tonic-gate struct target *cur_tg, struct probe_success_count *psinfo);
787c478bd9Sstevel@tonic-gate static boolean_t phyint_repaired(struct phyint *pi);
797c478bd9Sstevel@tonic-gate
807c478bd9Sstevel@tonic-gate static boolean_t highest_ack_tg(uint16_t seq, struct target *tg);
817c478bd9Sstevel@tonic-gate static int in_cksum(ushort_t *addr, int len);
827c478bd9Sstevel@tonic-gate static void reset_snxt_basetimes(void);
83e11c3f44Smeem static int ns2ms(int64_t ns);
84e11c3f44Smeem static int64_t tv2ns(struct timeval *);
857c478bd9Sstevel@tonic-gate
867c478bd9Sstevel@tonic-gate /*
877c478bd9Sstevel@tonic-gate * CRTT - Conservative Round Trip Time Estimate
887c478bd9Sstevel@tonic-gate * Probe success - A matching probe reply received before CRTT ms has elapsed
897c478bd9Sstevel@tonic-gate * after sending the probe.
907c478bd9Sstevel@tonic-gate * Probe failure - No probe reply received and more than CRTT ms has elapsed
917c478bd9Sstevel@tonic-gate * after sending the probe.
927c478bd9Sstevel@tonic-gate *
937c478bd9Sstevel@tonic-gate * TLS - Time last success. Most recent probe ack received at this time.
947c478bd9Sstevel@tonic-gate * TFF - Time first fail. The time of the earliest probe failure in
957c478bd9Sstevel@tonic-gate * a consecutive series of probe failures.
967c478bd9Sstevel@tonic-gate * NUM_PROBE_REPAIRS - Number of consecutive successful probes required
977c478bd9Sstevel@tonic-gate * before declaring phyint repair.
987c478bd9Sstevel@tonic-gate * NUM_PROBE_FAILS - Number of consecutive probe failures required to
997c478bd9Sstevel@tonic-gate * declare a phyint failure.
1007c478bd9Sstevel@tonic-gate *
1017c478bd9Sstevel@tonic-gate * Phyint state diagram
1027c478bd9Sstevel@tonic-gate *
1037c478bd9Sstevel@tonic-gate * The state of a phyint that is capable of being probed, is completely
104e11c3f44Smeem * specified by the 3-tuple <pi_state, pg_state, I>.
1057c478bd9Sstevel@tonic-gate *
106fcdc8680Smeem * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
107fcdc8680Smeem * IFF_OFFLINE is set. If the phyint is also configured with a test address
108fcdc8680Smeem * (the common case) and probe targets, then a phyint must also successfully
109fcdc8680Smeem * be able to send and receive probes in order to remain in the PI_RUNNING
110fcdc8680Smeem * state (otherwise, it transitions to PI_FAILED).
1117c478bd9Sstevel@tonic-gate *
1127c478bd9Sstevel@tonic-gate * Further, if a PI_RUNNING phyint is configured with a test address but is
1137c478bd9Sstevel@tonic-gate * unable to find any probe targets, it will transition to the PI_NOTARGETS
1147c478bd9Sstevel@tonic-gate * state, which indicates that the link is apparently functional but that
1157c478bd9Sstevel@tonic-gate * in.mpathd is unable to send probes to verify functionality (in this case,
1167c478bd9Sstevel@tonic-gate * in.mpathd makes the optimistic assumption that the interface is working
117e11c3f44Smeem * correctly and thus does not mark the interface FAILED, but reports it as
118e11c3f44Smeem * IPMP_IF_UNKNOWN through the async events and query interfaces).
1197c478bd9Sstevel@tonic-gate *
1207c478bd9Sstevel@tonic-gate * At any point, a phyint may be administratively marked offline via if_mpadm.
1217c478bd9Sstevel@tonic-gate * In this case, the interface always transitions to PI_OFFLINE, regardless
1227c478bd9Sstevel@tonic-gate * of its previous state. When the interface is later brought back online,
1237c478bd9Sstevel@tonic-gate * in.mpathd acts as if the interface is new (and thus it transitions to
1247c478bd9Sstevel@tonic-gate * PI_RUNNING or PI_FAILED based on the status of the link and the result of
1257c478bd9Sstevel@tonic-gate * its probes, if probes are sent).
1267c478bd9Sstevel@tonic-gate *
1277c478bd9Sstevel@tonic-gate * pi_state - PI_RUNNING or PI_FAILED
1287c478bd9Sstevel@tonic-gate * PI_RUNNING: The failure detection logic says the phyint is good.
1297c478bd9Sstevel@tonic-gate * PI_FAILED: The failure detection logic says the phyint has failed.
1307c478bd9Sstevel@tonic-gate *
131e11c3f44Smeem * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED.
132e11c3f44Smeem * PG_OK: All interfaces in the group are OK.
133e11c3f44Smeem * PG_DEGRADED: Some interfaces in the group are unusable.
134e11c3f44Smeem * PG_FAILED: All interfaces in the group are unusable.
135e11c3f44Smeem *
1367c478bd9Sstevel@tonic-gate * In the case of router targets, we assume that the current list of
1377c478bd9Sstevel@tonic-gate * targets obtained from the routing table, is still valid, so the
1387c478bd9Sstevel@tonic-gate * phyint stat is PI_FAILED. In the case of host targets, we delete the
1397c478bd9Sstevel@tonic-gate * list of targets, and multicast to the all hosts, to reconstruct the
1407c478bd9Sstevel@tonic-gate * target list. So the phyints are in the PI_NOTARGETS state.
1417c478bd9Sstevel@tonic-gate *
1427c478bd9Sstevel@tonic-gate * I - value of (pi_flags & IFF_INACTIVE)
143e11c3f44Smeem * IFF_INACTIVE: This phyint will not send or receive packets.
144e11c3f44Smeem * Usually, inactive is tied to standby interfaces that are not yet
145e11c3f44Smeem * needed (e.g., no non-standby interfaces in the group have failed).
146e11c3f44Smeem * When failback has been disabled (FAILBACK=no configured), phyint can
147e11c3f44Smeem * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
148e11c3f44Smeem * subsequently recovers after a failure.
1497c478bd9Sstevel@tonic-gate *
150e11c3f44Smeem * Not all 9 possible combinations of the above 3-tuple are possible.
1517c478bd9Sstevel@tonic-gate *
152e11c3f44Smeem * I is tracked by IP. pi_state is tracked by mpathd.
1537c478bd9Sstevel@tonic-gate *
1547c478bd9Sstevel@tonic-gate * pi_state state machine
1557c478bd9Sstevel@tonic-gate * ---------------------------------------------------------------------------
1567c478bd9Sstevel@tonic-gate * Event State New State
1577c478bd9Sstevel@tonic-gate * Action:
1587c478bd9Sstevel@tonic-gate * ---------------------------------------------------------------------------
159e11c3f44Smeem * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
1607c478bd9Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint
1617c478bd9Sstevel@tonic-gate *
162e11c3f44Smeem * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
1637c478bd9Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint
1647c478bd9Sstevel@tonic-gate *
165e11c3f44Smeem * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes)
16649df4566Sethindra * detection -> (PI_RUNNING, I == 0)
1677c478bd9Sstevel@tonic-gate * : clear IFF_FAILED on this phyint
1687c478bd9Sstevel@tonic-gate *
169e11c3f44Smeem * IP interface repair (PI_FAILED, I == 0, FAILBACK=no)
17049df4566Sethindra * detection -> (PI_RUNNING, I == 1)
17149df4566Sethindra * : clear IFF_FAILED on this phyint
17249df4566Sethindra * : if failback is disabled set I == 1
1737c478bd9Sstevel@tonic-gate *
1747c478bd9Sstevel@tonic-gate * Group failure (perform on all phyints in the group)
1757c478bd9Sstevel@tonic-gate * detection PI_RUNNING PI_FAILED
1767c478bd9Sstevel@tonic-gate * (Router targets) : set IFF_FAILED
1777c478bd9Sstevel@tonic-gate *
1787c478bd9Sstevel@tonic-gate * Group failure (perform on all phyints in the group)
1797c478bd9Sstevel@tonic-gate * detection PI_RUNNING PI_NOTARGETS
1807c478bd9Sstevel@tonic-gate * (Host targets) : set IFF_FAILED
1817c478bd9Sstevel@tonic-gate * : delete the target list on all phyints
1827c478bd9Sstevel@tonic-gate * ---------------------------------------------------------------------------
1837c478bd9Sstevel@tonic-gate */
1847c478bd9Sstevel@tonic-gate
1857c478bd9Sstevel@tonic-gate struct probes_missed probes_missed;
1867c478bd9Sstevel@tonic-gate
1877c478bd9Sstevel@tonic-gate /*
1887c478bd9Sstevel@tonic-gate * Compose and transmit an ICMP ECHO REQUEST packet. The IP header
1897c478bd9Sstevel@tonic-gate * will be added on by the kernel. The id field identifies this phyint.
1907c478bd9Sstevel@tonic-gate * and the sequence number is an increasing (modulo 2^^16) integer. The data
1917c478bd9Sstevel@tonic-gate * portion holds the time value when the packet is sent. On echo this is
1927c478bd9Sstevel@tonic-gate * extracted to compute the round-trip time. Three different types of
1937c478bd9Sstevel@tonic-gate * probe packets are used.
1947c478bd9Sstevel@tonic-gate *
1957c478bd9Sstevel@tonic-gate * PROBE_UNI: This type is used to do failure detection / failure recovery
1967c478bd9Sstevel@tonic-gate * and RTT calculation. PROBE_UNI probes are spaced apart in time,
1977c478bd9Sstevel@tonic-gate * not less than the current CRTT. pii_probes[] stores data
1987c478bd9Sstevel@tonic-gate * about these probes. These packets consume sequence number space.
1997c478bd9Sstevel@tonic-gate *
200e11c3f44Smeem * PROBE_RTT: This type is used to make only rtt measurements. Normally these
2017c478bd9Sstevel@tonic-gate * are not used. Under heavy network load, the rtt may go up very high,
2027c478bd9Sstevel@tonic-gate * due to a spike, or may appear to go high, due to extreme scheduling
2037c478bd9Sstevel@tonic-gate * delays. Once the network stress is removed, mpathd takes long time to
2047c478bd9Sstevel@tonic-gate * recover, because the probe_interval is already high, and it takes
2057c478bd9Sstevel@tonic-gate * a long time to send out sufficient number of probes to bring down the
2067c478bd9Sstevel@tonic-gate * rtt. To avoid this problem, PROBE_RTT probes are sent out every
2077c478bd9Sstevel@tonic-gate * user_probe_interval ms. and will cause only rtt updates. These packets
2087c478bd9Sstevel@tonic-gate * do not consume sequence number space nor is information about these
2097c478bd9Sstevel@tonic-gate * packets stored in the pii_probes[]
2107c478bd9Sstevel@tonic-gate *
2117c478bd9Sstevel@tonic-gate * PROBE_MULTI: This type is only used to construct a list of targets, when
2127c478bd9Sstevel@tonic-gate * no targets are known. The packet is multicast to the all hosts addr.
2137c478bd9Sstevel@tonic-gate */
2147c478bd9Sstevel@tonic-gate static void
probe(struct phyint_instance * pii,uint_t probe_type,hrtime_t start_hrtime)215e11c3f44Smeem probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
2167c478bd9Sstevel@tonic-gate {
217e11c3f44Smeem hrtime_t sent_hrtime;
218e11c3f44Smeem struct timeval sent_tv;
2197c478bd9Sstevel@tonic-gate struct pr_icmp probe_pkt; /* Probe packet */
220e11c3f44Smeem struct sockaddr_storage targ; /* target address */
221e11c3f44Smeem uint_t targaddrlen; /* targed address length */
2227c478bd9Sstevel@tonic-gate int pr_ndx; /* probe index in pii->pii_probes[] */
223b6bc5f8fSGeorge Shepherd boolean_t sent = _B_FALSE;
224b6bc5f8fSGeorge Shepherd int rval;
2257c478bd9Sstevel@tonic-gate
2267c478bd9Sstevel@tonic-gate if (debug & D_TARGET) {
227e11c3f44Smeem logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
228e11c3f44Smeem pii->pii_name, probe_type, start_hrtime);
2297c478bd9Sstevel@tonic-gate }
2307c478bd9Sstevel@tonic-gate
2317c478bd9Sstevel@tonic-gate assert(pii->pii_probe_sock != -1);
2327c478bd9Sstevel@tonic-gate assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
2337c478bd9Sstevel@tonic-gate probe_type == PROBE_RTT);
2347c478bd9Sstevel@tonic-gate
2357c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
2367c478bd9Sstevel@tonic-gate ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
2377c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_code = 0;
2387c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_cksum = 0;
2397c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
2407c478bd9Sstevel@tonic-gate
2417c478bd9Sstevel@tonic-gate /*
2427c478bd9Sstevel@tonic-gate * Since there is no need to do arithmetic on the icmpid,
2437c478bd9Sstevel@tonic-gate * (only equality check is done) pii_icmpid is stored in
2447c478bd9Sstevel@tonic-gate * network byte order at initialization itself.
2457c478bd9Sstevel@tonic-gate */
2467c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_id = pii->pii_icmpid;
247e11c3f44Smeem probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
2487c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_mtype = htonl(probe_type);
2497c478bd9Sstevel@tonic-gate
2507c478bd9Sstevel@tonic-gate /*
2517c478bd9Sstevel@tonic-gate * If probe_type is PROBE_MULTI, this packet will be multicast to
2527c478bd9Sstevel@tonic-gate * the all hosts address. Otherwise it is unicast to the next target.
2537c478bd9Sstevel@tonic-gate */
2547c478bd9Sstevel@tonic-gate assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
2557c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next != NULL));
2567c478bd9Sstevel@tonic-gate
257e11c3f44Smeem bzero(&targ, sizeof (targ));
258e11c3f44Smeem targ.ss_family = pii->pii_af;
259e11c3f44Smeem
2607c478bd9Sstevel@tonic-gate if (pii->pii_af == AF_INET6) {
261e11c3f44Smeem struct in6_addr *addr6;
262e11c3f44Smeem
263e11c3f44Smeem addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
264e11c3f44Smeem targaddrlen = sizeof (struct sockaddr_in6);
2657c478bd9Sstevel@tonic-gate if (probe_type == PROBE_MULTI) {
266e11c3f44Smeem *addr6 = all_nodes_mcast_v6;
2677c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) {
268e11c3f44Smeem *addr6 = pii->pii_target_next->tg_address;
269e11c3f44Smeem } else { /* type is PROBE_RTT */
270e11c3f44Smeem *addr6 = pii->pii_rtt_target_next->tg_address;
2717c478bd9Sstevel@tonic-gate }
2727c478bd9Sstevel@tonic-gate } else {
273e11c3f44Smeem struct in_addr *addr4;
274e11c3f44Smeem
275e11c3f44Smeem addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
276e11c3f44Smeem targaddrlen = sizeof (struct sockaddr_in);
2777c478bd9Sstevel@tonic-gate if (probe_type == PROBE_MULTI) {
278e11c3f44Smeem *addr4 = all_nodes_mcast_v4;
2797c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) {
2807c478bd9Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR(
281e11c3f44Smeem &pii->pii_target_next->tg_address, addr4);
282e11c3f44Smeem } else { /* type is PROBE_RTT */
2837c478bd9Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR(
284e11c3f44Smeem &pii->pii_rtt_target_next->tg_address, addr4);
2857c478bd9Sstevel@tonic-gate }
2867c478bd9Sstevel@tonic-gate
2877c478bd9Sstevel@tonic-gate /*
2887c478bd9Sstevel@tonic-gate * Compute the IPv4 icmp checksum. Does not cover the IP header.
2897c478bd9Sstevel@tonic-gate */
2907c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_cksum =
2917c478bd9Sstevel@tonic-gate in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
292e11c3f44Smeem }
293e11c3f44Smeem
294e11c3f44Smeem /*
295e11c3f44Smeem * Use the current time as the time we sent. Not atomic, but the best
296e11c3f44Smeem * we can do from here.
297e11c3f44Smeem */
298e11c3f44Smeem sent_hrtime = gethrtime();
299e11c3f44Smeem (void) gettimeofday(&sent_tv, NULL);
300b6bc5f8fSGeorge Shepherd rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
301b6bc5f8fSGeorge Shepherd (struct sockaddr *)&targ, targaddrlen);
302b6bc5f8fSGeorge Shepherd /*
303b6bc5f8fSGeorge Shepherd * If the send would block, this may either be transient or a hang in a
304b6bc5f8fSGeorge Shepherd * lower layer. We pretend the probe was actually sent, the daemon will
305b6bc5f8fSGeorge Shepherd * not see a reply to the probe and will fail the interface if normal
306b6bc5f8fSGeorge Shepherd * failure detection criteria are met.
307b6bc5f8fSGeorge Shepherd */
308b6bc5f8fSGeorge Shepherd if (rval == sizeof (probe_pkt) ||
309b6bc5f8fSGeorge Shepherd (rval == -1 && errno == EWOULDBLOCK)) {
310b6bc5f8fSGeorge Shepherd sent = _B_TRUE;
311b6bc5f8fSGeorge Shepherd } else {
312e11c3f44Smeem logperror_pii(pii, "probe: probe sendto");
3137c478bd9Sstevel@tonic-gate }
3147c478bd9Sstevel@tonic-gate
3157c478bd9Sstevel@tonic-gate /*
3167c478bd9Sstevel@tonic-gate * If this is a PROBE_UNI probe packet being unicast to a target, then
3177c478bd9Sstevel@tonic-gate * update our tables. We will need this info in processing the probe
3187c478bd9Sstevel@tonic-gate * response. PROBE_MULTI and PROBE_RTT packets are not used for
3197c478bd9Sstevel@tonic-gate * the purpose of failure or recovery detection. PROBE_MULTI packets
3207c478bd9Sstevel@tonic-gate * are only used to construct a list of targets. PROBE_RTT packets are
3217c478bd9Sstevel@tonic-gate * used only for updating the rtt and not for failure detection.
3227c478bd9Sstevel@tonic-gate */
3237c478bd9Sstevel@tonic-gate if (probe_type == PROBE_UNI && sent) {
3247c478bd9Sstevel@tonic-gate pr_ndx = pii->pii_probe_next;
3257c478bd9Sstevel@tonic-gate assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
3267c478bd9Sstevel@tonic-gate
3277c478bd9Sstevel@tonic-gate /* Collect statistics, before we reuse the last slot. */
3287c478bd9Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
3297c478bd9Sstevel@tonic-gate pii->pii_cum_stats.lost++;
3307c478bd9Sstevel@tonic-gate else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
3317c478bd9Sstevel@tonic-gate pii->pii_cum_stats.acked++;
3327c478bd9Sstevel@tonic-gate pii->pii_cum_stats.sent++;
3337c478bd9Sstevel@tonic-gate
334e11c3f44Smeem pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
335e11c3f44Smeem pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
336e11c3f44Smeem pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
337e11c3f44Smeem pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
3387c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
339e11c3f44Smeem probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
340e11c3f44Smeem
3417c478bd9Sstevel@tonic-gate pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
3427c478bd9Sstevel@tonic-gate pii->pii_target_next = target_next(pii->pii_target_next);
3437c478bd9Sstevel@tonic-gate assert(pii->pii_target_next != NULL);
3447c478bd9Sstevel@tonic-gate /*
3457c478bd9Sstevel@tonic-gate * If we have a single variable to denote the next target to
3467c478bd9Sstevel@tonic-gate * probe for both rtt probes and failure detection probes, we
3477c478bd9Sstevel@tonic-gate * could end up with a situation where the failure detection
3487c478bd9Sstevel@tonic-gate * probe targets become disjoint from the rtt probe targets.
3497c478bd9Sstevel@tonic-gate * Eg. if 2 targets and the actual fdt is double the user
3507c478bd9Sstevel@tonic-gate * specified fdt. So we have 2 variables. In this scheme
3517c478bd9Sstevel@tonic-gate * we also reset pii_rtt_target_next for every fdt probe,
3527c478bd9Sstevel@tonic-gate * though that may not be necessary.
3537c478bd9Sstevel@tonic-gate */
3547c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next = pii->pii_target_next;
3557c478bd9Sstevel@tonic-gate pii->pii_snxt++;
3567c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_RTT) {
3577c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next =
3587c478bd9Sstevel@tonic-gate target_next(pii->pii_rtt_target_next);
3597c478bd9Sstevel@tonic-gate assert(pii->pii_rtt_target_next != NULL);
3607c478bd9Sstevel@tonic-gate }
3617c478bd9Sstevel@tonic-gate }
3627c478bd9Sstevel@tonic-gate
3637c478bd9Sstevel@tonic-gate /*
3647c478bd9Sstevel@tonic-gate * Incoming IPv4 data from wire, is received here. Called from main.
3657c478bd9Sstevel@tonic-gate */
3667c478bd9Sstevel@tonic-gate void
in_data(struct phyint_instance * pii)3677c478bd9Sstevel@tonic-gate in_data(struct phyint_instance *pii)
3687c478bd9Sstevel@tonic-gate {
3697c478bd9Sstevel@tonic-gate struct sockaddr_in from;
3707c478bd9Sstevel@tonic-gate struct in6_addr fromaddr;
371e11c3f44Smeem static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
372e11c3f44Smeem static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
3737c478bd9Sstevel@tonic-gate struct ip *ip;
3747c478bd9Sstevel@tonic-gate int iphlen;
3757c478bd9Sstevel@tonic-gate int len;
3767c478bd9Sstevel@tonic-gate char abuf[INET_ADDRSTRLEN];
377e11c3f44Smeem struct msghdr msg;
378e11c3f44Smeem struct iovec iov;
379e11c3f44Smeem struct pr_icmp *reply;
380e11c3f44Smeem struct timeval *recv_tvp;
3817c478bd9Sstevel@tonic-gate
3827c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
3837c478bd9Sstevel@tonic-gate logdebug("in_data(%s %s)\n",
3847c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name);
3857c478bd9Sstevel@tonic-gate }
3867c478bd9Sstevel@tonic-gate
387e11c3f44Smeem iov.iov_base = (char *)in_packet;
388e11c3f44Smeem iov.iov_len = sizeof (in_packet);
389