17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 37c478bd9Sstevel@tonic-gate * Use is subject to license terms. 47c478bd9Sstevel@tonic-gate */ 57c478bd9Sstevel@tonic-gate 67c478bd9Sstevel@tonic-gate /* 77c478bd9Sstevel@tonic-gate * Copyright (c) 1987 Regents of the University of California. 87c478bd9Sstevel@tonic-gate * All rights reserved. 97c478bd9Sstevel@tonic-gate * 107c478bd9Sstevel@tonic-gate * Redistribution and use in source and binary forms are permitted 117c478bd9Sstevel@tonic-gate * provided that the above copyright notice and this paragraph are 127c478bd9Sstevel@tonic-gate * duplicated in all such forms and that any documentation, 137c478bd9Sstevel@tonic-gate * advertising materials, and other materials related to such 147c478bd9Sstevel@tonic-gate * distribution and use acknowledge that the software was developed 157c478bd9Sstevel@tonic-gate * by the University of California, Berkeley. The name of the 167c478bd9Sstevel@tonic-gate * University may not be used to endorse or promote products derived 177c478bd9Sstevel@tonic-gate * from this software without specific prior written permission. 187c478bd9Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 197c478bd9Sstevel@tonic-gate * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 207c478bd9Sstevel@tonic-gate * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 217c478bd9Sstevel@tonic-gate */ 227c478bd9Sstevel@tonic-gate 237c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 247c478bd9Sstevel@tonic-gate 257c478bd9Sstevel@tonic-gate #include "mpd_defs.h" 267c478bd9Sstevel@tonic-gate #include "mpd_tables.h" 277c478bd9Sstevel@tonic-gate 287c478bd9Sstevel@tonic-gate /* 297c478bd9Sstevel@tonic-gate * Probe types for probe() 307c478bd9Sstevel@tonic-gate */ 317c478bd9Sstevel@tonic-gate #define PROBE_UNI 0x1234 /* Unicast probe packet */ 327c478bd9Sstevel@tonic-gate #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 337c478bd9Sstevel@tonic-gate #define PROBE_RTT 0x9abc /* RTT only probe packet */ 347c478bd9Sstevel@tonic-gate 357c478bd9Sstevel@tonic-gate #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 367c478bd9Sstevel@tonic-gate 377c478bd9Sstevel@tonic-gate /* 387c478bd9Sstevel@tonic-gate * Format of probe / probe response packets. This is an ICMP Echo request 397c478bd9Sstevel@tonic-gate * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 407c478bd9Sstevel@tonic-gate */ 417c478bd9Sstevel@tonic-gate struct pr_icmp 427c478bd9Sstevel@tonic-gate { 437c478bd9Sstevel@tonic-gate uint8_t pr_icmp_type; /* type field */ 447c478bd9Sstevel@tonic-gate uint8_t pr_icmp_code; /* code field */ 457c478bd9Sstevel@tonic-gate uint16_t pr_icmp_cksum; /* checksum field */ 467c478bd9Sstevel@tonic-gate uint16_t pr_icmp_id; /* Identification */ 477c478bd9Sstevel@tonic-gate uint16_t pr_icmp_seq; /* sequence number */ 487c478bd9Sstevel@tonic-gate uint32_t pr_icmp_timestamp; /* Time stamp */ 497c478bd9Sstevel@tonic-gate uint32_t pr_icmp_mtype; /* Message type */ 507c478bd9Sstevel@tonic-gate }; 517c478bd9Sstevel@tonic-gate 527c478bd9Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 537c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0, 547c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0, 557c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x1 } }; 567c478bd9Sstevel@tonic-gate 577c478bd9Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 587c478bd9Sstevel@tonic-gate 597c478bd9Sstevel@tonic-gate static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 607c478bd9Sstevel@tonic-gate 617c478bd9Sstevel@tonic-gate static void *find_ancillary(struct msghdr *msg, int cmsg_type); 627c478bd9Sstevel@tonic-gate static void pi_set_crtt(struct target *tg, int m, 637c478bd9Sstevel@tonic-gate boolean_t is_probe_uni); 647c478bd9Sstevel@tonic-gate static void incoming_echo_reply(struct phyint_instance *pii, 657c478bd9Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 667c478bd9Sstevel@tonic-gate static void incoming_rtt_reply(struct phyint_instance *pii, 677c478bd9Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 687c478bd9Sstevel@tonic-gate static void incoming_mcast_reply(struct phyint_instance *pii, 697c478bd9Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 707c478bd9Sstevel@tonic-gate 717c478bd9Sstevel@tonic-gate static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 727c478bd9Sstevel@tonic-gate static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 737c478bd9Sstevel@tonic-gate static boolean_t check_exception_target(struct phyint_instance *pii, 747c478bd9Sstevel@tonic-gate struct target *target); 757c478bd9Sstevel@tonic-gate static void probe_fail_info(struct phyint_instance *pii, 767c478bd9Sstevel@tonic-gate struct target *cur_tg, struct probe_fail_count *pfinfo); 777c478bd9Sstevel@tonic-gate static void probe_success_info(struct phyint_instance *pii, 787c478bd9Sstevel@tonic-gate struct target *cur_tg, struct probe_success_count *psinfo); 797c478bd9Sstevel@tonic-gate static boolean_t phyint_repaired(struct phyint *pi); 807c478bd9Sstevel@tonic-gate 817c478bd9Sstevel@tonic-gate static int failover(struct phyint *from, struct phyint *to); 827c478bd9Sstevel@tonic-gate static int failback(struct phyint *from, struct phyint *to); 837c478bd9Sstevel@tonic-gate static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); 847c478bd9Sstevel@tonic-gate 857c478bd9Sstevel@tonic-gate static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 867c478bd9Sstevel@tonic-gate static int in_cksum(ushort_t *addr, int len); 877c478bd9Sstevel@tonic-gate static void reset_snxt_basetimes(void); 887c478bd9Sstevel@tonic-gate 897c478bd9Sstevel@tonic-gate /* 907c478bd9Sstevel@tonic-gate * CRTT - Conservative Round Trip Time Estimate 917c478bd9Sstevel@tonic-gate * Probe success - A matching probe reply received before CRTT ms has elapsed 927c478bd9Sstevel@tonic-gate * after sending the probe. 937c478bd9Sstevel@tonic-gate * Probe failure - No probe reply received and more than CRTT ms has elapsed 947c478bd9Sstevel@tonic-gate * after sending the probe. 957c478bd9Sstevel@tonic-gate * 967c478bd9Sstevel@tonic-gate * TLS - Time last success. Most recent probe ack received at this time. 977c478bd9Sstevel@tonic-gate * TFF - Time first fail. The time of the earliest probe failure in 987c478bd9Sstevel@tonic-gate * a consecutive series of probe failures. 997c478bd9Sstevel@tonic-gate * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 1007c478bd9Sstevel@tonic-gate * before declaring phyint repair. 1017c478bd9Sstevel@tonic-gate * NUM_PROBE_FAILS - Number of consecutive probe failures required to 1027c478bd9Sstevel@tonic-gate * declare a phyint failure. 1037c478bd9Sstevel@tonic-gate * 1047c478bd9Sstevel@tonic-gate * Phyint state diagram 1057c478bd9Sstevel@tonic-gate * 1067c478bd9Sstevel@tonic-gate * The state of a phyint that is capable of being probed, is completely 1077c478bd9Sstevel@tonic-gate * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. 1087c478bd9Sstevel@tonic-gate * 1097c478bd9Sstevel@tonic-gate * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 1107c478bd9Sstevel@tonic-gate * of the link (according to the driver). If the phyint is also configured 1117c478bd9Sstevel@tonic-gate * with a test address (the common case) and probe targets, then a phyint must 1127c478bd9Sstevel@tonic-gate * also successfully be able to send and receive probes in order to remain in 1137c478bd9Sstevel@tonic-gate * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 1147c478bd9Sstevel@tonic-gate * 1157c478bd9Sstevel@tonic-gate * Further, if a PI_RUNNING phyint is configured with a test address but is 1167c478bd9Sstevel@tonic-gate * unable to find any probe targets, it will transition to the PI_NOTARGETS 1177c478bd9Sstevel@tonic-gate * state, which indicates that the link is apparently functional but that 1187c478bd9Sstevel@tonic-gate * in.mpathd is unable to send probes to verify functionality (in this case, 1197c478bd9Sstevel@tonic-gate * in.mpathd makes the optimistic assumption that the interface is working 1207c478bd9Sstevel@tonic-gate * correctly and thus does not perform a failover, but reports the interface 1217c478bd9Sstevel@tonic-gate * as IPMP_IF_UNKNOWN through the async events and query interfaces). 1227c478bd9Sstevel@tonic-gate * 1237c478bd9Sstevel@tonic-gate * At any point, a phyint may be administratively marked offline via if_mpadm. 1247c478bd9Sstevel@tonic-gate * In this case, the interface always transitions to PI_OFFLINE, regardless 1257c478bd9Sstevel@tonic-gate * of its previous state. When the interface is later brought back online, 1267c478bd9Sstevel@tonic-gate * in.mpathd acts as if the interface is new (and thus it transitions to 1277c478bd9Sstevel@tonic-gate * PI_RUNNING or PI_FAILED based on the status of the link and the result of 1287c478bd9Sstevel@tonic-gate * its probes, if probes are sent). 1297c478bd9Sstevel@tonic-gate * 1307c478bd9Sstevel@tonic-gate * pi_state - PI_RUNNING or PI_FAILED 1317c478bd9Sstevel@tonic-gate * PI_RUNNING: The failure detection logic says the phyint is good. 1327c478bd9Sstevel@tonic-gate * PI_FAILED: The failure detection logic says the phyint has failed. 1337c478bd9Sstevel@tonic-gate * 1347c478bd9Sstevel@tonic-gate * pg_groupfailed - Group failure, all interfaces in the group have failed. 1357c478bd9Sstevel@tonic-gate * The pi_state may be either PI_FAILED or PI_NOTARGETS. 1367c478bd9Sstevel@tonic-gate * In the case of router targets, we assume that the current list of 1377c478bd9Sstevel@tonic-gate * targets obtained from the routing table, is still valid, so the 1387c478bd9Sstevel@tonic-gate * phyint stat is PI_FAILED. In the case of host targets, we delete the 1397c478bd9Sstevel@tonic-gate * list of targets, and multicast to the all hosts, to reconstruct the 1407c478bd9Sstevel@tonic-gate * target list. So the phyints are in the PI_NOTARGETS state. 1417c478bd9Sstevel@tonic-gate * 1427c478bd9Sstevel@tonic-gate * I - value of (pi_flags & IFF_INACTIVE) 143*49df4566Sethindra * IFF_INACTIVE: No failovers have been done to this phyint, from 144*49df4566Sethindra * other phyints. This phyint is inactive. Phyint can be a Standby. 145*49df4566Sethindra * When failback has been disabled (FAILOVER=no configured), 146*49df4566Sethindra * phyint can also be a non-STANDBY. In this case IFF_INACTIVE 147*49df4566Sethindra * is set when phyint subsequently recovers after a failure. 1487c478bd9Sstevel@tonic-gate * 1497c478bd9Sstevel@tonic-gate * pi_empty 1507c478bd9Sstevel@tonic-gate * This phyint has failed over successfully to another phyint, and 1517c478bd9Sstevel@tonic-gate * this phyint is currently "empty". It does not host any addresses or 1527c478bd9Sstevel@tonic-gate * multicast membership etc. This is the state of a phyint after a 1537c478bd9Sstevel@tonic-gate * failover from the phyint has completed successfully and no subsequent 1547c478bd9Sstevel@tonic-gate * 'failover to' or 'failback to' has occurred on the phyint. 1557c478bd9Sstevel@tonic-gate * IP guarantees that no new logicals will be hosted nor any multicast 1567c478bd9Sstevel@tonic-gate * joins permitted on the phyint, since the phyint is either failed or 1577c478bd9Sstevel@tonic-gate * inactive. pi_empty is set implies the phyint is either failed or 1587c478bd9Sstevel@tonic-gate * inactive. 1597c478bd9Sstevel@tonic-gate * 1607c478bd9Sstevel@tonic-gate * pi_full 1617c478bd9Sstevel@tonic-gate * The phyint hosts all of its own addresses that it "owns". If the 1627c478bd9Sstevel@tonic-gate * phyint was previously failed or inactive, failbacks to the phyint 1637c478bd9Sstevel@tonic-gate * has completed successfully. i.e. No more failbacks to this phyint 1647c478bd9Sstevel@tonic-gate * can produce any change in system state whatsoever. 1657c478bd9Sstevel@tonic-gate * 1667c478bd9Sstevel@tonic-gate * Not all 32 possible combinations of the above 5-tuple are possible. 1677c478bd9Sstevel@tonic-gate * Furthermore some of the above combinations are transient. They may occur 1687c478bd9Sstevel@tonic-gate * only because the failover or failback did not complete successfully. The 1697c478bd9Sstevel@tonic-gate * failover/failback will be retried and eventually a stable state will be 1707c478bd9Sstevel@tonic-gate * reached. 1717c478bd9Sstevel@tonic-gate * 1727c478bd9Sstevel@tonic-gate * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. 1737c478bd9Sstevel@tonic-gate * The following are the state machines. 'from' and 'to' are the src and 1747c478bd9Sstevel@tonic-gate * dst of the failover/failback, below 1757c478bd9Sstevel@tonic-gate * 1767c478bd9Sstevel@tonic-gate * pi_empty state machine 1777c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 1787c478bd9Sstevel@tonic-gate * Event State -> New State 1797c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 1807c478bd9Sstevel@tonic-gate * successful completion from.pi_empty = 0 -> from.pi_empty = 1 1817c478bd9Sstevel@tonic-gate * of failover 1827c478bd9Sstevel@tonic-gate * 1837c478bd9Sstevel@tonic-gate * Initiate failover to.pi_empty = X -> to.pi_empty = 0 1847c478bd9Sstevel@tonic-gate * 1857c478bd9Sstevel@tonic-gate * Initiate failback to.pi_empty = X -> to.pi_empty = 0 1867c478bd9Sstevel@tonic-gate * 1877c478bd9Sstevel@tonic-gate * group failure pi_empty = X -> pi_empty = 0 1887c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 1897c478bd9Sstevel@tonic-gate * 1907c478bd9Sstevel@tonic-gate * pi_full state machine 1917c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 1927c478bd9Sstevel@tonic-gate * Event State -> New State 1937c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 1947c478bd9Sstevel@tonic-gate * successful completion to.pi_full = 0 -> to.pi_full = 1 1957c478bd9Sstevel@tonic-gate * of failback from 1967c478bd9Sstevel@tonic-gate * each of the other phyints 1977c478bd9Sstevel@tonic-gate * 1987c478bd9Sstevel@tonic-gate * Initiate failover from.pi_full = X -> from.pi_full = 0 1997c478bd9Sstevel@tonic-gate * 2007c478bd9Sstevel@tonic-gate * group failure pi_full = X -> pi_full = 0 2017c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 2027c478bd9Sstevel@tonic-gate * 2037c478bd9Sstevel@tonic-gate * pi_state state machine 2047c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 2057c478bd9Sstevel@tonic-gate * Event State New State 2067c478bd9Sstevel@tonic-gate * Action: 2077c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 2087c478bd9Sstevel@tonic-gate * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 2097c478bd9Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint 2107c478bd9Sstevel@tonic-gate * : failover from this phyint to another 2117c478bd9Sstevel@tonic-gate * 212*49df4566Sethindra * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 2137c478bd9Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint 2147c478bd9Sstevel@tonic-gate * 215*49df4566Sethindra * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) 216*49df4566Sethindra * detection -> (PI_RUNNING, I == 0) 217*49df4566Sethindra * : to.pi_empty = 0 2187c478bd9Sstevel@tonic-gate * : clear IFF_FAILED on this phyint 219*49df4566Sethindra * : failback to this phyint if enabled 2207c478bd9Sstevel@tonic-gate * 221*49df4566Sethindra * NIC repair (PI_FAILED, I == 0, FAILBACK=no) 222*49df4566Sethindra * detection -> (PI_RUNNING, I == 1) 223*49df4566Sethindra * : to.pi_empty = 0 224*49df4566Sethindra * : clear IFF_FAILED on this phyint 225*49df4566Sethindra * : if failback is disabled set I == 1 2267c478bd9Sstevel@tonic-gate * 2277c478bd9Sstevel@tonic-gate * Group failure (perform on all phyints in the group) 2287c478bd9Sstevel@tonic-gate * detection PI_RUNNING PI_FAILED 2297c478bd9Sstevel@tonic-gate * (Router targets) : set IFF_FAILED 2307c478bd9Sstevel@tonic-gate * : clear pi_empty and pi_full 2317c478bd9Sstevel@tonic-gate * 2327c478bd9Sstevel@tonic-gate * Group failure (perform on all phyints in the group) 2337c478bd9Sstevel@tonic-gate * detection PI_RUNNING PI_NOTARGETS 2347c478bd9Sstevel@tonic-gate * (Host targets) : set IFF_FAILED 2357c478bd9Sstevel@tonic-gate * : clear pi_empty and pi_full 2367c478bd9Sstevel@tonic-gate * : delete the target list on all phyints 2377c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 2387c478bd9Sstevel@tonic-gate * 2397c478bd9Sstevel@tonic-gate * I state machine 2407c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 2417c478bd9Sstevel@tonic-gate * Event State Action: 2427c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 243*49df4566Sethindra * Turn on I pi_empty == 0, STANDBY : failover from standby 2447c478bd9Sstevel@tonic-gate * 245*49df4566Sethindra * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 2467c478bd9Sstevel@tonic-gate * pi_full == 0 : failback to this if enabled 2477c478bd9Sstevel@tonic-gate * --------------------------------------------------------------------------- 2487c478bd9Sstevel@tonic-gate * 2497c478bd9Sstevel@tonic-gate * Assertions: (Read '==>' as implies) 2507c478bd9Sstevel@tonic-gate * 2517c478bd9Sstevel@tonic-gate * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) 2527c478bd9Sstevel@tonic-gate * (pi_empty == 1) ==> (pi_full == 0) 2537c478bd9Sstevel@tonic-gate * (pi_full == 1) ==> (pi_empty == 0) 2547c478bd9Sstevel@tonic-gate * 2557c478bd9Sstevel@tonic-gate * Invariants 2567c478bd9Sstevel@tonic-gate * 2577c478bd9Sstevel@tonic-gate * pg_groupfailed = 0 && 258*49df4566Sethindra * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby 2597c478bd9Sstevel@tonic-gate * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint 2607c478bd9Sstevel@tonic-gate * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint 2617c478bd9Sstevel@tonic-gate * 2627c478bd9Sstevel@tonic-gate * 1. says that an inactive standby, that is not empty, has to be failed 2637c478bd9Sstevel@tonic-gate * over. For a standby to be truly inactive, it should not host any 2647c478bd9Sstevel@tonic-gate * addresses. So we move them to some other phyint. Usually we catch the 2657c478bd9Sstevel@tonic-gate * turn on of IFF_INACTIVE, and perform this action. However if the failover 2667c478bd9Sstevel@tonic-gate * did not complete successfully, then subsequently we have lost the edge 2677c478bd9Sstevel@tonic-gate * trigger, and this invariant kicks in and completes the action. 2687c478bd9Sstevel@tonic-gate * 2697c478bd9Sstevel@tonic-gate * 2. says that any failed phyint that is not empty must be failed over. 2707c478bd9Sstevel@tonic-gate * Usually we do the failover when we detect NIC failure. However if the 2717c478bd9Sstevel@tonic-gate * failover does not complete successfully, this invariant kicks in and 2727c478bd9Sstevel@tonic-gate * completes the failover. We exclude inactive standby which is covered by 1. 2737c478bd9Sstevel@tonic-gate * 2747c478bd9Sstevel@tonic-gate * 3. says that any running phyint that is not full must be failed back. 2757c478bd9Sstevel@tonic-gate * Usually we do the failback when we detect NIC repair. However if the 2767c478bd9Sstevel@tonic-gate * failback does not complete successfully, this invariant kicks in and 2777c478bd9Sstevel@tonic-gate * completes the failback. Note that we don't want to failback to an inactive 2787c478bd9Sstevel@tonic-gate * standby. 2797c478bd9Sstevel@tonic-gate * 2807c478bd9Sstevel@tonic-gate * The invariants 1 - 3 and the actions are in initifs(). 2817c478bd9Sstevel@tonic-gate */ 2827c478bd9Sstevel@tonic-gate 2837c478bd9Sstevel@tonic-gate struct probes_missed probes_missed; 2847c478bd9Sstevel@tonic-gate 2857c478bd9Sstevel@tonic-gate /* 2867c478bd9Sstevel@tonic-gate * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 2877c478bd9Sstevel@tonic-gate * will be added on by the kernel. The id field identifies this phyint. 2887c478bd9Sstevel@tonic-gate * and the sequence number is an increasing (modulo 2^^16) integer. The data 2897c478bd9Sstevel@tonic-gate * portion holds the time value when the packet is sent. On echo this is 2907c478bd9Sstevel@tonic-gate * extracted to compute the round-trip time. Three different types of 2917c478bd9Sstevel@tonic-gate * probe packets are used. 2927c478bd9Sstevel@tonic-gate * 2937c478bd9Sstevel@tonic-gate * PROBE_UNI: This type is used to do failure detection / failure recovery 2947c478bd9Sstevel@tonic-gate * and RTT calculation. PROBE_UNI probes are spaced apart in time, 2957c478bd9Sstevel@tonic-gate * not less than the current CRTT. pii_probes[] stores data 2967c478bd9Sstevel@tonic-gate * about these probes. These packets consume sequence number space. 2977c478bd9Sstevel@tonic-gate * 2987c478bd9Sstevel@tonic-gate * PROBE_RTT: This type is used to make only rtt measurments. Normally these 2997c478bd9Sstevel@tonic-gate * are not used. Under heavy network load, the rtt may go up very high, 3007c478bd9Sstevel@tonic-gate * due to a spike, or may appear to go high, due to extreme scheduling 3017c478bd9Sstevel@tonic-gate * delays. Once the network stress is removed, mpathd takes long time to 3027c478bd9Sstevel@tonic-gate * recover, because the probe_interval is already high, and it takes 3037c478bd9Sstevel@tonic-gate * a long time to send out sufficient number of probes to bring down the 3047c478bd9Sstevel@tonic-gate * rtt. To avoid this problem, PROBE_RTT probes are sent out every 3057c478bd9Sstevel@tonic-gate * user_probe_interval ms. and will cause only rtt updates. These packets 3067c478bd9Sstevel@tonic-gate * do not consume sequence number space nor is information about these 3077c478bd9Sstevel@tonic-gate * packets stored in the pii_probes[] 3087c478bd9Sstevel@tonic-gate * 3097c478bd9Sstevel@tonic-gate * PROBE_MULTI: This type is only used to construct a list of targets, when 3107c478bd9Sstevel@tonic-gate * no targets are known. The packet is multicast to the all hosts addr. 3117c478bd9Sstevel@tonic-gate */ 3127c478bd9Sstevel@tonic-gate static void 3137c478bd9Sstevel@tonic-gate probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) 3147c478bd9Sstevel@tonic-gate { 3157c478bd9Sstevel@tonic-gate struct pr_icmp probe_pkt; /* Probe packet */ 3167c478bd9Sstevel@tonic-gate struct sockaddr_in6 whereto6; /* target address IPv6 */ 3177c478bd9Sstevel@tonic-gate struct sockaddr_in whereto; /* target address IPv4 */ 3187c478bd9Sstevel@tonic-gate int pr_ndx; /* probe index in pii->pii_probes[] */ 3197c478bd9Sstevel@tonic-gate boolean_t sent = _B_TRUE; 3207c478bd9Sstevel@tonic-gate 3217c478bd9Sstevel@tonic-gate if (debug & D_TARGET) { 3227c478bd9Sstevel@tonic-gate logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), 3237c478bd9Sstevel@tonic-gate pii->pii_name, probe_type, cur_time); 3247c478bd9Sstevel@tonic-gate } 3257c478bd9Sstevel@tonic-gate 3267c478bd9Sstevel@tonic-gate assert(pii->pii_probe_sock != -1); 3277c478bd9Sstevel@tonic-gate assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 3287c478bd9Sstevel@tonic-gate probe_type == PROBE_RTT); 3297c478bd9Sstevel@tonic-gate 3307c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 3317c478bd9Sstevel@tonic-gate ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 3327c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_code = 0; 3337c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_cksum = 0; 3347c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 3357c478bd9Sstevel@tonic-gate 3367c478bd9Sstevel@tonic-gate /* 3377c478bd9Sstevel@tonic-gate * Since there is no need to do arithmetic on the icmpid, 3387c478bd9Sstevel@tonic-gate * (only equality check is done) pii_icmpid is stored in 3397c478bd9Sstevel@tonic-gate * network byte order at initialization itself. 3407c478bd9Sstevel@tonic-gate */ 3417c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_id = pii->pii_icmpid; 3427c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_timestamp = htonl(cur_time); 3437c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_mtype = htonl(probe_type); 3447c478bd9Sstevel@tonic-gate 3457c478bd9Sstevel@tonic-gate /* 3467c478bd9Sstevel@tonic-gate * If probe_type is PROBE_MULTI, this packet will be multicast to 3477c478bd9Sstevel@tonic-gate * the all hosts address. Otherwise it is unicast to the next target. 3487c478bd9Sstevel@tonic-gate */ 3497c478bd9Sstevel@tonic-gate assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 3507c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next != NULL)); 3517c478bd9Sstevel@tonic-gate 3527c478bd9Sstevel@tonic-gate if (pii->pii_af == AF_INET6) { 3537c478bd9Sstevel@tonic-gate bzero(&whereto6, sizeof (whereto6)); 3547c478bd9Sstevel@tonic-gate whereto6.sin6_family = AF_INET6; 3557c478bd9Sstevel@tonic-gate if (probe_type == PROBE_MULTI) { 3567c478bd9Sstevel@tonic-gate whereto6.sin6_addr = all_nodes_mcast_v6; 3577c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) { 3587c478bd9Sstevel@tonic-gate whereto6.sin6_addr = pii->pii_target_next->tg_address; 3597c478bd9Sstevel@tonic-gate } else { 3607c478bd9Sstevel@tonic-gate /* type is PROBE_RTT */ 3617c478bd9Sstevel@tonic-gate whereto6.sin6_addr = 3627c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next->tg_address; 3637c478bd9Sstevel@tonic-gate } 3647c478bd9Sstevel@tonic-gate if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 3657c478bd9Sstevel@tonic-gate sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, 3667c478bd9Sstevel@tonic-gate sizeof (whereto6)) != sizeof (probe_pkt)) { 3677c478bd9Sstevel@tonic-gate logperror_pii(pii, "probe: probe sendto"); 3687c478bd9Sstevel@tonic-gate sent = _B_FALSE; 3697c478bd9Sstevel@tonic-gate } 3707c478bd9Sstevel@tonic-gate } else { 3717c478bd9Sstevel@tonic-gate bzero(&whereto, sizeof (whereto)); 3727c478bd9Sstevel@tonic-gate whereto.sin_family = AF_INET; 3737c478bd9Sstevel@tonic-gate if (probe_type == PROBE_MULTI) { 3747c478bd9Sstevel@tonic-gate whereto.sin_addr = all_nodes_mcast_v4; 3757c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) { 3767c478bd9Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR( 3777c478bd9Sstevel@tonic-gate &pii->pii_target_next->tg_address, 3787c478bd9Sstevel@tonic-gate &whereto.sin_addr); 3797c478bd9Sstevel@tonic-gate } else { 3807c478bd9Sstevel@tonic-gate /* type is PROBE_RTT */ 3817c478bd9Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR( 3827c478bd9Sstevel@tonic-gate &pii->pii_rtt_target_next->tg_address, 3837c478bd9Sstevel@tonic-gate &whereto.sin_addr); 3847c478bd9Sstevel@tonic-gate } 3857c478bd9Sstevel@tonic-gate 3867c478bd9Sstevel@tonic-gate /* 3877c478bd9Sstevel@tonic-gate * Compute the IPv4 icmp checksum. Does not cover the IP header. 3887c478bd9Sstevel@tonic-gate */ 3897c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_cksum = 3907c478bd9Sstevel@tonic-gate in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 3917c478bd9Sstevel@tonic-gate if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 3927c478bd9Sstevel@tonic-gate sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, 3937c478bd9Sstevel@tonic-gate sizeof (whereto)) != sizeof (probe_pkt)) { 3947c478bd9Sstevel@tonic-gate logperror_pii(pii, "probe: probe sendto"); 3957c478bd9Sstevel@tonic-gate sent = _B_FALSE; 3967c478bd9Sstevel@tonic-gate } 3977c478bd9Sstevel@tonic-gate } 3987c478bd9Sstevel@tonic-gate 3997c478bd9Sstevel@tonic-gate /* 4007c478bd9Sstevel@tonic-gate * If this is a PROBE_UNI probe packet being unicast to a target, then 4017c478bd9Sstevel@tonic-gate * update our tables. We will need this info in processing the probe 4027c478bd9Sstevel@tonic-gate * response. PROBE_MULTI and PROBE_RTT packets are not used for 4037c478bd9Sstevel@tonic-gate * the purpose of failure or recovery detection. PROBE_MULTI packets 4047c478bd9Sstevel@tonic-gate * are only used to construct a list of targets. PROBE_RTT packets are 4057c478bd9Sstevel@tonic-gate * used only for updating the rtt and not for failure detection. 4067c478bd9Sstevel@tonic-gate */ 4077c478bd9Sstevel@tonic-gate if (probe_type == PROBE_UNI && sent) { 4087c478bd9Sstevel@tonic-gate pr_ndx = pii->pii_probe_next; 4097c478bd9Sstevel@tonic-gate assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 4107c478bd9Sstevel@tonic-gate 4117c478bd9Sstevel@tonic-gate /* Collect statistics, before we reuse the last slot. */ 4127c478bd9Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 4137c478bd9Sstevel@tonic-gate pii->pii_cum_stats.lost++; 4147c478bd9Sstevel@tonic-gate else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 4157c478bd9Sstevel@tonic-gate pii->pii_cum_stats.acked++; 4167c478bd9Sstevel@tonic-gate pii->pii_cum_stats.sent++; 4177c478bd9Sstevel@tonic-gate 4187c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; 4197c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 4207c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_time_sent = cur_time; 4217c478bd9Sstevel@tonic-gate pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 4227c478bd9Sstevel@tonic-gate pii->pii_target_next = target_next(pii->pii_target_next); 4237c478bd9Sstevel@tonic-gate assert(pii->pii_target_next != NULL); 4247c478bd9Sstevel@tonic-gate /* 4257c478bd9Sstevel@tonic-gate * If we have a single variable to denote the next target to 4267c478bd9Sstevel@tonic-gate * probe for both rtt probes and failure detection probes, we 4277c478bd9Sstevel@tonic-gate * could end up with a situation where the failure detection 4287c478bd9Sstevel@tonic-gate * probe targets become disjoint from the rtt probe targets. 4297c478bd9Sstevel@tonic-gate * Eg. if 2 targets and the actual fdt is double the user 4307c478bd9Sstevel@tonic-gate * specified fdt. So we have 2 variables. In this scheme 4317c478bd9Sstevel@tonic-gate * we also reset pii_rtt_target_next for every fdt probe, 4327c478bd9Sstevel@tonic-gate * though that may not be necessary. 4337c478bd9Sstevel@tonic-gate */ 4347c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next = pii->pii_target_next; 4357c478bd9Sstevel@tonic-gate pii->pii_snxt++; 4367c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_RTT) { 4377c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next = 4387c478bd9Sstevel@tonic-gate target_next(pii->pii_rtt_target_next); 4397c478bd9Sstevel@tonic-gate assert(pii->pii_rtt_target_next != NULL); 4407c478bd9Sstevel@tonic-gate } 4417c478bd9Sstevel@tonic-gate } 4427c478bd9Sstevel@tonic-gate 4437c478bd9Sstevel@tonic-gate /* 4447c478bd9Sstevel@tonic-gate * Incoming IPv4 data from wire, is received here. Called from main. 4457c478bd9Sstevel@tonic-gate */ 4467c478bd9Sstevel@tonic-gate void 4477c478bd9Sstevel@tonic-gate in_data(struct phyint_instance *pii) 4487c478bd9Sstevel@tonic-gate { 4497c478bd9Sstevel@tonic-gate struct sockaddr_in from; 4507c478bd9Sstevel@tonic-gate struct in6_addr fromaddr; 4517c478bd9Sstevel@tonic-gate uint_t fromlen; 4527c478bd9Sstevel@tonic-gate static uint_t in_packet[(IP_MAXPACKET + 1)/4]; 4537c478bd9Sstevel@tonic-gate struct ip *ip; 4547c478bd9Sstevel@tonic-gate int iphlen; 4557c478bd9Sstevel@tonic-gate int len; 4567c478bd9Sstevel@tonic-gate char abuf[INET_ADDRSTRLEN]; 4577c478bd9Sstevel@tonic-gate struct pr_icmp *reply; 4587c478bd9Sstevel@tonic-gate 4597c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 4607c478bd9Sstevel@tonic-gate logdebug("in_data(%s %s)\n", 4617c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 4627c478bd9Sstevel@tonic-gate } 4637c478bd9Sstevel@tonic-gate 4647c478bd9Sstevel@tonic-gate /* 4657c478bd9Sstevel@tonic-gate * Poll has already told us that a message is waiting, 4667c478bd9Sstevel@tonic-gate * on this socket. Read it now. We should not block. 4677c478bd9Sstevel@tonic-gate */ 4687c478bd9Sstevel@tonic-gate fromlen = sizeof (from); 4697c478bd9Sstevel@tonic-gate len = recvfrom(pii->pii_probe_sock, (char *)in_packet, 4707c478bd9Sstevel@tonic-gate sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); 4717c478bd9Sstevel@tonic-gate if (len < 0) { 4727c478bd9Sstevel@tonic-gate logperror_pii(pii, "in_data: recvfrom"); 4737c478bd9Sstevel@tonic-gate return; 4747c478bd9Sstevel@tonic-gate } 4757c478bd9Sstevel@tonic-gate 4767c478bd9Sstevel@tonic-gate /* 4777c478bd9Sstevel@tonic-gate * If the NIC has indicated the link is down, don't go 4787c478bd9Sstevel@tonic-gate * any further. 4797c478bd9Sstevel@tonic-gate */ 4807c478bd9Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 4817c478bd9Sstevel@tonic-gate return; 4827c478bd9Sstevel@tonic-gate 4837c478bd9Sstevel@tonic-gate /* Get the printable address for error reporting */ 4847c478bd9Sstevel@tonic-gate (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 4857c478bd9Sstevel@tonic-gate 4867c478bd9Sstevel@tonic-gate /* Make sure packet contains at least minimum ICMP header */ 4877c478bd9Sstevel@tonic-gate ip = (struct ip *)in_packet; 4887c478bd9Sstevel@tonic-gate iphlen = ip->ip_hl << 2; 4897c478bd9Sstevel@tonic-gate if (len < iphlen + ICMP_MINLEN) { 4907c478bd9Sstevel@tonic-gate if (debug & D_PKTBAD) { 4917c478bd9Sstevel@tonic-gate logdebug("in_data: packet too short (%d bytes)" 4927c478bd9Sstevel@tonic-gate " from %s\n", len, abuf); 4937c478bd9Sstevel@tonic-gate } 4947c478bd9Sstevel@tonic-gate return; 4957c478bd9Sstevel@tonic-gate } 4967c478bd9Sstevel@tonic-gate 4977c478bd9Sstevel@tonic-gate /* 4987c478bd9Sstevel@tonic-gate * Subtract the IP hdr length, 'len' will be length of the probe 4997c478bd9Sstevel@tonic-gate * reply, starting from the icmp hdr. 5007c478bd9Sstevel@tonic-gate */ 5017c478bd9Sstevel@tonic-gate len -= iphlen; 5027c478bd9Sstevel@tonic-gate /* LINTED */ 5037c478bd9Sstevel@tonic-gate reply = (struct pr_icmp *)((char *)in_packet + iphlen); 5047c478bd9Sstevel@tonic-gate 5057c478bd9Sstevel@tonic-gate /* Probe replies are icmp echo replies. Ignore anything else */ 5067c478bd9Sstevel@tonic-gate if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 5077c478bd9Sstevel@tonic-gate return; 5087c478bd9Sstevel@tonic-gate 5097c478bd9Sstevel@tonic-gate /* 5107c478bd9Sstevel@tonic-gate * The icmp id should match what we sent, which is stored 5117c478bd9Sstevel@tonic-gate * in pi_icmpid. The icmp code for reply must be 0. 5127c478bd9Sstevel@tonic-gate * The reply content must be a struct pr_icmp 5137c478bd9Sstevel@tonic-gate */ 5147c478bd9Sstevel@tonic-gate if (reply->pr_icmp_id != pii->pii_icmpid) { 5157c478bd9Sstevel@tonic-gate /* Not in response to our probe */ 5167c478bd9Sstevel@tonic-gate return; 5177c478bd9Sstevel@tonic-gate } 5187c478bd9Sstevel@tonic-gate 5197c478bd9Sstevel@tonic-gate if (reply->pr_icmp_code != 0) { 5207c478bd9Sstevel@tonic-gate logtrace("probe reply code %d from %s on %s\n", 5217c478bd9Sstevel@tonic-gate reply->pr_icmp_code, abuf, pii->pii_name); 5227c478bd9Sstevel@tonic-gate return; 5237c478bd9Sstevel@tonic-gate } 5247c478bd9Sstevel@tonic-gate 5257c478bd9Sstevel@tonic-gate if (len < sizeof (struct pr_icmp)) { 5267c478bd9Sstevel@tonic-gate logtrace("probe reply too short: %d bytes from %s on %s\n", 5277c478bd9Sstevel@tonic-gate len, abuf, pii->pii_name); 5287c478bd9Sstevel@tonic-gate return; 5297c478bd9Sstevel@tonic-gate } 5307c478bd9Sstevel@tonic-gate 5317c478bd9Sstevel@tonic-gate IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 5327c478bd9Sstevel@tonic-gate if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 5337c478bd9Sstevel@tonic-gate /* Unicast probe reply */ 5347c478bd9Sstevel@tonic-gate incoming_echo_reply(pii, reply, fromaddr); 5357c478bd9Sstevel@tonic-gate else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 5367c478bd9Sstevel@tonic-gate /* Multicast reply */ 5377c478bd9Sstevel@tonic-gate incoming_mcast_reply(pii, reply, fromaddr); 5387c478bd9Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 5397c478bd9Sstevel@tonic-gate incoming_rtt_reply(pii, reply, fromaddr); 5407c478bd9Sstevel@tonic-gate } else { 5417c478bd9Sstevel@tonic-gate /* Probably not in response to our probe */ 5427c478bd9Sstevel@tonic-gate logtrace("probe reply type: %d from %s on %s\n", 5437c478bd9Sstevel@tonic-gate reply->pr_icmp_mtype, abuf, pii->pii_name); 5447c478bd9Sstevel@tonic-gate return; 5457c478bd9Sstevel@tonic-gate } 5467c478bd9Sstevel@tonic-gate 5477c478bd9Sstevel@tonic-gate } 5487c478bd9Sstevel@tonic-gate 5497c478bd9Sstevel@tonic-gate /* 5507c478bd9Sstevel@tonic-gate * Incoming IPv6 data from wire is received here. Called from main. 5517c478bd9Sstevel@tonic-gate */ 5527c478bd9Sstevel@tonic-gate void 5537c478bd9Sstevel@tonic-gate in6_data(struct phyint_instance *pii) 5547c478bd9Sstevel@tonic-gate { 5557c478bd9Sstevel@tonic-gate struct sockaddr_in6 from; 5567c478bd9Sstevel@tonic-gate static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 5577c478bd9Sstevel@tonic-gate static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 5587c478bd9Sstevel@tonic-gate int len; 5597c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 5607c478bd9Sstevel@tonic-gate struct msghdr msg; 5617c478bd9Sstevel@tonic-gate struct iovec iov; 5627c478bd9Sstevel@tonic-gate uchar_t *opt; 5637c478bd9Sstevel@tonic-gate struct pr_icmp *reply; 5647c478bd9Sstevel@tonic-gate 5657c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 5667c478bd9Sstevel@tonic-gate logdebug("in6_data(%s %s)\n", 5677c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 5687c478bd9Sstevel@tonic-gate } 5697c478bd9Sstevel@tonic-gate 5707c478bd9Sstevel@tonic-gate iov.iov_base = (char *)in_packet; 5717c478bd9Sstevel@tonic-gate iov.iov_len = sizeof (in_packet); 5727c478bd9Sstevel@tonic-gate msg.msg_iov = &iov; 5737c478bd9Sstevel@tonic-gate msg.msg_iovlen = 1; 5747c478bd9Sstevel@tonic-gate msg.msg_name = (struct sockaddr *)&from; 5757c478bd9Sstevel@tonic-gate msg.msg_namelen = sizeof (from); 5767c478bd9Sstevel@tonic-gate msg.msg_control = ancillary_data; 5777c478bd9Sstevel@tonic-gate msg.msg_controllen = sizeof (ancillary_data); 5787c478bd9Sstevel@tonic-gate 5797c478bd9Sstevel@tonic-gate if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 5807c478bd9Sstevel@tonic-gate logperror_pii(pii, "in6_data: recvfrom"); 5817c478bd9Sstevel@tonic-gate return; 5827c478bd9Sstevel@tonic-gate } 5837c478bd9Sstevel@tonic-gate 5847c478bd9Sstevel@tonic-gate /* 5857c478bd9Sstevel@tonic-gate * If the NIC has indicated that the link is down, don't go 5867c478bd9Sstevel@tonic-gate * any further. 5877c478bd9Sstevel@tonic-gate */ 5887c478bd9Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 5897c478bd9Sstevel@tonic-gate return; 5907c478bd9Sstevel@tonic-gate 5917c478bd9Sstevel@tonic-gate /* Get the printable address for error reporting */ 5927c478bd9Sstevel@tonic-gate (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 5937c478bd9Sstevel@tonic-gate if (len < ICMP_MINLEN) { 5947c478bd9Sstevel@tonic-gate if (debug & D_PKTBAD) { 5957c478bd9Sstevel@tonic-gate logdebug("Truncated message: msg_flags 0x%x from %s\n", 5967c478bd9Sstevel@tonic-gate msg.msg_flags, abuf); 5977c478bd9Sstevel@tonic-gate } 5987c478bd9Sstevel@tonic-gate return; 5997c478bd9Sstevel@tonic-gate } 6007c478bd9Sstevel@tonic-gate /* Ignore packets > 64k or control buffers that don't fit */ 6017c478bd9Sstevel@tonic-gate if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 6027c478bd9Sstevel@tonic-gate if (debug & D_PKTBAD) { 6037c478bd9Sstevel@tonic-gate logdebug("Truncated message: msg_flags 0x%x from %s\n", 6047c478bd9Sstevel@tonic-gate msg.msg_flags, abuf); 6057c478bd9Sstevel@tonic-gate } 6067c478bd9Sstevel@tonic-gate return; 6077c478bd9Sstevel@tonic-gate } 6087c478bd9Sstevel@tonic-gate 6097c478bd9Sstevel@tonic-gate reply = (struct pr_icmp *)in_packet; 6107c478bd9Sstevel@tonic-gate if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 6117c478bd9Sstevel@tonic-gate return; 6127c478bd9Sstevel@tonic-gate 6137c478bd9Sstevel@tonic-gate if (reply->pr_icmp_id != pii->pii_icmpid) { 6147c478bd9Sstevel@tonic-gate /* Not in response to our probe */ 6157c478bd9Sstevel@tonic-gate return; 6167c478bd9Sstevel@tonic-gate } 6177c478bd9Sstevel@tonic-gate 6187c478bd9Sstevel@tonic-gate /* 6197c478bd9Sstevel@tonic-gate * The kernel has already verified the the ICMP checksum. 6207c478bd9Sstevel@tonic-gate */ 6217c478bd9Sstevel@tonic-gate if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 6227c478bd9Sstevel@tonic-gate logtrace("ICMPv6 echo reply source address not linklocal from " 6237c478bd9Sstevel@tonic-gate "%s on %s\n", abuf, pii->pii_name); 6247c478bd9Sstevel@tonic-gate return; 6257c478bd9Sstevel@tonic-gate } 6267c478bd9Sstevel@tonic-gate opt = find_ancillary(&msg, IPV6_RTHDR); 6277c478bd9Sstevel@tonic-gate if (opt != NULL) { 6287c478bd9Sstevel@tonic-gate /* Can't allow routing headers in probe replies */ 6297c478bd9Sstevel@tonic-gate logtrace("message with routing header from %s on %s\n", 6307c478bd9Sstevel@tonic-gate abuf, pii->pii_name); 6317c478bd9Sstevel@tonic-gate return; 6327c478bd9Sstevel@tonic-gate } 6337c478bd9Sstevel@tonic-gate if (reply->pr_icmp_code != 0) { 6347c478bd9Sstevel@tonic-gate logtrace("probe reply code: %d from %s on %s\n", 6357c478bd9Sstevel@tonic-gate reply->pr_icmp_code, abuf, pii->pii_name); 6367c478bd9Sstevel@tonic-gate return; 6377c478bd9Sstevel@tonic-gate } 6387c478bd9Sstevel@tonic-gate if (len < (sizeof (struct pr_icmp))) { 6397c478bd9Sstevel@tonic-gate logtrace("probe reply too short: %d bytes from %s on %s\n", 6407c478bd9Sstevel@tonic-gate len, abuf, pii->pii_name); 6417c478bd9Sstevel@tonic-gate return; 6427c478bd9Sstevel@tonic-gate } 6437c478bd9Sstevel@tonic-gate if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 6447c478bd9Sstevel@tonic-gate incoming_echo_reply(pii, reply, from.sin6_addr); 6457c478bd9Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 6467c478bd9Sstevel@tonic-gate incoming_mcast_reply(pii, reply, from.sin6_addr); 6477c478bd9Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 6487c478bd9Sstevel@tonic-gate incoming_rtt_reply(pii, reply, from.sin6_addr); 6497c478bd9Sstevel@tonic-gate } else { 6507c478bd9Sstevel@tonic-gate /* Probably not in response to our probe */ 6517c478bd9Sstevel@tonic-gate logtrace("probe reply type: %d from %s on %s\n", 6527c478bd9Sstevel@tonic-gate reply->pr_icmp_mtype, abuf, pii->pii_name); 6537c478bd9Sstevel@tonic-gate } 6547c478bd9Sstevel@tonic-gate } 6557c478bd9Sstevel@tonic-gate 6567c478bd9Sstevel@tonic-gate /* 6577c478bd9Sstevel@tonic-gate * Process the incoming rtt reply, in response to our rtt probe. 6587c478bd9Sstevel@tonic-gate * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 6597c478bd9Sstevel@tonic-gate * have any stored information about the probe we sent. So we don't log 6607c478bd9Sstevel@tonic-gate * any errors if we receive bad replies. 6617c478bd9Sstevel@tonic-gate */ 6627c478bd9Sstevel@tonic-gate static void 6637c478bd9Sstevel@tonic-gate incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 6647c478bd9Sstevel@tonic-gate struct in6_addr fromaddr) 6657c478bd9Sstevel@tonic-gate { 6667c478bd9Sstevel@tonic-gate int m; /* rtt measurment in ms */ 6677c478bd9Sstevel@tonic-gate uint32_t cur_time; /* in ms from some arbitrary point */ 6687c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 6697c478bd9Sstevel@tonic-gate struct target *target; 6707c478bd9Sstevel@tonic-gate uint32_t pr_icmp_timestamp; 6717c478bd9Sstevel@tonic-gate struct phyint_group *pg; 6727c478bd9Sstevel@tonic-gate 6737c478bd9Sstevel@tonic-gate /* Get the printable address for error reporting */ 6747c478bd9Sstevel@tonic-gate (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 6757c478bd9Sstevel@tonic-gate 6767c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 6777c478bd9Sstevel@tonic-gate logdebug("incoming_rtt_reply: %s %s %s\n", 6787c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, abuf); 6797c478bd9Sstevel@tonic-gate } 6807c478bd9Sstevel@tonic-gate 6817c478bd9Sstevel@tonic-gate /* Do we know this target ? */ 6827c478bd9Sstevel@tonic-gate target = target_lookup(pii, fromaddr); 6837c478bd9Sstevel@tonic-gate if (target == NULL) 6847c478bd9Sstevel@tonic-gate return; 6857c478bd9Sstevel@tonic-gate 6867c478bd9Sstevel@tonic-gate pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 6877c478bd9Sstevel@tonic-gate cur_time = getcurrenttime(); 6887c478bd9Sstevel@tonic-gate m = (int)(cur_time - pr_icmp_timestamp); 6897c478bd9Sstevel@tonic-gate 6907c478bd9Sstevel@tonic-gate /* Invalid rtt. It has wrapped around */ 6917c478bd9Sstevel@tonic-gate if (m < 0) 6927c478bd9Sstevel@tonic-gate return; 6937c478bd9Sstevel@tonic-gate 6947c478bd9Sstevel@tonic-gate /* 6957c478bd9Sstevel@tonic-gate * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 6967c478bd9Sstevel@tonic-gate * The initial few responses after the interface is repaired may 6977c478bd9Sstevel@tonic-gate * contain high rtt's because they could have been queued up waiting 6987c478bd9Sstevel@tonic-gate * for ARP/NDP resolution on a failed interface. 6997c478bd9Sstevel@tonic-gate */ 7007c478bd9Sstevel@tonic-gate pg = pii->pii_phyint->pi_group; 7017c478bd9Sstevel@tonic-gate if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 7027c478bd9Sstevel@tonic-gate return; 7037c478bd9Sstevel@tonic-gate 7047c478bd9Sstevel@tonic-gate /* 7057c478bd9Sstevel@tonic-gate * Update rtt only if the new rtt is lower than the current rtt. 7067c478bd9Sstevel@tonic-gate * (specified by the 3rd parameter to pi_set_crtt). 7077c478bd9Sstevel@tonic-gate * If a spike has caused the current probe_interval to be > 7087c478bd9Sstevel@tonic-gate * user_probe_interval, then this mechanism is used to bring down 7097c478bd9Sstevel@tonic-gate * the rtt rapidly once the network stress is removed. 7107c478bd9Sstevel@tonic-gate * If the new rtt is higher than the current rtt, we don't want to 7117c478bd9Sstevel@tonic-gate * update the rtt. We are having more than 1 outstanding probe and 7127c478bd9Sstevel@tonic-gate * the increase in rtt we are seeing is being unnecessarily weighted 7137c478bd9Sstevel@tonic-gate * many times. The regular rtt update will be handled by 7147c478bd9Sstevel@tonic-gate * incoming_echo_reply() and will take care of any rtt increase. 7157c478bd9Sstevel@tonic-gate */ 7167c478bd9Sstevel@tonic-gate pi_set_crtt(target, m, _B_FALSE); 7177c478bd9Sstevel@tonic-gate if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 7187c478bd9Sstevel@tonic-gate (user_failure_detection_time < pg->pg_fdt) && 7197c478bd9Sstevel@tonic-gate (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 7207c478bd9Sstevel@tonic-gate /* 7217c478bd9Sstevel@tonic-gate * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 7227c478bd9Sstevel@tonic-gate * investigate if we can improve the failure detection time to 7237c478bd9Sstevel@tonic-gate * meet whatever the user specified. 7247c478bd9Sstevel@tonic-gate */ 7257c478bd9Sstevel@tonic-gate if (check_pg_crtt_improved(pg)) { 7267c478bd9Sstevel@tonic-gate pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 7277c478bd9Sstevel@tonic-gate user_failure_detection_time); 7287c478bd9Sstevel@tonic-gate pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 7297c478bd9Sstevel@tonic-gate if (pii->pii_phyint->pi_group != phyint_anongroup) { 7307c478bd9Sstevel@tonic-gate logerr("Improved failure detection time %d ms " 7317c478bd9Sstevel@tonic-gate "on (%s %s) for group \"%s\"\n", 7327c478bd9Sstevel@tonic-gate pg->pg_fdt, AF_STR(pii->pii_af), 7337c478bd9Sstevel@tonic-gate pii->pii_name, 7347c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_name); 7357c478bd9Sstevel@tonic-gate } 7367c478bd9Sstevel@tonic-gate if (user_failure_detection_time == pg->pg_fdt) { 7377c478bd9Sstevel@tonic-gate /* Avoid any truncation or rounding errors */ 7387c478bd9Sstevel@tonic-gate pg->pg_probeint = user_probe_interval; 7397c478bd9Sstevel@tonic-gate /* 7407c478bd9Sstevel@tonic-gate * No more rtt probes will be sent. The actual 7417c478bd9Sstevel@tonic-gate * fdt has dropped to the user specified value. 7427c478bd9Sstevel@tonic-gate * pii_fd_snxt_basetime and pii_snxt_basetime 7437c478bd9Sstevel@tonic-gate * will be in sync henceforth. 7447c478bd9Sstevel@tonic-gate */ 7457c478bd9Sstevel@tonic-gate reset_snxt_basetimes(); 7467c478bd9Sstevel@tonic-gate } 7477c478bd9Sstevel@tonic-gate } 7487c478bd9Sstevel@tonic-gate } 7497c478bd9Sstevel@tonic-gate } 7507c478bd9Sstevel@tonic-gate 7517c478bd9Sstevel@tonic-gate /* 7527c478bd9Sstevel@tonic-gate * Process the incoming echo reply, in response to our unicast probe. 7537c478bd9Sstevel@tonic-gate * Common for both IPv4 and IPv6 7547c478bd9Sstevel@tonic-gate */ 7557c478bd9Sstevel@tonic-gate static void 7567c478bd9Sstevel@tonic-gate incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 7577c478bd9Sstevel@tonic-gate struct in6_addr fromaddr) 7587c478bd9Sstevel@tonic-gate { 7597c478bd9Sstevel@tonic-gate int m; /* rtt measurment in ms */ 7607c478bd9Sstevel@tonic-gate uint32_t cur_time; /* in ms from some arbitrary point */ 7617c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 7627c478bd9Sstevel@tonic-gate int pr_ndx; 7637c478bd9Sstevel@tonic-gate struct target *target; 7647c478bd9Sstevel@tonic-gate boolean_t exception; 7657c478bd9Sstevel@tonic-gate uint32_t pr_icmp_timestamp; 7667c478bd9Sstevel@tonic-gate uint16_t pr_icmp_seq; 7677c478bd9Sstevel@tonic-gate struct phyint_group *pg = pii->pii_phyint->pi_group; 7687c478bd9Sstevel@tonic-gate 7697c478bd9Sstevel@tonic-gate /* Get the printable address for error reporting */ 7707c478bd9Sstevel@tonic-gate (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 7717c478bd9Sstevel@tonic-gate 7727c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 7737c478bd9Sstevel@tonic-gate logdebug("incoming_echo_reply: %s %s %s seq %u\n", 7747c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, abuf, 7757c478bd9Sstevel@tonic-gate ntohs(reply->pr_icmp_seq)); 7767c478bd9Sstevel@tonic-gate } 7777c478bd9Sstevel@tonic-gate 7787c478bd9Sstevel@tonic-gate pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 7797c478bd9Sstevel@tonic-gate pr_icmp_seq = ntohs(reply->pr_icmp_seq); 7807c478bd9Sstevel@tonic-gate 7817c478bd9Sstevel@tonic-gate /* Reject out of window probe replies */ 7827c478bd9Sstevel@tonic-gate if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 7837c478bd9Sstevel@tonic-gate SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 7847c478bd9Sstevel@tonic-gate logtrace("out of window probe seq %u snxt %u on %s from %s\n", 7857c478bd9Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 7867c478bd9Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 7877c478bd9Sstevel@tonic-gate return; 7887c478bd9Sstevel@tonic-gate } 7897c478bd9Sstevel@tonic-gate cur_time = getcurrenttime(); 7907c478bd9Sstevel@tonic-gate m = (int)(cur_time - pr_icmp_timestamp); 7917c478bd9Sstevel@tonic-gate if (m < 0) { 7927c478bd9Sstevel@tonic-gate /* 7937c478bd9Sstevel@tonic-gate * This is a ridiculously high value of rtt. rtt has wrapped 7947c478bd9Sstevel@tonic-gate * around. Log a message, and ignore the rtt. 7957c478bd9Sstevel@tonic-gate */ 7967c478bd9Sstevel@tonic-gate logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " 7977c478bd9Sstevel@tonic-gate "timestamp %u\n", cur_time, pr_icmp_timestamp); 7987c478bd9Sstevel@tonic-gate } 7997c478bd9Sstevel@tonic-gate 8007c478bd9Sstevel@tonic-gate /* 8017c478bd9Sstevel@tonic-gate * Get the probe index pr_ndx corresponding to the received icmp seq. 8027c478bd9Sstevel@tonic-gate * number in our pii->pii_probes[] array. The icmp sequence number 8037c478bd9Sstevel@tonic-gate * pii_snxt corresponds to the probe index pii->pii_probe_next 8047c478bd9Sstevel@tonic-gate */ 8057c478bd9Sstevel@tonic-gate pr_ndx = MOD_SUB(pii->pii_probe_next, 8067c478bd9Sstevel@tonic-gate (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 8077c478bd9Sstevel@tonic-gate 8087c478bd9Sstevel@tonic-gate assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 8097c478bd9Sstevel@tonic-gate 8107c478bd9Sstevel@tonic-gate target = pii->pii_probes[pr_ndx].pr_target; 8117c478bd9Sstevel@tonic-gate 8127c478bd9Sstevel@tonic-gate /* 8137c478bd9Sstevel@tonic-gate * Perform sanity checks, whether this probe reply that we 8147c478bd9Sstevel@tonic-gate * have received is genuine 8157c478bd9Sstevel@tonic-gate */ 8167c478bd9Sstevel@tonic-gate if (target != NULL) { 8177c478bd9Sstevel@tonic-gate /* 8187c478bd9Sstevel@tonic-gate * Compare the src. addr of the received ICMP or ICMPv6 8197c478bd9Sstevel@tonic-gate * probe reply with the target address in our tables. 8207c478bd9Sstevel@tonic-gate */ 8217c478bd9Sstevel@tonic-gate if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 8227c478bd9Sstevel@tonic-gate /* 8237c478bd9Sstevel@tonic-gate * We don't have any record of having sent a probe to 8247c478bd9Sstevel@tonic-gate * this target. This is a fake probe reply. Log an error 8257c478bd9Sstevel@tonic-gate */ 8267c478bd9Sstevel@tonic-gate logtrace("probe status %d Fake probe reply seq %u " 8277c478bd9Sstevel@tonic-gate "snxt %u on %s from %s\n", 8287c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 8297c478bd9Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 8307c478bd9Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 8317c478bd9Sstevel@tonic-gate return; 8327c478bd9Sstevel@tonic-gate } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 8337c478bd9Sstevel@tonic-gate /* 8347c478bd9Sstevel@tonic-gate * The address matches, but our tables indicate that 8357c478bd9Sstevel@tonic-gate * this probe reply has been acked already. So this 8367c478bd9Sstevel@tonic-gate * is a duplicate probe reply. Log an error 8377c478bd9Sstevel@tonic-gate */ 8387c478bd9Sstevel@tonic-gate logtrace("probe status %d Duplicate probe reply seq %u " 8397c478bd9Sstevel@tonic-gate "snxt %u on %s from %s\n", 8407c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 8417c478bd9Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 8427c478bd9Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 8437c478bd9Sstevel@tonic-gate return; 8447c478bd9Sstevel@tonic-gate } 8457c478bd9Sstevel@tonic-gate } else { 8467c478bd9Sstevel@tonic-gate /* 8477c478bd9Sstevel@tonic-gate * Target must not be NULL in the PR_UNACKED state 8487c478bd9Sstevel@tonic-gate */ 8497c478bd9Sstevel@tonic-gate assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 8507c478bd9Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 8517c478bd9Sstevel@tonic-gate /* 8527c478bd9Sstevel@tonic-gate * The probe stats slot is unused. So we didn't 8537c478bd9Sstevel@tonic-gate * send out any probe to this target. This is a fake. 8547c478bd9Sstevel@tonic-gate * Log an error. 8557c478bd9Sstevel@tonic-gate */ 8567c478bd9Sstevel@tonic-gate logtrace("probe status %d Fake probe reply seq %u " 8577c478bd9Sstevel@tonic-gate "snxt %u on %s from %s\n", 8587c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 8597c478bd9Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 8607c478bd9Sstevel@tonic-gate } 8617c478bd9Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 8627c478bd9Sstevel@tonic-gate return; 8637c478bd9Sstevel@tonic-gate } 8647c478bd9Sstevel@tonic-gate 8657c478bd9Sstevel@tonic-gate /* 8667c478bd9Sstevel@tonic-gate * If the rtt does not appear to be right, don't update the 8677c478bd9Sstevel@tonic-gate * rtt stats. This can happen if the system dropped into the 8687c478bd9Sstevel@tonic-gate * debugger, or the system was hung or too busy for a 8697c478bd9Sstevel@tonic-gate * substantial time that we didn't get a chance to run. 8707c478bd9Sstevel@tonic-gate */ 8717c478bd9Sstevel@tonic-gate if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { 8727c478bd9Sstevel@tonic-gate /* 8737c478bd9Sstevel@tonic-gate * If the probe corresponding to this receieved response 8747c478bd9Sstevel@tonic-gate * was truly sent 'm' ms. ago, then this response must 8757c478bd9Sstevel@tonic-gate * have been rejected by the sequence number checks. The 8767c478bd9Sstevel@tonic-gate * fact that it has passed the sequence number checks 8777c478bd9Sstevel@tonic-gate * means that the measured rtt is wrong. We were probably 8787c478bd9Sstevel@tonic-gate * scheduled long after the packet was received. 8797c478bd9Sstevel@tonic-gate */ 8807c478bd9Sstevel@tonic-gate goto out; 8817c478bd9Sstevel@tonic-gate } 8827c478bd9Sstevel@tonic-gate 8837c478bd9Sstevel@tonic-gate /* 8847c478bd9Sstevel@tonic-gate * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 8857c478bd9Sstevel@tonic-gate * The initial few responses after the interface is repaired may 8867c478bd9Sstevel@tonic-gate * contain high rtt's because they could have been queued up waiting 8877c478bd9Sstevel@tonic-gate * for ARP/NDP resolution on a failed interface. 8887c478bd9Sstevel@tonic-gate */ 8897c478bd9Sstevel@tonic-gate if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 8907c478bd9Sstevel@tonic-gate goto out; 8917c478bd9Sstevel@tonic-gate 8927c478bd9Sstevel@tonic-gate /* 8937c478bd9Sstevel@tonic-gate * Don't update the Conservative Round Trip Time estimate for this 8947c478bd9Sstevel@tonic-gate * (phint, target) pair if this is the not the highest ack seq seen 8957c478bd9Sstevel@tonic-gate * thus far on this target. 8967c478bd9Sstevel@tonic-gate */ 8977c478bd9Sstevel@tonic-gate if (!highest_ack_tg(pr_icmp_seq, target)) 8987c478bd9Sstevel@tonic-gate goto out; 8997c478bd9Sstevel@tonic-gate 9007c478bd9Sstevel@tonic-gate /* 9017c478bd9Sstevel@tonic-gate * Always update the rtt. This is a failure detection probe 9027c478bd9Sstevel@tonic-gate * and we want to measure both increase / decrease in rtt. 9037c478bd9Sstevel@tonic-gate */ 9047c478bd9Sstevel@tonic-gate pi_set_crtt(target, m, _B_TRUE); 9057c478bd9Sstevel@tonic-gate 9067c478bd9Sstevel@tonic-gate /* 9077c478bd9Sstevel@tonic-gate * If the crtt exceeds the average time between probes, 9087c478bd9Sstevel@tonic-gate * investigate if this slow target is an exception. If so we 9097c478bd9Sstevel@tonic-gate * can avoid this target and still meet the failure detection 9107c478bd9Sstevel@tonic-gate * time. Otherwise we can't meet the failure detection time. 9117c478bd9Sstevel@tonic-gate */ 9127c478bd9Sstevel@tonic-gate if (target->tg_crtt > pg->pg_probeint) { 9137c478bd9Sstevel@tonic-gate exception = check_exception_target(pii, target); 9147c478bd9Sstevel@tonic-gate if (exception) { 9157c478bd9Sstevel@tonic-gate /* 9167c478bd9Sstevel@tonic-gate * This target is exceptionally slow. Don't use it 9177c478bd9Sstevel@tonic-gate * for future probes. check_exception_target() has 9187c478bd9Sstevel@tonic-gate * made sure that we have at least MIN_PROBE_TARGETS 9197c478bd9Sstevel@tonic-gate * other active targets 9207c478bd9Sstevel@tonic-gate */ 9217c478bd9Sstevel@tonic-gate if (pii->pii_targets_are_routers) { 9227c478bd9Sstevel@tonic-gate /* 9237c478bd9Sstevel@tonic-gate * This is a slow router, mark it as slow 9247c478bd9Sstevel@tonic-gate * and don't use it for further probes. We 9257c478bd9Sstevel@tonic-gate * don't delete it, since it will be populated 9267c478bd9Sstevel@tonic-gate * again when we do a router scan. Hence we 9277c478bd9Sstevel@tonic-gate * need to maintain extra state (unlike the 9287c478bd9Sstevel@tonic-gate * host case below). Mark it as TG_SLOW. 9297c478bd9Sstevel@tonic-gate */ 9307c478bd9Sstevel@tonic-gate if (target->tg_status == TG_ACTIVE) 9317c478bd9Sstevel@tonic-gate pii->pii_ntargets--; 9327c478bd9Sstevel@tonic-gate target->tg_status = TG_SLOW; 9337c478bd9Sstevel@tonic-gate target->tg_latime = gethrtime(); 9347c478bd9Sstevel@tonic-gate target->tg_rtt_sa = -1; 9357c478bd9Sstevel@tonic-gate target->tg_crtt = 0; 9367c478bd9Sstevel@tonic-gate target->tg_rtt_sd = 0; 9377c478bd9Sstevel@tonic-gate if (pii->pii_target_next == target) { 9387c478bd9Sstevel@tonic-gate pii->pii_target_next = 9397c478bd9Sstevel@tonic-gate target_next(target); 9407c478bd9Sstevel@tonic-gate } 9417c478bd9Sstevel@tonic-gate } else { 9427c478bd9Sstevel@tonic-gate /* 9437c478bd9Sstevel@tonic-gate * the slow target is not a router, we can 9447c478bd9Sstevel@tonic-gate * just delete it. Send an icmp multicast and 9457c478bd9Sstevel@tonic-gate * pick the fastest responder that is not 9467c478bd9Sstevel@tonic-gate * already an active target. target_delete() 9477c478bd9Sstevel@tonic-gate * adjusts pii->pii_target_next 9487c478bd9Sstevel@tonic-gate */ 9497c478bd9Sstevel@tonic-gate target_delete(target); 9507c478bd9Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 9517c478bd9Sstevel@tonic-gate } 9527c478bd9Sstevel@tonic-gate } else { 9537c478bd9Sstevel@tonic-gate /* 9547c478bd9Sstevel@tonic-gate * We can't meet the failure detection time. 9557c478bd9Sstevel@tonic-gate * Log a message, and update the detection time to 9567c478bd9Sstevel@tonic-gate * whatever we can achieve. 9577c478bd9Sstevel@tonic-gate */ 9587c478bd9Sstevel@tonic-gate pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 9597c478bd9Sstevel@tonic-gate pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 9607c478bd9Sstevel@tonic-gate last_fdt_bumpup_time = gethrtime(); 9617c478bd9Sstevel@tonic-gate if (pg != phyint_anongroup) { 9627c478bd9Sstevel@tonic-gate logerr("Cannot meet requested failure detection" 9637c478bd9Sstevel@tonic-gate " time of %d ms on (%s %s) new failure" 9647c478bd9Sstevel@tonic-gate " detection time for group \"%s\" is %d" 9657c478bd9Sstevel@tonic-gate " ms\n", user_failure_detection_time, 9667c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 9677c478bd9Sstevel@tonic-gate pg->pg_name, pg->pg_fdt); 9687c478bd9Sstevel@tonic-gate } 9697c478bd9Sstevel@tonic-gate } 9707c478bd9Sstevel@tonic-gate } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 9717c478bd9Sstevel@tonic-gate (user_failure_detection_time < pg->pg_fdt) && 9727c478bd9Sstevel@tonic-gate (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 9737c478bd9Sstevel@tonic-gate /* 9747c478bd9Sstevel@tonic-gate * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 9757c478bd9Sstevel@tonic-gate * investigate if we can improve the failure detection time to 9767c478bd9Sstevel@tonic-gate * meet whatever the user specified. 9777c478bd9Sstevel@tonic-gate */ 9787c478bd9Sstevel@tonic-gate if (check_pg_crtt_improved(pg)) { 9797c478bd9Sstevel@tonic-gate pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 9807c478bd9Sstevel@tonic-gate user_failure_detection_time); 9817c478bd9Sstevel@tonic-gate pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 9827c478bd9Sstevel@tonic-gate if (pg != phyint_anongroup) { 9837c478bd9Sstevel@tonic-gate logerr("Improved failure detection time %d ms " 9847c478bd9Sstevel@tonic-gate "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 9857c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 9867c478bd9Sstevel@tonic-gate pg->pg_name); 9877c478bd9Sstevel@tonic-gate } 9887c478bd9Sstevel@tonic-gate if (user_failure_detection_time == pg->pg_fdt) { 9897c478bd9Sstevel@tonic-gate /* Avoid any truncation or rounding errors */ 9907c478bd9Sstevel@tonic-gate pg->pg_probeint = user_probe_interval; 9917c478bd9Sstevel@tonic-gate /* 9927c478bd9Sstevel@tonic-gate * No more rtt probes will be sent. The actual 9937c478bd9Sstevel@tonic-gate * fdt has dropped to the user specified value. 9947c478bd9Sstevel@tonic-gate * pii_fd_snxt_basetime and pii_snxt_basetime 9957c478bd9Sstevel@tonic-gate * will be in sync henceforth. 9967c478bd9Sstevel@tonic-gate */ 9977c478bd9Sstevel@tonic-gate reset_snxt_basetimes(); 9987c478bd9Sstevel@tonic-gate } 9997c478bd9Sstevel@tonic-gate } 10007c478bd9Sstevel@tonic-gate } 10017c478bd9Sstevel@tonic-gate out: 10027c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status = PR_ACKED; 10037c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_time_acked = cur_time; 10047c478bd9Sstevel@tonic-gate 10057c478bd9Sstevel@tonic-gate /* 10067c478bd9Sstevel@tonic-gate * Update pii->pii_rack, i.e. the sequence number of the last received 10077c478bd9Sstevel@tonic-gate * probe response, based on the echo reply we have received now, if 10087c478bd9Sstevel@tonic-gate * either of the following conditions are satisfied. 10097c478bd9Sstevel@tonic-gate * a. pii_rack is outside the current receive window of 10107c478bd9Sstevel@tonic-gate * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 10117c478bd9Sstevel@tonic-gate * This means we have not received probe responses for a 10127c478bd9Sstevel@tonic-gate * long time, and the sequence number has wrapped around. 10137c478bd9Sstevel@tonic-gate * b. pii_rack is within the current receive window and this echo 10147c478bd9Sstevel@tonic-gate * reply corresponds to the highest sequence number we have seen 10157c478bd9Sstevel@tonic-gate * so far. 10167c478bd9Sstevel@tonic-gate */ 10177c478bd9Sstevel@tonic-gate if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 10187c478bd9Sstevel@tonic-gate SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 10197c478bd9Sstevel@tonic-gate SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 10207c478bd9Sstevel@tonic-gate pii->pii_rack = pr_icmp_seq; 10217c478bd9Sstevel@tonic-gate } 10227c478bd9Sstevel@tonic-gate } 10237c478bd9Sstevel@tonic-gate 10247c478bd9Sstevel@tonic-gate /* 10257c478bd9Sstevel@tonic-gate * Returns true if seq is the highest unacknowledged seq for target tg 10267c478bd9Sstevel@tonic-gate * else returns false 10277c478bd9Sstevel@tonic-gate */ 10287c478bd9Sstevel@tonic-gate static boolean_t 10297c478bd9Sstevel@tonic-gate highest_ack_tg(uint16_t seq, struct target *tg) 10307c478bd9Sstevel@tonic-gate { 10317c478bd9Sstevel@tonic-gate struct phyint_instance *pii; 10327c478bd9Sstevel@tonic-gate int pr_ndx; 10337c478bd9Sstevel@tonic-gate uint16_t pr_seq; 10347c478bd9Sstevel@tonic-gate 10357c478bd9Sstevel@tonic-gate pii = tg->tg_phyint_inst; 10367c478bd9Sstevel@tonic-gate 10377c478bd9Sstevel@tonic-gate /* 10387c478bd9Sstevel@tonic-gate * Get the seq number of the most recent probe sent so far, 10397c478bd9Sstevel@tonic-gate * and also get the corresponding probe index in the probe stats 10407c478bd9Sstevel@tonic-gate * array. 10417c478bd9Sstevel@tonic-gate */ 10427c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 10437c478bd9Sstevel@tonic-gate pr_seq = pii->pii_snxt; 10447c478bd9Sstevel@tonic-gate pr_seq--; 10457c478bd9Sstevel@tonic-gate 10467c478bd9Sstevel@tonic-gate /* 10477c478bd9Sstevel@tonic-gate * Start from the most recent probe and walk back, trying to find 10487c478bd9Sstevel@tonic-gate * an acked probe corresponding to target tg. 10497c478bd9Sstevel@tonic-gate */ 10507c478bd9Sstevel@tonic-gate for (; pr_ndx != pii->pii_probe_next; 10517c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 10527c478bd9Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_target == tg && 10537c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 10547c478bd9Sstevel@tonic-gate if (SEQ_GT(pr_seq, seq)) 10557c478bd9Sstevel@tonic-gate return (_B_FALSE); 10567c478bd9Sstevel@tonic-gate } 10577c478bd9Sstevel@tonic-gate } 10587c478bd9Sstevel@tonic-gate return (_B_TRUE); 10597c478bd9Sstevel@tonic-gate } 10607c478bd9Sstevel@tonic-gate 10617c478bd9Sstevel@tonic-gate /* 10627c478bd9Sstevel@tonic-gate * Check whether the crtt for the group has improved by a factor of 10637c478bd9Sstevel@tonic-gate * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 10647c478bd9Sstevel@tonic-gate * detection time flapping in the face of small crtt changes. 10657c478bd9Sstevel@tonic-gate */ 10667c478bd9Sstevel@tonic-gate static boolean_t 10677c478bd9Sstevel@tonic-gate check_pg_crtt_improved(struct phyint_group *pg) 10687c478bd9Sstevel@tonic-gate { 10697c478bd9Sstevel@tonic-gate struct phyint *pi; 10707c478bd9Sstevel@tonic-gate 10717c478bd9Sstevel@tonic-gate if (debug & D_PROBE) 10727c478bd9Sstevel@tonic-gate logdebug("check_pg_crtt_improved()\n"); 10737c478bd9Sstevel@tonic-gate 10747c478bd9Sstevel@tonic-gate /* 10757c478bd9Sstevel@tonic-gate * The crtt for the group is only improved if each phyint_instance 10767c478bd9Sstevel@tonic-gate * for both ipv4 and ipv6 is improved. 10777c478bd9Sstevel@tonic-gate */ 10787c478bd9Sstevel@tonic-gate for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 10797c478bd9Sstevel@tonic-gate if (!check_pii_crtt_improved(pi->pi_v4) || 10807c478bd9Sstevel@tonic-gate !check_pii_crtt_improved(pi->pi_v6)) 10817c478bd9Sstevel@tonic-gate return (_B_FALSE); 10827c478bd9Sstevel@tonic-gate } 10837c478bd9Sstevel@tonic-gate 10847c478bd9Sstevel@tonic-gate return (_B_TRUE); 10857c478bd9Sstevel@tonic-gate } 10867c478bd9Sstevel@tonic-gate 10877c478bd9Sstevel@tonic-gate /* 10887c478bd9Sstevel@tonic-gate * Check whether the crtt has improved substantially on this phyint_instance. 10897c478bd9Sstevel@tonic-gate * Returns _B_TRUE if there's no crtt information available, because pii 10907c478bd9Sstevel@tonic-gate * is NULL or the phyint_instance is not capable of probing. 10917c478bd9Sstevel@tonic-gate */ 10927c478bd9Sstevel@tonic-gate boolean_t 10937c478bd9Sstevel@tonic-gate check_pii_crtt_improved(struct phyint_instance *pii) { 10947c478bd9Sstevel@tonic-gate struct target *tg; 10957c478bd9Sstevel@tonic-gate 10967c478bd9Sstevel@tonic-gate if (pii == NULL) 10977c478bd9Sstevel@tonic-gate return (_B_TRUE); 10987c478bd9Sstevel@tonic-gate 10997c478bd9Sstevel@tonic-gate if (!PROBE_CAPABLE(pii) || 11007c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_state == PI_FAILED) 11017c478bd9Sstevel@tonic-gate return (_B_TRUE); 11027c478bd9Sstevel@tonic-gate 11037c478bd9Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 11047c478bd9Sstevel@tonic-gate if (tg->tg_status != TG_ACTIVE) 11057c478bd9Sstevel@tonic-gate continue; 11067c478bd9Sstevel@tonic-gate if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 11077c478bd9Sstevel@tonic-gate LOWER_FDT_TRIGGER)) { 11087c478bd9Sstevel@tonic-gate return (_B_FALSE); 11097c478bd9Sstevel@tonic-gate } 11107c478bd9Sstevel@tonic-gate } 11117c478bd9Sstevel@tonic-gate 11127c478bd9Sstevel@tonic-gate return (_B_TRUE); 11137c478bd9Sstevel@tonic-gate } 11147c478bd9Sstevel@tonic-gate 11157c478bd9Sstevel@tonic-gate /* 11167c478bd9Sstevel@tonic-gate * This target responds very slowly to probes. The target's crtt exceeds 11177c478bd9Sstevel@tonic-gate * the probe interval of its group. Compare against other targets 11187c478bd9Sstevel@tonic-gate * and determine if this target is an exception, if so return true, else false 11197c478bd9Sstevel@tonic-gate */ 11207c478bd9Sstevel@tonic-gate static boolean_t 11217c478bd9Sstevel@tonic-gate check_exception_target(struct phyint_instance *pii, struct target *target) 11227c478bd9Sstevel@tonic-gate { 11237c478bd9Sstevel@tonic-gate struct target *tg; 11247c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 11257c478bd9Sstevel@tonic-gate 11267c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 11277c478bd9Sstevel@tonic-gate logdebug("check_exception_target(%s %s target %s)\n", 11287c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 11297c478bd9Sstevel@tonic-gate pr_addr(pii->pii_af, target->tg_address, 11307c478bd9Sstevel@tonic-gate abuf, sizeof (abuf))); 11317c478bd9Sstevel@tonic-gate } 11327c478bd9Sstevel@tonic-gate 11337c478bd9Sstevel@tonic-gate /* 11347c478bd9Sstevel@tonic-gate * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 11357c478bd9Sstevel@tonic-gate * to make a good judgement. Otherwise don't drop this target. 11367c478bd9Sstevel@tonic-gate */ 11377c478bd9Sstevel@tonic-gate if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 11387c478bd9Sstevel@tonic-gate return (_B_FALSE); 11397c478bd9Sstevel@tonic-gate 11407c478bd9Sstevel@tonic-gate /* 11417c478bd9Sstevel@tonic-gate * Determine whether only this particular target is slow. 11427c478bd9Sstevel@tonic-gate * We know that this target's crtt exceeds the group's probe interval. 11437c478bd9Sstevel@tonic-gate * If all other active targets have a 11447c478bd9Sstevel@tonic-gate * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 11457c478bd9Sstevel@tonic-gate * then this target is considered slow. 11467c478bd9Sstevel@tonic-gate */ 11477c478bd9Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 11487c478bd9Sstevel@tonic-gate if (tg != target && tg->tg_status == TG_ACTIVE) { 11497c478bd9Sstevel@tonic-gate if (tg->tg_crtt > 11507c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint / 11517c478bd9Sstevel@tonic-gate EXCEPTION_FACTOR) { 11527c478bd9Sstevel@tonic-gate return (_B_FALSE); 11537c478bd9Sstevel@tonic-gate } 11547c478bd9Sstevel@tonic-gate } 11557c478bd9Sstevel@tonic-gate } 11567c478bd9Sstevel@tonic-gate 11577c478bd9Sstevel@tonic-gate return (_B_TRUE); 11587c478bd9Sstevel@tonic-gate } 11597c478bd9Sstevel@tonic-gate 11607c478bd9Sstevel@tonic-gate /* 11617c478bd9Sstevel@tonic-gate * Update the target list. The icmp all hosts multicast has given us 11627c478bd9Sstevel@tonic-gate * some host to which we can send probes. If we already have sufficient 11637c478bd9Sstevel@tonic-gate * targets, discard it. 11647c478bd9Sstevel@tonic-gate */ 11657c478bd9Sstevel@tonic-gate static void 11667c478bd9Sstevel@tonic-gate incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 11677c478bd9Sstevel@tonic-gate struct in6_addr fromaddr) 11687c478bd9Sstevel@tonic-gate /* ARGSUSED */ 11697c478bd9Sstevel@tonic-gate { 11707c478bd9Sstevel@tonic-gate int af; 11717c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 11727c478bd9Sstevel@tonic-gate struct phyint *pi; 11737c478bd9Sstevel@tonic-gate 11747c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 11757c478bd9Sstevel@tonic-gate logdebug("incoming_mcast_reply(%s %s %s)\n", 11767c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 11777c478bd9Sstevel@tonic-gate pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 11787c478bd9Sstevel@tonic-gate } 11797c478bd9Sstevel@tonic-gate 11807c478bd9Sstevel@tonic-gate /* 11817c478bd9Sstevel@tonic-gate * Using host targets is a fallback mechanism. If we have 11827c478bd9Sstevel@tonic-gate * found a router, don't add this host target. If we already 11837c478bd9Sstevel@tonic-gate * know MAX_PROBE_TARGETS, don't add another target. 11847c478bd9Sstevel@tonic-gate */ 11857c478bd9Sstevel@tonic-gate assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 11867c478bd9Sstevel@tonic-gate if (pii->pii_targets != NULL) { 11877c478bd9Sstevel@tonic-gate if (pii->pii_targets_are_routers || 11887c478bd9Sstevel@tonic-gate (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 11897c478bd9Sstevel@tonic-gate return; 11907c478bd9Sstevel@tonic-gate } 11917c478bd9Sstevel@tonic-gate } 11927c478bd9Sstevel@tonic-gate 11937c478bd9Sstevel@tonic-gate if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 11947c478bd9Sstevel@tonic-gate IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 11957c478bd9Sstevel@tonic-gate /* 11967c478bd9Sstevel@tonic-gate * Guard against response from 0.0.0.0 11977c478bd9Sstevel@tonic-gate * and ::. Log a trace message 11987c478bd9Sstevel@tonic-gate */ 11997c478bd9Sstevel@tonic-gate logtrace("probe response from %s on %s\n", 12007c478bd9Sstevel@tonic-gate pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 12017c478bd9Sstevel@tonic-gate pii->pii_name); 12027c478bd9Sstevel@tonic-gate return; 12037c478bd9Sstevel@tonic-gate } 12047c478bd9Sstevel@tonic-gate 12057c478bd9Sstevel@tonic-gate /* 12067c478bd9Sstevel@tonic-gate * This address is one of our own, so reject this address as a 12077c478bd9Sstevel@tonic-gate * valid probe target. 12087c478bd9Sstevel@tonic-gate */ 12097c478bd9Sstevel@tonic-gate af = pii->pii_af; 12107c478bd9Sstevel@tonic-gate if (own_address(af, fromaddr)) 12117c478bd9Sstevel@tonic-gate return; 12127c478bd9Sstevel@tonic-gate 12137c478bd9Sstevel@tonic-gate /* 12147c478bd9Sstevel@tonic-gate * If the phyint is part a named group, then add the address to all 12157c478bd9Sstevel@tonic-gate * members of the group. Otherwise, add the address only to the 12167c478bd9Sstevel@tonic-gate * phyint itself, since other phyints in the anongroup may not be on 12177c478bd9Sstevel@tonic-gate * the same subnet. 12187c478bd9Sstevel@tonic-gate */ 12197c478bd9Sstevel@tonic-gate pi = pii->pii_phyint; 12207c478bd9Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 12217c478bd9Sstevel@tonic-gate target_add(pii, fromaddr, _B_FALSE); 12227c478bd9Sstevel@tonic-gate } else { 12237c478bd9Sstevel@tonic-gate pi = pi->pi_group->pg_phyint; 12247c478bd9Sstevel@tonic-gate for (; pi != NULL; pi = pi->pi_pgnext) 12257c478bd9Sstevel@tonic-gate target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 12267c478bd9Sstevel@tonic-gate } 12277c478bd9Sstevel@tonic-gate } 12287c478bd9Sstevel@tonic-gate 12297c478bd9Sstevel@tonic-gate /* 12307c478bd9Sstevel@tonic-gate * Compute CRTT given an existing scaled average, scaled deviation estimate 12317c478bd9Sstevel@tonic-gate * and a new rtt time. The formula is from Jacobson and Karels' 12327c478bd9Sstevel@tonic-gate * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 12337c478bd9Sstevel@tonic-gate * are the same as those in Appendix A.2 of that paper. 12347c478bd9Sstevel@tonic-gate * 12357c478bd9Sstevel@tonic-gate * m = new measurement 12367c478bd9Sstevel@tonic-gate * sa = scaled RTT average (8 * average estimates) 12377c478bd9Sstevel@tonic-gate * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 12387c478bd9Sstevel@tonic-gate * crtt = Conservative round trip time. Used to determine whether probe 12397c478bd9Sstevel@tonic-gate * has timed out. 12407c478bd9Sstevel@tonic-gate * 12417c478bd9Sstevel@tonic-gate * New scaled average and deviation are passed back via sap and svp 12427c478bd9Sstevel@tonic-gate */ 12437c478bd9Sstevel@tonic-gate static int 12447c478bd9Sstevel@tonic-gate compute_crtt(int *sap, int *svp, int m) 12457c478bd9Sstevel@tonic-gate { 12467c478bd9Sstevel@tonic-gate int sa = *sap; 12477c478bd9Sstevel@tonic-gate int sv = *svp; 12487c478bd9Sstevel@tonic-gate int crtt; 12497c478bd9Sstevel@tonic-gate int saved_m = m; 12507c478bd9Sstevel@tonic-gate 12517c478bd9Sstevel@tonic-gate assert(*sap >= -1); 12527c478bd9Sstevel@tonic-gate assert(*svp >= 0); 12537c478bd9Sstevel@tonic-gate 12547c478bd9Sstevel@tonic-gate if (sa != -1) { 12557c478bd9Sstevel@tonic-gate /* 12567c478bd9Sstevel@tonic-gate * Update average estimator: 12577c478bd9Sstevel@tonic-gate * new rtt = old rtt + 1/8 Error 12587c478bd9Sstevel@tonic-gate * where Error = m - old rtt 12597c478bd9Sstevel@tonic-gate * i.e. 8 * new rtt = 8 * old rtt + Error 12607c478bd9Sstevel@tonic-gate * i.e. new sa = old sa + Error 12617c478bd9Sstevel@tonic-gate */ 12627c478bd9Sstevel@tonic-gate m -= sa >> 3; /* m is now Error in estimate. */ 12637c478bd9Sstevel@tonic-gate if ((sa += m) < 0) { 12647c478bd9Sstevel@tonic-gate /* Don't allow the smoothed average to be negative. */ 12657c478bd9Sstevel@tonic-gate sa = 0; 12667c478bd9Sstevel@tonic-gate } 12677c478bd9Sstevel@tonic-gate 12687c478bd9Sstevel@tonic-gate /* 12697c478bd9Sstevel@tonic-gate * Update deviation estimator: 12707c478bd9Sstevel@tonic-gate * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 12717c478bd9Sstevel@tonic-gate * i.e. 4 * new mdev = 4 * old mdev + 12727c478bd9Sstevel@tonic-gate * (abs(Error) - old mdev) 12737c478bd9Sstevel@tonic-gate * i.e. new sv = old sv + (abs(Error) - old mdev) 12747c478bd9Sstevel@tonic-gate */ 12757c478bd9Sstevel@tonic-gate if (m < 0) 12767c478bd9Sstevel@tonic-gate m = -m; 12777c478bd9Sstevel@tonic-gate m -= sv >> 2; 12787c478bd9Sstevel@tonic-gate sv += m; 12797c478bd9Sstevel@tonic-gate } else { 12807c478bd9Sstevel@tonic-gate /* Initialization. This is the first response received. */ 12817c478bd9Sstevel@tonic-gate sa = (m << 3); 12827c478bd9Sstevel@tonic-gate sv = (m << 1); 12837c478bd9Sstevel@tonic-gate } 12847c478bd9Sstevel@tonic-gate 12857c478bd9Sstevel@tonic-gate crtt = (sa >> 3) + sv; 12867c478bd9Sstevel@tonic-gate 12877c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 12887c478bd9Sstevel@tonic-gate logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " 12897c478bd9Sstevel@tonic-gate "%d\n", saved_m, sa, sv, crtt); 12907c478bd9Sstevel@tonic-gate } 12917c478bd9Sstevel@tonic-gate 12927c478bd9Sstevel@tonic-gate *sap = sa; 12937c478bd9Sstevel@tonic-gate *svp = sv; 12947c478bd9Sstevel@tonic-gate 12957c478bd9Sstevel@tonic-gate /* 12967c478bd9Sstevel@tonic-gate * CRTT = average estimates + 4 * deviation estimates 12977c478bd9Sstevel@tonic-gate * = sa / 8 + sv 12987c478bd9Sstevel@tonic-gate */ 12997c478bd9Sstevel@tonic-gate return (crtt); 13007c478bd9Sstevel@tonic-gate } 13017c478bd9Sstevel@tonic-gate 13027c478bd9Sstevel@tonic-gate static void 13037c478bd9Sstevel@tonic-gate pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) 13047c478bd9Sstevel@tonic-gate { 13057c478bd9Sstevel@tonic-gate struct phyint_instance *pii = tg->tg_phyint_inst; 13067c478bd9Sstevel@tonic-gate int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 13077c478bd9Sstevel@tonic-gate int sa = tg->tg_rtt_sa; 13087c478bd9Sstevel@tonic-gate int sv = tg->tg_rtt_sd; 13097c478bd9Sstevel@tonic-gate int new_crtt; 13107c478bd9Sstevel@tonic-gate int i; 13117c478bd9Sstevel@tonic-gate 13127c478bd9Sstevel@tonic-gate if (debug & D_PROBE) 13137c478bd9Sstevel@tonic-gate logdebug("pi_set_crtt: target - m %d\n", m); 13147c478bd9Sstevel@tonic-gate 13157c478bd9Sstevel@tonic-gate /* store the round trip time, in case we need to defer computation */ 13167c478bd9Sstevel@tonic-gate tg->tg_deferred[tg->tg_num_deferred] = m; 13177c478bd9Sstevel@tonic-gate 13187c478bd9Sstevel@tonic-gate new_crtt = compute_crtt(&sa, &sv, m); 13197c478bd9Sstevel@tonic-gate 13207c478bd9Sstevel@tonic-gate /* 13217c478bd9Sstevel@tonic-gate * If this probe's round trip time would singlehandedly cause an 13227c478bd9Sstevel@tonic-gate * increase in the group's probe interval consider it suspect. 13237c478bd9Sstevel@tonic-gate */ 13247c478bd9Sstevel@tonic-gate if ((new_crtt > probe_interval) && is_probe_uni) { 13257c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 13267c478bd9Sstevel@tonic-gate logdebug("Received a suspect probe on %s, new_crtt =" 13277c478bd9Sstevel@tonic-gate " %d, probe_interval = %d, num_deferred = %d\n", 13287c478bd9Sstevel@tonic-gate pii->pii_probe_logint->li_name, new_crtt, 13297c478bd9Sstevel@tonic-gate probe_interval, tg->tg_num_deferred); 13307c478bd9Sstevel@tonic-gate } 13317c478bd9Sstevel@tonic-gate 13327c478bd9Sstevel@tonic-gate /* 13337c478bd9Sstevel@tonic-gate * If we've deferred as many rtts as we plan on deferring, then 13347c478bd9Sstevel@tonic-gate * assume the link really did slow down and process all queued 13357c478bd9Sstevel@tonic-gate * rtts 13367c478bd9Sstevel@tonic-gate */ 13377c478bd9Sstevel@tonic-gate if (tg->tg_num_deferred == MAXDEFERREDRTT) { 13387c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 13397c478bd9Sstevel@tonic-gate logdebug("Received MAXDEFERREDRTT probes which " 13407c478bd9Sstevel@tonic-gate "would cause an increased probe_interval. " 13417c478bd9Sstevel@tonic-gate "Integrating queued rtt data points.\n"); 13427c478bd9Sstevel@tonic-gate } 13437c478bd9Sstevel@tonic-gate 13447c478bd9Sstevel@tonic-gate for (i = 0; i <= tg->tg_num_deferred; i++) { 13457c478bd9Sstevel@tonic-gate tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, 13467c478bd9Sstevel@tonic-gate &tg->tg_rtt_sd, tg->tg_deferred[i]); 13477c478bd9Sstevel@tonic-gate } 13487c478bd9Sstevel@tonic-gate 13497c478bd9Sstevel@tonic-gate tg->tg_num_deferred = 0; 13507c478bd9Sstevel@tonic-gate } else { 13517c478bd9Sstevel@tonic-gate tg->tg_num_deferred++; 13527c478bd9Sstevel@tonic-gate } 13537c478bd9Sstevel@tonic-gate return; 13547c478bd9Sstevel@tonic-gate } 13557c478bd9Sstevel@tonic-gate 13567c478bd9Sstevel@tonic-gate /* 13577c478bd9Sstevel@tonic-gate * If this is a normal probe, or an RTT probe that would lead to a 13587c478bd9Sstevel@tonic-gate * reduced CRTT, then update our CRTT data. Further, if this was 13597c478bd9Sstevel@tonic-gate * a normal probe, pitch any deferred probes since our probes are 13607c478bd9Sstevel@tonic-gate * again being answered within our CRTT estimates. 13617c478bd9Sstevel@tonic-gate */ 13627c478bd9Sstevel@tonic-gate if (is_probe_uni || new_crtt < tg->tg_crtt) { 13637c478bd9Sstevel@tonic-gate tg->tg_rtt_sa = sa; 13647c478bd9Sstevel@tonic-gate tg->tg_rtt_sd = sv; 13657c478bd9Sstevel@tonic-gate tg->tg_crtt = new_crtt; 13667c478bd9Sstevel@tonic-gate if (is_probe_uni) 13677c478bd9Sstevel@tonic-gate tg->tg_num_deferred = 0; 13687c478bd9Sstevel@tonic-gate } 13697c478bd9Sstevel@tonic-gate } 13707c478bd9Sstevel@tonic-gate 13717c478bd9Sstevel@tonic-gate /* 13727c478bd9Sstevel@tonic-gate * Return a pointer to the specified option buffer. 13737c478bd9Sstevel@tonic-gate * If not found return NULL. 13747c478bd9Sstevel@tonic-gate */ 13757c478bd9Sstevel@tonic-gate static void * 13767c478bd9Sstevel@tonic-gate find_ancillary(struct msghdr *msg, int cmsg_type) 13777c478bd9Sstevel@tonic-gate { 13787c478bd9Sstevel@tonic-gate struct cmsghdr *cmsg; 13797c478bd9Sstevel@tonic-gate 13807c478bd9Sstevel@tonic-gate for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 13817c478bd9Sstevel@tonic-gate cmsg = CMSG_NXTHDR(msg, cmsg)) { 13827c478bd9Sstevel@tonic-gate if (cmsg->cmsg_level == IPPROTO_IPV6 && 13837c478bd9Sstevel@tonic-gate cmsg->cmsg_type == cmsg_type) { 13847c478bd9Sstevel@tonic-gate return (CMSG_DATA(cmsg)); 13857c478bd9Sstevel@tonic-gate } 13867c478bd9Sstevel@tonic-gate } 13877c478bd9Sstevel@tonic-gate return (NULL); 13887c478bd9Sstevel@tonic-gate } 13897c478bd9Sstevel@tonic-gate 13907c478bd9Sstevel@tonic-gate /* 13917c478bd9Sstevel@tonic-gate * See if a previously failed interface has started working again. 13927c478bd9Sstevel@tonic-gate */ 13937c478bd9Sstevel@tonic-gate void 13947c478bd9Sstevel@tonic-gate phyint_check_for_repair(struct phyint *pi) 13957c478bd9Sstevel@tonic-gate { 13967c478bd9Sstevel@tonic-gate if (phyint_repaired(pi)) { 13977c478bd9Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 13987c478bd9Sstevel@tonic-gate logerr("NIC repair detected on %s\n", pi->pi_name); 13997c478bd9Sstevel@tonic-gate } else { 14007c478bd9Sstevel@tonic-gate logerr("NIC repair detected on %s of group %s\n", 14017c478bd9Sstevel@tonic-gate pi->pi_name, pi->pi_group->pg_name); 14027c478bd9Sstevel@tonic-gate } 14037c478bd9Sstevel@tonic-gate 14047c478bd9Sstevel@tonic-gate /* 14057c478bd9Sstevel@tonic-gate * If the interface is offline, just clear the FAILED flag, 14067c478bd9Sstevel@tonic-gate * delaying the state change and failback operation until it 14077c478bd9Sstevel@tonic-gate * is brought back online. 14087c478bd9Sstevel@tonic-gate */ 14097c478bd9Sstevel@tonic-gate if (pi->pi_state == PI_OFFLINE) { 14107c478bd9Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 14117c478bd9Sstevel@tonic-gate return; 14127c478bd9Sstevel@tonic-gate } 14137c478bd9Sstevel@tonic-gate 1414*49df4566Sethindra if (pi->pi_flags & IFF_STANDBY) { 14157c478bd9Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 14167c478bd9Sstevel@tonic-gate } else { 14177c478bd9Sstevel@tonic-gate if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) { 14187c478bd9Sstevel@tonic-gate (void) change_lif_flags(pi, 14197c478bd9Sstevel@tonic-gate IFF_FAILED, _B_FALSE); 14207c478bd9Sstevel@tonic-gate /* Per state diagram */ 14217c478bd9Sstevel@tonic-gate pi->pi_empty = 0; 14227c478bd9Sstevel@tonic-gate } 14237c478bd9Sstevel@tonic-gate } 14247c478bd9Sstevel@tonic-gate 14257c478bd9Sstevel@tonic-gate phyint_chstate(pi, PI_RUNNING); 14267c478bd9Sstevel@tonic-gate 14277c478bd9Sstevel@tonic-gate if (GROUP_FAILED(pi->pi_group)) { 14287c478bd9Sstevel@tonic-gate /* 14297c478bd9Sstevel@tonic-gate * This is the 1st phyint to receive a response 14307c478bd9Sstevel@tonic-gate * after group failure. 14317c478bd9Sstevel@tonic-gate */ 14327c478bd9Sstevel@tonic-gate logerr("At least 1 interface (%s) of group %s has " 14337c478bd9Sstevel@tonic-gate "repaired\n", pi->pi_name, pi->pi_group->pg_name); 14347c478bd9Sstevel@tonic-gate phyint_group_chstate(pi->pi_group, PG_RUNNING); 14357c478bd9Sstevel@tonic-gate } 14367c478bd9Sstevel@tonic-gate } 14377c478bd9Sstevel@tonic-gate } 14387c478bd9Sstevel@tonic-gate 14397c478bd9Sstevel@tonic-gate /* 14407c478bd9Sstevel@tonic-gate * See if a previously functioning interface has failed, or if the 14417c478bd9Sstevel@tonic-gate * whole group of interfaces has failed. 14427c478bd9Sstevel@tonic-gate */ 14437c478bd9Sstevel@tonic-gate static void 14447c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(struct phyint_instance *pii) 14457c478bd9Sstevel@tonic-gate { 14467c478bd9Sstevel@tonic-gate struct phyint *pi; 14477c478bd9Sstevel@tonic-gate struct phyint *pi2; 14487c478bd9Sstevel@tonic-gate 14497c478bd9Sstevel@tonic-gate pi = pii->pii_phyint; 14507c478bd9Sstevel@tonic-gate 14517c478bd9Sstevel@tonic-gate switch (failure_state(pii)) { 14527c478bd9Sstevel@tonic-gate case PHYINT_FAILURE: 14537c478bd9Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 14547c478bd9Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 14557c478bd9Sstevel@tonic-gate logerr("NIC failure detected on %s\n", pii->pii_name); 14567c478bd9Sstevel@tonic-gate } else { 14577c478bd9Sstevel@tonic-gate logerr("NIC failure detected on %s of group %s\n", 14587c478bd9Sstevel@tonic-gate pii->pii_name, pi->pi_group->pg_name); 14597c478bd9Sstevel@tonic-gate } 14607c478bd9Sstevel@tonic-gate /* 14617c478bd9Sstevel@tonic-gate * Do the failover, unless the interface is offline (in 14627c478bd9Sstevel@tonic-gate * which case we've already failed over). 14637c478bd9Sstevel@tonic-gate */ 14647c478bd9Sstevel@tonic-gate if (pi->pi_state != PI_OFFLINE) { 14657c478bd9Sstevel@tonic-gate phyint_chstate(pi, PI_FAILED); 14667c478bd9Sstevel@tonic-gate reset_crtt_all(pi); 14677c478bd9Sstevel@tonic-gate if (!(pi->pi_flags & IFF_INACTIVE)) 14687c478bd9Sstevel@tonic-gate (void) try_failover(pi, FAILOVER_NORMAL); 14697c478bd9Sstevel@tonic-gate } 14707c478bd9Sstevel@tonic-gate break; 14717c478bd9Sstevel@tonic-gate 14727c478bd9Sstevel@tonic-gate case GROUP_FAILURE: 14737c478bd9Sstevel@tonic-gate logerr("All Interfaces in group %s have failed\n", 14747c478bd9Sstevel@tonic-gate pi->pi_group->pg_name); 14757c478bd9Sstevel@tonic-gate for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; 14767c478bd9Sstevel@tonic-gate pi2 = pi2->pi_pgnext) { 14777c478bd9Sstevel@tonic-gate if (pi2->pi_flags & IFF_OFFLINE) 14787c478bd9Sstevel@tonic-gate continue; 14797c478bd9Sstevel@tonic-gate (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); 14807c478bd9Sstevel@tonic-gate reset_crtt_all(pi2); 14817c478bd9Sstevel@tonic-gate 14827c478bd9Sstevel@tonic-gate /* 14837c478bd9Sstevel@tonic-gate * In the case of host targets, we 14847c478bd9Sstevel@tonic-gate * would have flushed the targets, 14857c478bd9Sstevel@tonic-gate * and gone to PI_NOTARGETS state. 14867c478bd9Sstevel@tonic-gate */ 14877c478bd9Sstevel@tonic-gate if (pi2->pi_state == PI_RUNNING) 1488*49df4566Sethindra phyint_chstate(pi2, PI_FAILED); 14897c478bd9Sstevel@tonic-gate 14907c478bd9Sstevel@tonic-gate pi2->pi_empty = 0; 14917c478bd9Sstevel@tonic-gate pi2->pi_full = 0; 14927c478bd9Sstevel@tonic-gate } 14937c478bd9Sstevel@tonic-gate break; 14947c478bd9Sstevel@tonic-gate 14957c478bd9Sstevel@tonic-gate default: 14967c478bd9Sstevel@tonic-gate break; 14977c478bd9Sstevel@tonic-gate } 14987c478bd9Sstevel@tonic-gate } 14997c478bd9Sstevel@tonic-gate 15007c478bd9Sstevel@tonic-gate /* 15017c478bd9Sstevel@tonic-gate * Determines if any timeout event has occurred and returns the number of 15027c478bd9Sstevel@tonic-gate * milliseconds until the next timeout event for the phyint. Returns 15037c478bd9Sstevel@tonic-gate * TIMER_INFINITY for "never". 15047c478bd9Sstevel@tonic-gate */ 15057c478bd9Sstevel@tonic-gate uint_t 15067c478bd9Sstevel@tonic-gate phyint_inst_timer(struct phyint_instance *pii) 15077c478bd9Sstevel@tonic-gate { 15087c478bd9Sstevel@tonic-gate int pr_ndx; 15097c478bd9Sstevel@tonic-gate uint_t timeout; 15107c478bd9Sstevel@tonic-gate struct target *cur_tg; 15117c478bd9Sstevel@tonic-gate struct probe_stats *pr_statp; 15127c478bd9Sstevel@tonic-gate struct phyint_instance *pii_other; 15137c478bd9Sstevel@tonic-gate struct phyint *pi; 15147c478bd9Sstevel@tonic-gate int valid_unack_count; 15157c478bd9Sstevel@tonic-gate int i; 15167c478bd9Sstevel@tonic-gate int interval; 15177c478bd9Sstevel@tonic-gate uint_t check_time; 15187c478bd9Sstevel@tonic-gate uint_t cur_time; 15197c478bd9Sstevel@tonic-gate hrtime_t cur_hrtime; 15207c478bd9Sstevel@tonic-gate int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 15217c478bd9Sstevel@tonic-gate 15227c478bd9Sstevel@tonic-gate cur_time = getcurrenttime(); 15237c478bd9Sstevel@tonic-gate 15247c478bd9Sstevel@tonic-gate if (debug & D_TIMER) { 15257c478bd9Sstevel@tonic-gate logdebug("phyint_inst_timer(%s %s)\n", 15267c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 15277c478bd9Sstevel@tonic-gate } 15287c478bd9Sstevel@tonic-gate 15297c478bd9Sstevel@tonic-gate pii_other = phyint_inst_other(pii); 15307c478bd9Sstevel@tonic-gate if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 15317c478bd9Sstevel@tonic-gate /* 15327c478bd9Sstevel@tonic-gate * Check to see if we're here due to link up/down flapping; If 15337c478bd9Sstevel@tonic-gate * enough time has passed, then try to bring the interface 15347c478bd9Sstevel@tonic-gate * back up; otherwise, schedule a timer to bring it back up 15357c478bd9Sstevel@tonic-gate * when enough time *has* elapsed. 15367c478bd9Sstevel@tonic-gate */ 15377c478bd9Sstevel@tonic-gate pi = pii->pii_phyint; 15387c478bd9Sstevel@tonic-gate if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 15397c478bd9Sstevel@tonic-gate check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 15407c478bd9Sstevel@tonic-gate if (check_time > cur_time) 15417c478bd9Sstevel@tonic-gate return (check_time - cur_time); 15427c478bd9Sstevel@tonic-gate 15437c478bd9Sstevel@tonic-gate phyint_check_for_repair(pi); 15447c478bd9Sstevel@tonic-gate } 15457c478bd9Sstevel@tonic-gate } 15467c478bd9Sstevel@tonic-gate 15477c478bd9Sstevel@tonic-gate /* 15487c478bd9Sstevel@tonic-gate * If this phyint is not yet initialized for probes, 15497c478bd9Sstevel@tonic-gate * don't proceed further 15507c478bd9Sstevel@tonic-gate */ 15517c478bd9Sstevel@tonic-gate if (pii->pii_probe_sock == -1) 15527c478bd9Sstevel@tonic-gate return (TIMER_INFINITY); 15537c478bd9Sstevel@tonic-gate 15547c478bd9Sstevel@tonic-gate /* 15557c478bd9Sstevel@tonic-gate * If the timer has fired too soon, probably triggered 15567c478bd9Sstevel@tonic-gate * by some other phyint instance, return the remaining 15577c478bd9Sstevel@tonic-gate * time 15587c478bd9Sstevel@tonic-gate */ 15597c478bd9Sstevel@tonic-gate if (TIME_LT(cur_time, pii->pii_snxt_time)) 15607c478bd9Sstevel@tonic-gate return (pii->pii_snxt_time - cur_time); 15617c478bd9Sstevel@tonic-gate 15627c478bd9Sstevel@tonic-gate /* 15637c478bd9Sstevel@tonic-gate * If the link is down, don't send any probes for now. 15647c478bd9Sstevel@tonic-gate */ 15657c478bd9Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 15667c478bd9Sstevel@tonic-gate return (TIMER_INFINITY); 15677c478bd9Sstevel@tonic-gate 15687c478bd9Sstevel@tonic-gate /* 15697c478bd9Sstevel@tonic-gate * Randomize the next probe time, between MIN_RANDOM_FACTOR 15707c478bd9Sstevel@tonic-gate * and MAX_RANDOM_FACTOR with respect to the base probe time. 15717c478bd9Sstevel@tonic-gate * Base probe time is strictly periodic. 15727c478bd9Sstevel@tonic-gate */ 15737c478bd9Sstevel@tonic-gate interval = GET_RANDOM( 15747c478bd9Sstevel@tonic-gate (int)(MIN_RANDOM_FACTOR * user_probe_interval), 15757c478bd9Sstevel@tonic-gate (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 15767c478bd9Sstevel@tonic-gate pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 15777c478bd9Sstevel@tonic-gate 15787c478bd9Sstevel@tonic-gate /* 15797c478bd9Sstevel@tonic-gate * Check if the current time > next time to probe. If so, we missed 15807c478bd9Sstevel@tonic-gate * sending 1 or more probes, probably due to heavy system load. At least 15817c478bd9Sstevel@tonic-gate * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 15827c478bd9Sstevel@tonic-gate * were scheduled. Make adjustments to the times, in multiples of 15837c478bd9Sstevel@tonic-gate * user_probe_interval. 15847c478bd9Sstevel@tonic-gate */ 15857c478bd9Sstevel@tonic-gate if (TIME_GT(cur_time, pii->pii_snxt_time)) { 15867c478bd9Sstevel@tonic-gate int n; 15877c478bd9Sstevel@tonic-gate 15887c478bd9Sstevel@tonic-gate n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 15897c478bd9Sstevel@tonic-gate pii->pii_snxt_time += (n + 1) * user_probe_interval; 15907c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 15917c478bd9Sstevel@tonic-gate logtrace("missed sending %d probes cur_time %u snxt_time %u" 15927c478bd9Sstevel@tonic-gate " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 15937c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime); 15947c478bd9Sstevel@tonic-gate 15957c478bd9Sstevel@tonic-gate /* Collect statistics about missed probes */ 15967c478bd9Sstevel@tonic-gate probes_missed.pm_nprobes += n + 1; 15977c478bd9Sstevel@tonic-gate probes_missed.pm_ntimes++; 15987c478bd9Sstevel@tonic-gate } 15997c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime += user_probe_interval; 16007c478bd9Sstevel@tonic-gate interval = pii->pii_snxt_time - cur_time; 16017c478bd9Sstevel@tonic-gate if (debug & D_TARGET) { 16027c478bd9Sstevel@tonic-gate logdebug("cur_time %u snxt_time %u snxt_basetime %u" 16037c478bd9Sstevel@tonic-gate " interval %u\n", cur_time, pii->pii_snxt_time, 16047c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime, interval); 16057c478bd9Sstevel@tonic-gate } 16067c478bd9Sstevel@tonic-gate 16077c478bd9Sstevel@tonic-gate /* 16087c478bd9Sstevel@tonic-gate * If no targets are known, we need to send an ICMP multicast. The 16097c478bd9Sstevel@tonic-gate * probe type is PROBE_MULTI. We'll check back in 'interval' msec 16107c478bd9Sstevel@tonic-gate * to see if we found a target. 16117c478bd9Sstevel@tonic-gate */ 16127c478bd9Sstevel@tonic-gate if (pii->pii_target_next == NULL) { 16137c478bd9Sstevel@tonic-gate assert(pii->pii_ntargets == 0); 16147c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 16157c478bd9Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 16167c478bd9Sstevel@tonic-gate return (interval); 16177c478bd9Sstevel@tonic-gate } 16187c478bd9Sstevel@tonic-gate 16197c478bd9Sstevel@tonic-gate if ((user_probe_interval != probe_interval) && 16207c478bd9Sstevel@tonic-gate TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 16217c478bd9Sstevel@tonic-gate /* 16227c478bd9Sstevel@tonic-gate * the failure detection (fd) probe timer has not yet fired. 16237c478bd9Sstevel@tonic-gate * Need to send only an rtt probe. The probe type is PROBE_RTT. 16247c478bd9Sstevel@tonic-gate */ 16257c478bd9Sstevel@tonic-gate probe(pii, PROBE_RTT, cur_time); 16267c478bd9Sstevel@tonic-gate return (interval); 16277c478bd9Sstevel@tonic-gate } 16287c478bd9Sstevel@tonic-gate /* 16297c478bd9Sstevel@tonic-gate * the fd probe timer has fired. Need to do all failure 16307c478bd9Sstevel@tonic-gate * detection / recovery calculations, and then send an fd probe 16317c478bd9Sstevel@tonic-gate * of type PROBE_UNI. 16327c478bd9Sstevel@tonic-gate */ 16337c478bd9Sstevel@tonic-gate if (user_probe_interval == probe_interval) { 16347c478bd9Sstevel@tonic-gate /* 16357c478bd9Sstevel@tonic-gate * We could have missed some probes, and then adjusted 16367c478bd9Sstevel@tonic-gate * pii_snxt_basetime above. Otherwise we could have 16377c478bd9Sstevel@tonic-gate * blindly added probe_interval to pii_fd_snxt_basetime. 16387c478bd9Sstevel@tonic-gate */ 16397c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 16407c478bd9Sstevel@tonic-gate } else { 16417c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime += probe_interval; 16427c478bd9Sstevel@tonic-gate if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 16437c478bd9Sstevel@tonic-gate int n; 16447c478bd9Sstevel@tonic-gate 16457c478bd9Sstevel@tonic-gate n = (cur_time - pii->pii_fd_snxt_basetime) / 16467c478bd9Sstevel@tonic-gate probe_interval; 16477c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 16487c478bd9Sstevel@tonic-gate } 16497c478bd9Sstevel@tonic-gate } 16507c478bd9Sstevel@tonic-gate 16517c478bd9Sstevel@tonic-gate /* 16527c478bd9Sstevel@tonic-gate * We can have at most, the latest 2 probes that we sent, in 16537c478bd9Sstevel@tonic-gate * the PR_UNACKED state. All previous probes sent, are either 16547c478bd9Sstevel@tonic-gate * PR_LOST or PR_ACKED. An unacknowledged probe is considered 16557c478bd9Sstevel@tonic-gate * timed out if the probe's time_sent + the CRTT < currenttime. 16567c478bd9Sstevel@tonic-gate * For each of the last 2 probes, examine whether it has timed 16577c478bd9Sstevel@tonic-gate * out. If so, mark it PR_LOST. The probe stats is a circular array. 16587c478bd9Sstevel@tonic-gate */ 16597c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 16607c478bd9Sstevel@tonic-gate valid_unack_count = 0; 16617c478bd9Sstevel@tonic-gate 16627c478bd9Sstevel@tonic-gate for (i = 0; i < 2; i++) { 16637c478bd9Sstevel@tonic-gate pr_statp = &pii->pii_probes[pr_ndx]; 16647c478bd9Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 16657c478bd9Sstevel@tonic-gate switch (pr_statp->pr_status) { 16667c478bd9Sstevel@tonic-gate case PR_ACKED: 16677c478bd9Sstevel@tonic-gate /* 16687c478bd9Sstevel@tonic-gate * We received back an ACK, so the switch clearly 16697c478bd9Sstevel@tonic-gate * is not dropping our traffic, and thus we can 16707c478bd9Sstevel@tonic-gate * enable failure detection immediately. 16717c478bd9Sstevel@tonic-gate */ 16727c478bd9Sstevel@tonic-gate if (pii->pii_fd_hrtime > gethrtime()) { 16737c478bd9Sstevel@tonic-gate if (debug & D_PROBE) { 16747c478bd9Sstevel@tonic-gate logdebug("successful probe on %s; " 16757c478bd9Sstevel@tonic-gate "ending quiet period\n", 16767c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_name); 16777c478bd9Sstevel@tonic-gate } 16787c478bd9Sstevel@tonic-gate pii->pii_fd_hrtime = gethrtime(); 16797c478bd9Sstevel@tonic-gate } 16807c478bd9Sstevel@tonic-gate break; 16817c478bd9Sstevel@tonic-gate 16827c478bd9Sstevel@tonic-gate case PR_UNACKED: 16837c478bd9Sstevel@tonic-gate assert(cur_tg != NULL); 16847c478bd9Sstevel@tonic-gate /* 16857c478bd9Sstevel@tonic-gate * The crtt could be zero for some reason, 16867c478bd9Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 16877c478bd9Sstevel@tonic-gate * not available use group's probe interval, 16887c478bd9Sstevel@tonic-gate * which is a worst case estimate. 16897c478bd9Sstevel@tonic-gate */ 16907c478bd9Sstevel@tonic-gate if (cur_tg->tg_crtt != 0) { 16917c478bd9Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 16927c478bd9Sstevel@tonic-gate cur_tg->tg_crtt; 16937c478bd9Sstevel@tonic-gate } else { 16947c478bd9Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 16957c478bd9Sstevel@tonic-gate probe_interval; 16967c478bd9Sstevel@tonic-gate } 16977c478bd9Sstevel@tonic-gate if (TIME_LT(timeout, cur_time)) { 16987c478bd9Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 16997c478bd9Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 17007c478bd9Sstevel@tonic-gate } else if (i == 1) { 17017c478bd9Sstevel@tonic-gate /* 17027c478bd9Sstevel@tonic-gate * We are forced to consider this probe 17037c478bd9Sstevel@tonic-gate * lost, as we can have at most 2 unack. 17047c478bd9Sstevel@tonic-gate * probes any time, and we will be sending a 17057c478bd9Sstevel@tonic-gate * probe at the end of this function. 17067c478bd9Sstevel@tonic-gate * Normally, we should not be here, but 17077c478bd9Sstevel@tonic-gate * this can happen if an incoming response 17087c478bd9Sstevel@tonic-gate * that was considered lost has increased 17097c478bd9Sstevel@tonic-gate * the crtt for this target, and also bumped 17107c478bd9Sstevel@tonic-gate * up the FDT. Note that we never cancel or 17117c478bd9Sstevel@tonic-gate * increase the current pii_time_left, so 17127c478bd9Sstevel@tonic-gate * when the timer fires, we find 2 valid 17137c478bd9Sstevel@tonic-gate * unacked probes, and they are yet to timeout 17147c478bd9Sstevel@tonic-gate */ 17157c478bd9Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 17167c478bd9Sstevel@tonic-gate pr_statp->pr_time_lost = cur_time; 17177c478bd9Sstevel@tonic-gate } else { 17187c478bd9Sstevel@tonic-gate /* 17197c478bd9Sstevel@tonic-gate * Only the most recent probe can enter 17207c478bd9Sstevel@tonic-gate * this 'else' arm. The second most recent 17217c478bd9Sstevel@tonic-gate * probe must take either of the above arms, 17227c478bd9Sstevel@tonic-gate * if it is unacked. 17237c478bd9Sstevel@tonic-gate */ 17247c478bd9Sstevel@tonic-gate valid_unack_count++; 17257c478bd9Sstevel@tonic-gate } 17267c478bd9Sstevel@tonic-gate break; 17277c478bd9Sstevel@tonic-gate } 17287c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pr_ndx); 17297c478bd9Sstevel@tonic-gate } 17307c478bd9Sstevel@tonic-gate 17317c478bd9Sstevel@tonic-gate /* 17327c478bd9Sstevel@tonic-gate * We send out 1 probe randomly in the interval between one half 17337c478bd9Sstevel@tonic-gate * and one probe interval for the group. Given that the CRTT is always 17347c478bd9Sstevel@tonic-gate * less than the group's probe interval, we can have at most 1 17357c478bd9Sstevel@tonic-gate * unacknowledged probe now. All previous probes are either lost or 17367c478bd9Sstevel@tonic-gate * acked. 17377c478bd9Sstevel@tonic-gate */ 17387c478bd9Sstevel@tonic-gate assert(valid_unack_count == 0 || valid_unack_count == 1); 17397c478bd9Sstevel@tonic-gate 17407c478bd9Sstevel@tonic-gate /* 17417c478bd9Sstevel@tonic-gate * The timer has fired. Take appropriate action depending 17427c478bd9Sstevel@tonic-gate * on the current state of the phyint. 17437c478bd9Sstevel@tonic-gate * 17447c478bd9Sstevel@tonic-gate * PI_RUNNING state - Failure detection and failover 17457c478bd9Sstevel@tonic-gate * PI_FAILED state - Repair detection and failback 17467c478bd9Sstevel@tonic-gate */ 17477c478bd9Sstevel@tonic-gate switch (pii->pii_phyint->pi_state) { 17487c478bd9Sstevel@tonic-gate case PI_FAILED: 17497c478bd9Sstevel@tonic-gate /* 17507c478bd9Sstevel@tonic-gate * If the most recent probe (excluding unacked probes that 17517c478bd9Sstevel@tonic-gate * are yet to time out) has been acked, check whether the 17527c478bd9Sstevel@tonic-gate * phyint is now repaired. If the phyint is repaired, then 17537c478bd9Sstevel@tonic-gate * attempt failback, unless it is an inactive standby. 17547c478bd9Sstevel@tonic-gate */ 17557c478bd9Sstevel@tonic-gate if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 17567c478bd9Sstevel@tonic-gate phyint_check_for_repair(pii->pii_phyint); 17577c478bd9Sstevel@tonic-gate } 17587c478bd9Sstevel@tonic-gate break; 17597c478bd9Sstevel@tonic-gate 17607c478bd9Sstevel@tonic-gate case PI_RUNNING: 17617c478bd9Sstevel@tonic-gate /* 17627c478bd9Sstevel@tonic-gate * It's possible our probes have been lost because of a 17637c478bd9Sstevel@tonic-gate * spanning-tree mandated quiet period on the switch. If so, 17647c478bd9Sstevel@tonic-gate * ignore the lost probes and consider the interface to still 17657c478bd9Sstevel@tonic-gate * be functioning. 17667c478bd9Sstevel@tonic-gate */ 17677c478bd9Sstevel@tonic-gate cur_hrtime = gethrtime(); 17687c478bd9Sstevel@tonic-gate if (pii->pii_fd_hrtime - cur_hrtime > 0) 17697c478bd9Sstevel@tonic-gate break; 17707c478bd9Sstevel@tonic-gate 17717c478bd9Sstevel@tonic-gate if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 17727c478bd9Sstevel@tonic-gate /* 17737c478bd9Sstevel@tonic-gate * We have 1 or more failed probes (excluding unacked 17747c478bd9Sstevel@tonic-gate * probes that are yet to time out). Determine if the 17757c478bd9Sstevel@tonic-gate * phyint has failed. If so attempt a failover, 17767c478bd9Sstevel@tonic-gate * unless it is an inactive standby 17777c478bd9Sstevel@tonic-gate */ 17787c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(pii); 17797c478bd9Sstevel@tonic-gate } 17807c478bd9Sstevel@tonic-gate break; 17817c478bd9Sstevel@tonic-gate 17827c478bd9Sstevel@tonic-gate default: 17837c478bd9Sstevel@tonic-gate logerr("phyint_inst_timer: invalid state %d\n", 17847c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_state); 17857c478bd9Sstevel@tonic-gate abort(); 17867c478bd9Sstevel@tonic-gate } 17877c478bd9Sstevel@tonic-gate 17887c478bd9Sstevel@tonic-gate /* 17897c478bd9Sstevel@tonic-gate * Start the next probe. probe() will also set pii->pii_probe_time_left 17907c478bd9Sstevel@tonic-gate * to the group's probe interval. If phyint_failed -> target_flush_hosts 17917c478bd9Sstevel@tonic-gate * was called, the target list may be empty. 17927c478bd9Sstevel@tonic-gate */ 17937c478bd9Sstevel@tonic-gate if (pii->pii_target_next != NULL) { 17947c478bd9Sstevel@tonic-gate probe(pii, PROBE_UNI, cur_time); 17957c478bd9Sstevel@tonic-gate /* 17967c478bd9Sstevel@tonic-gate * If we have just the one probe target, and we're not using 17977c478bd9Sstevel@tonic-gate * router targets, try to find another as we presently have 17987c478bd9Sstevel@tonic-gate * no resilience. 17997c478bd9Sstevel@tonic-gate */ 18007c478bd9Sstevel@tonic-gate if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 18017c478bd9Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 18027c478bd9Sstevel@tonic-gate } else { 18037c478bd9Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 18047c478bd9Sstevel@tonic-gate } 18057c478bd9Sstevel@tonic-gate return (interval); 18067c478bd9Sstevel@tonic-gate } 18077c478bd9Sstevel@tonic-gate 18087c478bd9Sstevel@tonic-gate /* 18097c478bd9Sstevel@tonic-gate * Start the probe timer for an interface instance. 18107c478bd9Sstevel@tonic-gate */ 18117c478bd9Sstevel@tonic-gate void 18127c478bd9Sstevel@tonic-gate start_timer(struct phyint_instance *pii) 18137c478bd9Sstevel@tonic-gate { 18147c478bd9Sstevel@tonic-gate uint32_t interval; 18157c478bd9Sstevel@tonic-gate 18167c478bd9Sstevel@tonic-gate /* 18177c478bd9Sstevel@tonic-gate * Spread the base probe times (pi_snxt_basetime) across phyints 18187c478bd9Sstevel@tonic-gate * uniformly over the (curtime..curtime + the group's probe_interval). 18197c478bd9Sstevel@tonic-gate * pi_snxt_basetime is strictly periodic with a frequency of 18207c478bd9Sstevel@tonic-gate * the group's probe interval. The actual probe time pi_snxt_time 18217c478bd9Sstevel@tonic-gate * adds some randomness to pi_snxt_basetime and happens in probe(). 18227c478bd9Sstevel@tonic-gate * For the 1st probe on each phyint after the timer is started, 18237c478bd9Sstevel@tonic-gate * pi_snxt_time and pi_snxt_basetime are the same. 18247c478bd9Sstevel@tonic-gate */ 18257c478bd9Sstevel@tonic-gate interval = GET_RANDOM(0, 18267c478bd9Sstevel@tonic-gate (int)pii->pii_phyint->pi_group->pg_probeint); 18277c478bd9Sstevel@tonic-gate 18287c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime = getcurrenttime() + interval; 18297c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 18307c478bd9Sstevel@tonic-gate pii->pii_snxt_time = pii->pii_snxt_basetime; 18317c478bd9Sstevel@tonic-gate timer_schedule(interval); 18327c478bd9Sstevel@tonic-gate } 18337c478bd9Sstevel@tonic-gate 18347c478bd9Sstevel@tonic-gate /* 18357c478bd9Sstevel@tonic-gate * Restart the probe timer on an interface instance. 18367c478bd9Sstevel@tonic-gate */ 18377c478bd9Sstevel@tonic-gate static void 18387c478bd9Sstevel@tonic-gate restart_timer(struct phyint_instance *pii) 18397c478bd9Sstevel@tonic-gate { 18407c478bd9Sstevel@tonic-gate /* 18417c478bd9Sstevel@tonic-gate * We don't need to restart the timer if it was never started in 18427c478bd9Sstevel@tonic-gate * the first place (pii->pii_basetime_inited not set), as the timer 18437c478bd9Sstevel@tonic-gate * won't have gone off yet. 18447c478bd9Sstevel@tonic-gate */ 18457c478bd9Sstevel@tonic-gate if (pii->pii_basetime_inited != 0) { 18467c478bd9Sstevel@tonic-gate 18477c478bd9Sstevel@tonic-gate if (debug & D_LINKNOTE) 18487c478bd9Sstevel@tonic-gate logdebug("restart timer: restarting timer on %s, " 18497c478bd9Sstevel@tonic-gate "address family %s\n", pii->pii_phyint->pi_name, 18507c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af)); 18517c478bd9Sstevel@tonic-gate 18527c478bd9Sstevel@tonic-gate start_timer(pii); 18537c478bd9Sstevel@tonic-gate } 18547c478bd9Sstevel@tonic-gate } 18557c478bd9Sstevel@tonic-gate 18567c478bd9Sstevel@tonic-gate static void 18577c478bd9Sstevel@tonic-gate process_link_state_down(struct phyint *pi) 18587c478bd9Sstevel@tonic-gate { 18597c478bd9Sstevel@tonic-gate logerr("The link has gone down on %s\n", pi->pi_name); 18607c478bd9Sstevel@tonic-gate 18617c478bd9Sstevel@tonic-gate /* 18627c478bd9Sstevel@tonic-gate * Clear the probe statistics arrays, we don't want the repair 18637c478bd9Sstevel@tonic-gate * detection logic relying on probes that were succesful prior 18647c478bd9Sstevel@tonic-gate * to the link going down. 18657c478bd9Sstevel@tonic-gate */ 18667c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pi->pi_v4)) 18677c478bd9Sstevel@tonic-gate clear_pii_probe_stats(pi->pi_v4); 18687c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pi->pi_v6)) 18697c478bd9Sstevel@tonic-gate clear_pii_probe_stats(pi->pi_v6); 18707c478bd9Sstevel@tonic-gate /* 18717c478bd9Sstevel@tonic-gate * Check for interface failure. Although we know the interface 18727c478bd9Sstevel@tonic-gate * has failed, we don't know if all the other interfaces in the 18737c478bd9Sstevel@tonic-gate * group have failed as well. 18747c478bd9Sstevel@tonic-gate */ 18757c478bd9Sstevel@tonic-gate if ((pi->pi_state == PI_RUNNING) || 18767c478bd9Sstevel@tonic-gate (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 18777c478bd9Sstevel@tonic-gate if (debug & D_LINKNOTE) { 18787c478bd9Sstevel@tonic-gate logdebug("process_link_state_down:" 18797c478bd9Sstevel@tonic-gate " checking for failure on %s\n", pi->pi_name); 18807c478bd9Sstevel@tonic-gate } 18817c478bd9Sstevel@tonic-gate 18827c478bd9Sstevel@tonic-gate if (pi->pi_v4 != NULL) 18837c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(pi->pi_v4); 18847c478bd9Sstevel@tonic-gate else if (pi->pi_v6 != NULL) 18857c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(pi->pi_v6); 18867c478bd9Sstevel@tonic-gate } 18877c478bd9Sstevel@tonic-gate } 18887c478bd9Sstevel@tonic-gate 18897c478bd9Sstevel@tonic-gate static void 18907c478bd9Sstevel@tonic-gate process_link_state_up(struct phyint *pi) 18917c478bd9Sstevel@tonic-gate { 18927c478bd9Sstevel@tonic-gate logerr("The link has come up on %s\n", pi->pi_name); 18937c478bd9Sstevel@tonic-gate 18947c478bd9Sstevel@tonic-gate /* 18957c478bd9Sstevel@tonic-gate * We stopped any running timers on each instance when the link 18967c478bd9Sstevel@tonic-gate * went down, so restart them. 18977c478bd9Sstevel@tonic-gate */ 18987c478bd9Sstevel@tonic-gate if (pi->pi_v4) 18997c478bd9Sstevel@tonic-gate restart_timer(pi->pi_v4); 19007c478bd9Sstevel@tonic-gate if (pi->pi_v6) 19017c478bd9Sstevel@tonic-gate restart_timer(pi->pi_v6); 19027c478bd9Sstevel@tonic-gate 19037c478bd9Sstevel@tonic-gate phyint_check_for_repair(pi); 19047c478bd9Sstevel@tonic-gate 19057c478bd9Sstevel@tonic-gate pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 19067c478bd9Sstevel@tonic-gate if (pi->pi_whendx == LINK_UP_PERMIN) 19077c478bd9Sstevel@tonic-gate pi->pi_whendx = 0; 19087c478bd9Sstevel@tonic-gate } 19097c478bd9Sstevel@tonic-gate 19107c478bd9Sstevel@tonic-gate /* 19117c478bd9Sstevel@tonic-gate * Process any changes in link state passed up from the interfaces. 19127c478bd9Sstevel@tonic-gate */ 19137c478bd9Sstevel@tonic-gate void 19147c478bd9Sstevel@tonic-gate process_link_state_changes(void) 19157c478bd9Sstevel@tonic-gate { 19167c478bd9Sstevel@tonic-gate struct phyint *pi; 19177c478bd9Sstevel@tonic-gate 19187c478bd9Sstevel@tonic-gate /* Look for interfaces where the link state has just changed */ 19197c478bd9Sstevel@tonic-gate 19207c478bd9Sstevel@tonic-gate for (pi = phyints; pi != NULL; pi = pi->pi_next) { 19217c478bd9Sstevel@tonic-gate boolean_t old_link_state_up = LINK_UP(pi); 19227c478bd9Sstevel@tonic-gate 19237c478bd9Sstevel@tonic-gate /* 19247c478bd9Sstevel@tonic-gate * Except when the "phyint" structure is created, this is 19257c478bd9Sstevel@tonic-gate * the only place the link state is updated. This allows 19267c478bd9Sstevel@tonic-gate * this routine to detect changes in link state, rather 19277c478bd9Sstevel@tonic-gate * than just the current state. 19287c478bd9Sstevel@tonic-gate */ 19297c478bd9Sstevel@tonic-gate UPDATE_LINK_STATE(pi); 19307c478bd9Sstevel@tonic-gate 19317c478bd9Sstevel@tonic-gate if (LINK_DOWN(pi)) { 19327c478bd9Sstevel@tonic-gate /* 19337c478bd9Sstevel@tonic-gate * Has link just gone down? 19347c478bd9Sstevel@tonic-gate */ 19357c478bd9Sstevel@tonic-gate if (old_link_state_up) 19367c478bd9Sstevel@tonic-gate process_link_state_down(pi); 19377c478bd9Sstevel@tonic-gate } else { 19387c478bd9Sstevel@tonic-gate /* 19397c478bd9Sstevel@tonic-gate * Has link just gone back up? 19407c478bd9Sstevel@tonic-gate */ 19417c478bd9Sstevel@tonic-gate if (!old_link_state_up) 19427c478bd9Sstevel@tonic-gate process_link_state_up(pi); 19437c478bd9Sstevel@tonic-gate } 19447c478bd9Sstevel@tonic-gate } 19457c478bd9Sstevel@tonic-gate } 19467c478bd9Sstevel@tonic-gate 19477c478bd9Sstevel@tonic-gate void 19487c478bd9Sstevel@tonic-gate reset_crtt_all(struct phyint *pi) 19497c478bd9Sstevel@tonic-gate { 19507c478bd9Sstevel@tonic-gate struct phyint_instance *pii; 19517c478bd9Sstevel@tonic-gate struct target *tg; 19527c478bd9Sstevel@tonic-gate 19537c478bd9Sstevel@tonic-gate pii = pi->pi_v4; 19547c478bd9Sstevel@tonic-gate if (pii != NULL) { 19557c478bd9Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 19567c478bd9Sstevel@tonic-gate tg->tg_crtt = 0; 19577c478bd9Sstevel@tonic-gate tg->tg_rtt_sa = -1; 19587c478bd9Sstevel@tonic-gate tg->tg_rtt_sd = 0; 19597c478bd9Sstevel@tonic-gate } 19607c478bd9Sstevel@tonic-gate } 19617c478bd9Sstevel@tonic-gate 19627c478bd9Sstevel@tonic-gate pii = pi->pi_v6; 19637c478bd9Sstevel@tonic-gate if (pii != NULL) { 19647c478bd9Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 19657c478bd9Sstevel@tonic-gate tg->tg_crtt = 0; 19667c478bd9Sstevel@tonic-gate tg->tg_rtt_sa = -1; 19677c478bd9Sstevel@tonic-gate tg->tg_rtt_sd = 0; 19687c478bd9Sstevel@tonic-gate } 19697c478bd9Sstevel@tonic-gate } 19707c478bd9Sstevel@tonic-gate } 19717c478bd9Sstevel@tonic-gate 19727c478bd9Sstevel@tonic-gate /* 19737c478bd9Sstevel@tonic-gate * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 19747c478bd9Sstevel@tonic-gate * probes on both instances IPv4 and IPv6. 19757c478bd9Sstevel@tonic-gate * If the interface has failed, return the time of the first probe failure 19767c478bd9Sstevel@tonic-gate * in "tff". 19777c478bd9Sstevel@tonic-gate */ 19787c478bd9Sstevel@tonic-gate static int 19797c478bd9Sstevel@tonic-gate phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 19807c478bd9Sstevel@tonic-gate { 19817c478bd9Sstevel@tonic-gate uint_t pi_tff; 19827c478bd9Sstevel@tonic-gate struct target *cur_tg; 19837c478bd9Sstevel@tonic-gate struct probe_fail_count pfinfo; 19847c478bd9Sstevel@tonic-gate struct phyint_instance *pii_other; 19857c478bd9Sstevel@tonic-gate int pr_ndx; 19867c478bd9Sstevel@tonic-gate 19877c478bd9Sstevel@tonic-gate /* 19887c478bd9Sstevel@tonic-gate * Get the number of consecutive failed probes on 19897c478bd9Sstevel@tonic-gate * this phyint across all targets. Also get the number 19907c478bd9Sstevel@tonic-gate * of consecutive failed probes on this target only 19917c478bd9Sstevel@tonic-gate */ 19927c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 19937c478bd9Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 19947c478bd9Sstevel@tonic-gate probe_fail_info(pii, cur_tg, &pfinfo); 19957c478bd9Sstevel@tonic-gate 19967c478bd9Sstevel@tonic-gate /* Get the time of first failure, for later use */ 19977c478bd9Sstevel@tonic-gate pi_tff = pfinfo.pf_tff; 19987c478bd9Sstevel@tonic-gate 19997c478bd9Sstevel@tonic-gate /* 20007c478bd9Sstevel@tonic-gate * If the current target has not responded to the 20017c478bd9Sstevel@tonic-gate * last NUM_PROBE_FAILS probes, and other targets are 20027c478bd9Sstevel@tonic-gate * responding delete this target. Dead gateway detection 20037c478bd9Sstevel@tonic-gate * will eventually remove this target (if router) from the 20047c478bd9Sstevel@tonic-gate * routing tables. If that does not occur, we may end 20057c478bd9Sstevel@tonic-gate * up adding this to our list again. 20067c478bd9Sstevel@tonic-gate */ 20077c478bd9Sstevel@tonic-gate if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 20087c478bd9Sstevel@tonic-gate pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 20097c478bd9Sstevel@tonic-gate if (pii->pii_targets_are_routers) { 20107c478bd9Sstevel@tonic-gate if (cur_tg->tg_status == TG_ACTIVE) 20117c478bd9Sstevel@tonic-gate pii->pii_ntargets--; 20127c478bd9Sstevel@tonic-gate cur_tg->tg_status = TG_DEAD; 20137c478bd9Sstevel@tonic-gate cur_tg->tg_crtt = 0; 20147c478bd9Sstevel@tonic-gate cur_tg->tg_rtt_sa = -1; 20157c478bd9Sstevel@tonic-gate cur_tg->tg_rtt_sd = 0; 20167c478bd9Sstevel@tonic-gate if (pii->pii_target_next == cur_tg) 20177c478bd9Sstevel@tonic-gate pii->pii_target_next = target_next(cur_tg); 20187c478bd9Sstevel@tonic-gate } else { 20197c478bd9Sstevel@tonic-gate target_delete(cur_tg); 20207c478bd9Sstevel@tonic-gate probe(pii, PROBE_MULTI, getcurrenttime()); 20217c478bd9Sstevel@tonic-gate } 20227c478bd9Sstevel@tonic-gate return (PHYINT_OK); 20237c478bd9Sstevel@tonic-gate } 20247c478bd9Sstevel@tonic-gate 20257c478bd9Sstevel@tonic-gate /* 20267c478bd9Sstevel@tonic-gate * If the phyint has lost NUM_PROBE_FAILS or more 20277c478bd9Sstevel@tonic-gate * consecutive probes, on both IPv4 and IPv6 protocol 20287c478bd9Sstevel@tonic-gate * instances of the phyint, then trigger failure 20297c478bd9Sstevel@tonic-gate * detection, else return false 20307c478bd9Sstevel@tonic-gate */ 20317c478bd9Sstevel@tonic-gate if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 20327c478bd9Sstevel@tonic-gate return (PHYINT_OK); 20337c478bd9Sstevel@tonic-gate 20347c478bd9Sstevel@tonic-gate pii_other = phyint_inst_other(pii); 20357c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pii_other)) { 20367c478bd9Sstevel@tonic-gate probe_fail_info(pii_other, NULL, &pfinfo); 20377c478bd9Sstevel@tonic-gate if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 20387c478bd9Sstevel@tonic-gate /* 20397c478bd9Sstevel@tonic-gate * We have NUM_PROBE_FAILS or more failures 20407c478bd9Sstevel@tonic-gate * on both IPv4 and IPv6. Get the earliest 20417c478bd9Sstevel@tonic-gate * time when failure was detected on this 20427c478bd9Sstevel@tonic-gate * phyint across IPv4 and IPv6. 20437c478bd9Sstevel@tonic-gate */ 20447c478bd9Sstevel@tonic-gate if (TIME_LT(pfinfo.pf_tff, pi_tff)) 20457c478bd9Sstevel@tonic-gate pi_tff = pfinfo.pf_tff; 20467c478bd9Sstevel@tonic-gate } else { 20477c478bd9Sstevel@tonic-gate /* 20487c478bd9Sstevel@tonic-gate * This instance has < NUM_PROBE_FAILS failure. 20497c478bd9Sstevel@tonic-gate * So return false 20507c478bd9Sstevel@tonic-gate */ 20517c478bd9Sstevel@tonic-gate return (PHYINT_OK); 20527c478bd9Sstevel@tonic-gate } 20537c478bd9Sstevel@tonic-gate } 20547c478bd9Sstevel@tonic-gate *tff = pi_tff; 20557c478bd9Sstevel@tonic-gate return (PHYINT_FAILURE); 20567c478bd9Sstevel@tonic-gate } 20577c478bd9Sstevel@tonic-gate 20587c478bd9Sstevel@tonic-gate /* 20597c478bd9Sstevel@tonic-gate * Check if the link has gone down on this phyint, or it has failed the 20607c478bd9Sstevel@tonic-gate * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 20617c478bd9Sstevel@tonic-gate * Also look at other phyints of this group, for group failures. 20627c478bd9Sstevel@tonic-gate */ 20637c478bd9Sstevel@tonic-gate int 20647c478bd9Sstevel@tonic-gate failure_state(struct phyint_instance *pii) 20657c478bd9Sstevel@tonic-gate { 20667c478bd9Sstevel@tonic-gate struct probe_success_count psinfo; 20677c478bd9Sstevel@tonic-gate uint_t pi2_tls; /* time last success */ 20687c478bd9Sstevel@tonic-gate uint_t pi_tff; /* time first fail */ 20697c478bd9Sstevel@tonic-gate struct phyint *pi2; 20707c478bd9Sstevel@tonic-gate struct phyint *pi; 20717c478bd9Sstevel@tonic-gate struct phyint_instance *pii2; 20727c478bd9Sstevel@tonic-gate struct phyint_group *pg; 20737c478bd9Sstevel@tonic-gate boolean_t alone; 20747c478bd9Sstevel@tonic-gate 20757c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) 20767c478bd9Sstevel@tonic-gate logdebug("phyint_failed(%s)\n", pii->pii_name); 20777c478bd9Sstevel@tonic-gate 20787c478bd9Sstevel@tonic-gate pi = pii->pii_phyint; 20797c478bd9Sstevel@tonic-gate pg = pi->pi_group; 20807c478bd9Sstevel@tonic-gate 20817c478bd9Sstevel@tonic-gate if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 20827c478bd9Sstevel@tonic-gate PHYINT_OK) 20837c478bd9Sstevel@tonic-gate return (PHYINT_OK); 20847c478bd9Sstevel@tonic-gate 20857c478bd9Sstevel@tonic-gate /* 20867c478bd9Sstevel@tonic-gate * At this point, the link is down, or the phyint is suspect, 20877c478bd9Sstevel@tonic-gate * as it has lost NUM_PROBE_FAILS or more probes. If the phyint 20887c478bd9Sstevel@tonic-gate * does not belong to any group, or is the only member of the 20897c478bd9Sstevel@tonic-gate * group capable of being probed, return PHYINT_FAILURE. 20907c478bd9Sstevel@tonic-gate */ 20917c478bd9Sstevel@tonic-gate alone = _B_TRUE; 20927c478bd9Sstevel@tonic-gate if (pg != phyint_anongroup) { 20937c478bd9Sstevel@tonic-gate for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 20947c478bd9Sstevel@tonic-gate if (pi2 == pi) 20957c478bd9Sstevel@tonic-gate continue; 20967c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pi2->pi_v4) || 20977c478bd9Sstevel@tonic-gate PROBE_CAPABLE(pi2->pi_v6)) { 20987c478bd9Sstevel@tonic-gate alone = _B_FALSE; 20997c478bd9Sstevel@tonic-gate break; 21007c478bd9Sstevel@tonic-gate } 21017c478bd9Sstevel@tonic-gate } 21027c478bd9Sstevel@tonic-gate } 21037c478bd9Sstevel@tonic-gate if (alone) 21047c478bd9Sstevel@tonic-gate return (PHYINT_FAILURE); 21057c478bd9Sstevel@tonic-gate 21067c478bd9Sstevel@tonic-gate /* 21077c478bd9Sstevel@tonic-gate * Need to compare against other phyints of the same group 21087c478bd9Sstevel@tonic-gate * to exclude group failures. If the failure was detected via 21097c478bd9Sstevel@tonic-gate * probing, then if the time of last success (tls) of any 21107c478bd9Sstevel@tonic-gate * phyint is more recent than the time of first fail (tff) of the 21117c478bd9Sstevel@tonic-gate * phyint in question, and the link is up on the phyint, 21127c478bd9Sstevel@tonic-gate * then it is a phyint failure. Otherwise it is a group failure. 21137c478bd9Sstevel@tonic-gate * If failure was detected via a link down notification sent from 21147c478bd9Sstevel@tonic-gate * the driver to IP, we see if any phyints in the group are still 21157c478bd9Sstevel@tonic-gate * running and haven't received a link down notification. We 21167c478bd9Sstevel@tonic-gate * will usually be processing the link down notification shortly 21177c478bd9Sstevel@tonic-gate * after it was received, so there is no point looking at the tls 21187c478bd9Sstevel@tonic-gate * of other phyints. 21197c478bd9Sstevel@tonic-gate */ 21207c478bd9Sstevel@tonic-gate for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 21217c478bd9Sstevel@tonic-gate /* Exclude ourself from comparison */ 21227c478bd9Sstevel@tonic-gate if (pi2 == pi) 21237c478bd9Sstevel@tonic-gate continue; 21247c478bd9Sstevel@tonic-gate 21257c478bd9Sstevel@tonic-gate if (LINK_DOWN(pi)) { 21267c478bd9Sstevel@tonic-gate /* 21277c478bd9Sstevel@tonic-gate * We use FLAGS_TO_LINK_STATE() to test the 21287c478bd9Sstevel@tonic-gate * flags directly, rather then LINK_UP() or 21297c478bd9Sstevel@tonic-gate * LINK_DOWN(), as we may not have got round 21307c478bd9Sstevel@tonic-gate * to processing the link state for the other 21317c478bd9Sstevel@tonic-gate * phyints in the group yet. 21327c478bd9Sstevel@tonic-gate * 21337c478bd9Sstevel@tonic-gate * The check for PI_RUNNING and group 21347c478bd9Sstevel@tonic-gate * failure handles the case when the 21357c478bd9Sstevel@tonic-gate * group begins to recover. The first 21367c478bd9Sstevel@tonic-gate * phyint to recover should not trigger 21377c478bd9Sstevel@tonic-gate * a failover from the soon-to-recover 21387c478bd9Sstevel@tonic-gate * other phyints to the first recovered 21397c478bd9Sstevel@tonic-gate * phyint. PI_RUNNING will be set, and 21407c478bd9Sstevel@tonic-gate * pg_groupfailed cleared only after 21417c478bd9Sstevel@tonic-gate * receipt of NUM_PROBE_REPAIRS, by 21427c478bd9Sstevel@tonic-gate * which time the other phyints should 21437c478bd9Sstevel@tonic-gate * have received at least 1 packet, 21447c478bd9Sstevel@tonic-gate * and so will not have NUM_PROBE_FAILS. 21457c478bd9Sstevel@tonic-gate */ 21467c478bd9Sstevel@tonic-gate if ((pi2->pi_state == PI_RUNNING) && 21477c478bd9Sstevel@tonic-gate !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) 21487c478bd9Sstevel@tonic-gate return (PHYINT_FAILURE); 21497c478bd9Sstevel@tonic-gate } else { 21507c478bd9Sstevel@tonic-gate /* 21517c478bd9Sstevel@tonic-gate * Need to compare against both IPv4 and 21527c478bd9Sstevel@tonic-gate * IPv6 instances. 21537c478bd9Sstevel@tonic-gate */ 21547c478bd9Sstevel@tonic-gate pii2 = pi2->pi_v4; 21557c478bd9Sstevel@tonic-gate if (pii2 != NULL) { 21567c478bd9Sstevel@tonic-gate probe_success_info(pii2, NULL, &psinfo); 21577c478bd9Sstevel@tonic-gate if (psinfo.ps_tls_valid) { 21587c478bd9Sstevel@tonic-gate pi2_tls = psinfo.ps_tls; 21597c478bd9Sstevel@tonic-gate /* 21607c478bd9Sstevel@tonic-gate * See comment above regarding check 21617c478bd9Sstevel@tonic-gate * for PI_RUNNING and group failure. 21627c478bd9Sstevel@tonic-gate */ 21637c478bd9Sstevel@tonic-gate if (TIME_GT(pi2_tls, pi_tff) && 21647c478bd9Sstevel@tonic-gate (pi2->pi_state == PI_RUNNING) && 21657c478bd9Sstevel@tonic-gate !GROUP_FAILED(pg) && 21667c478bd9Sstevel@tonic-gate FLAGS_TO_LINK_STATE(pi2)) 21677c478bd9Sstevel@tonic-gate return (PHYINT_FAILURE); 21687c478bd9Sstevel@tonic-gate } 21697c478bd9Sstevel@tonic-gate } 21707c478bd9Sstevel@tonic-gate 21717c478bd9Sstevel@tonic-gate pii2 = pi2->pi_v6; 21727c478bd9Sstevel@tonic-gate if (pii2 != NULL) { 21737c478bd9Sstevel@tonic-gate probe_success_info(pii2, NULL, &psinfo); 21747c478bd9Sstevel@tonic-gate if (psinfo.ps_tls_valid) { 21757c478bd9Sstevel@tonic-gate pi2_tls = psinfo.ps_tls; 21767c478bd9Sstevel@tonic-gate /* 21777c478bd9Sstevel@tonic-gate * See comment above regarding check 21787c478bd9Sstevel@tonic-gate * for PI_RUNNING and group failure. 21797c478bd9Sstevel@tonic-gate */ 21807c478bd9Sstevel@tonic-gate if (TIME_GT(pi2_tls, pi_tff) && 21817c478bd9Sstevel@tonic-gate (pi2->pi_state == PI_RUNNING) && 21827c478bd9Sstevel@tonic-gate !GROUP_FAILED(pg) && 21837c478bd9Sstevel@tonic-gate FLAGS_TO_LINK_STATE(pi2)) 21847c478bd9Sstevel@tonic-gate return (PHYINT_FAILURE); 21857c478bd9Sstevel@tonic-gate } 21867c478bd9Sstevel@tonic-gate } 21877c478bd9Sstevel@tonic-gate } 21887c478bd9Sstevel@tonic-gate } 21897c478bd9Sstevel@tonic-gate 21907c478bd9Sstevel@tonic-gate /* 21917c478bd9Sstevel@tonic-gate * Change the group state to PG_FAILED if it's not already. 21927c478bd9Sstevel@tonic-gate */ 21937c478bd9Sstevel@tonic-gate if (!GROUP_FAILED(pg)) 21947c478bd9Sstevel@tonic-gate phyint_group_chstate(pg, PG_FAILED); 21957c478bd9Sstevel@tonic-gate 21967c478bd9Sstevel@tonic-gate return (GROUP_FAILURE); 21977c478bd9Sstevel@tonic-gate } 21987c478bd9Sstevel@tonic-gate 21997c478bd9Sstevel@tonic-gate /* 22007c478bd9Sstevel@tonic-gate * Return the information associated with consecutive probe successes 22017c478bd9Sstevel@tonic-gate * starting with the most recent probe. At most the last 2 probes can be 22027c478bd9Sstevel@tonic-gate * in the unacknowledged state. All previous probes have either failed 22037c478bd9Sstevel@tonic-gate * or succeeded. 22047c478bd9Sstevel@tonic-gate */ 22057c478bd9Sstevel@tonic-gate static void 22067c478bd9Sstevel@tonic-gate probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 22077c478bd9Sstevel@tonic-gate struct probe_success_count *psinfo) 22087c478bd9Sstevel@tonic-gate { 22097c478bd9Sstevel@tonic-gate uint_t i; 22107c478bd9Sstevel@tonic-gate struct probe_stats *pr_statp; 22117c478bd9Sstevel@tonic-gate uint_t most_recent; 22127c478bd9Sstevel@tonic-gate uint_t second_most_recent; 22137c478bd9Sstevel@tonic-gate boolean_t pi_found_failure = _B_FALSE; 22147c478bd9Sstevel@tonic-gate boolean_t tg_found_failure = _B_FALSE; 22157c478bd9Sstevel@tonic-gate uint_t now; 22167c478bd9Sstevel@tonic-gate uint_t timeout; 22177c478bd9Sstevel@tonic-gate struct target *tg; 22187c478bd9Sstevel@tonic-gate 22197c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) 22207c478bd9Sstevel@tonic-gate logdebug("probe_success_info(%s)\n", pii->pii_name); 22217c478bd9Sstevel@tonic-gate 22227c478bd9Sstevel@tonic-gate bzero(psinfo, sizeof (*psinfo)); 22237c478bd9Sstevel@tonic-gate now = getcurrenttime(); 22247c478bd9Sstevel@tonic-gate 22257c478bd9Sstevel@tonic-gate /* 22267c478bd9Sstevel@tonic-gate * Start with the most recent probe, and count the number 22277c478bd9Sstevel@tonic-gate * of consecutive probe successes. Latch the number of successes 22287c478bd9Sstevel@tonic-gate * on hitting a failure. 22297c478bd9Sstevel@tonic-gate */ 22307c478bd9Sstevel@tonic-gate most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 22317c478bd9Sstevel@tonic-gate second_most_recent = PROBE_INDEX_PREV(most_recent); 22327c478bd9Sstevel@tonic-gate 22337c478bd9Sstevel@tonic-gate for (i = most_recent; i != pii->pii_probe_next; 22347c478bd9Sstevel@tonic-gate i = PROBE_INDEX_PREV(i)) { 22357c478bd9Sstevel@tonic-gate pr_statp = &pii->pii_probes[i]; 22367c478bd9Sstevel@tonic-gate 22377c478bd9Sstevel@tonic-gate switch (pr_statp->pr_status) { 22387c478bd9Sstevel@tonic-gate case PR_UNACKED: 22397c478bd9Sstevel@tonic-gate /* 22407c478bd9Sstevel@tonic-gate * Only the most recent 2 probes can be unacknowledged 22417c478bd9Sstevel@tonic-gate */ 22427c478bd9Sstevel@tonic-gate assert(i == most_recent || i == second_most_recent); 22437c478bd9Sstevel@tonic-gate 22447c478bd9Sstevel@tonic-gate tg = pr_statp->pr_target; 22457c478bd9Sstevel@tonic-gate assert(tg != NULL); 22467c478bd9Sstevel@tonic-gate /* 22477c478bd9Sstevel@tonic-gate * The crtt could be zero for some reason, 22487c478bd9Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 22497c478bd9Sstevel@tonic-gate * not available use the value of the group's probe 22507c478bd9Sstevel@tonic-gate * interval which is a worst case estimate. 22517c478bd9Sstevel@tonic-gate */ 22527c478bd9Sstevel@tonic-gate if (tg->tg_crtt != 0) { 22537c478bd9Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + tg->tg_crtt; 22547c478bd9Sstevel@tonic-gate } else { 22557c478bd9Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 22567c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint; 22577c478bd9Sstevel@tonic-gate } 22587c478bd9Sstevel@tonic-gate 22597c478bd9Sstevel@tonic-gate if (TIME_LT(timeout, now)) { 22607c478bd9Sstevel@tonic-gate /* 22617c478bd9Sstevel@tonic-gate * We hit a failure. Latch the total number of 22627c478bd9Sstevel@tonic-gate * recent consecutive successes. 22637c478bd9Sstevel@tonic-gate */ 22647c478bd9Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 22657c478bd9Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 22667c478bd9Sstevel@tonic-gate pi_found_failure = _B_TRUE; 22677c478bd9Sstevel@tonic-gate if (cur_tg != NULL && tg == cur_tg) { 22687c478bd9Sstevel@tonic-gate /* 22697c478bd9Sstevel@tonic-gate * We hit a failure for the desired 22707c478bd9Sstevel@tonic-gate * target. Latch the number of recent 22717c478bd9Sstevel@tonic-gate * consecutive successes for this target 22727c478bd9Sstevel@tonic-gate */ 22737c478bd9Sstevel@tonic-gate tg_found_failure = _B_TRUE; 22747c478bd9Sstevel@tonic-gate } 22757c478bd9Sstevel@tonic-gate } 22767c478bd9Sstevel@tonic-gate break; 22777c478bd9Sstevel@tonic-gate 22787c478bd9Sstevel@tonic-gate case PR_ACKED: 22797c478bd9Sstevel@tonic-gate /* 22807c478bd9Sstevel@tonic-gate * Bump up the count of probe successes, if we 22817c478bd9Sstevel@tonic-gate * have not seen any failure so far. 22827c478bd9Sstevel@tonic-gate */ 22837c478bd9Sstevel@tonic-gate if (!pi_found_failure) 22847c478bd9Sstevel@tonic-gate psinfo->ps_nsucc++; 22857c478bd9Sstevel@tonic-gate 22867c478bd9Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 22877c478bd9Sstevel@tonic-gate !tg_found_failure) { 22887c478bd9Sstevel@tonic-gate psinfo->ps_nsucc_tg++; 22897c478bd9Sstevel@tonic-gate } 22907c478bd9Sstevel@tonic-gate 22917c478bd9Sstevel@tonic-gate /* 22927c478bd9Sstevel@tonic-gate * Record the time of last success, if this is 22937c478bd9Sstevel@tonic-gate * the most recent probe success. 22947c478bd9Sstevel@tonic-gate */ 22957c478bd9Sstevel@tonic-gate if (!psinfo->ps_tls_valid) { 22967c478bd9Sstevel@tonic-gate psinfo->ps_tls = pr_statp->pr_time_acked; 22977c478bd9Sstevel@tonic-gate psinfo->ps_tls_valid = _B_TRUE; 22987c478bd9Sstevel@tonic-gate } 22997c478bd9Sstevel@tonic-gate break; 23007c478bd9Sstevel@tonic-gate 23017c478bd9Sstevel@tonic-gate case PR_LOST: 23027c478bd9Sstevel@tonic-gate /* 23037c478bd9Sstevel@tonic-gate * We hit a failure. Latch the total number of 23047c478bd9Sstevel@tonic-gate * recent consecutive successes. 23057c478bd9Sstevel@tonic-gate */ 23067c478bd9Sstevel@tonic-gate pi_found_failure = _B_TRUE; 23077c478bd9Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 23087c478bd9Sstevel@tonic-gate /* 23097c478bd9Sstevel@tonic-gate * We hit a failure for the desired target. 23107c478bd9Sstevel@tonic-gate * Latch the number of recent consecutive 23117c478bd9Sstevel@tonic-gate * successes for this target 23127c478bd9Sstevel@tonic-gate */ 23137c478bd9Sstevel@tonic-gate tg_found_failure = _B_TRUE; 23147c478bd9Sstevel@tonic-gate } 23157c478bd9Sstevel@tonic-gate break; 23167c478bd9Sstevel@tonic-gate 23177c478bd9Sstevel@tonic-gate default: 23187c478bd9Sstevel@tonic-gate return; 23197c478bd9Sstevel@tonic-gate 23207c478bd9Sstevel@tonic-gate } 23217c478bd9Sstevel@tonic-gate } 23227c478bd9Sstevel@tonic-gate } 23237c478bd9Sstevel@tonic-gate 23247c478bd9Sstevel@tonic-gate /* 23257c478bd9Sstevel@tonic-gate * Return the information associated with consecutive probe failures 23267c478bd9Sstevel@tonic-gate * starting with the most recent probe. Only the last 2 probes can be in the 23277c478bd9Sstevel@tonic-gate * unacknowledged state. All previous probes have either failed or succeeded. 23287c478bd9Sstevel@tonic-gate */ 23297c478bd9Sstevel@tonic-gate static void 23307c478bd9Sstevel@tonic-gate probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 23317c478bd9Sstevel@tonic-gate struct probe_fail_count *pfinfo) 23327c478bd9Sstevel@tonic-gate { 23337c478bd9Sstevel@tonic-gate int i; 23347c478bd9Sstevel@tonic-gate struct probe_stats *pr_statp; 23357c478bd9Sstevel@tonic-gate boolean_t tg_found_success = _B_FALSE; 23367c478bd9Sstevel@tonic-gate boolean_t pi_found_success = _B_FALSE; 23377c478bd9Sstevel@tonic-gate int most_recent; 23387c478bd9Sstevel@tonic-gate int second_most_recent; 23397c478bd9Sstevel@tonic-gate uint_t now; 23407c478bd9Sstevel@tonic-gate uint_t timeout; 23417c478bd9Sstevel@tonic-gate struct target *tg; 23427c478bd9Sstevel@tonic-gate 23437c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) 23447c478bd9Sstevel@tonic-gate logdebug("probe_fail_info(%s)\n", pii->pii_name); 23457c478bd9Sstevel@tonic-gate 23467c478bd9Sstevel@tonic-gate bzero(pfinfo, sizeof (*pfinfo)); 23477c478bd9Sstevel@tonic-gate now = getcurrenttime(); 23487c478bd9Sstevel@tonic-gate 23497c478bd9Sstevel@tonic-gate /* 23507c478bd9Sstevel@tonic-gate * Start with the most recent probe, and count the number 23517c478bd9Sstevel@tonic-gate * of consecutive probe failures. Latch the number of failures 23527c478bd9Sstevel@tonic-gate * on hitting a probe success. 23537c478bd9Sstevel@tonic-gate */ 23547c478bd9Sstevel@tonic-gate most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 23557c478bd9Sstevel@tonic-gate second_most_recent = PROBE_INDEX_PREV(most_recent); 23567c478bd9Sstevel@tonic-gate 23577c478bd9Sstevel@tonic-gate for (i = most_recent; i != pii->pii_probe_next; 23587c478bd9Sstevel@tonic-gate i = PROBE_INDEX_PREV(i)) { 23597c478bd9Sstevel@tonic-gate pr_statp = &pii->pii_probes[i]; 23607c478bd9Sstevel@tonic-gate 23617c478bd9Sstevel@tonic-gate assert(PR_STATUS_VALID(pr_statp->pr_status)); 23627c478bd9Sstevel@tonic-gate 23637c478bd9Sstevel@tonic-gate switch (pr_statp->pr_status) { 23647c478bd9Sstevel@tonic-gate case PR_UNACKED: 23657c478bd9Sstevel@tonic-gate /* 23667c478bd9Sstevel@tonic-gate * Only the most recent 2 probes can be unacknowledged 23677c478bd9Sstevel@tonic-gate */ 23687c478bd9Sstevel@tonic-gate assert(i == most_recent || i == second_most_recent); 23697c478bd9Sstevel@tonic-gate 23707c478bd9Sstevel@tonic-gate tg = pr_statp->pr_target; 23717c478bd9Sstevel@tonic-gate /* 23727c478bd9Sstevel@tonic-gate * Target is guaranteed to exist in the unack. state 23737c478bd9Sstevel@tonic-gate */ 23747c478bd9Sstevel@tonic-gate assert(tg != NULL); 23757c478bd9Sstevel@tonic-gate /* 23767c478bd9Sstevel@tonic-gate * The crtt could be zero for some reason, 23777c478bd9Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 23787c478bd9Sstevel@tonic-gate * not available use the group's probe interval, 23797c478bd9Sstevel@tonic-gate * which is a worst case estimate. 23807c478bd9Sstevel@tonic-gate */ 23817c478bd9Sstevel@tonic-gate if (tg->tg_crtt != 0) { 23827c478bd9Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + tg->tg_crtt; 23837c478bd9Sstevel@tonic-gate } else { 23847c478bd9Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 23857c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint; 23867c478bd9Sstevel@tonic-gate } 23877c478bd9Sstevel@tonic-gate 23887c478bd9Sstevel@tonic-gate if (TIME_GT(timeout, now)) 23897c478bd9Sstevel@tonic-gate break; 23907c478bd9Sstevel@tonic-gate 23917c478bd9Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 23927c478bd9Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 23937c478bd9Sstevel@tonic-gate /* FALLTHRU */ 23947c478bd9Sstevel@tonic-gate 23957c478bd9Sstevel@tonic-gate case PR_LOST: 23967c478bd9Sstevel@tonic-gate if (!pi_found_success) { 23977c478bd9Sstevel@tonic-gate pfinfo->pf_nfail++; 23987c478bd9Sstevel@tonic-gate pfinfo->pf_tff = pr_statp->pr_time_lost; 23997c478bd9Sstevel@tonic-gate } 24007c478bd9Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 24017c478bd9Sstevel@tonic-gate !tg_found_success) { 24027c478bd9Sstevel@tonic-gate pfinfo->pf_nfail_tg++; 24037c478bd9Sstevel@tonic-gate } 24047c478bd9Sstevel@tonic-gate break; 24057c478bd9Sstevel@tonic-gate 24067c478bd9Sstevel@tonic-gate default: 24077c478bd9Sstevel@tonic-gate /* 24087c478bd9Sstevel@tonic-gate * We hit a success or unused slot. Latch the 24097c478bd9Sstevel@tonic-gate * total number of recent consecutive failures. 24107c478bd9Sstevel@tonic-gate */ 24117c478bd9Sstevel@tonic-gate pi_found_success = _B_TRUE; 24127c478bd9Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 24137c478bd9Sstevel@tonic-gate /* 24147c478bd9Sstevel@tonic-gate * We hit a success for the desired target. 24157c478bd9Sstevel@tonic-gate * Latch the number of recent consecutive 24167c478bd9Sstevel@tonic-gate * failures for this target 24177c478bd9Sstevel@tonic-gate */ 24187c478bd9Sstevel@tonic-gate tg_found_success = _B_TRUE; 24197c478bd9Sstevel@tonic-gate } 24207c478bd9Sstevel@tonic-gate } 24217c478bd9Sstevel@tonic-gate } 24227c478bd9Sstevel@tonic-gate } 24237c478bd9Sstevel@tonic-gate 24247c478bd9Sstevel@tonic-gate /* 24257c478bd9Sstevel@tonic-gate * Check if the phyint has been repaired. If no test address has been 24267c478bd9Sstevel@tonic-gate * configured, then consider the interface repaired if the link is up (unless 24277c478bd9Sstevel@tonic-gate * the link is flapping; see below). Otherwise, look for proof of probes 24287c478bd9Sstevel@tonic-gate * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 24297c478bd9Sstevel@tonic-gate * either IPv4 or IPv6 instance, the phyint can be considered repaired. 24307c478bd9Sstevel@tonic-gate */ 24317c478bd9Sstevel@tonic-gate static boolean_t 24327c478bd9Sstevel@tonic-gate phyint_repaired(struct phyint *pi) 24337c478bd9Sstevel@tonic-gate { 24347c478bd9Sstevel@tonic-gate struct probe_success_count psinfo; 24357c478bd9Sstevel@tonic-gate struct phyint_instance *pii; 24367c478bd9Sstevel@tonic-gate struct target *cur_tg; 24377c478bd9Sstevel@tonic-gate int pr_ndx; 24387c478bd9Sstevel@tonic-gate uint_t cur_time; 24397c478bd9Sstevel@tonic-gate 24407c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) 24417c478bd9Sstevel@tonic-gate logdebug("phyint_repaired(%s)\n", pi->pi_name); 24427c478bd9Sstevel@tonic-gate 24437c478bd9Sstevel@tonic-gate if (LINK_DOWN(pi)) 24447c478bd9Sstevel@tonic-gate return (_B_FALSE); 24457c478bd9Sstevel@tonic-gate 24467c478bd9Sstevel@tonic-gate /* 24477c478bd9Sstevel@tonic-gate * If we don't have any test addresses and the link is up, then 24487c478bd9Sstevel@tonic-gate * consider the interface repaired, unless we've received more than 24497c478bd9Sstevel@tonic-gate * LINK_UP_PERMIN link up notifications in the last minute, in 24507c478bd9Sstevel@tonic-gate * which case we keep the link down until we drop back below 24517c478bd9Sstevel@tonic-gate * the threshold. 24527c478bd9Sstevel@tonic-gate */ 24537c478bd9Sstevel@tonic-gate if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 24547c478bd9Sstevel@tonic-gate cur_time = getcurrenttime(); 24557c478bd9Sstevel@tonic-gate if ((pi->pi_whenup[pi->pi_whendx] == 0 || 24567c478bd9Sstevel@tonic-gate (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 24577c478bd9Sstevel@tonic-gate pi->pi_lfmsg_printed = 0; 24587c478bd9Sstevel@tonic-gate return (_B_TRUE); 24597c478bd9Sstevel@tonic-gate } 24607c478bd9Sstevel@tonic-gate if (!pi->pi_lfmsg_printed) { 24617c478bd9Sstevel@tonic-gate logerr("The link has come up on %s more than %d times " 24627c478bd9Sstevel@tonic-gate "in the last minute; disabling failback until it " 24637c478bd9Sstevel@tonic-gate "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 24647c478bd9Sstevel@tonic-gate pi->pi_lfmsg_printed = 1; 24657c478bd9Sstevel@tonic-gate } 24667c478bd9Sstevel@tonic-gate 24677c478bd9Sstevel@tonic-gate return (_B_FALSE); 24687c478bd9Sstevel@tonic-gate } 24697c478bd9Sstevel@tonic-gate 24707c478bd9Sstevel@tonic-gate pii = pi->pi_v4; 24717c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pii)) { 24727c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 24737c478bd9Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 24747c478bd9Sstevel@tonic-gate probe_success_info(pii, cur_tg, &psinfo); 24757c478bd9Sstevel@tonic-gate if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 24767c478bd9Sstevel@tonic-gate psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 24777c478bd9Sstevel@tonic-gate return (_B_TRUE); 24787c478bd9Sstevel@tonic-gate } 24797c478bd9Sstevel@tonic-gate 24807c478bd9Sstevel@tonic-gate pii = pi->pi_v6; 24817c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pii)) { 24827c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 24837c478bd9Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 24847c478bd9Sstevel@tonic-gate probe_success_info(pii, cur_tg, &psinfo); 24857c478bd9Sstevel@tonic-gate if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 24867c478bd9Sstevel@tonic-gate psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 24877c478bd9Sstevel@tonic-gate return (_B_TRUE); 24887c478bd9Sstevel@tonic-gate } 24897c478bd9Sstevel@tonic-gate 24907c478bd9Sstevel@tonic-gate return (_B_FALSE); 24917c478bd9Sstevel@tonic-gate } 24927c478bd9Sstevel@tonic-gate 24937c478bd9Sstevel@tonic-gate /* 24947c478bd9Sstevel@tonic-gate * Try failover from phyint 'pi' to a suitable destination. 24957c478bd9Sstevel@tonic-gate */ 24967c478bd9Sstevel@tonic-gate int 24977c478bd9Sstevel@tonic-gate try_failover(struct phyint *pi, int failover_type) 24987c478bd9Sstevel@tonic-gate { 24997c478bd9Sstevel@tonic-gate struct phyint *dst; 25007c478bd9Sstevel@tonic-gate int err; 25017c478bd9Sstevel@tonic-gate 25027c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) 25037c478bd9Sstevel@tonic-gate logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); 25047c478bd9Sstevel@tonic-gate 25057c478bd9Sstevel@tonic-gate /* 25067c478bd9Sstevel@tonic-gate * Attempt to find a failover destination 'dst'. 25077c478bd9Sstevel@tonic-gate * dst will be null if any of the following is true 25087c478bd9Sstevel@tonic-gate * Phyint is not part of a group OR 25097c478bd9Sstevel@tonic-gate * Phyint is the only member of a group OR 25107c478bd9Sstevel@tonic-gate * No suitable failover dst was available 25117c478bd9Sstevel@tonic-gate */ 25127c478bd9Sstevel@tonic-gate dst = get_failover_dst(pi, failover_type); 25137c478bd9Sstevel@tonic-gate if (dst == NULL) 25147c478bd9Sstevel@tonic-gate return (IPMP_EMINRED); 25157c478bd9Sstevel@tonic-gate 25167c478bd9Sstevel@tonic-gate dst->pi_empty = 0; /* Per state diagram */ 25177c478bd9Sstevel@tonic-gate pi->pi_full = 0; /* Per state diagram */ 25187c478bd9Sstevel@tonic-gate 25197c478bd9Sstevel@tonic-gate err = failover(pi, dst); 25207c478bd9Sstevel@tonic-gate 25217c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) { 25227c478bd9Sstevel@tonic-gate logdebug("failed over from %s to %s ret %d\n", 25237c478bd9Sstevel@tonic-gate pi->pi_name, dst->pi_name, err); 25247c478bd9Sstevel@tonic-gate } 25257c478bd9Sstevel@tonic-gate if (err == 0) { 25267c478bd9Sstevel@tonic-gate pi->pi_empty = 1; /* Per state diagram */ 25277c478bd9Sstevel@tonic-gate /* 25287c478bd9Sstevel@tonic-gate * we don't want to print out this message if a 25297c478bd9Sstevel@tonic-gate * phyint is leaving the group, nor for failover from 25307c478bd9Sstevel@tonic-gate * standby 25317c478bd9Sstevel@tonic-gate */ 25327c478bd9Sstevel@tonic-gate if (failover_type == FAILOVER_NORMAL) { 25337c478bd9Sstevel@tonic-gate logerr("Successfully failed over from NIC %s to NIC " 25347c478bd9Sstevel@tonic-gate "%s\n", pi->pi_name, dst->pi_name); 25357c478bd9Sstevel@tonic-gate } 25367c478bd9Sstevel@tonic-gate return (0); 25377c478bd9Sstevel@tonic-gate } else { 25387c478bd9Sstevel@tonic-gate /* 25397c478bd9Sstevel@tonic-gate * The failover did not succeed. We must retry the failover 25407c478bd9Sstevel@tonic-gate * only after resyncing our state based on the kernel's. 25417c478bd9Sstevel@tonic-gate * For eg. either the src or the dst might have been unplumbed 25427c478bd9Sstevel@tonic-gate * causing this failure. initifs() will be called again, 25437c478bd9Sstevel@tonic-gate * from main, since full_scan_required has been set to true 25447c478bd9Sstevel@tonic-gate * by failover(); 25457c478bd9Sstevel@tonic-gate */ 25467c478bd9Sstevel@tonic-gate return (IPMP_FAILURE); 25477c478bd9Sstevel@tonic-gate } 25487c478bd9Sstevel@tonic-gate } 25497c478bd9Sstevel@tonic-gate 25507c478bd9Sstevel@tonic-gate /* 25517c478bd9Sstevel@tonic-gate * global_errno captures the errno value, if failover() or failback() 25527c478bd9Sstevel@tonic-gate * fails. This is sent to if_mpadm(1M). 25537c478bd9Sstevel@tonic-gate */ 25547c478bd9Sstevel@tonic-gate int global_errno; 25557c478bd9Sstevel@tonic-gate 25567c478bd9Sstevel@tonic-gate /* 25577c478bd9Sstevel@tonic-gate * Attempt failover from phyint 'from' to phyint 'to'. 25587c478bd9Sstevel@tonic-gate * IP moves everything from phyint 'from' to phyint 'to'. 25597c478bd9Sstevel@tonic-gate */ 25607c478bd9Sstevel@tonic-gate static int 25617c478bd9Sstevel@tonic-gate failover(struct phyint *from, struct phyint *to) 25627c478bd9Sstevel@tonic-gate { 25637c478bd9Sstevel@tonic-gate struct lifreq lifr; 25647c478bd9Sstevel@tonic-gate int ret; 25657c478bd9Sstevel@tonic-gate 25667c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) { 25677c478bd9Sstevel@tonic-gate logdebug("failing over from %s to %s\n", 25687c478bd9Sstevel@tonic-gate from->pi_name, to->pi_name); 25697c478bd9Sstevel@tonic-gate } 25707c478bd9Sstevel@tonic-gate 25717c478bd9Sstevel@tonic-gate /* 25727c478bd9Sstevel@tonic-gate * Perform the failover. Both IPv4 and IPv6 are failed over 25737c478bd9Sstevel@tonic-gate * using a single ioctl by passing in AF_UNSPEC family. 25747c478bd9Sstevel@tonic-gate */ 25757c478bd9Sstevel@tonic-gate lifr.lifr_addr.ss_family = AF_UNSPEC; 25767c478bd9Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 25777c478bd9Sstevel@tonic-gate lifr.lifr_movetoindex = to->pi_ifindex; 25787c478bd9Sstevel@tonic-gate 25797c478bd9Sstevel@tonic-gate ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); 25807c478bd9Sstevel@tonic-gate if (ret < 0) { 25817c478bd9Sstevel@tonic-gate global_errno = errno; 25827c478bd9Sstevel@tonic-gate logperror("failover: ioctl (failover)"); 25837c478bd9Sstevel@tonic-gate } 25847c478bd9Sstevel@tonic-gate 25857c478bd9Sstevel@tonic-gate /* 25867c478bd9Sstevel@tonic-gate * Set full_scan_required to true. This will make us read 25877c478bd9Sstevel@tonic-gate * the state from the kernel in initifs() and update our tables, 25887c478bd9Sstevel@tonic-gate * to reflect the current state after the failover. If the 25897c478bd9Sstevel@tonic-gate * failover has failed it will then reissue the failover. 25907c478bd9Sstevel@tonic-gate */ 25917c478bd9Sstevel@tonic-gate full_scan_required = _B_TRUE; 25927c478bd9Sstevel@tonic-gate return (ret); 25937c478bd9Sstevel@tonic-gate } 25947c478bd9Sstevel@tonic-gate 25957c478bd9Sstevel@tonic-gate /* 25967c478bd9Sstevel@tonic-gate * phyint 'pi' has recovered. Attempt failback from every phyint in the same 25977c478bd9Sstevel@tonic-gate * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. 25987c478bd9Sstevel@tonic-gate * Return values: 25997c478bd9Sstevel@tonic-gate * IPMP_SUCCESS: Failback successful from each of the other 26007c478bd9Sstevel@tonic-gate * phyints in the group. 26017c478bd9Sstevel@tonic-gate * IPMP_EFBPARTIAL: Failback successful from some of the other 26027c478bd9Sstevel@tonic-gate * phyints in the group. 26037c478bd9Sstevel@tonic-gate * IPMP_FAILURE: Failback syscall failed with some error. 26047c478bd9Sstevel@tonic-gate * 26057c478bd9Sstevel@tonic-gate * Note that failback is attempted regardless of the setting of the 26067c478bd9Sstevel@tonic-gate * failback_enabled flag. 26077c478bd9Sstevel@tonic-gate */ 26087c478bd9Sstevel@tonic-gate int 26097c478bd9Sstevel@tonic-gate do_failback(struct phyint *pi, boolean_t check_only) 26107c478bd9Sstevel@tonic-gate { 26117c478bd9Sstevel@tonic-gate struct phyint *from; 26127c478bd9Sstevel@tonic-gate boolean_t done; 26137c478bd9Sstevel@tonic-gate boolean_t partial; 26147c478bd9Sstevel@tonic-gate boolean_t attempted_failback = _B_FALSE; 26157c478bd9Sstevel@tonic-gate 26167c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) 26177c478bd9Sstevel@tonic-gate logdebug("do_failback(%s)\n", pi->pi_name); 26187c478bd9Sstevel@tonic-gate 26197c478bd9Sstevel@tonic-gate /* If this phyint is not part of a named group, return. */ 26207c478bd9Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 26217c478bd9Sstevel@tonic-gate pi->pi_full = 1; 26227c478bd9Sstevel@tonic-gate return (IPMP_SUCCESS); 26237c478bd9Sstevel@tonic-gate } 26247c478bd9Sstevel@tonic-gate 26257c478bd9Sstevel@tonic-gate /* 26267c478bd9Sstevel@tonic-gate * Attempt failback from every phyint in the group to 'pi'. 26277c478bd9Sstevel@tonic-gate * The reason for doing this, instead of only from the 26287c478bd9Sstevel@tonic-gate * phyint to which we did the failover is given below. 26297c478bd9Sstevel@tonic-gate * 26307c478bd9Sstevel@tonic-gate * After 'pi' failed, if any app. tries to join on a multicast 26317c478bd9Sstevel@tonic-gate * address (IPv6), on the failed phyint, IP picks any arbitrary 26327c478bd9Sstevel@tonic-gate * non-failed phyint in the group, instead of the failed phyint, 26337c478bd9Sstevel@tonic-gate * in.mpathd is not aware of this. Thus failing back only from the 26347c478bd9Sstevel@tonic-gate * interface to which 'pi' failed over, will failback the ipif's 26357c478bd9Sstevel@tonic-gate * but not the ilm's. So we need to failback from all members of 26367c478bd9Sstevel@tonic-gate * the phyint group 26377c478bd9Sstevel@tonic-gate */ 26387c478bd9Sstevel@tonic-gate done = _B_TRUE; 26397c478bd9Sstevel@tonic-gate partial = _B_FALSE; 26407c478bd9Sstevel@tonic-gate for (from = pi->pi_group->pg_phyint; from != NULL; 26417c478bd9Sstevel@tonic-gate from = from->pi_pgnext) { 26427c478bd9Sstevel@tonic-gate /* Exclude ourself as a failback src */ 26437c478bd9Sstevel@tonic-gate if (from == pi) 26447c478bd9Sstevel@tonic-gate continue; 26457c478bd9Sstevel@tonic-gate 26467c478bd9Sstevel@tonic-gate /* 26477c478bd9Sstevel@tonic-gate * If the 'from' phyint has IPv4 plumbed, the 'to' 26487c478bd9Sstevel@tonic-gate * phyint must also have IPv4 plumbed. Similar check 26497c478bd9Sstevel@tonic-gate * for IPv6. IP makes the same check. Otherwise the 26507c478bd9Sstevel@tonic-gate * failback will fail. 26517c478bd9Sstevel@tonic-gate */ 26527c478bd9Sstevel@tonic-gate if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || 26537c478bd9Sstevel@tonic-gate (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { 26547c478bd9Sstevel@tonic-gate partial = _B_TRUE; 26557c478bd9Sstevel@tonic-gate continue; 26567c478bd9Sstevel@tonic-gate } 26577c478bd9Sstevel@tonic-gate 26587c478bd9Sstevel@tonic-gate if (!check_only) { 26597c478bd9Sstevel@tonic-gate pi->pi_empty = 0; /* Per state diagram */ 26607c478bd9Sstevel@tonic-gate attempted_failback = _B_TRUE; 26617c478bd9Sstevel@tonic-gate if (failback(from, pi) != 0) { 26627c478bd9Sstevel@tonic-gate done = _B_FALSE; 26637c478bd9Sstevel@tonic-gate break; 26647c478bd9Sstevel@tonic-gate } 26657c478bd9Sstevel@tonic-gate } 26667c478bd9Sstevel@tonic-gate } 26677c478bd9Sstevel@tonic-gate 26687c478bd9Sstevel@tonic-gate if (check_only) { 26697c478bd9Sstevel@tonic-gate return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 26707c478bd9Sstevel@tonic-gate } 26717c478bd9Sstevel@tonic-gate 26727c478bd9Sstevel@tonic-gate /* 26737c478bd9Sstevel@tonic-gate * We are done. No more phyint from which we can src the failback 26747c478bd9Sstevel@tonic-gate */ 26757c478bd9Sstevel@tonic-gate if (done) { 26767c478bd9Sstevel@tonic-gate if (!partial) 26777c478bd9Sstevel@tonic-gate pi->pi_full = 1; /* Per state diagram */ 26787c478bd9Sstevel@tonic-gate /* 26797c478bd9Sstevel@tonic-gate * Don't print out a message unless there is a 26807c478bd9Sstevel@tonic-gate * transition from FAILED to RUNNING. For eg. 26817c478bd9Sstevel@tonic-gate * we don't want to print out this message if a 26827c478bd9Sstevel@tonic-gate * phyint is leaving the group, or at startup 26837c478bd9Sstevel@tonic-gate */ 26847c478bd9Sstevel@tonic-gate if (attempted_failback && (pi->pi_flags & 26857c478bd9Sstevel@tonic-gate (IFF_FAILED | IFF_OFFLINE))) { 26867c478bd9Sstevel@tonic-gate logerr("Successfully failed back to NIC %s\n", 26877c478bd9Sstevel@tonic-gate pi->pi_name); 26887c478bd9Sstevel@tonic-gate } 26897c478bd9Sstevel@tonic-gate return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 26907c478bd9Sstevel@tonic-gate } 26917c478bd9Sstevel@tonic-gate 26927c478bd9Sstevel@tonic-gate return (IPMP_FAILURE); 26937c478bd9Sstevel@tonic-gate } 26947c478bd9Sstevel@tonic-gate 26957c478bd9Sstevel@tonic-gate /* 26967c478bd9Sstevel@tonic-gate * This function is similar to do_failback() above, but respects the 26977c478bd9Sstevel@tonic-gate * failback_enabled flag for phyints in named groups. 26987c478bd9Sstevel@tonic-gate */ 26997c478bd9Sstevel@tonic-gate int 27007c478bd9Sstevel@tonic-gate try_failback(struct phyint *pi, boolean_t check_only) 27017c478bd9Sstevel@tonic-gate { 27027c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) 27037c478bd9Sstevel@tonic-gate logdebug("try_failback(%s)\n", pi->pi_name); 27047c478bd9Sstevel@tonic-gate 27057c478bd9Sstevel@tonic-gate if (pi->pi_group != phyint_anongroup && !failback_enabled) 27067c478bd9Sstevel@tonic-gate return (IPMP_EFBDISABLED); 27077c478bd9Sstevel@tonic-gate 27087c478bd9Sstevel@tonic-gate return (do_failback(pi, check_only)); 27097c478bd9Sstevel@tonic-gate } 27107c478bd9Sstevel@tonic-gate 27117c478bd9Sstevel@tonic-gate /* 27127c478bd9Sstevel@tonic-gate * Failback everything from phyint 'from' that has the same ifindex 27137c478bd9Sstevel@tonic-gate * as phyint to's ifindex. 27147c478bd9Sstevel@tonic-gate */ 27157c478bd9Sstevel@tonic-gate static int 27167c478bd9Sstevel@tonic-gate failback(struct phyint *from, struct phyint *to) 27177c478bd9Sstevel@tonic-gate { 27187c478bd9Sstevel@tonic-gate struct lifreq lifr; 27197c478bd9Sstevel@tonic-gate int ret; 27207c478bd9Sstevel@tonic-gate 27217c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) 27227c478bd9Sstevel@tonic-gate logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); 27237c478bd9Sstevel@tonic-gate 27247c478bd9Sstevel@tonic-gate lifr.lifr_addr.ss_family = AF_UNSPEC; 27257c478bd9Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 27267c478bd9Sstevel@tonic-gate lifr.lifr_movetoindex = to->pi_ifindex; 27277c478bd9Sstevel@tonic-gate 27287c478bd9Sstevel@tonic-gate ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); 27297c478bd9Sstevel@tonic-gate if (ret < 0) { 27307c478bd9Sstevel@tonic-gate global_errno = errno; 27317c478bd9Sstevel@tonic-gate logperror("failback: ioctl (failback)"); 27327c478bd9Sstevel@tonic-gate } 27337c478bd9Sstevel@tonic-gate 27347c478bd9Sstevel@tonic-gate /* 27357c478bd9Sstevel@tonic-gate * Set full_scan_required to true. This will make us read 27367c478bd9Sstevel@tonic-gate * the state from the kernel in initifs() and update our tables, 27377c478bd9Sstevel@tonic-gate * to reflect the current state after the failback. If the 27387c478bd9Sstevel@tonic-gate * failback has failed it will then reissue the failback. 27397c478bd9Sstevel@tonic-gate */ 27407c478bd9Sstevel@tonic-gate full_scan_required = _B_TRUE; 27417c478bd9Sstevel@tonic-gate 27427c478bd9Sstevel@tonic-gate return (ret); 27437c478bd9Sstevel@tonic-gate } 27447c478bd9Sstevel@tonic-gate 27457c478bd9Sstevel@tonic-gate /* 27467c478bd9Sstevel@tonic-gate * Select a target phyint for failing over from 'pi'. 27477c478bd9Sstevel@tonic-gate * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred 27487c478bd9Sstevel@tonic-gate * target phyint is chosen as follows, 27497c478bd9Sstevel@tonic-gate * 1. Pick any inactive standby interface. 27507c478bd9Sstevel@tonic-gate * 2. If no inactive standby is available, select any phyint in the 27517c478bd9Sstevel@tonic-gate * same group that has the least number of logints, (excluding 27527c478bd9Sstevel@tonic-gate * IFF_NOFAILOVER and !IFF_UP logints) 27537c478bd9Sstevel@tonic-gate * If we are failing over from a standby, failover_type is 27547c478bd9Sstevel@tonic-gate * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. 27557c478bd9Sstevel@tonic-gate * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, 27567c478bd9Sstevel@tonic-gate * and we won't return NULL, as long as there is at least 1 other phyint 27577c478bd9Sstevel@tonic-gate * in the group. 27587c478bd9Sstevel@tonic-gate */ 27597c478bd9Sstevel@tonic-gate static struct phyint * 27607c478bd9Sstevel@tonic-gate get_failover_dst(struct phyint *pi, int failover_type) 27617c478bd9Sstevel@tonic-gate { 27627c478bd9Sstevel@tonic-gate struct phyint *maybe = NULL; 27637c478bd9Sstevel@tonic-gate struct phyint *pi2; 27647c478bd9Sstevel@tonic-gate struct phyint *last_choice = NULL; 27657c478bd9Sstevel@tonic-gate 27667c478bd9Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) 27677c478bd9Sstevel@tonic-gate return (NULL); 27687c478bd9Sstevel@tonic-gate 27697c478bd9Sstevel@tonic-gate /* 27707c478bd9Sstevel@tonic-gate * Loop thru the phyints in the group, and pick the preferred 27717c478bd9Sstevel@tonic-gate * phyint for the target. 27727c478bd9Sstevel@tonic-gate */ 27737c478bd9Sstevel@tonic-gate for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 27747c478bd9Sstevel@tonic-gate /* Exclude ourself and offlined interfaces */ 27757c478bd9Sstevel@tonic-gate if (pi2 == pi || pi2->pi_state == PI_OFFLINE) 27767c478bd9Sstevel@tonic-gate continue; 27777c478bd9Sstevel@tonic-gate 27787c478bd9Sstevel@tonic-gate /* 27797c478bd9Sstevel@tonic-gate * The chosen target phyint must have IPv4 instance 27807c478bd9Sstevel@tonic-gate * plumbed, if the src phyint has IPv4 plumbed. Similarly 27817c478bd9Sstevel@tonic-gate * for IPv6. 27827c478bd9Sstevel@tonic-gate */ 27837c478bd9Sstevel@tonic-gate if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || 27847c478bd9Sstevel@tonic-gate (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) 27857c478bd9Sstevel@tonic-gate continue; 27867c478bd9Sstevel@tonic-gate 27877c478bd9Sstevel@tonic-gate /* The chosen target must be PI_RUNNING. */ 27887c478bd9Sstevel@tonic-gate if (pi2->pi_state != PI_RUNNING) { 27897c478bd9Sstevel@tonic-gate last_choice = pi2; 27907c478bd9Sstevel@tonic-gate continue; 27917c478bd9Sstevel@tonic-gate } 27927c478bd9Sstevel@tonic-gate 2793*49df4566Sethindra if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && 27947c478bd9Sstevel@tonic-gate (failover_type != FAILOVER_TO_NONSTANDBY)) { 27957c478bd9Sstevel@tonic-gate return (pi2); 27967c478bd9Sstevel@tonic-gate } else { 27977c478bd9Sstevel@tonic-gate if (maybe == NULL) 27987c478bd9Sstevel@tonic-gate maybe = pi2; 27997c478bd9Sstevel@tonic-gate else if (logint_upcount(pi2) < logint_upcount(maybe)) 28007c478bd9Sstevel@tonic-gate maybe = pi2; 28017c478bd9Sstevel@tonic-gate } 28027c478bd9Sstevel@tonic-gate } 28037c478bd9Sstevel@tonic-gate if (maybe == NULL && failover_type == FAILOVER_TO_ANY) 28047c478bd9Sstevel@tonic-gate return (last_choice); 28057c478bd9Sstevel@tonic-gate else 28067c478bd9Sstevel@tonic-gate return (maybe); 28077c478bd9Sstevel@tonic-gate } 28087c478bd9Sstevel@tonic-gate 28097c478bd9Sstevel@tonic-gate /* 28107c478bd9Sstevel@tonic-gate * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 28117c478bd9Sstevel@tonic-gate */ 28127c478bd9Sstevel@tonic-gate boolean_t 28137c478bd9Sstevel@tonic-gate change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) 28147c478bd9Sstevel@tonic-gate { 28157c478bd9Sstevel@tonic-gate int ifsock; 28167c478bd9Sstevel@tonic-gate struct lifreq lifr; 28177c478bd9Sstevel@tonic-gate 28187c478bd9Sstevel@tonic-gate if (debug & D_FAILOVER) { 28197c478bd9Sstevel@tonic-gate logdebug("change_lif_flags(%s): flags %llx setfl %d\n", 28207c478bd9Sstevel@tonic-gate pi->pi_name, flags, (int)setfl); 28217c478bd9Sstevel@tonic-gate } 28227c478bd9Sstevel@tonic-gate 28237c478bd9Sstevel@tonic-gate if (pi->pi_v4 != NULL) { 28247c478bd9Sstevel@tonic-gate ifsock = ifsock_v4; 28257c478bd9Sstevel@tonic-gate } else { 28267c478bd9Sstevel@tonic-gate ifsock = ifsock_v6; 28277c478bd9Sstevel@tonic-gate } 28287c478bd9Sstevel@tonic-gate 28297c478bd9Sstevel@tonic-gate /* 28307c478bd9Sstevel@tonic-gate * Get the current flags from the kernel, and set/clear the 28317c478bd9Sstevel@tonic-gate * desired phyint flags. Since we set only phyint flags, we can 28327c478bd9Sstevel@tonic-gate * do it on either IPv4 or IPv6 instance. 28337c478bd9Sstevel@tonic-gate */ 28347c478bd9Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 28357c478bd9Sstevel@tonic-gate lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 28367c478bd9Sstevel@tonic-gate if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 28377c478bd9Sstevel@tonic-gate if (errno != ENXIO) 28387c478bd9Sstevel@tonic-gate logperror("change_lif_flags: ioctl (get flags)"); 28397c478bd9Sstevel@tonic-gate return (_B_FALSE); 28407c478bd9Sstevel@tonic-gate } 28417c478bd9Sstevel@tonic-gate if (setfl) 28427c478bd9Sstevel@tonic-gate lifr.lifr_flags |= flags; 28437c478bd9Sstevel@tonic-gate else 28447c478bd9Sstevel@tonic-gate lifr.lifr_flags &= ~flags; 28457c478bd9Sstevel@tonic-gate if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 28467c478bd9Sstevel@tonic-gate if (errno != ENXIO) 28477c478bd9Sstevel@tonic-gate logperror("change_lif_flags: ioctl (set flags)"); 28487c478bd9Sstevel@tonic-gate return (_B_FALSE); 28497c478bd9Sstevel@tonic-gate } 28507c478bd9Sstevel@tonic-gate 28517c478bd9Sstevel@tonic-gate /* 28527c478bd9Sstevel@tonic-gate * Keep pi_flags in synch. with actual flags. Assumes flags are 28537c478bd9Sstevel@tonic-gate * phyint flags. 28547c478bd9Sstevel@tonic-gate */ 28557c478bd9Sstevel@tonic-gate if (setfl) 28567c478bd9Sstevel@tonic-gate pi->pi_flags |= flags; 28577c478bd9Sstevel@tonic-gate else 28587c478bd9Sstevel@tonic-gate pi->pi_flags &= ~flags; 28597c478bd9Sstevel@tonic-gate 28607c478bd9Sstevel@tonic-gate if (pi->pi_v4) 28617c478bd9Sstevel@tonic-gate pi->pi_v4->pii_flags = pi->pi_flags; 28627c478bd9Sstevel@tonic-gate 28637c478bd9Sstevel@tonic-gate if (pi->pi_v6) 28647c478bd9Sstevel@tonic-gate pi->pi_v6->pii_flags = pi->pi_flags; 28657c478bd9Sstevel@tonic-gate 28667c478bd9Sstevel@tonic-gate return (_B_TRUE); 28677c478bd9Sstevel@tonic-gate } 28687c478bd9Sstevel@tonic-gate 28697c478bd9Sstevel@tonic-gate /* 28707c478bd9Sstevel@tonic-gate * icmp cksum computation for IPv4. 28717c478bd9Sstevel@tonic-gate */ 28727c478bd9Sstevel@tonic-gate static int 28737c478bd9Sstevel@tonic-gate in_cksum(ushort_t *addr, int len) 28747c478bd9Sstevel@tonic-gate { 28757c478bd9Sstevel@tonic-gate register int nleft = len; 28767c478bd9Sstevel@tonic-gate register ushort_t *w = addr; 28777c478bd9Sstevel@tonic-gate register ushort_t answer; 28787c478bd9Sstevel@tonic-gate ushort_t odd_byte = 0; 28797c478bd9Sstevel@tonic-gate register int sum = 0; 28807c478bd9Sstevel@tonic-gate 28817c478bd9Sstevel@tonic-gate /* 28827c478bd9Sstevel@tonic-gate * Our algorithm is simple, using a 32 bit accumulator (sum), 28837c478bd9Sstevel@tonic-gate * we add sequential 16 bit words to it, and at the end, fold 28847c478bd9Sstevel@tonic-gate * back all the carry bits from the top 16 bits into the lower 28857c478bd9Sstevel@tonic-gate * 16 bits. 28867c478bd9Sstevel@tonic-gate */ 28877c478bd9Sstevel@tonic-gate while (nleft > 1) { 28887c478bd9Sstevel@tonic-gate sum += *w++; 28897c478bd9Sstevel@tonic-gate nleft -= 2; 28907c478bd9Sstevel@tonic-gate } 28917c478bd9Sstevel@tonic-gate 28927c478bd9Sstevel@tonic-gate /* mop up an odd byte, if necessary */ 28937c478bd9Sstevel@tonic-gate if (nleft == 1) { 28947c478bd9Sstevel@tonic-gate *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 28957c478bd9Sstevel@tonic-gate sum += odd_byte; 28967c478bd9Sstevel@tonic-gate } 28977c478bd9Sstevel@tonic-gate 28987c478bd9Sstevel@tonic-gate /* 28997c478bd9Sstevel@tonic-gate * add back carry outs from top 16 bits to low 16 bits 29007c478bd9Sstevel@tonic-gate */ 29017c478bd9Sstevel@tonic-gate sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 29027c478bd9Sstevel@tonic-gate sum += (sum >> 16); /* add carry */ 29037c478bd9Sstevel@tonic-gate answer = ~sum; /* truncate to 16 bits */ 29047c478bd9Sstevel@tonic-gate return (answer); 29057c478bd9Sstevel@tonic-gate } 29067c478bd9Sstevel@tonic-gate 29077c478bd9Sstevel@tonic-gate static void 29087c478bd9Sstevel@tonic-gate reset_snxt_basetimes(void) 29097c478bd9Sstevel@tonic-gate { 29107c478bd9Sstevel@tonic-gate struct phyint_instance *pii; 29117c478bd9Sstevel@tonic-gate 29127c478bd9Sstevel@tonic-gate for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 29137c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 29147c478bd9Sstevel@tonic-gate } 29157c478bd9Sstevel@tonic-gate } 29167c478bd9Sstevel@tonic-gate 29177c478bd9Sstevel@tonic-gate /* 29187c478bd9Sstevel@tonic-gate * Is the address one of our own addresses? Unfortunately, 29197c478bd9Sstevel@tonic-gate * we cannot check our phyint tables to determine if the address 29207c478bd9Sstevel@tonic-gate * is our own. This is because, we don't track interfaces that 29217c478bd9Sstevel@tonic-gate * are not part of any group. We have to either use a 'bind' or 29227c478bd9Sstevel@tonic-gate * get the complete list of all interfaces using SIOCGLIFCONF, 29237c478bd9Sstevel@tonic-gate * to do this check. We choose to use 'bind'. We could use 29247c478bd9Sstevel@tonic-gate * SIOCTMYADDR, but bind is preferred, since it is stronger. 29257c478bd9Sstevel@tonic-gate * SIOCTMYADDR excludes down interfaces, while bind includes even 29267c478bd9Sstevel@tonic-gate * down interfaces. 29277c478bd9Sstevel@tonic-gate */ 29287c478bd9Sstevel@tonic-gate boolean_t 29297c478bd9Sstevel@tonic-gate own_address(int af, struct in6_addr addr) 29307c478bd9Sstevel@tonic-gate { 29317c478bd9Sstevel@tonic-gate int sock; 29327c478bd9Sstevel@tonic-gate boolean_t ours = _B_TRUE; 29337c478bd9Sstevel@tonic-gate 29347c478bd9Sstevel@tonic-gate sock = socket(AF_INET6, SOCK_DGRAM, 0); 29357c478bd9Sstevel@tonic-gate if (sock == -1) { 29367c478bd9Sstevel@tonic-gate logperror("own_address: socket"); 29377c478bd9Sstevel@tonic-gate /* 29387c478bd9Sstevel@tonic-gate * If the socket call fails, err on the side of caution, 29397c478bd9Sstevel@tonic-gate * and return true. 29407c478bd9Sstevel@tonic-gate */ 29417c478bd9Sstevel@tonic-gate } else { 29427c478bd9Sstevel@tonic-gate struct sockaddr_in6 sin6; 29437c478bd9Sstevel@tonic-gate 29447c478bd9Sstevel@tonic-gate (void) memset(&sin6, 0, sizeof (struct sockaddr_in6)); 29457c478bd9Sstevel@tonic-gate sin6.sin6_family = AF_INET6; 29467c478bd9Sstevel@tonic-gate sin6.sin6_addr = addr; 29477c478bd9Sstevel@tonic-gate /* 29487c478bd9Sstevel@tonic-gate * If the bind succeeds, then this address is one of our 29497c478bd9Sstevel@tonic-gate * addresses. 29507c478bd9Sstevel@tonic-gate * If bind returns error EADDRNOTAVAIL, the address is 29517c478bd9Sstevel@tonic-gate * not one of ours. 29527c478bd9Sstevel@tonic-gate * If bind returns an error other than EADDRNOTAVAIL, err 29537c478bd9Sstevel@tonic-gate * on the side of caution and report the address as one of 29547c478bd9Sstevel@tonic-gate * our own. 29557c478bd9Sstevel@tonic-gate */ 29567c478bd9Sstevel@tonic-gate if (bind(sock, (struct sockaddr *)&sin6, 29577c478bd9Sstevel@tonic-gate sizeof (struct sockaddr_in6)) == -1) { 29587c478bd9Sstevel@tonic-gate if (errno == EADDRNOTAVAIL) 29597c478bd9Sstevel@tonic-gate ours = _B_FALSE; 29607c478bd9Sstevel@tonic-gate else 29617c478bd9Sstevel@tonic-gate logperror("own_address: bind"); 29627c478bd9Sstevel@tonic-gate } 29637c478bd9Sstevel@tonic-gate (void) close(sock); 29647c478bd9Sstevel@tonic-gate } 29657c478bd9Sstevel@tonic-gate if (debug & D_TARGET) { 29667c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 29677c478bd9Sstevel@tonic-gate 29687c478bd9Sstevel@tonic-gate logdebug("own_address: addr %s is %s ours\n", 29697c478bd9Sstevel@tonic-gate pr_addr(af, addr, abuf, sizeof (abuf)), 29707c478bd9Sstevel@tonic-gate ours ? "one of" : "not"); 29717c478bd9Sstevel@tonic-gate } 29727c478bd9Sstevel@tonic-gate return (ours); 29737c478bd9Sstevel@tonic-gate } 2974