/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * nfs_cast.c : broadcast to a specific group of NFS servers * * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NFSCLIENT #include #include "automount.h" #define PENALTY_WEIGHT 100000 struct tstamps { struct tstamps *ts_next; int ts_penalty; int ts_inx; int ts_rcvd; struct timeval ts_timeval; }; /* A list of addresses - all belonging to the same transport */ struct addrs { struct addrs *addr_next; struct mapfs *addr_mfs; struct nd_addrlist *addr_addrs; struct tstamps *addr_if_tstamps; }; /* A list of connectionless transports */ struct transp { struct transp *tr_next; int tr_fd; char *tr_device; struct t_bind *tr_taddr; struct addrs *tr_addrs; }; /* A list of map entries and their roundtrip times, for sorting */ struct sm { struct mapfs *mfs; struct timeval timeval; }; static void free_transports(struct transp *); static void calc_resp_time(struct timeval *); static struct mapfs *sort_responses(struct transp *); static int host_sm(const void *, const void *b); static int time_sm(const void *, const void *b); extern struct mapfs *add_mfs(struct mapfs *, int, struct mapfs **, struct mapfs **); /* * This routine is designed to be able to "ping" * a list of hosts and create a list of responding * hosts sorted by response time. * This must be done without any prior * contact with the host - therefore the "ping" * must be to a "well-known" address. The outstanding * candidate here is the address of "rpcbind". * * A response to a ping is no guarantee that the host * is running NFS, has a mount daemon, or exports * the required filesystem. If the subsequent * mount attempt fails then the host will be marked * "ignore" and the host list will be re-pinged * (sans the bad host). This process continues * until a successful mount is achieved or until * there are no hosts left to try. */ enum clnt_stat nfs_cast(struct mapfs *mfs_in, struct mapfs **mfs_out, int timeout) { enum clnt_stat stat; AUTH *sys_auth = authsys_create_default(); XDR xdr_stream; register XDR *xdrs = &xdr_stream; int outlen; int if_inx; int tsec; int flag; int sent, addr_cnt, rcvd, if_cnt; fd_set readfds, mask; register ulong_t xid; /* xid - unique per addr */ register int i; struct rpc_msg msg; struct timeval t, rcv_timeout; char outbuf[UDPMSGSIZE], inbuf[UDPMSGSIZE]; struct t_unitdata t_udata, t_rdata; struct nd_hostserv hs; struct nd_addrlist *retaddrs; struct transp *tr_head; struct transp *trans, *prev_trans; struct addrs *a, *prev_addr; struct tstamps *ts, *prev_ts; NCONF_HANDLE *nc = NULL; struct netconfig *nconf; struct rlimit rl; int dtbsize; struct mapfs *mfs; /* * For each connectionless transport get a list of * host addresses. Any single host may have * addresses on several transports. */ addr_cnt = sent = rcvd = 0; tr_head = NULL; FD_ZERO(&mask); /* * Set the default select size to be the maximum FD_SETSIZE, unless * the current rlimit is lower. */ dtbsize = FD_SETSIZE; if (getrlimit(RLIMIT_NOFILE, &rl) == 0) { if (rl.rlim_cur < FD_SETSIZE) dtbsize = rl.rlim_cur; } prev_trans = NULL; prev_addr = NULL; prev_ts = NULL; for (mfs = mfs_in; mfs; mfs = mfs->mfs_next) { if (trace > 2) trace_prt(1, "nfs_cast: host=%s\n", mfs->mfs_host); nc = setnetconfig(); if (nc == NULL) { stat = RPC_CANTSEND; goto done_broad; } while (nconf = getnetconfig(nc)) { if (!(nconf->nc_flag & NC_VISIBLE) || nconf->nc_semantics != NC_TPI_CLTS || (strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0)) continue; trans = (struct transp *)malloc(sizeof (*trans)); if (trans == NULL) { syslog(LOG_ERR, "no memory"); stat = RPC_CANTSEND; goto done_broad; } (void) memset(trans, 0, sizeof (*trans)); if (tr_head == NULL) tr_head = trans; else prev_trans->tr_next = trans; prev_trans = trans; trans->tr_fd = t_open(nconf->nc_device, O_RDWR, NULL); if (trans->tr_fd < 0) { syslog(LOG_ERR, "nfscast: t_open: %s:%m", nconf->nc_device); stat = RPC_CANTSEND; goto done_broad; } if (t_bind(trans->tr_fd, (struct t_bind *)NULL, (struct t_bind *)NULL) < 0) { syslog(LOG_ERR, "nfscast: t_bind: %m"); stat = RPC_CANTSEND; goto done_broad; } trans->tr_taddr = /* LINTED pointer alignment */ (struct t_bind *)t_alloc(trans->tr_fd, T_BIND, T_ADDR); if (trans->tr_taddr == (struct t_bind *)NULL) { syslog(LOG_ERR, "nfscast: t_alloc: %m"); stat = RPC_SYSTEMERROR; goto done_broad; } trans->tr_device = nconf->nc_device; FD_SET(trans->tr_fd, &mask); if_inx = 0; hs.h_host = mfs->mfs_host; hs.h_serv = "rpcbind"; if (netdir_getbyname(nconf, &hs, &retaddrs) == ND_OK) { /* * If mfs->ignore is previously set for * this map, clear it. Because a host can * have either v6 or v4 address */ if (mfs->mfs_ignore == 1) mfs->mfs_ignore = 0; a = (struct addrs *)malloc(sizeof (*a)); if (a == NULL) { syslog(LOG_ERR, "no memory"); stat = RPC_CANTSEND; goto done_broad; } (void) memset(a, 0, sizeof (*a)); if (trans->tr_addrs == NULL) trans->tr_addrs = a; else prev_addr->addr_next = a; prev_addr = a; a->addr_if_tstamps = NULL; a->addr_mfs = mfs; a->addr_addrs = retaddrs; if_cnt = retaddrs->n_cnt; while (if_cnt--) { ts = (struct tstamps *) malloc(sizeof (*ts)); if (ts == NULL) { syslog(LOG_ERR, "no memory"); stat = RPC_CANTSEND; goto done_broad; } (void) memset(ts, 0, sizeof (*ts)); ts->ts_penalty = mfs->mfs_penalty; if (a->addr_if_tstamps == NULL) a->addr_if_tstamps = ts; else prev_ts->ts_next = ts; prev_ts = ts; ts->ts_inx = if_inx++; addr_cnt++; } break; } else { mfs->mfs_ignore = 1; if (verbose) syslog(LOG_ERR, "%s:%s address not known", mfs->mfs_host, strcmp(nconf->nc_proto, NC_INET)?"IPv6":"IPv4"); } } /* while */ endnetconfig(nc); nc = NULL; } /* for */ if (addr_cnt == 0) { syslog(LOG_ERR, "nfscast: couldn't find addresses"); stat = RPC_CANTSEND; goto done_broad; } (void) gettimeofday(&t, (struct timezone *)0); xid = (getpid() ^ t.tv_sec ^ t.tv_usec) & ~0xFF; t.tv_usec = 0; /* serialize the RPC header */ msg.rm_direction = CALL; msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; msg.rm_call.cb_prog = RPCBPROG; /* * we can not use RPCBVERS here since it doesn't exist in 4.X, * the fix to bug 1139883 has made the 4.X portmapper silent to * version mismatches. This causes the RPC call to the remote * portmapper to simply be ignored if it's not Version 2. */ msg.rm_call.cb_vers = PMAPVERS; msg.rm_call.cb_proc = NULLPROC; if (sys_auth == (AUTH *)NULL) { stat = RPC_SYSTEMERROR; goto done_broad; } msg.rm_call.cb_cred = sys_auth->ah_cred; msg.rm_call.cb_verf = sys_auth->ah_verf; xdrmem_create(xdrs, outbuf, sizeof (outbuf), XDR_ENCODE); if (! xdr_callmsg(xdrs, &msg)) { stat = RPC_CANTENCODEARGS; goto done_broad; } outlen = (int)xdr_getpos(xdrs); xdr_destroy(xdrs); t_udata.opt.len = 0; t_udata.udata.buf = outbuf; t_udata.udata.len = outlen; /* * Basic loop: send packet to all hosts and wait for response(s). * The response timeout grows larger per iteration. * A unique xid is assigned to each address in order to * correctly match the replies. */ for (tsec = 4; timeout > 0; tsec *= 2) { timeout -= tsec; if (timeout <= 0) tsec += timeout; rcv_timeout.tv_sec = tsec; rcv_timeout.tv_usec = 0; sent = 0; for (trans = tr_head; trans; trans = trans->tr_next) { for (a = trans->tr_addrs; a; a = a->addr_next) { struct netbuf *if_netbuf = a->addr_addrs->n_addrs; ts = a->addr_if_tstamps; if_cnt = a->addr_addrs->n_cnt; while (if_cnt--) { /* * xid is the first thing in * preserialized buffer */ /* LINTED pointer alignment */ *((ulong_t *)outbuf) = htonl(xid + ts->ts_inx); (void) gettimeofday(&(ts->ts_timeval), (struct timezone *)0); /* * Check if already received * from a previous iteration. */ if (ts->ts_rcvd) { sent++; ts = ts->ts_next; continue; } t_udata.addr = *if_netbuf++; if (t_sndudata(trans->tr_fd, &t_udata) == 0) { sent++; } ts = ts->ts_next; } } } if (sent == 0) { /* no packets sent ? */ stat = RPC_CANTSEND; goto done_broad; } /* * Have sent all the packets. Now collect the responses... */ rcvd = 0; recv_again: msg.acpted_rply.ar_verf = _null_auth; msg.acpted_rply.ar_results.proc = xdr_void; readfds = mask; switch (select(dtbsize, &readfds, (fd_set *)NULL, (fd_set *)NULL, &rcv_timeout)) { case 0: /* Timed out */ /* * If we got at least one response in the * last interval, then don't wait for any * more. In theory we should wait for * the max weighting (penalty) value so * that a very slow server has a chance to * respond but this could take a long time * if the admin has set a high weighting * value. */ if (rcvd > 0) goto done_broad; stat = RPC_TIMEDOUT; continue; case -1: /* some kind of error */ if (errno == EINTR) goto recv_again; syslog(LOG_ERR, "nfscast: select: %m"); if (rcvd == 0) stat = RPC_CANTRECV; goto done_broad; } /* end of select results switch */ for (trans = tr_head; trans; trans = trans->tr_next) { if (FD_ISSET(trans->tr_fd, &readfds)) break; } if (trans == NULL) goto recv_again; try_again: t_rdata.addr = trans->tr_taddr->addr; t_rdata.udata.buf = inbuf; t_rdata.udata.maxlen = sizeof (inbuf); t_rdata.udata.len = 0; t_rdata.opt.len = 0; if (t_rcvudata(trans->tr_fd, &t_rdata, &flag) < 0) { if (errno == EINTR) goto try_again; syslog(LOG_ERR, "nfscast: t_rcvudata: %s:%m", trans->tr_device); stat = RPC_CANTRECV; continue; } if (t_rdata.udata.len < sizeof (ulong_t)) goto recv_again; if (flag & T_MORE) { syslog(LOG_ERR, "nfscast: t_rcvudata: %s: buffer overflow", trans->tr_device); goto recv_again; } /* * see if reply transaction id matches sent id. * If so, decode the results. * Note: received addr is ignored, it could be * different from the send addr if the host has * more than one addr. */ xdrmem_create(xdrs, inbuf, (uint_t)t_rdata.udata.len, XDR_DECODE); if (xdr_replymsg(xdrs, &msg)) { if (msg.rm_reply.rp_stat == MSG_ACCEPTED && (msg.rm_xid & ~0xFF) == xid) { struct addrs *curr_addr; i = msg.rm_xid & 0xFF; for (curr_addr = trans->tr_addrs; curr_addr; curr_addr = curr_addr->addr_next) { for (ts = curr_addr->addr_if_tstamps; ts; ts = ts->ts_next) if (ts->ts_inx == i && !ts->ts_rcvd) { ts->ts_rcvd = 1; calc_resp_time(&ts->ts_timeval); stat = RPC_SUCCESS; rcvd++; break; } } } /* otherwise, we just ignore the errors ... */ } xdrs->x_op = XDR_FREE; msg.acpted_rply.ar_results.proc = xdr_void; (void) xdr_replymsg(xdrs, &msg); XDR_DESTROY(xdrs); if (rcvd == sent) goto done_broad; else goto recv_again; } if (!rcvd) stat = RPC_TIMEDOUT; done_broad: if (rcvd) { *mfs_out = sort_responses(tr_head); stat = RPC_SUCCESS; } if (nc) endnetconfig(nc); free_transports(tr_head); AUTH_DESTROY(sys_auth); return (stat); } /* * Go through all the responses and sort fastest to slowest. * Note that any penalty is added to the response time - so the * fastest response isn't necessarily the one that arrived first. */ static struct mapfs * sort_responses(trans) struct transp *trans; { struct transp *t; struct addrs *a; struct tstamps *ti; int i, size = 0, allocsize = 10; struct mapfs *p, *mfs_head = NULL, *mfs_tail = NULL; struct sm *buffer; buffer = (struct sm *)malloc(allocsize * sizeof (struct sm)); if (!buffer) { syslog(LOG_ERR, "sort_responses: malloc error.\n"); return (NULL); } for (t = trans; t; t = t->tr_next) { for (a = t->tr_addrs; a; a = a->addr_next) { for (ti = a->addr_if_tstamps; ti; ti = ti->ts_next) { if (!ti->ts_rcvd) continue; ti->ts_timeval.tv_usec += (ti->ts_penalty * PENALTY_WEIGHT); if (ti->ts_timeval.tv_usec >= 1000000) { ti->ts_timeval.tv_sec += (ti->ts_timeval.tv_usec / 1000000); ti->ts_timeval.tv_usec = (ti->ts_timeval.tv_usec % 1000000); } if (size >= allocsize) { allocsize += 10; buffer = (struct sm *)realloc(buffer, allocsize * sizeof (struct sm)); if (!buffer) { syslog(LOG_ERR, "sort_responses: malloc error.\n"); return (NULL); } } buffer[size].timeval = ti->ts_timeval; buffer[size].mfs = a->addr_mfs; size++; } } } #ifdef DEBUG if (trace > 3) { trace_prt(1, " sort_responses: before host sort:\n"); for (i = 0; i < size; i++) trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); trace_prt(0, "\n"); } #endif qsort((void *)buffer, size, sizeof (struct sm), host_sm); /* * Cope with multiply listed hosts by choosing first time */ for (i = 1; i < size; i++) { #ifdef DEBUG if (trace > 3) { trace_prt(1, " sort_responses: comparing %s and %s\n", buffer[i-1].mfs->mfs_host, buffer[i].mfs->mfs_host); } #endif if (strcmp(buffer[i-1].mfs->mfs_host, buffer[i].mfs->mfs_host) == 0) memcpy(&buffer[i].timeval, &buffer[i-1].timeval, sizeof (struct timeval)); } if (trace > 3) trace_prt(0, "\n"); #ifdef DEBUG if (trace > 3) { trace_prt(1, " sort_responses: before time sort:\n"); for (i = 0; i < size; i++) trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); trace_prt(0, "\n"); } #endif qsort((void *)buffer, size, sizeof (struct sm), time_sm); #ifdef DEBUG if (trace > 3) { trace_prt(1, " sort_responses: after sort:\n"); for (i = 0; i < size; i++) trace_prt(1, " %s %d.%d\n", buffer[i].mfs->mfs_host, buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec); trace_prt(0, "\n"); } #endif for (i = 0; i < size; i++) { #ifdef DEBUG if (trace > 3) { trace_prt(1, " sort_responses: adding %s\n", buffer[i].mfs->mfs_host); } #endif p = add_mfs(buffer[i].mfs, 0, &mfs_head, &mfs_tail); if (!p) return (NULL); } free(buffer); return (mfs_head); } /* * Comparison routines called by qsort(3). */ static int host_sm(const void *a, const void *b) { return (strcmp(((struct sm *)a)->mfs->mfs_host, ((struct sm *)b)->mfs->mfs_host)); } static int time_sm(const void *a, const void *b) { if (timercmp(&(((struct sm *)a)->timeval), &(((struct sm *)b)->timeval), < /* cstyle */)) return (-1); else if (timercmp(&(((struct sm *)a)->timeval), &(((struct sm *)b)->timeval), > /* cstyle */)) return (1); else return (0); } /* * Given send_time which is the time a request * was transmitted to a server, subtract it * from the time "now" thereby converting it * to an elapsed time. */ static void calc_resp_time(send_time) struct timeval *send_time; { struct timeval time_now; (void) gettimeofday(&time_now, (struct timezone *)0); if (time_now.tv_usec < send_time->tv_usec) { time_now.tv_sec--; time_now.tv_usec += 1000000; } send_time->tv_sec = time_now.tv_sec - send_time->tv_sec; send_time->tv_usec = time_now.tv_usec - send_time->tv_usec; } static void free_transports(trans) struct transp *trans; { struct transp *t, *tmpt = NULL; struct addrs *a, *tmpa = NULL; struct tstamps *ts, *tmpts = NULL; for (t = trans; t; t = tmpt) { if (t->tr_taddr) (void) t_free((char *)t->tr_taddr, T_BIND); if (t->tr_fd > 0) (void) t_close(t->tr_fd); for (a = t->tr_addrs; a; a = tmpa) { for (ts = a->addr_if_tstamps; ts; ts = tmpts) { tmpts = ts->ts_next; free(ts); } (void) netdir_free((char *)a->addr_addrs, ND_ADDRLIST); tmpa = a->addr_next; free(a); } tmpt = t->tr_next; free(t); } }