/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 *	nfs_cast.c : broadcast to a specific group of NFS servers
 *
 *      Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 *      Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <stdio.h>
#include <syslog.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <unistd.h>
#include <stdlib.h>
#include <rpc/rpc.h>
#include <rpc/clnt_soc.h>
#include <rpc/nettype.h>
#include <rpc/pmap_prot.h>
#include <netconfig.h>
#include <netdir.h>
#include <nfs/nfs.h>
#define	NFSCLIENT
#include <locale.h>
#include "automount.h"

#define	PENALTY_WEIGHT    100000

struct tstamps {
	struct tstamps	*ts_next;
	int		ts_penalty;
	int		ts_inx;
	int		ts_rcvd;
	struct timeval	ts_timeval;
};

/* A list of addresses - all belonging to the same transport */

struct addrs {
	struct addrs		*addr_next;
	struct mapfs		*addr_mfs;
	struct nd_addrlist	*addr_addrs;
	struct tstamps		*addr_if_tstamps;
};

/* A list of connectionless transports */

struct transp {
	struct transp		*tr_next;
	int			tr_fd;
	char			*tr_device;
	struct t_bind		*tr_taddr;
	struct addrs		*tr_addrs;
};

/* A list of map entries and their roundtrip times, for sorting */

struct sm {
	struct mapfs *mfs;
	struct timeval timeval;
};

static void free_transports(struct transp *);
static void calc_resp_time(struct timeval *);
static struct mapfs *sort_responses(struct transp *);
static int host_sm(const void *, const void *b);
static int time_sm(const void *, const void *b);
extern struct mapfs *add_mfs(struct mapfs *, int, struct mapfs **,
	struct mapfs **);

/*
 * This routine is designed to be able to "ping"
 * a list of hosts and create a list of responding
 * hosts sorted by response time.
 * This must be done without any prior
 * contact with the host - therefore the "ping"
 * must be to a "well-known" address.  The outstanding
 * candidate here is the address of "rpcbind".
 *
 * A response to a ping is no guarantee that the host
 * is running NFS, has a mount daemon, or exports
 * the required filesystem.  If the subsequent
 * mount attempt fails then the host will be marked
 * "ignore" and the host list will be re-pinged
 * (sans the bad host). This process continues
 * until a successful mount is achieved or until
 * there are no hosts left to try.
 */
enum clnt_stat
nfs_cast(struct mapfs *mfs_in, struct mapfs **mfs_out, int timeout)
{
	enum clnt_stat stat;
	AUTH *sys_auth = authsys_create_default();
	XDR xdr_stream;
	register XDR *xdrs = &xdr_stream;
	int outlen;
	int if_inx;
	int tsec;
	int flag;
	int sent, addr_cnt, rcvd, if_cnt;
	fd_set readfds, mask;
	register ulong_t xid;		/* xid - unique per addr */
	register int i;
	struct rpc_msg msg;
	struct timeval t, rcv_timeout;
	char outbuf[UDPMSGSIZE], inbuf[UDPMSGSIZE];
	struct t_unitdata t_udata, t_rdata;
	struct nd_hostserv hs;
	struct nd_addrlist *retaddrs;
	struct transp *tr_head;
	struct transp *trans, *prev_trans;
	struct addrs *a, *prev_addr;
	struct tstamps *ts, *prev_ts;
	NCONF_HANDLE *nc = NULL;
	struct netconfig *nconf;
	struct rlimit rl;
	int dtbsize;
	struct mapfs *mfs;

	/*
	 * For each connectionless transport get a list of
	 * host addresses.  Any single host may have
	 * addresses on several transports.
	 */
	addr_cnt = sent = rcvd = 0;
	tr_head = NULL;
	FD_ZERO(&mask);

	/*
	 * Set the default select size to be the maximum FD_SETSIZE, unless
	 * the current rlimit is lower.
	 */
	dtbsize = FD_SETSIZE;
	if (getrlimit(RLIMIT_NOFILE, &rl) == 0) {
		if (rl.rlim_cur < FD_SETSIZE)
			dtbsize = rl.rlim_cur;
	}

	prev_trans = NULL;
	prev_addr = NULL;
	prev_ts = NULL;
	for (mfs = mfs_in; mfs; mfs = mfs->mfs_next) {

		if (trace > 2)
			trace_prt(1, "nfs_cast: host=%s\n", mfs->mfs_host);

		nc = setnetconfig();
		if (nc == NULL) {
			stat = RPC_CANTSEND;
			goto done_broad;
		}
		while (nconf = getnetconfig(nc)) {
			if (!(nconf->nc_flag & NC_VISIBLE) ||
			    nconf->nc_semantics != NC_TPI_CLTS ||
			    (strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0))
				continue;
			trans = (struct transp *)malloc(sizeof (*trans));
			if (trans == NULL) {
				syslog(LOG_ERR, "no memory");
				stat = RPC_CANTSEND;
				goto done_broad;
			}
			(void) memset(trans, 0, sizeof (*trans));
			if (tr_head == NULL)
				tr_head = trans;
			else
				prev_trans->tr_next = trans;
			prev_trans = trans;

			trans->tr_fd = t_open(nconf->nc_device, O_RDWR, NULL);
			if (trans->tr_fd < 0) {
				syslog(LOG_ERR, "nfscast: t_open: %s:%m",
					nconf->nc_device);
				stat = RPC_CANTSEND;
				goto done_broad;
			}
			if (t_bind(trans->tr_fd, (struct t_bind *)NULL,
				(struct t_bind *)NULL) < 0) {
				syslog(LOG_ERR, "nfscast: t_bind: %m");
				stat = RPC_CANTSEND;
				goto done_broad;
			}
			trans->tr_taddr =
				/* LINTED pointer alignment */
			(struct t_bind *)t_alloc(trans->tr_fd, T_BIND, T_ADDR);
			if (trans->tr_taddr == (struct t_bind *)NULL) {
				syslog(LOG_ERR, "nfscast: t_alloc: %m");
				stat = RPC_SYSTEMERROR;
				goto done_broad;
			}

			trans->tr_device = nconf->nc_device;
			FD_SET(trans->tr_fd, &mask);

			if_inx = 0;
			hs.h_host = mfs->mfs_host;
			hs.h_serv = "rpcbind";
			if (netdir_getbyname(nconf, &hs, &retaddrs) == ND_OK) {

				/*
				 * If mfs->ignore is previously set for
				 * this map, clear it. Because a host can
				 * have either v6 or v4 address
				 */
				if (mfs->mfs_ignore == 1)
					mfs->mfs_ignore = 0;

				a = (struct addrs *)malloc(sizeof (*a));
				if (a == NULL) {
					syslog(LOG_ERR, "no memory");
					stat = RPC_CANTSEND;
					goto done_broad;
				}
				(void) memset(a, 0, sizeof (*a));
				if (trans->tr_addrs == NULL)
					trans->tr_addrs = a;
				else
					prev_addr->addr_next = a;
				prev_addr = a;
				a->addr_if_tstamps = NULL;
				a->addr_mfs = mfs;
				a->addr_addrs = retaddrs;
				if_cnt = retaddrs->n_cnt;
				while (if_cnt--) {
					ts = (struct tstamps *)
						malloc(sizeof (*ts));
					if (ts == NULL) {
						syslog(LOG_ERR, "no memory");
						stat = RPC_CANTSEND;
						goto done_broad;
					}
					(void) memset(ts, 0, sizeof (*ts));
					ts->ts_penalty = mfs->mfs_penalty;
					if (a->addr_if_tstamps == NULL)
						a->addr_if_tstamps = ts;
					else
						prev_ts->ts_next = ts;
					prev_ts = ts;
					ts->ts_inx = if_inx++;
					addr_cnt++;
				}
				break;
			} else {
				mfs->mfs_ignore = 1;
				if (verbose)
					syslog(LOG_ERR,
				"%s:%s address not known",
				mfs->mfs_host,
				strcmp(nconf->nc_proto, NC_INET)?"IPv6":"IPv4");
			}
		} /* while */

		endnetconfig(nc);
		nc = NULL;
	} /* for */
	if (addr_cnt == 0) {
		syslog(LOG_ERR, "nfscast: couldn't find addresses");
		stat = RPC_CANTSEND;
		goto done_broad;
	}

	(void) gettimeofday(&t, (struct timezone *)0);
	xid = (getpid() ^ t.tv_sec ^ t.tv_usec) & ~0xFF;
	t.tv_usec = 0;

	/* serialize the RPC header */

	msg.rm_direction = CALL;
	msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
	msg.rm_call.cb_prog = RPCBPROG;
	/*
	 * we can not use RPCBVERS here since it doesn't exist in 4.X,
	 * the fix to bug 1139883 has made the 4.X portmapper silent to
	 * version mismatches. This causes the RPC call to the remote
	 * portmapper to simply be ignored if it's not Version 2.
	 */
	msg.rm_call.cb_vers = PMAPVERS;
	msg.rm_call.cb_proc = NULLPROC;
	if (sys_auth == (AUTH *)NULL) {
		stat = RPC_SYSTEMERROR;
		goto done_broad;
	}
	msg.rm_call.cb_cred = sys_auth->ah_cred;
	msg.rm_call.cb_verf = sys_auth->ah_verf;
	xdrmem_create(xdrs, outbuf, sizeof (outbuf), XDR_ENCODE);
	if (! xdr_callmsg(xdrs, &msg)) {
		stat = RPC_CANTENCODEARGS;
		goto done_broad;
	}
	outlen = (int)xdr_getpos(xdrs);
	xdr_destroy(xdrs);

	t_udata.opt.len = 0;
	t_udata.udata.buf = outbuf;
	t_udata.udata.len = outlen;

	/*
	 * Basic loop: send packet to all hosts and wait for response(s).
	 * The response timeout grows larger per iteration.
	 * A unique xid is assigned to each address in order to
	 * correctly match the replies.
	 */
	for (tsec = 4; timeout > 0; tsec *= 2) {

		timeout -= tsec;
		if (timeout <= 0)
			tsec += timeout;

		rcv_timeout.tv_sec = tsec;
		rcv_timeout.tv_usec = 0;

		sent = 0;
		for (trans = tr_head; trans; trans = trans->tr_next) {
			for (a = trans->tr_addrs; a; a = a->addr_next) {
				struct netbuf *if_netbuf =
					a->addr_addrs->n_addrs;
				ts = a->addr_if_tstamps;
				if_cnt = a->addr_addrs->n_cnt;
				while (if_cnt--) {

					/*
					 * xid is the first thing in
					 * preserialized buffer
					 */
					/* LINTED pointer alignment */
					*((ulong_t *)outbuf) =
						htonl(xid + ts->ts_inx);
					(void) gettimeofday(&(ts->ts_timeval),
						(struct timezone *)0);
					/*
					 * Check if already received
					 * from a previous iteration.
					 */
					if (ts->ts_rcvd) {
						sent++;
						ts = ts->ts_next;
						continue;
					}

					t_udata.addr = *if_netbuf++;

					if (t_sndudata(trans->tr_fd,
							&t_udata) == 0) {
						sent++;
					}

					ts = ts->ts_next;
				}
			}
		}
		if (sent == 0) {		/* no packets sent ? */
			stat = RPC_CANTSEND;
			goto done_broad;
		}

		/*
		 * Have sent all the packets.  Now collect the responses...
		 */
		rcvd = 0;
	recv_again:
		msg.acpted_rply.ar_verf = _null_auth;
		msg.acpted_rply.ar_results.proc = xdr_void;
		readfds = mask;

		switch (select(dtbsize, &readfds,
			(fd_set *)NULL, (fd_set *)NULL, &rcv_timeout)) {

		case 0: /* Timed out */
			/*
			 * If we got at least one response in the
			 * last interval, then don't wait for any
			 * more.  In theory we should wait for
			 * the max weighting (penalty) value so
			 * that a very slow server has a chance to
			 * respond but this could take a long time
			 * if the admin has set a high weighting
			 * value.
			 */
			if (rcvd > 0)
				goto done_broad;

			stat = RPC_TIMEDOUT;
			continue;

		case -1:  /* some kind of error */
			if (errno == EINTR)
				goto recv_again;
			syslog(LOG_ERR, "nfscast: select: %m");
			if (rcvd == 0)
				stat = RPC_CANTRECV;
			goto done_broad;

		}  /* end of select results switch */

		for (trans = tr_head; trans; trans = trans->tr_next) {
			if (FD_ISSET(trans->tr_fd, &readfds))
				break;
		}
		if (trans == NULL)
			goto recv_again;

	try_again:
		t_rdata.addr = trans->tr_taddr->addr;
		t_rdata.udata.buf = inbuf;
		t_rdata.udata.maxlen = sizeof (inbuf);
		t_rdata.udata.len = 0;
		t_rdata.opt.len = 0;
		if (t_rcvudata(trans->tr_fd, &t_rdata, &flag) < 0) {
			if (errno == EINTR)
				goto try_again;
			syslog(LOG_ERR, "nfscast: t_rcvudata: %s:%m",
				trans->tr_device);
			stat = RPC_CANTRECV;
			continue;
		}
		if (t_rdata.udata.len < sizeof (ulong_t))
			goto recv_again;
		if (flag & T_MORE) {
			syslog(LOG_ERR,
				"nfscast: t_rcvudata: %s: buffer overflow",
				trans->tr_device);
			goto recv_again;
		}

		/*
		 * see if reply transaction id matches sent id.
		 * If so, decode the results.
		 * Note: received addr is ignored, it could be
		 * different from the send addr if the host has
		 * more than one addr.
		 */
		xdrmem_create(xdrs, inbuf, (uint_t)t_rdata.udata.len,
								XDR_DECODE);
		if (xdr_replymsg(xdrs, &msg)) {
		    if (msg.rm_reply.rp_stat == MSG_ACCEPTED &&
			(msg.rm_xid & ~0xFF) == xid) {
			struct addrs *curr_addr;

			i = msg.rm_xid & 0xFF;
			for (curr_addr = trans->tr_addrs; curr_addr;
			    curr_addr = curr_addr->addr_next) {
			    for (ts = curr_addr->addr_if_tstamps; ts;
				ts = ts->ts_next)
				if (ts->ts_inx == i && !ts->ts_rcvd) {
					ts->ts_rcvd = 1;
					calc_resp_time(&ts->ts_timeval);
					stat = RPC_SUCCESS;
					rcvd++;
					break;
				}
			}
		    } /* otherwise, we just ignore the errors ... */
		}
		xdrs->x_op = XDR_FREE;
		msg.acpted_rply.ar_results.proc = xdr_void;
		(void) xdr_replymsg(xdrs, &msg);
		XDR_DESTROY(xdrs);
		if (rcvd == sent)
			goto done_broad;
		else
			goto recv_again;
	}
	if (!rcvd)
		stat = RPC_TIMEDOUT;

done_broad:
	if (rcvd) {
		*mfs_out = sort_responses(tr_head);
		stat = RPC_SUCCESS;
	}
	if (nc)
		endnetconfig(nc);
	free_transports(tr_head);
	AUTH_DESTROY(sys_auth);
	return (stat);
}

/*
 * Go through all the responses and sort fastest to slowest.
 * Note that any penalty is added to the response time - so the
 * fastest response isn't necessarily the one that arrived first.
 */
static struct mapfs *
sort_responses(trans)
	struct transp *trans;
{
	struct transp *t;
	struct addrs *a;
	struct tstamps *ti;
	int i, size = 0, allocsize = 10;
	struct mapfs *p, *mfs_head = NULL, *mfs_tail = NULL;
	struct sm *buffer;

	buffer = (struct sm *)malloc(allocsize * sizeof (struct sm));
	if (!buffer) {
		syslog(LOG_ERR, "sort_responses: malloc error.\n");
		return (NULL);
	}

	for (t = trans; t; t = t->tr_next) {
		for (a = t->tr_addrs; a; a = a->addr_next) {
			for (ti = a->addr_if_tstamps;
				ti; ti = ti->ts_next) {
				if (!ti->ts_rcvd)
					continue;
				ti->ts_timeval.tv_usec +=
					(ti->ts_penalty * PENALTY_WEIGHT);
				if (ti->ts_timeval.tv_usec >= 1000000) {
					ti->ts_timeval.tv_sec +=
					(ti->ts_timeval.tv_usec / 1000000);
					ti->ts_timeval.tv_usec =
					(ti->ts_timeval.tv_usec % 1000000);
				}

				if (size >= allocsize) {
					allocsize += 10;
					buffer = (struct sm *)realloc(buffer,
					    allocsize * sizeof (struct sm));
					if (!buffer) {
						syslog(LOG_ERR,
					    "sort_responses: malloc error.\n");
						return (NULL);
					}
				}
				buffer[size].timeval = ti->ts_timeval;
				buffer[size].mfs = a->addr_mfs;
				size++;
			}
		}
	}

#ifdef DEBUG
	if (trace > 3) {
		trace_prt(1, "  sort_responses: before host sort:\n");
		for (i = 0; i < size; i++)
			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
		trace_prt(0, "\n");
	}
#endif

	qsort((void *)buffer, size, sizeof (struct sm), host_sm);

	/*
	 * Cope with multiply listed hosts  by choosing first time
	 */
	for (i = 1; i < size; i++) {
#ifdef DEBUG
		if (trace > 3) {
			trace_prt(1, "  sort_responses: comparing %s and %s\n",
				buffer[i-1].mfs->mfs_host,
				buffer[i].mfs->mfs_host);
		}
#endif
		if (strcmp(buffer[i-1].mfs->mfs_host,
		    buffer[i].mfs->mfs_host) == 0)
			memcpy(&buffer[i].timeval, &buffer[i-1].timeval,
				sizeof (struct timeval));
	}
	if (trace > 3)
		trace_prt(0, "\n");

#ifdef DEBUG
	if (trace > 3) {
		trace_prt(1, "  sort_responses: before time sort:\n");
		for (i = 0; i < size; i++)
			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
		trace_prt(0, "\n");
	}
#endif

	qsort((void *)buffer, size, sizeof (struct sm), time_sm);

#ifdef DEBUG
	if (trace > 3) {
		trace_prt(1, "  sort_responses: after sort:\n");
		for (i = 0; i < size; i++)
			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
		trace_prt(0, "\n");
	}
#endif

	for (i = 0; i < size; i++) {
#ifdef DEBUG
		if (trace > 3) {
			trace_prt(1, "  sort_responses: adding %s\n",
				buffer[i].mfs->mfs_host);
		}
#endif
		p = add_mfs(buffer[i].mfs, 0, &mfs_head, &mfs_tail);
		if (!p)
			return (NULL);
	}
	free(buffer);

	return (mfs_head);
}


/*
 * Comparison routines called by qsort(3).
 */
static int host_sm(const void *a, const void *b)
{
	return (strcmp(((struct sm *)a)->mfs->mfs_host,
			((struct sm *)b)->mfs->mfs_host));
}

static int time_sm(const void *a, const void *b)
{
	if (timercmp(&(((struct sm *)a)->timeval),
	    &(((struct sm *)b)->timeval), < /* cstyle */))
		return (-1);
	else if (timercmp(&(((struct sm *)a)->timeval),
	    &(((struct sm *)b)->timeval), > /* cstyle */))
		return (1);
	else
		return (0);
}

/*
 * Given send_time which is the time a request
 * was transmitted to a server, subtract it
 * from the time "now" thereby converting it
 * to an elapsed time.
 */
static void
calc_resp_time(send_time)
struct timeval *send_time;
{
	struct timeval time_now;

	(void) gettimeofday(&time_now, (struct timezone *)0);
	if (time_now.tv_usec <  send_time->tv_usec) {
		time_now.tv_sec--;
		time_now.tv_usec += 1000000;
	}
	send_time->tv_sec = time_now.tv_sec - send_time->tv_sec;
	send_time->tv_usec = time_now.tv_usec - send_time->tv_usec;
}

static void
free_transports(trans)
	struct transp *trans;
{
	struct transp *t, *tmpt = NULL;
	struct addrs *a, *tmpa = NULL;
	struct tstamps *ts, *tmpts = NULL;

	for (t = trans; t; t = tmpt) {
		if (t->tr_taddr)
			(void) t_free((char *)t->tr_taddr, T_BIND);
		if (t->tr_fd > 0)
			(void) t_close(t->tr_fd);
		for (a = t->tr_addrs; a; a = tmpa) {
			for (ts = a->addr_if_tstamps; ts; ts = tmpts) {
				tmpts = ts->ts_next;
				free(ts);
			}
			(void) netdir_free((char *)a->addr_addrs, ND_ADDRLIST);
			tmpa = a->addr_next;
			free(a);
		}
		tmpt = t->tr_next;
		free(t);
	}
}