1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 *	nfs_cast.c : broadcast to a specific group of NFS servers
24 *
25 *      Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
26 *      Use is subject to license terms.
27 */
28
29#pragma ident	"%Z%%M%	%I%	%E% SMI"
30
31#include <stdio.h>
32#include <syslog.h>
33#include <errno.h>
34#include <string.h>
35#include <sys/types.h>
36#include <sys/time.h>
37#include <sys/resource.h>
38#include <unistd.h>
39#include <stdlib.h>
40#include <rpc/rpc.h>
41#include <rpc/clnt_soc.h>
42#include <rpc/nettype.h>
43#include <rpc/pmap_prot.h>
44#include <netconfig.h>
45#include <netdir.h>
46#include <nfs/nfs.h>
47#define	NFSCLIENT
48#include <locale.h>
49#include "automount.h"
50
51#define	PENALTY_WEIGHT    100000
52
53struct tstamps {
54	struct tstamps	*ts_next;
55	int		ts_penalty;
56	int		ts_inx;
57	int		ts_rcvd;
58	struct timeval	ts_timeval;
59};
60
61/* A list of addresses - all belonging to the same transport */
62
63struct addrs {
64	struct addrs		*addr_next;
65	struct mapfs		*addr_mfs;
66	struct nd_addrlist	*addr_addrs;
67	struct tstamps		*addr_if_tstamps;
68};
69
70/* A list of connectionless transports */
71
72struct transp {
73	struct transp		*tr_next;
74	int			tr_fd;
75	char			*tr_device;
76	struct t_bind		*tr_taddr;
77	struct addrs		*tr_addrs;
78};
79
80/* A list of map entries and their roundtrip times, for sorting */
81
82struct sm {
83	struct mapfs *mfs;
84	struct timeval timeval;
85};
86
87static void free_transports(struct transp *);
88static void calc_resp_time(struct timeval *);
89static struct mapfs *sort_responses(struct transp *);
90static int host_sm(const void *, const void *b);
91static int time_sm(const void *, const void *b);
92extern struct mapfs *add_mfs(struct mapfs *, int, struct mapfs **,
93	struct mapfs **);
94
95/*
96 * This routine is designed to be able to "ping"
97 * a list of hosts and create a list of responding
98 * hosts sorted by response time.
99 * This must be done without any prior
100 * contact with the host - therefore the "ping"
101 * must be to a "well-known" address.  The outstanding
102 * candidate here is the address of "rpcbind".
103 *
104 * A response to a ping is no guarantee that the host
105 * is running NFS, has a mount daemon, or exports
106 * the required filesystem.  If the subsequent
107 * mount attempt fails then the host will be marked
108 * "ignore" and the host list will be re-pinged
109 * (sans the bad host). This process continues
110 * until a successful mount is achieved or until
111 * there are no hosts left to try.
112 */
113enum clnt_stat
114nfs_cast(struct mapfs *mfs_in, struct mapfs **mfs_out, int timeout)
115{
116	enum clnt_stat stat;
117	AUTH *sys_auth = authsys_create_default();
118	XDR xdr_stream;
119	register XDR *xdrs = &xdr_stream;
120	int outlen;
121	int if_inx;
122	int tsec;
123	int flag;
124	int sent, addr_cnt, rcvd, if_cnt;
125	fd_set readfds, mask;
126	register ulong_t xid;		/* xid - unique per addr */
127	register int i;
128	struct rpc_msg msg;
129	struct timeval t, rcv_timeout;
130	char outbuf[UDPMSGSIZE], inbuf[UDPMSGSIZE];
131	struct t_unitdata t_udata, t_rdata;
132	struct nd_hostserv hs;
133	struct nd_addrlist *retaddrs;
134	struct transp *tr_head;
135	struct transp *trans, *prev_trans;
136	struct addrs *a, *prev_addr;
137	struct tstamps *ts, *prev_ts;
138	NCONF_HANDLE *nc = NULL;
139	struct netconfig *nconf;
140	struct rlimit rl;
141	int dtbsize;
142	struct mapfs *mfs;
143
144	/*
145	 * For each connectionless transport get a list of
146	 * host addresses.  Any single host may have
147	 * addresses on several transports.
148	 */
149	addr_cnt = sent = rcvd = 0;
150	tr_head = NULL;
151	FD_ZERO(&mask);
152
153	/*
154	 * Set the default select size to be the maximum FD_SETSIZE, unless
155	 * the current rlimit is lower.
156	 */
157	dtbsize = FD_SETSIZE;
158	if (getrlimit(RLIMIT_NOFILE, &rl) == 0) {
159		if (rl.rlim_cur < FD_SETSIZE)
160			dtbsize = rl.rlim_cur;
161	}
162
163	prev_trans = NULL;
164	prev_addr = NULL;
165	prev_ts = NULL;
166	for (mfs = mfs_in; mfs; mfs = mfs->mfs_next) {
167
168		if (trace > 2)
169			trace_prt(1, "nfs_cast: host=%s\n", mfs->mfs_host);
170
171		nc = setnetconfig();
172		if (nc == NULL) {
173			stat = RPC_CANTSEND;
174			goto done_broad;
175		}
176		while (nconf = getnetconfig(nc)) {
177			if (!(nconf->nc_flag & NC_VISIBLE) ||
178			    nconf->nc_semantics != NC_TPI_CLTS ||
179			    (strcmp(nconf->nc_protofmly, NC_LOOPBACK) == 0))
180				continue;
181			trans = (struct transp *)malloc(sizeof (*trans));
182			if (trans == NULL) {
183				syslog(LOG_ERR, "no memory");
184				stat = RPC_CANTSEND;
185				goto done_broad;
186			}
187			(void) memset(trans, 0, sizeof (*trans));
188			if (tr_head == NULL)
189				tr_head = trans;
190			else
191				prev_trans->tr_next = trans;
192			prev_trans = trans;
193
194			trans->tr_fd = t_open(nconf->nc_device, O_RDWR, NULL);
195			if (trans->tr_fd < 0) {
196				syslog(LOG_ERR, "nfscast: t_open: %s:%m",
197					nconf->nc_device);
198				stat = RPC_CANTSEND;
199				goto done_broad;
200			}
201			if (t_bind(trans->tr_fd, (struct t_bind *)NULL,
202				(struct t_bind *)NULL) < 0) {
203				syslog(LOG_ERR, "nfscast: t_bind: %m");
204				stat = RPC_CANTSEND;
205				goto done_broad;
206			}
207			trans->tr_taddr =
208				/* LINTED pointer alignment */
209			(struct t_bind *)t_alloc(trans->tr_fd, T_BIND, T_ADDR);
210			if (trans->tr_taddr == (struct t_bind *)NULL) {
211				syslog(LOG_ERR, "nfscast: t_alloc: %m");
212				stat = RPC_SYSTEMERROR;
213				goto done_broad;
214			}
215
216			trans->tr_device = nconf->nc_device;
217			FD_SET(trans->tr_fd, &mask);
218
219			if_inx = 0;
220			hs.h_host = mfs->mfs_host;
221			hs.h_serv = "rpcbind";
222			if (netdir_getbyname(nconf, &hs, &retaddrs) == ND_OK) {
223
224				/*
225				 * If mfs->ignore is previously set for
226				 * this map, clear it. Because a host can
227				 * have either v6 or v4 address
228				 */
229				if (mfs->mfs_ignore == 1)
230					mfs->mfs_ignore = 0;
231
232				a = (struct addrs *)malloc(sizeof (*a));
233				if (a == NULL) {
234					syslog(LOG_ERR, "no memory");
235					stat = RPC_CANTSEND;
236					goto done_broad;
237				}
238				(void) memset(a, 0, sizeof (*a));
239				if (trans->tr_addrs == NULL)
240					trans->tr_addrs = a;
241				else
242					prev_addr->addr_next = a;
243				prev_addr = a;
244				a->addr_if_tstamps = NULL;
245				a->addr_mfs = mfs;
246				a->addr_addrs = retaddrs;
247				if_cnt = retaddrs->n_cnt;
248				while (if_cnt--) {
249					ts = (struct tstamps *)
250						malloc(sizeof (*ts));
251					if (ts == NULL) {
252						syslog(LOG_ERR, "no memory");
253						stat = RPC_CANTSEND;
254						goto done_broad;
255					}
256					(void) memset(ts, 0, sizeof (*ts));
257					ts->ts_penalty = mfs->mfs_penalty;
258					if (a->addr_if_tstamps == NULL)
259						a->addr_if_tstamps = ts;
260					else
261						prev_ts->ts_next = ts;
262					prev_ts = ts;
263					ts->ts_inx = if_inx++;
264					addr_cnt++;
265				}
266				break;
267			} else {
268				mfs->mfs_ignore = 1;
269				if (verbose)
270					syslog(LOG_ERR,
271				"%s:%s address not known",
272				mfs->mfs_host,
273				strcmp(nconf->nc_proto, NC_INET)?"IPv6":"IPv4");
274			}
275		} /* while */
276
277		endnetconfig(nc);
278		nc = NULL;
279	} /* for */
280	if (addr_cnt == 0) {
281		syslog(LOG_ERR, "nfscast: couldn't find addresses");
282		stat = RPC_CANTSEND;
283		goto done_broad;
284	}
285
286	(void) gettimeofday(&t, (struct timezone *)0);
287	xid = (getpid() ^ t.tv_sec ^ t.tv_usec) & ~0xFF;
288	t.tv_usec = 0;
289
290	/* serialize the RPC header */
291
292	msg.rm_direction = CALL;
293	msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
294	msg.rm_call.cb_prog = RPCBPROG;
295	/*
296	 * we can not use RPCBVERS here since it doesn't exist in 4.X,
297	 * the fix to bug 1139883 has made the 4.X portmapper silent to
298	 * version mismatches. This causes the RPC call to the remote
299	 * portmapper to simply be ignored if it's not Version 2.
300	 */
301	msg.rm_call.cb_vers = PMAPVERS;
302	msg.rm_call.cb_proc = NULLPROC;
303	if (sys_auth == (AUTH *)NULL) {
304		stat = RPC_SYSTEMERROR;
305		goto done_broad;
306	}
307	msg.rm_call.cb_cred = sys_auth->ah_cred;
308	msg.rm_call.cb_verf = sys_auth->ah_verf;
309	xdrmem_create(xdrs, outbuf, sizeof (outbuf), XDR_ENCODE);
310	if (! xdr_callmsg(xdrs, &msg)) {
311		stat = RPC_CANTENCODEARGS;
312		goto done_broad;
313	}
314	outlen = (int)xdr_getpos(xdrs);
315	xdr_destroy(xdrs);
316
317	t_udata.opt.len = 0;
318	t_udata.udata.buf = outbuf;
319	t_udata.udata.len = outlen;
320
321	/*
322	 * Basic loop: send packet to all hosts and wait for response(s).
323	 * The response timeout grows larger per iteration.
324	 * A unique xid is assigned to each address in order to
325	 * correctly match the replies.
326	 */
327	for (tsec = 4; timeout > 0; tsec *= 2) {
328
329		timeout -= tsec;
330		if (timeout <= 0)
331			tsec += timeout;
332
333		rcv_timeout.tv_sec = tsec;
334		rcv_timeout.tv_usec = 0;
335
336		sent = 0;
337		for (trans = tr_head; trans; trans = trans->tr_next) {
338			for (a = trans->tr_addrs; a; a = a->addr_next) {
339				struct netbuf *if_netbuf =
340					a->addr_addrs->n_addrs;
341				ts = a->addr_if_tstamps;
342				if_cnt = a->addr_addrs->n_cnt;
343				while (if_cnt--) {
344
345					/*
346					 * xid is the first thing in
347					 * preserialized buffer
348					 */
349					/* LINTED pointer alignment */
350					*((ulong_t *)outbuf) =
351						htonl(xid + ts->ts_inx);
352					(void) gettimeofday(&(ts->ts_timeval),
353						(struct timezone *)0);
354					/*
355					 * Check if already received
356					 * from a previous iteration.
357					 */
358					if (ts->ts_rcvd) {
359						sent++;
360						ts = ts->ts_next;
361						continue;
362					}
363
364					t_udata.addr = *if_netbuf++;
365
366					if (t_sndudata(trans->tr_fd,
367							&t_udata) == 0) {
368						sent++;
369					}
370
371					ts = ts->ts_next;
372				}
373			}
374		}
375		if (sent == 0) {		/* no packets sent ? */
376			stat = RPC_CANTSEND;
377			goto done_broad;
378		}
379
380		/*
381		 * Have sent all the packets.  Now collect the responses...
382		 */
383		rcvd = 0;
384	recv_again:
385		msg.acpted_rply.ar_verf = _null_auth;
386		msg.acpted_rply.ar_results.proc = xdr_void;
387		readfds = mask;
388
389		switch (select(dtbsize, &readfds,
390			(fd_set *)NULL, (fd_set *)NULL, &rcv_timeout)) {
391
392		case 0: /* Timed out */
393			/*
394			 * If we got at least one response in the
395			 * last interval, then don't wait for any
396			 * more.  In theory we should wait for
397			 * the max weighting (penalty) value so
398			 * that a very slow server has a chance to
399			 * respond but this could take a long time
400			 * if the admin has set a high weighting
401			 * value.
402			 */
403			if (rcvd > 0)
404				goto done_broad;
405
406			stat = RPC_TIMEDOUT;
407			continue;
408
409		case -1:  /* some kind of error */
410			if (errno == EINTR)
411				goto recv_again;
412			syslog(LOG_ERR, "nfscast: select: %m");
413			if (rcvd == 0)
414				stat = RPC_CANTRECV;
415			goto done_broad;
416
417		}  /* end of select results switch */
418
419		for (trans = tr_head; trans; trans = trans->tr_next) {
420			if (FD_ISSET(trans->tr_fd, &readfds))
421				break;
422		}
423		if (trans == NULL)
424			goto recv_again;
425
426	try_again:
427		t_rdata.addr = trans->tr_taddr->addr;
428		t_rdata.udata.buf = inbuf;
429		t_rdata.udata.maxlen = sizeof (inbuf);
430		t_rdata.udata.len = 0;
431		t_rdata.opt.len = 0;
432		if (t_rcvudata(trans->tr_fd, &t_rdata, &flag) < 0) {
433			if (errno == EINTR)
434				goto try_again;
435			syslog(LOG_ERR, "nfscast: t_rcvudata: %s:%m",
436				trans->tr_device);
437			stat = RPC_CANTRECV;
438			continue;
439		}
440		if (t_rdata.udata.len < sizeof (ulong_t))
441			goto recv_again;
442		if (flag & T_MORE) {
443			syslog(LOG_ERR,
444				"nfscast: t_rcvudata: %s: buffer overflow",
445				trans->tr_device);
446			goto recv_again;
447		}
448
449		/*
450		 * see if reply transaction id matches sent id.
451		 * If so, decode the results.
452		 * Note: received addr is ignored, it could be
453		 * different from the send addr if the host has
454		 * more than one addr.
455		 */
456		xdrmem_create(xdrs, inbuf, (uint_t)t_rdata.udata.len,
457								XDR_DECODE);
458		if (xdr_replymsg(xdrs, &msg)) {
459		    if (msg.rm_reply.rp_stat == MSG_ACCEPTED &&
460			(msg.rm_xid & ~0xFF) == xid) {
461			struct addrs *curr_addr;
462
463			i = msg.rm_xid & 0xFF;
464			for (curr_addr = trans->tr_addrs; curr_addr;
465			    curr_addr = curr_addr->addr_next) {
466			    for (ts = curr_addr->addr_if_tstamps; ts;
467				ts = ts->ts_next)
468				if (ts->ts_inx == i && !ts->ts_rcvd) {
469					ts->ts_rcvd = 1;
470					calc_resp_time(&ts->ts_timeval);
471					stat = RPC_SUCCESS;
472					rcvd++;
473					break;
474				}
475			}
476		    } /* otherwise, we just ignore the errors ... */
477		}
478		xdrs->x_op = XDR_FREE;
479		msg.acpted_rply.ar_results.proc = xdr_void;
480		(void) xdr_replymsg(xdrs, &msg);
481		XDR_DESTROY(xdrs);
482		if (rcvd == sent)
483			goto done_broad;
484		else
485			goto recv_again;
486	}
487	if (!rcvd)
488		stat = RPC_TIMEDOUT;
489
490done_broad:
491	if (rcvd) {
492		*mfs_out = sort_responses(tr_head);
493		stat = RPC_SUCCESS;
494	}
495	if (nc)
496		endnetconfig(nc);
497	free_transports(tr_head);
498	AUTH_DESTROY(sys_auth);
499	return (stat);
500}
501
502/*
503 * Go through all the responses and sort fastest to slowest.
504 * Note that any penalty is added to the response time - so the
505 * fastest response isn't necessarily the one that arrived first.
506 */
507static struct mapfs *
508sort_responses(trans)
509	struct transp *trans;
510{
511	struct transp *t;
512	struct addrs *a;
513	struct tstamps *ti;
514	int i, size = 0, allocsize = 10;
515	struct mapfs *p, *mfs_head = NULL, *mfs_tail = NULL;
516	struct sm *buffer;
517
518	buffer = (struct sm *)malloc(allocsize * sizeof (struct sm));
519	if (!buffer) {
520		syslog(LOG_ERR, "sort_responses: malloc error.\n");
521		return (NULL);
522	}
523
524	for (t = trans; t; t = t->tr_next) {
525		for (a = t->tr_addrs; a; a = a->addr_next) {
526			for (ti = a->addr_if_tstamps;
527				ti; ti = ti->ts_next) {
528				if (!ti->ts_rcvd)
529					continue;
530				ti->ts_timeval.tv_usec +=
531					(ti->ts_penalty * PENALTY_WEIGHT);
532				if (ti->ts_timeval.tv_usec >= 1000000) {
533					ti->ts_timeval.tv_sec +=
534					(ti->ts_timeval.tv_usec / 1000000);
535					ti->ts_timeval.tv_usec =
536					(ti->ts_timeval.tv_usec % 1000000);
537				}
538
539				if (size >= allocsize) {
540					allocsize += 10;
541					buffer = (struct sm *)realloc(buffer,
542					    allocsize * sizeof (struct sm));
543					if (!buffer) {
544						syslog(LOG_ERR,
545					    "sort_responses: malloc error.\n");
546						return (NULL);
547					}
548				}
549				buffer[size].timeval = ti->ts_timeval;
550				buffer[size].mfs = a->addr_mfs;
551				size++;
552			}
553		}
554	}
555
556#ifdef DEBUG
557	if (trace > 3) {
558		trace_prt(1, "  sort_responses: before host sort:\n");
559		for (i = 0; i < size; i++)
560			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
561			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
562		trace_prt(0, "\n");
563	}
564#endif
565
566	qsort((void *)buffer, size, sizeof (struct sm), host_sm);
567
568	/*
569	 * Cope with multiply listed hosts  by choosing first time
570	 */
571	for (i = 1; i < size; i++) {
572#ifdef DEBUG
573		if (trace > 3) {
574			trace_prt(1, "  sort_responses: comparing %s and %s\n",
575				buffer[i-1].mfs->mfs_host,
576				buffer[i].mfs->mfs_host);
577		}
578#endif
579		if (strcmp(buffer[i-1].mfs->mfs_host,
580		    buffer[i].mfs->mfs_host) == 0)
581			memcpy(&buffer[i].timeval, &buffer[i-1].timeval,
582				sizeof (struct timeval));
583	}
584	if (trace > 3)
585		trace_prt(0, "\n");
586
587#ifdef DEBUG
588	if (trace > 3) {
589		trace_prt(1, "  sort_responses: before time sort:\n");
590		for (i = 0; i < size; i++)
591			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
592			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
593		trace_prt(0, "\n");
594	}
595#endif
596
597	qsort((void *)buffer, size, sizeof (struct sm), time_sm);
598
599#ifdef DEBUG
600	if (trace > 3) {
601		trace_prt(1, "  sort_responses: after sort:\n");
602		for (i = 0; i < size; i++)
603			trace_prt(1, "    %s %d.%d\n", buffer[i].mfs->mfs_host,
604			buffer[i].timeval.tv_sec, buffer[i].timeval.tv_usec);
605		trace_prt(0, "\n");
606	}
607#endif
608
609	for (i = 0; i < size; i++) {
610#ifdef DEBUG
611		if (trace > 3) {
612			trace_prt(1, "  sort_responses: adding %s\n",
613				buffer[i].mfs->mfs_host);
614		}
615#endif
616		p = add_mfs(buffer[i].mfs, 0, &mfs_head, &mfs_tail);
617		if (!p)
618			return (NULL);
619	}
620	free(buffer);
621
622	return (mfs_head);
623}
624
625
626/*
627 * Comparison routines called by qsort(3).
628 */
629static int host_sm(const void *a, const void *b)
630{
631	return (strcmp(((struct sm *)a)->mfs->mfs_host,
632			((struct sm *)b)->mfs->mfs_host));
633}
634
635static int time_sm(const void *a, const void *b)
636{
637	if (timercmp(&(((struct sm *)a)->timeval),
638	    &(((struct sm *)b)->timeval), < /* cstyle */))
639		return (-1);
640	else if (timercmp(&(((struct sm *)a)->timeval),
641	    &(((struct sm *)b)->timeval), > /* cstyle */))
642		return (1);
643	else
644		return (0);
645}
646
647/*
648 * Given send_time which is the time a request
649 * was transmitted to a server, subtract it
650 * from the time "now" thereby converting it
651 * to an elapsed time.
652 */
653static void
654calc_resp_time(send_time)
655struct timeval *send_time;
656{
657	struct timeval time_now;
658
659	(void) gettimeofday(&time_now, (struct timezone *)0);
660	if (time_now.tv_usec <  send_time->tv_usec) {
661		time_now.tv_sec--;
662		time_now.tv_usec += 1000000;
663	}
664	send_time->tv_sec = time_now.tv_sec - send_time->tv_sec;
665	send_time->tv_usec = time_now.tv_usec - send_time->tv_usec;
666}
667
668static void
669free_transports(trans)
670	struct transp *trans;
671{
672	struct transp *t, *tmpt = NULL;
673	struct addrs *a, *tmpa = NULL;
674	struct tstamps *ts, *tmpts = NULL;
675
676	for (t = trans; t; t = tmpt) {
677		if (t->tr_taddr)
678			(void) t_free((char *)t->tr_taddr, T_BIND);
679		if (t->tr_fd > 0)
680			(void) t_close(t->tr_fd);
681		for (a = t->tr_addrs; a; a = tmpa) {
682			for (ts = a->addr_if_tstamps; ts; ts = tmpts) {
683				tmpts = ts->ts_next;
684				free(ts);
685			}
686			(void) netdir_free((char *)a->addr_addrs, ND_ADDRLIST);
687			tmpa = a->addr_next;
688			free(a);
689		}
690		tmpt = t->tr_next;
691		free(t);
692	}
693}
694