xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_mroute.c (revision 45916cd2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.
23  * All rights reserved.  Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Procedures for the kernel part of DVMRP,
31  * a Distance-Vector Multicast Routing Protocol.
32  * (See RFC-1075)
33  * Written by David Waitzman, BBN Labs, August 1988.
34  * Modified by Steve Deering, Stanford, February 1989.
35  * Modified by Mark J. Steiglitz, Stanford, May, 1991
36  * Modified by Van Jacobson, LBL, January 1993
37  * Modified by Ajit Thyagarajan, PARC, August 1993
38  * Modified by Bill Fenner, PARC, April 1995
39  *
40  * MROUTING 3.5
41  */
42 
43 /*
44  * TODO
45  * - function pointer field in vif, void *vif_sendit()
46  */
47 
48 #include <sys/types.h>
49 #include <sys/stream.h>
50 #include <sys/stropts.h>
51 #include <sys/strlog.h>
52 #include <sys/systm.h>
53 #include <sys/ddi.h>
54 #include <sys/cmn_err.h>
55 #include <sys/zone.h>
56 
57 #include <sys/param.h>
58 #include <sys/socket.h>
59 #include <sys/vtrace.h>
60 #include <sys/debug.h>
61 #include <net/if.h>
62 #include <sys/sockio.h>
63 #include <netinet/in.h>
64 #include <net/if_dl.h>
65 
66 #include <inet/common.h>
67 #include <inet/mi.h>
68 #include <inet/nd.h>
69 #include <inet/mib2.h>
70 #include <netinet/ip6.h>
71 #include <inet/ip.h>
72 #include <inet/snmpcom.h>
73 
74 #include <netinet/igmp.h>
75 #include <netinet/igmp_var.h>
76 #include <netinet/udp.h>
77 #include <netinet/ip_mroute.h>
78 #include <inet/ip_multi.h>
79 #include <inet/ip_ire.h>
80 #include <inet/ip_if.h>
81 #include <inet/ipclassifier.h>
82 
83 #include <netinet/pim.h>
84 
85 
86 /*
87  * MT Design:
88  *
89  * There are three main data structures viftable, mfctable and tbftable that
90  * need to be protected against MT races.
91  *
92  * vitable is a fixed length array of vif structs. There is no lock to protect
93  * the whole array, instead each struct is protected by its own indiviual lock.
94  * The value of v_marks in conjuction with the value of v_refcnt determines the
95  * current state of a vif structure. One special state that needs mention
96  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
97  * that vif is being initalized.
98  * Each structure is freed when the refcnt goes down to zero. If a delete comes
99  * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
100  * which prevents the struct from further use.  When the refcnt goes to zero
101  * the struct is freed and is marked VIF_MARK_NOTINUSE.
102  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
103  * from  going away a refhold is put on the ipif before using it. see
104  * lock_good_vif() and unlock_good_vif().
105  *
106  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
107  * of the vif struct.
108  *
109  * tbftable is also a fixed length array of tbf structs and is only accessed
110  * via v_tbf.  It is protected by its own lock tbf_lock.
111  *
112  * Lock Ordering is
113  * v_lock --> tbf_lock
114  * v_lock --> ill_locK
115  *
116  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
117  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
118  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
119  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
120  * protect the struct elements.
121  *
122  * mfc structs are dynamically allocated and are singly linked
123  * at the head of the chain. When an mfc structure is to be deleted
124  * it is marked condemned and so is the state in the bucket struct.
125  * When the last walker of the hash bucket exits all the mfc structs
126  * marked condemed are freed.
127  *
128  * Locking Hierarchy:
129  * The bucket lock should be acquired before the mfc struct lock.
130  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
131  * operations on the bucket struct.
132  *
133  * last_encap_lock and numvifs_mutex should be acquired after
134  * acquring vif or mfc locks. These locks protect some global variables.
135  *
136  * The statistics are not currently protected by a lock
137  * causing the stats be be approximate, not exact.
138  */
139 
140 /*
141  * Globals
142  * All but ip_g_mrouter and ip_mrtproto could be static,
143  * except for netstat or debugging purposes.
144  */
145 queue_t		*ip_g_mrouter	= NULL;
146 static kmutex_t	ip_g_mrouter_mutex;
147 
148 int		ip_mrtproto	= IGMP_DVMRP;	/* for netstat only */
149 struct mrtstat	mrtstat;	/* Stats for netstat */
150 
151 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
152 
153 /*
154  * Timeouts:
155  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
156  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
157  *	SunOS 5.x uses mfc->timeout for each mfc.
158  *	Some Unixes are limited in the number of simultaneous timeouts
159  * 	that can be run, SunOS 5.x does not have this restriction.
160  */
161 
162 /*
163  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
164  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
165  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
166  */
167 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
168 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
169 
170 /*
171  * Hash function for a source, group entry
172  */
173 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
174 	((g) >> 20) ^ ((g) >> 10) ^ (g))
175 
176 /*
177  * mfctable:
178  * Includes all mfcs, including waiting upcalls.
179  * Multiple mfcs per bucket.
180  */
181 static struct mfcb	mfctable[MFCTBLSIZ];	/* kernel routing table	*/
182 
183 /*
184  * Define the token bucket filter structures.
185  * tbftable -> each vif has one of these for storing info.
186  */
187 struct tbf 		tbftable[MAXVIFS];
188 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
189 
190 /* Identify PIM packet that came on a Register interface */
191 #define	PIM_REGISTER_MARKER	0xffffffff
192 
193 /* Function declarations */
194 static int	add_mfc(struct mfcctl *);
195 static int	add_vif(struct vifctl *, queue_t *, mblk_t *);
196 static int	del_mfc(struct mfcctl *);
197 static int	del_vif(vifi_t *, queue_t *, mblk_t *);
198 static void	del_vifp(struct vif *);
199 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
200 static void	expire_upcalls(void *);
201 static void	fill_route(struct mfc *, struct mfcctl *);
202 static int	get_assert(uchar_t *);
203 static int	get_lsg_cnt(struct sioc_lsg_req *);
204 static int	get_sg_cnt(struct sioc_sg_req *);
205 static int	get_version(uchar_t *);
206 static int	get_vif_cnt(struct sioc_vif_req *);
207 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
208 		    ipaddr_t, struct mfc *);
209 static int	ip_mrouter_init(queue_t *, uchar_t *, int);
210 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
211 static int	register_mforward(queue_t *, mblk_t *);
212 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
213 static int	set_assert(int *);
214 
215 /*
216  * Token Bucket Filter functions
217  */
218 static int  priority(struct vif *, ipha_t *);
219 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
220 static int  tbf_dq_sel(struct vif *, ipha_t *);
221 static void tbf_process_q(struct vif *);
222 static void tbf_queue(struct vif *, mblk_t *);
223 static void tbf_reprocess_q(void *);
224 static void tbf_send_packet(struct vif *, mblk_t *);
225 static void tbf_update_tokens(struct vif *);
226 static void release_mfc(struct mfcb *);
227 
228 static boolean_t is_mrouter_off(void);
229 /*
230  * Encapsulation packets
231  */
232 
233 #define	ENCAP_TTL	64
234 
235 /* prototype IP hdr for encapsulated packets */
236 static ipha_t multicast_encap_iphdr = {
237 	IP_SIMPLE_HDR_VERSION,
238 	0,				/* tos */
239 	sizeof (ipha_t),		/* total length */
240 	0,				/* id */
241 	0,				/* frag offset */
242 	ENCAP_TTL, IPPROTO_ENCAP,
243 	0,				/* checksum */
244 };
245 
246 /*
247  * Private variables.
248  */
249 static int		saved_ip_g_forward = -1;
250 
251 /*
252  * numvifs is only a hint about the max interface being used.
253  */
254 static vifi_t		numvifs = 0;
255 static kmutex_t		numvifs_mutex;
256 
257 static struct vif	viftable[MAXVIFS+1];	/* Index needs to accomodate */
258 /* the value of NO_VIF, which */
259 /* is MAXVIFS. */
260 
261 /*
262  * One-back cache used to locate a tunnel's vif,
263  * given a datagram's src ip address.
264  */
265 static ipaddr_t		last_encap_src;
266 static struct vif	*last_encap_vif;
267 static kmutex_t		last_encap_lock;	/* Protects the above */
268 
269 /*
270  * Whether or not special PIM assert processing is enabled.
271  */
272 /*
273  * reg_vif_num is protected by numvifs_mutex
274  */
275 static vifi_t reg_vif_num = ALL_VIFS; 	/* Index to Register vif */
276 static int pim_assert;
277 
278 /*
279  * Rate limit for assert notification messages, in nsec.
280  */
281 #define	ASSERT_MSG_TIME		3000000000
282 
283 
284 #define	VIF_REFHOLD(vifp) {			\
285 	mutex_enter(&(vifp)->v_lock);		\
286 	(vifp)->v_refcnt++;			\
287 	mutex_exit(&(vifp)->v_lock);		\
288 }
289 
290 #define	VIF_REFRELE_LOCKED(vifp) {				\
291 	(vifp)->v_refcnt--;					\
292 	if ((vifp)->v_refcnt == 0 &&				\
293 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
294 			del_vifp(vifp);				\
295 	} else {						\
296 		mutex_exit(&(vifp)->v_lock);			\
297 	}							\
298 }
299 
300 #define	VIF_REFRELE(vifp) {					\
301 	mutex_enter(&(vifp)->v_lock);				\
302 	(vifp)->v_refcnt--;					\
303 	if ((vifp)->v_refcnt == 0 &&				\
304 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
305 			del_vifp(vifp);				\
306 	} else {						\
307 		mutex_exit(&(vifp)->v_lock);			\
308 	}							\
309 }
310 
311 #define	MFCB_REFHOLD(mfcb) {				\
312 	mutex_enter(&(mfcb)->mfcb_lock);		\
313 	(mfcb)->mfcb_refcnt++;				\
314 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
315 	mutex_exit(&(mfcb)->mfcb_lock);			\
316 }
317 
318 #define	MFCB_REFRELE(mfcb) {					\
319 	mutex_enter(&(mfcb)->mfcb_lock);			\
320 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
321 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
322 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
323 			release_mfc(mfcb);			\
324 	}							\
325 	mutex_exit(&(mfcb)->mfcb_lock);				\
326 }
327 
328 /*
329  * MFCFIND:
330  * Find a route for a given origin IP address and multicast group address.
331  * Skip entries with pending upcalls.
332  * Type of service parameter to be added in the future!
333  */
334 #define	MFCFIND(mfcbp, o, g, rt) { \
335 	struct mfc *_mb_rt = NULL; \
336 	rt = NULL; \
337 	_mb_rt = mfcbp->mfcb_mfc; \
338 	while (_mb_rt) { \
339 		if ((_mb_rt->mfc_origin.s_addr == o) && \
340 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
341 		    (_mb_rt->mfc_rte == NULL) && \
342 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
343 		    rt = _mb_rt; \
344 		    break; \
345 		} \
346 	_mb_rt = _mb_rt->mfc_next; \
347 	} \
348 }
349 
350 /*
351  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
352  * are inefficient. We use gethrestime() which returns a timespec_t with
353  * sec and nsec, the resolution is machine dependent.
354  * The following 2 macros have been changed to use nsec instead of usec.
355  */
356 /*
357  * Macros to compute elapsed time efficiently.
358  * Borrowed from Van Jacobson's scheduling code.
359  * Delta should be a hrtime_t.
360  */
361 #define	TV_DELTA(a, b, delta) { \
362 	int xxs; \
363  \
364 	delta = (a).tv_nsec - (b).tv_nsec; \
365 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
366 		switch (xxs) { \
367 		case 2: \
368 		    delta += 1000000000; \
369 		    /*FALLTHROUGH*/ \
370 		case 1: \
371 		    delta += 1000000000; \
372 		    break; \
373 		default: \
374 		    delta += (1000000000 * xxs); \
375 		} \
376 	} \
377 }
378 
379 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
380 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
381 
382 /*
383  * Handle MRT setsockopt commands to modify the multicast routing tables.
384  */
385 int
386 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
387     int datalen, mblk_t *first_mp)
388 {
389 	mutex_enter(&ip_g_mrouter_mutex);
390 	if (cmd != MRT_INIT && q != ip_g_mrouter) {
391 		mutex_exit(&ip_g_mrouter_mutex);
392 		return (EACCES);
393 	}
394 	mutex_exit(&ip_g_mrouter_mutex);
395 
396 	if (checkonly) {
397 		/*
398 		 * do not do operation, just pretend to - new T_CHECK
399 		 * Note: Even routines further on can probably fail but
400 		 * this T_CHECK stuff is only to please XTI so it not
401 		 * necessary to be perfect.
402 		 */
403 		switch (cmd) {
404 		case MRT_INIT:
405 		case MRT_DONE:
406 		case MRT_ADD_VIF:
407 		case MRT_DEL_VIF:
408 		case MRT_ADD_MFC:
409 		case MRT_DEL_MFC:
410 		case MRT_ASSERT:
411 		    return (0);
412 		default:
413 		    return (EOPNOTSUPP);
414 		}
415 	}
416 
417 	/*
418 	 * make sure no command is issued after multicast routing has been
419 	 * turned off.
420 	 */
421 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
422 		if (is_mrouter_off())
423 			return (EINVAL);
424 	}
425 
426 	switch (cmd) {
427 	case MRT_INIT:	return (ip_mrouter_init(q, data, datalen));
428 	case MRT_DONE:	return (ip_mrouter_done(first_mp));
429 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, q, first_mp));
430 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, q, first_mp));
431 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data));
432 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data));
433 	case MRT_ASSERT:   return (set_assert((int *)data));
434 	default:	   return (EOPNOTSUPP);
435 	}
436 }
437 
438 /*
439  * Handle MRT getsockopt commands
440  */
441 int
442 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
443 {
444 	if (q != ip_g_mrouter)
445 		return (EACCES);
446 
447 	switch (cmd) {
448 	case MRT_VERSION:	return (get_version((uchar_t *)data));
449 	case MRT_ASSERT:	return (get_assert((uchar_t *)data));
450 	default:		return (EOPNOTSUPP);
451 	}
452 }
453 
454 /*
455  * Handle ioctl commands to obtain information from the cache.
456  * Called with shared access to IP. These are read_only ioctls.
457  */
458 /* ARGSUSED */
459 int
460 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
461     ip_ioctl_cmd_t *ipip, void *if_req)
462 {
463 	mblk_t	*mp1;
464 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
465 
466 	/* Existence verified in ip_wput_nondata */
467 	mp1 = mp->b_cont->b_cont;
468 
469 	switch (iocp->ioc_cmd) {
470 	case (SIOCGETVIFCNT):
471 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr));
472 	case (SIOCGETSGCNT):
473 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr));
474 	case (SIOCGETLSGCNT):
475 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr));
476 	default:
477 		return (EINVAL);
478 	}
479 }
480 
481 /*
482  * Returns the packet, byte, rpf-failure count for the source, group provided.
483  */
484 static int
485 get_sg_cnt(struct sioc_sg_req *req)
486 {
487 	struct mfc *rt;
488 	struct mfcb *mfcbp;
489 
490 	mfcbp = &mfctable[MFCHASH(req->src.s_addr, req->grp.s_addr)];
491 	MFCB_REFHOLD(mfcbp);
492 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
493 
494 	if (rt != NULL) {
495 		mutex_enter(&rt->mfc_mutex);
496 		req->pktcnt   = rt->mfc_pkt_cnt;
497 		req->bytecnt  = rt->mfc_byte_cnt;
498 		req->wrong_if = rt->mfc_wrong_if;
499 		mutex_exit(&rt->mfc_mutex);
500 	} else
501 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
502 
503 	MFCB_REFRELE(mfcbp);
504 	return (0);
505 }
506 
507 /*
508  * Returns the packet, byte, rpf-failure count for the source, group provided.
509  * Uses larger counters and IPv6 addresses.
510  */
511 /* ARGSUSED XXX until implemented */
512 static int
513 get_lsg_cnt(struct sioc_lsg_req *req)
514 {
515 	/* XXX TODO SIOCGETLSGCNT */
516 	return (ENXIO);
517 }
518 
519 /*
520  * Returns the input and output packet and byte counts on the vif provided.
521  */
522 static int
523 get_vif_cnt(struct sioc_vif_req *req)
524 {
525 	vifi_t vifi = req->vifi;
526 
527 	if (vifi >= numvifs)
528 		return (EINVAL);
529 
530 	/*
531 	 * No locks here, an approximation is fine.
532 	 */
533 	req->icount = viftable[vifi].v_pkt_in;
534 	req->ocount = viftable[vifi].v_pkt_out;
535 	req->ibytes = viftable[vifi].v_bytes_in;
536 	req->obytes = viftable[vifi].v_bytes_out;
537 
538 	return (0);
539 }
540 
541 static int
542 get_version(uchar_t *data)
543 {
544 	int *v = (int *)data;
545 
546 	*v = 0x0305;	/* XXX !!!! */
547 
548 	return (0);
549 }
550 
551 /*
552  * Set PIM assert processing global.
553  */
554 static int
555 set_assert(int *i)
556 {
557 	if ((*i != 1) && (*i != 0))
558 		return (EINVAL);
559 
560 	pim_assert = *i;
561 
562 	return (0);
563 }
564 
565 /*
566  * Get PIM assert processing global.
567  */
568 static int
569 get_assert(uchar_t *data)
570 {
571 	int *i = (int *)data;
572 
573 	*i = pim_assert;
574 
575 	return (0);
576 }
577 
578 /*
579  * Enable multicast routing.
580  */
581 static int
582 ip_mrouter_init(queue_t *q, uchar_t *data, int datalen)
583 {
584 	conn_t	*connp = Q_TO_CONN(q);
585 	int	*v;
586 
587 	if (data == NULL || (datalen != sizeof (int)))
588 		return (ENOPROTOOPT);
589 
590 	v = (int *)data;
591 	if (*v != 1)
592 		return (ENOPROTOOPT);
593 
594 	mutex_enter(&ip_g_mrouter_mutex);
595 	if (ip_g_mrouter != NULL) {
596 		mutex_exit(&ip_g_mrouter_mutex);
597 		return (EADDRINUSE);
598 	}
599 
600 	ip_g_mrouter = q;
601 	connp->conn_multi_router = 1;
602 
603 	mutex_init(&last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
604 
605 	mrtstat.mrts_vifctlSize = sizeof (struct vifctl);
606 	mrtstat.mrts_mfcctlSize = sizeof (struct mfcctl);
607 
608 	pim_assert = 0;
609 
610 	/* In order for tunnels to work we have to turn ip_g_forward on */
611 	if (!WE_ARE_FORWARDING) {
612 		if (ip_mrtdebug > 1) {
613 			(void) mi_strlog(q, 1, SL_TRACE,
614 			    "ip_mrouter_init: turning on forwarding");
615 		}
616 		saved_ip_g_forward = ip_g_forward;
617 		ip_g_forward = IP_FORWARD_ALWAYS;
618 	}
619 
620 	mutex_exit(&ip_g_mrouter_mutex);
621 	return (0);
622 }
623 
624 /*
625  * Disable multicast routing.
626  * Didn't use global timeout_val (BSD version), instead check the mfctable.
627  */
628 int
629 ip_mrouter_done(mblk_t *mp)
630 {
631 	conn_t		*connp;
632 	vifi_t 		vifi;
633 	struct mfc	*mfc_rt;
634 	int		i;
635 
636 	mutex_enter(&ip_g_mrouter_mutex);
637 	if (ip_g_mrouter == NULL) {
638 		mutex_exit(&ip_g_mrouter_mutex);
639 		return (EINVAL);
640 	}
641 
642 	connp = Q_TO_CONN(ip_g_mrouter);
643 
644 	if (saved_ip_g_forward != -1) {
645 		if (ip_mrtdebug > 1) {
646 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
647 			    "ip_mrouter_done: turning off forwarding");
648 		}
649 		ip_g_forward = saved_ip_g_forward;
650 		saved_ip_g_forward = -1;
651 	}
652 
653 	/*
654 	 * Always clear cache when vifs change.
655 	 * No need to get last_encap_lock since we are running as a writer.
656 	 */
657 	mutex_enter(&last_encap_lock);
658 	last_encap_src = 0;
659 	last_encap_vif = NULL;
660 	mutex_exit(&last_encap_lock);
661 	connp->conn_multi_router = 0;
662 
663 	mutex_exit(&ip_g_mrouter_mutex);
664 
665 	/*
666 	 * For each phyint in use,
667 	 * disable promiscuous reception of all IP multicasts.
668 	 */
669 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
670 		struct vif *vifp = viftable + vifi;
671 
672 		mutex_enter(&vifp->v_lock);
673 		/*
674 		 * if the vif is active mark it condemned.
675 		 */
676 		if (vifp->v_marks & VIF_MARK_GOOD) {
677 			ASSERT(vifp->v_ipif != NULL);
678 			ipif_refhold(vifp->v_ipif);
679 			/* Phyint only */
680 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
681 				ipif_t *ipif = vifp->v_ipif;
682 				ipsq_t  *ipsq;
683 				boolean_t suc;
684 				ill_t *ill;
685 
686 				ill = ipif->ipif_ill;
687 				suc = B_FALSE;
688 				if (mp == NULL) {
689 					/*
690 					 * being called from ip_close,
691 					 * lets do it synchronously.
692 					 * Clear VIF_MARK_GOOD and
693 					 * set VIF_MARK_CONDEMNED.
694 					 */
695 					vifp->v_marks &= ~VIF_MARK_GOOD;
696 					vifp->v_marks |= VIF_MARK_CONDEMNED;
697 					mutex_exit(&(vifp)->v_lock);
698 					suc = ipsq_enter(ill, B_FALSE);
699 					ipsq = ill->ill_phyint->phyint_ipsq;
700 				} else {
701 					ipsq = ipsq_try_enter(ipif, NULL,
702 					    ip_g_mrouter, mp,
703 					    ip_restart_optmgmt, NEW_OP, B_TRUE);
704 					if (ipsq == NULL) {
705 						mutex_exit(&(vifp)->v_lock);
706 						return (EINPROGRESS);
707 					}
708 					/*
709 					 * Clear VIF_MARK_GOOD and
710 					 * set VIF_MARK_CONDEMNED.
711 					 */
712 					vifp->v_marks &= ~VIF_MARK_GOOD;
713 					vifp->v_marks |= VIF_MARK_CONDEMNED;
714 						mutex_exit(&(vifp)->v_lock);
715 					suc = B_TRUE;
716 				}
717 
718 				if (suc) {
719 					(void) ip_delmulti(INADDR_ANY, ipif,
720 					    B_TRUE, B_TRUE);
721 					ipsq_exit(ipsq, B_TRUE, B_TRUE);
722 				}
723 				mutex_enter(&vifp->v_lock);
724 			}
725 			/*
726 			 * decreases the refcnt added in add_vif.
727 			 * and release v_lock.
728 			 */
729 			VIF_REFRELE_LOCKED(vifp);
730 		} else {
731 			mutex_exit(&vifp->v_lock);
732 			continue;
733 		}
734 	}
735 
736 	mutex_enter(&numvifs_mutex);
737 	numvifs = 0;
738 	pim_assert = 0;
739 	reg_vif_num = ALL_VIFS;
740 	mutex_exit(&numvifs_mutex);
741 
742 	/*
743 	 * Free upcall msgs.
744 	 * Go through mfctable and stop any outstanding upcall
745 	 * timeouts remaining on mfcs.
746 	 */
747 	for (i = 0; i < MFCTBLSIZ; i++) {
748 		mutex_enter(&mfctable[i].mfcb_lock);
749 		mfctable[i].mfcb_refcnt++;
750 		mfctable[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
751 		mutex_exit(&mfctable[i].mfcb_lock);
752 		mfc_rt = mfctable[i].mfcb_mfc;
753 		while (mfc_rt) {
754 			/* Free upcalls */
755 			mutex_enter(&mfc_rt->mfc_mutex);
756 			if (mfc_rt->mfc_rte != NULL) {
757 				if (mfc_rt->mfc_timeout_id != 0) {
758 					/*
759 					 * OK to drop the lock as we have
760 					 * a refcnt on the bucket. timeout
761 					 * can fire but it will see that
762 					 * mfc_timeout_id == 0 and not do
763 					 * anything. see expire_upcalls().
764 					 */
765 					mfc_rt->mfc_timeout_id = 0;
766 					mutex_exit(&mfc_rt->mfc_mutex);
767 					(void) untimeout(
768 					    mfc_rt->mfc_timeout_id);
769 						mfc_rt->mfc_timeout_id = 0;
770 					mutex_enter(&mfc_rt->mfc_mutex);
771 
772 					/*
773 					 * all queued upcall packets
774 					 * and mblk will be freed in
775 					 * release_mfc().
776 					 */
777 				}
778 			}
779 
780 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
781 
782 			mutex_exit(&mfc_rt->mfc_mutex);
783 			mfc_rt = mfc_rt->mfc_next;
784 		}
785 		MFCB_REFRELE(&mfctable[i]);
786 	}
787 
788 	mutex_enter(&ip_g_mrouter_mutex);
789 	ip_g_mrouter = NULL;
790 	mutex_exit(&ip_g_mrouter_mutex);
791 	return (0);
792 }
793 
794 static boolean_t
795 is_mrouter_off(void)
796 {
797 	conn_t	*connp;
798 
799 	mutex_enter(&ip_g_mrouter_mutex);
800 	if (ip_g_mrouter == NULL) {
801 		mutex_exit(&ip_g_mrouter_mutex);
802 		return (B_TRUE);
803 	}
804 
805 	connp = Q_TO_CONN(ip_g_mrouter);
806 	if (connp->conn_multi_router == 0) {
807 		mutex_exit(&ip_g_mrouter_mutex);
808 		return (B_TRUE);
809 	}
810 	mutex_exit(&ip_g_mrouter_mutex);
811 	return (B_FALSE);
812 }
813 
814 static void
815 unlock_good_vif(struct vif *vifp)
816 {
817 	ASSERT(vifp->v_ipif != NULL);
818 	ipif_refrele(vifp->v_ipif);
819 	VIF_REFRELE(vifp);
820 }
821 
822 static boolean_t
823 lock_good_vif(struct vif *vifp)
824 {
825 	mutex_enter(&vifp->v_lock);
826 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
827 		mutex_exit(&vifp->v_lock);
828 		return (B_FALSE);
829 	}
830 
831 	ASSERT(vifp->v_ipif != NULL);
832 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
833 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
834 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
835 		mutex_exit(&vifp->v_lock);
836 		return (B_FALSE);
837 	}
838 	ipif_refhold_locked(vifp->v_ipif);
839 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
840 	vifp->v_refcnt++;
841 	mutex_exit(&vifp->v_lock);
842 	return (B_TRUE);
843 }
844 
845 /*
846  * Add a vif to the vif table.
847  */
848 static int
849 add_vif(struct vifctl *vifcp, queue_t *q, mblk_t *first_mp)
850 {
851 	struct vif	*vifp = viftable + vifcp->vifc_vifi;
852 	ipif_t		*ipif;
853 	int		error;
854 	struct tbf	*v_tbf = tbftable + vifcp->vifc_vifi;
855 	conn_t   	*connp = Q_TO_CONN(q);
856 	ipsq_t  	*ipsq;
857 
858 	ASSERT(connp != NULL);
859 
860 	if (vifcp->vifc_vifi >= MAXVIFS)
861 		return (EINVAL);
862 
863 	if (is_mrouter_off())
864 		return (EINVAL);
865 
866 	mutex_enter(&vifp->v_lock);
867 	/*
868 	 * Viftable entry should be 0.
869 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
870 	 * initialized.
871 	 *
872 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
873 	 * request while the delete is in progress, mrouted only sends add
874 	 * requests when a new interface is added and the new interface cannot
875 	 * have the same vifi as an existing interface. We make sure that
876 	 * ill_delete will block till the vif is deleted by adding a refcnt
877 	 * to ipif in del_vif().
878 	 */
879 	if (vifp->v_lcl_addr.s_addr != 0 ||
880 	    vifp->v_marks != 0 ||
881 	    vifp->v_refcnt != 0) {
882 		mutex_exit(&vifp->v_lock);
883 		return (EADDRINUSE);
884 	}
885 
886 	/* Incoming vif should not be 0 */
887 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
888 		mutex_exit(&vifp->v_lock);
889 		return (EINVAL);
890 	}
891 
892 	vifp->v_refcnt++;
893 	mutex_exit(&vifp->v_lock);
894 	/* Find the interface with the local address */
895 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
896 	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
897 	    ip_restart_optmgmt, &error);
898 	if (ipif == NULL) {
899 		VIF_REFRELE(vifp);
900 		if (error == EINPROGRESS)
901 			return (error);
902 		return (EADDRNOTAVAIL);
903 	}
904 
905 	/*
906 	 * We have to be exclusive as we have to call ip_addmulti()
907 	 * This is the best position to try to be exclusive in case
908 	 * we have to wait.
909 	 */
910 	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
911 	    ip_restart_optmgmt, NEW_OP, B_TRUE);
912 	if ((ipsq) == NULL) {
913 		VIF_REFRELE(vifp);
914 		ipif_refrele(ipif);
915 		return (EINPROGRESS);
916 	}
917 
918 	if (ip_mrtdebug > 1) {
919 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
920 		    "add_vif: src 0x%x enter",
921 		    vifcp->vifc_lcl_addr.s_addr);
922 	}
923 
924 	mutex_enter(&vifp->v_lock);
925 	/*
926 	 * Always clear cache when vifs change.
927 	 * Needed to ensure that src isn't left over from before vif was added.
928 	 * No need to get last_encap_lock, since we are running as a writer.
929 	 */
930 
931 	mutex_enter(&last_encap_lock);
932 	last_encap_src = 0;
933 	last_encap_vif = NULL;
934 	mutex_exit(&last_encap_lock);
935 
936 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
937 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
938 			cmn_err(CE_WARN,
939 			    "add_vif: source route tunnels not supported\n");
940 			VIF_REFRELE_LOCKED(vifp);
941 			ipif_refrele(ipif);
942 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
943 			return (EOPNOTSUPP);
944 		}
945 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
946 
947 	} else {
948 		/* Phyint or Register vif */
949 		if (vifcp->vifc_flags & VIFF_REGISTER) {
950 			/*
951 			 * Note: Since all IPPROTO_IP level options (including
952 			 * MRT_ADD_VIF) are done exclusively via
953 			 * ip_optmgmt_writer(), a lock is not necessary to
954 			 * protect reg_vif_num.
955 			 */
956 			mutex_enter(&numvifs_mutex);
957 			if (reg_vif_num == ALL_VIFS) {
958 				reg_vif_num = vifcp->vifc_vifi;
959 				mutex_exit(&numvifs_mutex);
960 			} else {
961 				mutex_exit(&numvifs_mutex);
962 				VIF_REFRELE_LOCKED(vifp);
963 				ipif_refrele(ipif);
964 				ipsq_exit(ipsq, B_TRUE, B_TRUE);
965 				return (EADDRINUSE);
966 			}
967 		}
968 
969 		/* Make sure the interface supports multicast */
970 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
971 			VIF_REFRELE_LOCKED(vifp);
972 			ipif_refrele(ipif);
973 			if (vifcp->vifc_flags & VIFF_REGISTER) {
974 				mutex_enter(&numvifs_mutex);
975 				reg_vif_num = ALL_VIFS;
976 				mutex_exit(&numvifs_mutex);
977 			}
978 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
979 			return (EOPNOTSUPP);
980 		}
981 		/* Enable promiscuous reception of all IP mcasts from the if */
982 		mutex_exit(&vifp->v_lock);
983 		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
984 		    MODE_IS_EXCLUDE, NULL);
985 		mutex_enter(&vifp->v_lock);
986 		/*
987 		 * since we released the lock lets make sure that
988 		 * ip_mrouter_done() has not been called.
989 		 */
990 		if (error != 0 || is_mrouter_off()) {
991 			if (error == 0)
992 				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
993 				    B_TRUE);
994 			if (vifcp->vifc_flags & VIFF_REGISTER) {
995 				mutex_enter(&numvifs_mutex);
996 				reg_vif_num = ALL_VIFS;
997 				mutex_exit(&numvifs_mutex);
998 			}
999 			VIF_REFRELE_LOCKED(vifp);
1000 			ipif_refrele(ipif);
1001 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1002 			return (error?error:EINVAL);
1003 		}
1004 	}
1005 	/* Define parameters for the tbf structure */
1006 	vifp->v_tbf = v_tbf;
1007 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
1008 	vifp->v_tbf->tbf_n_tok = 0;
1009 	vifp->v_tbf->tbf_q_len = 0;
1010 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1011 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1012 
1013 	vifp->v_flags = vifcp->vifc_flags;
1014 	vifp->v_threshold = vifcp->vifc_threshold;
1015 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1016 	vifp->v_ipif = ipif;
1017 	ipif_refrele(ipif);
1018 	/* Scaling up here, allows division by 1024 in critical code.	*/
1019 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1020 	vifp->v_timeout_id = 0;
1021 	/* initialize per vif pkt counters */
1022 	vifp->v_pkt_in = 0;
1023 	vifp->v_pkt_out = 0;
1024 	vifp->v_bytes_in = 0;
1025 	vifp->v_bytes_out = 0;
1026 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1027 
1028 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1029 	mutex_enter(&numvifs_mutex);
1030 	if (numvifs <= vifcp->vifc_vifi)
1031 		numvifs = vifcp->vifc_vifi + 1;
1032 	mutex_exit(&numvifs_mutex);
1033 
1034 	if (ip_mrtdebug > 1) {
1035 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1036 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1037 		    vifcp->vifc_vifi,
1038 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1039 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1040 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1041 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1042 	}
1043 
1044 	vifp->v_marks = VIF_MARK_GOOD;
1045 	mutex_exit(&vifp->v_lock);
1046 	ipsq_exit(ipsq, B_TRUE, B_TRUE);
1047 	return (0);
1048 }
1049 
1050 
1051 /* Delete a vif from the vif table. */
1052 static void
1053 del_vifp(struct vif *vifp)
1054 {
1055 	struct tbf	*t = vifp->v_tbf;
1056 	mblk_t  *mp0;
1057 	vifi_t  vifi;
1058 
1059 
1060 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1061 	ASSERT(t != NULL);
1062 
1063 	/*
1064 	 * release the ref we put in vif_del.
1065 	 */
1066 	ASSERT(vifp->v_ipif != NULL);
1067 	ipif_refrele(vifp->v_ipif);
1068 
1069 	if (ip_mrtdebug > 1) {
1070 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1071 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1072 	}
1073 
1074 	if (vifp->v_timeout_id != 0) {
1075 		(void) untimeout(vifp->v_timeout_id);
1076 		vifp->v_timeout_id = 0;
1077 	}
1078 
1079 	/*
1080 	 * Free packets queued at the interface.
1081 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1082 	 */
1083 	mutex_enter(&t->tbf_lock);
1084 	while (t->tbf_q != NULL) {
1085 		mp0 = t->tbf_q;
1086 		t->tbf_q = t->tbf_q->b_next;
1087 		mp0->b_prev = mp0->b_next = NULL;
1088 		freemsg(mp0);
1089 	}
1090 	mutex_exit(&t->tbf_lock);
1091 
1092 	/*
1093 	 * Always clear cache when vifs change.
1094 	 * No need to get last_encap_lock since we are running as a writer.
1095 	 */
1096 	mutex_enter(&last_encap_lock);
1097 	if (vifp == last_encap_vif) {
1098 		last_encap_vif = NULL;
1099 		last_encap_src = 0;
1100 	}
1101 	mutex_exit(&last_encap_lock);
1102 
1103 	mutex_destroy(&t->tbf_lock);
1104 
1105 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1106 
1107 	/* Adjust numvifs down */
1108 	mutex_enter(&numvifs_mutex);
1109 	for (vifi = numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1110 		if (viftable[vifi - 1].v_lcl_addr.s_addr != 0)
1111 			break;
1112 	numvifs = vifi;
1113 	mutex_exit(&numvifs_mutex);
1114 
1115 	bzero(vifp, sizeof (*vifp));
1116 }
1117 
1118 static int
1119 del_vif(vifi_t *vifip, queue_t *q, mblk_t *first_mp)
1120 {
1121 	struct vif	*vifp = viftable + *vifip;
1122 	conn_t		*connp;
1123 	ipsq_t  	*ipsq;
1124 
1125 	if (*vifip >= numvifs)
1126 		return (EINVAL);
1127 
1128 
1129 	mutex_enter(&vifp->v_lock);
1130 	/*
1131 	 * Not initialized
1132 	 * Here we are not looking at the vif that is being initialized
1133 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1134 	 */
1135 	if (vifp->v_lcl_addr.s_addr == 0 ||
1136 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1137 		mutex_exit(&vifp->v_lock);
1138 		return (EADDRNOTAVAIL);
1139 	}
1140 
1141 	/*
1142 	 * This is an optimization, if first_mp == NULL
1143 	 * than we are being called from reset_mrt_vif_ipif()
1144 	 * so we already have exclusive access to the ipsq.
1145 	 * the ASSERT below is a check for this condition.
1146 	 */
1147 	if (first_mp != NULL &&
1148 	    !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1149 		connp = Q_TO_CONN(q);
1150 		ASSERT(connp != NULL);
1151 		/*
1152 		 * We have to be exclusive as we have to call ip_delmulti()
1153 		 * This is the best position to try to be exclusive in case
1154 		 * we have to wait.
1155 		 */
1156 		ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
1157 		    first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
1158 		if ((ipsq) == NULL) {
1159 			mutex_exit(&vifp->v_lock);
1160 			return (EINPROGRESS);
1161 		}
1162 		/* recheck after being exclusive */
1163 		if (vifp->v_lcl_addr.s_addr == 0 ||
1164 		    !vifp->v_marks & VIF_MARK_GOOD) {
1165 			/*
1166 			 * someone beat us.
1167 			 */
1168 			mutex_exit(&vifp->v_lock);
1169 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1170 			return (EADDRNOTAVAIL);
1171 		}
1172 	}
1173 
1174 
1175 	ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
1176 
1177 
1178 	/*
1179 	 * add a refhold so that ipif does not go away while
1180 	 * there are still users, this will be released in del_vifp
1181 	 * when we free the vif.
1182 	 */
1183 	ipif_refhold(vifp->v_ipif);
1184 
1185 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1186 	vifp->v_marks &= ~VIF_MARK_GOOD;
1187 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1188 
1189 	/* Phyint only */
1190 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1191 		ipif_t *ipif = vifp->v_ipif;
1192 		ASSERT(ipif != NULL);
1193 		/*
1194 		 * should be OK to drop the lock as we
1195 		 * have marked this as CONDEMNED.
1196 		 */
1197 		mutex_exit(&(vifp)->v_lock);
1198 		(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
1199 		if (first_mp != NULL)
1200 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1201 		mutex_enter(&(vifp)->v_lock);
1202 	}
1203 
1204 	/*
1205 	 * decreases the refcnt added in add_vif.
1206 	 */
1207 	VIF_REFRELE_LOCKED(vifp);
1208 	return (0);
1209 }
1210 
1211 /*
1212  * Add an mfc entry.
1213  */
1214 static int
1215 add_mfc(struct mfcctl *mfccp)
1216 {
1217 	struct mfc *rt;
1218 	struct rtdetq *rte;
1219 	ushort_t nstl;
1220 	int i;
1221 	struct mfcb *mfcbp;
1222 
1223 	/*
1224 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1225 	 * did not have a real route for pkt.
1226 	 * We want this pkt without rt installed in the mfctable to prevent
1227 	 * multiiple tries, so go ahead and put it in mfctable, it will
1228 	 * be discarded later in ip_mdq() because the child is NULL.
1229 	 */
1230 
1231 	/* Error checking, out of bounds? */
1232 	if (mfccp->mfcc_parent > MAXVIFS) {
1233 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1234 		    (int)mfccp->mfcc_parent));
1235 		return (EINVAL);
1236 	}
1237 
1238 	if ((mfccp->mfcc_parent != NO_VIF) &&
1239 	    (viftable[mfccp->mfcc_parent].v_ipif == NULL)) {
1240 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1241 		    (int)mfccp->mfcc_parent));
1242 		return (EINVAL);
1243 	}
1244 
1245 	if (is_mrouter_off()) {
1246 		return (EINVAL);
1247 	}
1248 
1249 	mfcbp = &mfctable[MFCHASH(mfccp->mfcc_origin.s_addr,
1250 	    mfccp->mfcc_mcastgrp.s_addr)];
1251 	MFCB_REFHOLD(mfcbp);
1252 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1253 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1254 
1255 	/* If an entry already exists, just update the fields */
1256 	if (rt) {
1257 		if (ip_mrtdebug > 1) {
1258 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1259 			    "add_mfc: update o %x grp %x parent %x",
1260 			    ntohl(mfccp->mfcc_origin.s_addr),
1261 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1262 			    mfccp->mfcc_parent);
1263 		}
1264 		mutex_enter(&rt->mfc_mutex);
1265 		rt->mfc_parent = mfccp->mfcc_parent;
1266 
1267 		mutex_enter(&numvifs_mutex);
1268 		for (i = 0; i < (int)numvifs; i++)
1269 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1270 		mutex_exit(&numvifs_mutex);
1271 		mutex_exit(&rt->mfc_mutex);
1272 
1273 		MFCB_REFRELE(mfcbp);
1274 		return (0);
1275 	}
1276 
1277 	/*
1278 	 * Find the entry for which the upcall was made and update.
1279 	 */
1280 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1281 		mutex_enter(&rt->mfc_mutex);
1282 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1283 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1284 		    (rt->mfc_rte != NULL) &&
1285 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1286 			if (nstl++ != 0)
1287 				cmn_err(CE_WARN,
1288 				    "add_mfc: %s o %x g %x p %x",
1289 				    "multiple kernel entries",
1290 				    ntohl(mfccp->mfcc_origin.s_addr),
1291 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1292 				    mfccp->mfcc_parent);
1293 
1294 			if (ip_mrtdebug > 1) {
1295 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1296 				    "add_mfc: o %x g %x p %x",
1297 				    ntohl(mfccp->mfcc_origin.s_addr),
1298 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1299 				    mfccp->mfcc_parent);
1300 			}
1301 			fill_route(rt, mfccp);
1302 
1303 			/*
1304 			 * Prevent cleanup of cache entry.
1305 			 * Timer starts in ip_mforward.
1306 			 */
1307 			if (rt->mfc_timeout_id != 0) {
1308 				timeout_id_t id;
1309 				id = rt->mfc_timeout_id;
1310 				/*
1311 				 * setting id to zero will avoid this
1312 				 * entry from being cleaned up in
1313 				 * expire_up_calls().
1314 				 */
1315 				rt->mfc_timeout_id = 0;
1316 				/*
1317 				 * dropping the lock is fine as we
1318 				 * have a refhold on the bucket.
1319 				 * so mfc cannot be freed.
1320 				 * The timeout can fire but it will see
1321 				 * that mfc_timeout_id == 0 and not cleanup.
1322 				 */
1323 				mutex_exit(&rt->mfc_mutex);
1324 				(void) untimeout(id);
1325 				mutex_enter(&rt->mfc_mutex);
1326 			}
1327 
1328 			/*
1329 			 * Send all pkts that are queued waiting for the upcall.
1330 			 * ip_mdq param tun set to 0 -
1331 			 * the return value of ip_mdq() isn't used here,
1332 			 * so value we send doesn't matter.
1333 			 */
1334 			while (rt->mfc_rte != NULL) {
1335 				rte = rt->mfc_rte;
1336 				rt->mfc_rte = rte->rte_next;
1337 				mutex_exit(&rt->mfc_mutex);
1338 				(void) ip_mdq(rte->mp, (ipha_t *)
1339 				    rte->mp->b_rptr, rte->ill, 0, rt);
1340 				freemsg(rte->mp);
1341 				mi_free((char *)rte);
1342 				mutex_enter(&rt->mfc_mutex);
1343 			}
1344 		}
1345 		mutex_exit(&rt->mfc_mutex);
1346 	}
1347 
1348 
1349 	/*
1350 	 * It is possible that an entry is being inserted without an upcall
1351 	 */
1352 	if (nstl == 0) {
1353 		mutex_enter(&(mfcbp->mfcb_lock));
1354 		if (ip_mrtdebug > 1) {
1355 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1356 			    "add_mfc: no upcall o %x g %x p %x",
1357 			    ntohl(mfccp->mfcc_origin.s_addr),
1358 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1359 			    mfccp->mfcc_parent);
1360 		}
1361 		if (is_mrouter_off()) {
1362 			mutex_exit(&mfcbp->mfcb_lock);
1363 			MFCB_REFRELE(mfcbp);
1364 			return (EINVAL);
1365 		}
1366 
1367 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1368 
1369 			mutex_enter(&rt->mfc_mutex);
1370 			if ((rt->mfc_origin.s_addr ==
1371 			    mfccp->mfcc_origin.s_addr) &&
1372 			    (rt->mfc_mcastgrp.s_addr ==
1373 				mfccp->mfcc_mcastgrp.s_addr) &&
1374 				(!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1375 				fill_route(rt, mfccp);
1376 				mutex_exit(&rt->mfc_mutex);
1377 				break;
1378 			}
1379 			mutex_exit(&rt->mfc_mutex);
1380 		}
1381 
1382 		/* No upcall, so make a new entry into mfctable */
1383 		if (rt == NULL) {
1384 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1385 			if (rt == NULL) {
1386 				ip1dbg(("add_mfc: out of memory\n"));
1387 				mutex_exit(&mfcbp->mfcb_lock);
1388 				MFCB_REFRELE(mfcbp);
1389 				return (ENOBUFS);
1390 			}
1391 
1392 			/* Insert new entry at head of hash chain */
1393 			mutex_enter(&rt->mfc_mutex);
1394 			fill_route(rt, mfccp);
1395 
1396 			/* Link into table */
1397 			rt->mfc_next   = mfcbp->mfcb_mfc;
1398 			mfcbp->mfcb_mfc = rt;
1399 			mutex_exit(&rt->mfc_mutex);
1400 		}
1401 		mutex_exit(&mfcbp->mfcb_lock);
1402 	}
1403 
1404 	MFCB_REFRELE(mfcbp);
1405 	return (0);
1406 }
1407 
1408 /*
1409  * Fills in mfc structure from mrouted mfcctl.
1410  */
1411 static void
1412 fill_route(struct mfc *rt, struct mfcctl *mfccp)
1413 {
1414 	int i;
1415 
1416 	rt->mfc_origin		= mfccp->mfcc_origin;
1417 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1418 	rt->mfc_parent		= mfccp->mfcc_parent;
1419 	mutex_enter(&numvifs_mutex);
1420 	for (i = 0; i < (int)numvifs; i++) {
1421 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1422 	}
1423 	mutex_exit(&numvifs_mutex);
1424 	/* Initialize pkt counters per src-grp */
1425 	rt->mfc_pkt_cnt	= 0;
1426 	rt->mfc_byte_cnt	= 0;
1427 	rt->mfc_wrong_if	= 0;
1428 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1429 
1430 }
1431 
1432 static void
1433 free_queue(struct mfc *mfcp)
1434 {
1435 	struct rtdetq *rte0;
1436 
1437 	/*
1438 	 * Drop all queued upcall packets.
1439 	 * Free the mbuf with the pkt.
1440 	 */
1441 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1442 		mfcp->mfc_rte = rte0->rte_next;
1443 		freemsg(rte0->mp);
1444 		mi_free((char *)rte0);
1445 	}
1446 }
1447 /*
1448  * go thorugh the hash bucket and free all the entries marked condemned.
1449  */
1450 void
1451 release_mfc(struct mfcb *mfcbp)
1452 {
1453 	struct mfc *current_mfcp;
1454 	struct mfc *prev_mfcp;
1455 
1456 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1457 
1458 	while (current_mfcp != NULL) {
1459 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1460 			if (current_mfcp == mfcbp->mfcb_mfc) {
1461 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1462 				free_queue(current_mfcp);
1463 				mi_free(current_mfcp);
1464 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1465 				continue;
1466 			}
1467 			ASSERT(prev_mfcp != NULL);
1468 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1469 			free_queue(current_mfcp);
1470 			mi_free(current_mfcp);
1471 			current_mfcp = NULL;
1472 		} else {
1473 			prev_mfcp = current_mfcp;
1474 		}
1475 
1476 		current_mfcp = prev_mfcp->mfc_next;
1477 
1478 	}
1479 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1480 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1481 }
1482 
1483 /*
1484  * Delete an mfc entry.
1485  */
1486 static int
1487 del_mfc(struct mfcctl *mfccp)
1488 {
1489 	struct in_addr	origin;
1490 	struct in_addr	mcastgrp;
1491 	struct mfc 		*rt;
1492 	uint_t			hash;
1493 
1494 	origin = mfccp->mfcc_origin;
1495 	mcastgrp = mfccp->mfcc_mcastgrp;
1496 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1497 
1498 	if (ip_mrtdebug > 1) {
1499 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1500 		    "del_mfc: o %x g %x",
1501 		    ntohl(origin.s_addr),
1502 		    ntohl(mcastgrp.s_addr));
1503 	}
1504 
1505 	MFCB_REFHOLD(&mfctable[hash]);
1506 
1507 	/* Find mfc in mfctable, finds only entries without upcalls */
1508 	for (rt = mfctable[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1509 		mutex_enter(&rt->mfc_mutex);
1510 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1511 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1512 		    rt->mfc_rte == NULL &&
1513 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1514 			break;
1515 		mutex_exit(&rt->mfc_mutex);
1516 	}
1517 
1518 	/*
1519 	 * Return if there was an upcall (mfc_rte != NULL,
1520 	 * or rt not in mfctable.
1521 	 */
1522 	if (rt == NULL) {
1523 		MFCB_REFRELE(&mfctable[hash]);
1524 		return (EADDRNOTAVAIL);
1525 	}
1526 
1527 
1528 	/*
1529 	 * no need to hold lock as we have a reference.
1530 	 */
1531 	mfctable[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1532 	/* error checking */
1533 	if (rt->mfc_timeout_id != 0) {
1534 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1535 		/*
1536 		 * Its ok to drop the lock,  the struct cannot be freed
1537 		 * since we have a ref on the hash bucket.
1538 		 */
1539 		rt->mfc_timeout_id = 0;
1540 		mutex_exit(&rt->mfc_mutex);
1541 		(void) untimeout(rt->mfc_timeout_id);
1542 		mutex_enter(&rt->mfc_mutex);
1543 	}
1544 
1545 	ASSERT(rt->mfc_rte == NULL);
1546 
1547 
1548 	/*
1549 	 * Delete the entry from the cache
1550 	 */
1551 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1552 	mutex_exit(&rt->mfc_mutex);
1553 
1554 	MFCB_REFRELE(&mfctable[hash]);
1555 
1556 	return (0);
1557 }
1558 
1559 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1560 
1561 /*
1562  * IP multicast forwarding function. This function assumes that the packet
1563  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1564  * pointed to by "ill", and the packet is to be relayed to other networks
1565  * that have members of the packet's destination IP multicast group.
1566  *
1567  * The packet is returned unscathed to the caller, unless it is
1568  * erroneous, in which case a -1 value tells the caller (IP)
1569  * to discard it.
1570  *
1571  * Unlike BSD, SunOS 5.x needs to return to IP info about
1572  * whether pkt came in thru a tunnel, so it can be discarded, unless
1573  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1574  * to be delivered.
1575  * Return values are 0 - pkt is okay and phyint
1576  *		    -1 - pkt is malformed and to be tossed
1577  *                   1 - pkt came in on tunnel
1578  */
1579 int
1580 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
1581 {
1582 	struct mfc 	*rt;
1583 	ipaddr_t	src, dst, tunnel_src = 0;
1584 	static int	srctun = 0;
1585 	vifi_t		vifi;
1586 	boolean_t	pim_reg_packet = B_FALSE;
1587 	struct mfcb *mfcbp;
1588 
1589 	if (ip_mrtdebug > 1) {
1590 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1591 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1592 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1593 		    ill->ill_name);
1594 	}
1595 
1596 	dst = ipha->ipha_dst;
1597 	if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
1598 		pim_reg_packet = B_TRUE;
1599 	else
1600 		tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
1601 
1602 	/*
1603 	 * Don't forward a packet with time-to-live of zero or one,
1604 	 * or a packet destined to a local-only group.
1605 	 */
1606 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1607 			(ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1608 		if (ip_mrtdebug > 1) {
1609 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1610 			    "ip_mforward: not forwarded ttl %d,"
1611 			    " dst 0x%x ill %s",
1612 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1613 		}
1614 		mp->b_prev = NULL;
1615 		if (tunnel_src != 0)
1616 			return (1);
1617 		else
1618 			return (0);
1619 	}
1620 
1621 	if ((tunnel_src != 0) || pim_reg_packet) {
1622 		/*
1623 		 * Packet arrived over an encapsulated tunnel or via a PIM
1624 		 * register message. Both ip_mroute_decap() and pim_input()
1625 		 * encode information in mp->b_prev.
1626 		 */
1627 		mp->b_prev = NULL;
1628 		if (ip_mrtdebug > 1) {
1629 			if (tunnel_src != 0) {
1630 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1631 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1632 				    ill->ill_name);
1633 			} else if (pim_reg_packet) {
1634 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1635 				    "ip_mforward: ill %s arrived via"
1636 				    "  REGISTER VIF",
1637 				    ill->ill_name);
1638 			}
1639 		}
1640 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1641 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1642 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1643 		/* Packet arrived via a physical interface. */
1644 		if (ip_mrtdebug > 1) {
1645 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1646 			    "ip_mforward: ill %s arrived via PHYINT",
1647 			    ill->ill_name);
1648 		}
1649 
1650 	} else {
1651 		/*
1652 		 * Packet arrived through a SRCRT tunnel.
1653 		 * Source-route tunnels are no longer supported.
1654 		 * Error message printed every 1000 times.
1655 		 */
1656 		if ((srctun++ % 1000) == 0) {
1657 			cmn_err(CE_WARN,
1658 			    "ip_mforward: received source-routed pkt from %x",
1659 			    ntohl(ipha->ipha_src));
1660 		}
1661 		return (-1);
1662 	}
1663 
1664 	mrtstat.mrts_fwd_in++;
1665 	src = ipha->ipha_src;
1666 
1667 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1668 
1669 	/*
1670 	 * Lock the mfctable against changes made by ip_mforward.
1671 	 * Note that only add_mfc and del_mfc can remove entries and
1672 	 * they run with exclusive access to IP. So we do not need to
1673 	 * guard against the rt being deleted, so release lock after reading.
1674 	 */
1675 
1676 	if (is_mrouter_off())
1677 		return (-1);
1678 
1679 	mfcbp = &mfctable[MFCHASH(src, dst)];
1680 	MFCB_REFHOLD(mfcbp);
1681 	MFCFIND(mfcbp, src, dst, rt);
1682 
1683 	/* Entry exists, so forward if necessary */
1684 	if (rt != NULL) {
1685 		int ret = 0;
1686 		mrtstat.mrts_mfc_hits++;
1687 		if (pim_reg_packet) {
1688 			ASSERT(reg_vif_num != ALL_VIFS);
1689 			ret = ip_mdq(mp, ipha,
1690 			    viftable[reg_vif_num].v_ipif->ipif_ill, 0, rt);
1691 		} else {
1692 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1693 		}
1694 
1695 		MFCB_REFRELE(mfcbp);
1696 		return (ret);
1697 
1698 		/*
1699 		 * Don't forward if we don't have a cache entry.  Mrouted will
1700 		 * always provide a cache entry in response to an upcall.
1701 		 */
1702 	} else {
1703 		/*
1704 		 * If we don't have a route for packet's origin, make a copy
1705 		 * of the packet and send message to routing daemon.
1706 		 */
1707 		struct mfc	*mfc_rt	 = NULL;
1708 		mblk_t		*mp0	 = NULL;
1709 		mblk_t		*mp_copy = NULL;
1710 		struct rtdetq	*rte	 = NULL;
1711 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1712 		uint_t		hash;
1713 		int		npkts;
1714 		boolean_t	new_mfc = B_FALSE;
1715 		mrtstat.mrts_mfc_misses++;
1716 		/* BSD uses mrts_no_route++ */
1717 		if (ip_mrtdebug > 1) {
1718 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1719 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1720 			    ill->ill_name, ntohl(src), ntohl(dst),
1721 			    (int)mrtstat.mrts_mfc_misses);
1722 		}
1723 		/*
1724 		 * The order of the following code differs from the BSD code.
1725 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1726 		 * code works, so SunOS 5.x wasn't changed to conform to the
1727 		 * BSD version.
1728 		 */
1729 
1730 		/* Lock mfctable. */
1731 		hash = MFCHASH(src, dst);
1732 		mutex_enter(&(mfctable[hash].mfcb_lock));
1733 
1734 		/*
1735 		 * If we are turning off mrouted return an error
1736 		 */
1737 		if (is_mrouter_off()) {
1738 			mutex_exit(&mfcbp->mfcb_lock);
1739 			MFCB_REFRELE(mfcbp);
1740 			return (-1);
1741 		}
1742 
1743 		/* Is there an upcall waiting for this packet? */
1744 		for (mfc_rt = mfctable[hash].mfcb_mfc; mfc_rt;
1745 		    mfc_rt = mfc_rt->mfc_next) {
1746 			mutex_enter(&mfc_rt->mfc_mutex);
1747 			if (ip_mrtdebug > 1) {
1748 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1749 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1750 				    " g 0x%x\n",
1751 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1752 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1753 			}
1754 			/* There is an upcall */
1755 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1756 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1757 			    (mfc_rt->mfc_rte != NULL) &&
1758 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1759 				break;
1760 			}
1761 			mutex_exit(&mfc_rt->mfc_mutex);
1762 		}
1763 		/* No upcall, so make a new entry into mfctable */
1764 		if (mfc_rt == NULL) {
1765 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1766 			if (mfc_rt == NULL) {
1767 				mrtstat.mrts_fwd_drop++;
1768 				ip1dbg(("ip_mforward: out of memory "
1769 				    "for mfc, mfc_rt\n"));
1770 				goto error_return;
1771 			} else
1772 				new_mfc = B_TRUE;
1773 			/* Get resources */
1774 			/* TODO could copy header and dup rest */
1775 			mp_copy = copymsg(mp);
1776 			if (mp_copy == NULL) {
1777 				mrtstat.mrts_fwd_drop++;
1778 				ip1dbg(("ip_mforward: out of memory for "
1779 				    "mblk, mp_copy\n"));
1780 				goto error_return;
1781 			}
1782 			mutex_enter(&mfc_rt->mfc_mutex);
1783 		}
1784 		/* Get resources for rte, whether first rte or not first. */
1785 		/* Add this packet into rtdetq */
1786 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1787 		if (rte == NULL) {
1788 			mrtstat.mrts_fwd_drop++;
1789 			mutex_exit(&mfc_rt->mfc_mutex);
1790 			ip1dbg(("ip_mforward: out of memory for"
1791 			    " rtdetq, rte\n"));
1792 			goto error_return;
1793 		}
1794 
1795 		mp0 = copymsg(mp);
1796 		if (mp0 == NULL) {
1797 			mrtstat.mrts_fwd_drop++;
1798 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1799 			mutex_exit(&mfc_rt->mfc_mutex);
1800 			goto error_return;
1801 		}
1802 		rte->mp		= mp0;
1803 		if (pim_reg_packet) {
1804 			ASSERT(reg_vif_num != ALL_VIFS);
1805 			rte->ill = viftable[reg_vif_num].v_ipif->ipif_ill;
1806 		} else {
1807 			rte->ill = ill;
1808 		}
1809 		rte->rte_next	= NULL;
1810 
1811 		/*
1812 		 * Determine if upcall q (rtdetq) has overflowed.
1813 		 * mfc_rt->mfc_rte is null by mi_zalloc
1814 		 * if it is the first message.
1815 		 */
1816 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1817 		    rte_m = rte_m->rte_next)
1818 			npkts++;
1819 		if (ip_mrtdebug > 1) {
1820 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1821 			    "ip_mforward: upcalls %d\n", npkts);
1822 		}
1823 		if (npkts > MAX_UPQ) {
1824 			mrtstat.mrts_upq_ovflw++;
1825 			mutex_exit(&mfc_rt->mfc_mutex);
1826 			goto error_return;
1827 		}
1828 
1829 		if (npkts == 0) {	/* first upcall */
1830 			int i = 0;
1831 			/*
1832 			 * Now finish installing the new mfc! Now that we have
1833 			 * resources!  Insert new entry at head of hash chain.
1834 			 * Use src and dst which are ipaddr_t's.
1835 			 */
1836 			mfc_rt->mfc_origin.s_addr = src;
1837 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1838 
1839 			mutex_enter(&numvifs_mutex);
1840 			for (i = 0; i < (int)numvifs; i++)
1841 				mfc_rt->mfc_ttls[i] = 0;
1842 			mutex_exit(&numvifs_mutex);
1843 			mfc_rt->mfc_parent = ALL_VIFS;
1844 
1845 			/* Link into table */
1846 			if (ip_mrtdebug > 1) {
1847 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1848 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1849 				    "g 0x%x\n", hash,
1850 				    ntohl(mfc_rt->mfc_origin.s_addr),
1851 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1852 			}
1853 			mfc_rt->mfc_next = mfctable[hash].mfcb_mfc;
1854 			mfctable[hash].mfcb_mfc = mfc_rt;
1855 			mfc_rt->mfc_rte = NULL;
1856 		}
1857 
1858 		/* Link in the upcall */
1859 		/* First upcall */
1860 		if (mfc_rt->mfc_rte == NULL)
1861 			mfc_rt->mfc_rte = rte;
1862 		else {
1863 			/* not the first upcall */
1864 			prev_rte = mfc_rt->mfc_rte;
1865 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1866 			    prev_rte = rte1, rte1 = rte1->rte_next);
1867 			prev_rte->rte_next = rte;
1868 		}
1869 
1870 		/*
1871 		 * No upcalls waiting, this is first one, so send a message to
1872 		 * routing daemon to install a route into kernel table.
1873 		 */
1874 		if (npkts == 0) {
1875 			struct igmpmsg	*im;
1876 			/* ipha_protocol is 0, for upcall */
1877 			ASSERT(mp_copy != NULL);
1878 			im = (struct igmpmsg *)mp_copy->b_rptr;
1879 			im->im_msgtype	= IGMPMSG_NOCACHE;
1880 			im->im_mbz = 0;
1881 			mutex_enter(&numvifs_mutex);
1882 			if (pim_reg_packet) {
1883 				im->im_vif = (uchar_t)reg_vif_num;
1884 				mutex_exit(&numvifs_mutex);
1885 			} else {
1886 				/*
1887 				 * XXX do we need to hold locks here ?
1888 				 */
1889 				for (vifi = 0; vifi < numvifs; vifi++) {
1890 					if (viftable[vifi].v_ipif == NULL)
1891 						continue;
1892 					if (viftable[vifi].v_ipif->ipif_ill ==
1893 					    ill) {
1894 						im->im_vif = (uchar_t)vifi;
1895 						break;
1896 					}
1897 				}
1898 				mutex_exit(&numvifs_mutex);
1899 				ASSERT(vifi < numvifs);
1900 			}
1901 
1902 			mrtstat.mrts_upcalls++;
1903 			/* Timer to discard upcalls if mrouted is too slow */
1904 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1905 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1906 			mutex_exit(&mfc_rt->mfc_mutex);
1907 			mutex_exit(&(mfctable[hash].mfcb_lock));
1908 			putnext(RD(ip_g_mrouter), mp_copy);
1909 
1910 		} else {
1911 			mutex_exit(&mfc_rt->mfc_mutex);
1912 			mutex_exit(&(mfctable[hash].mfcb_lock));
1913 			freemsg(mp_copy);
1914 		}
1915 
1916 		MFCB_REFRELE(mfcbp);
1917 		if (tunnel_src != 0)
1918 			return (1);
1919 		else
1920 			return (0);
1921 	error_return:
1922 		mutex_exit(&(mfctable[hash].mfcb_lock));
1923 		MFCB_REFRELE(mfcbp);
1924 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1925 			mi_free((char *)mfc_rt);
1926 		if (rte != NULL)
1927 			mi_free((char *)rte);
1928 		if (mp_copy != NULL)
1929 			freemsg(mp_copy);
1930 		if (mp0 != NULL)
1931 			freemsg(mp0);
1932 		return (-1);
1933 	}
1934 }
1935 
1936 /*
1937  * Clean up the mfctable cache entry if upcall is not serviced.
1938  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1939  */
1940 static void
1941 expire_upcalls(void *arg)
1942 {
1943 	struct mfc *mfc_rt = arg;
1944 	uint_t hash;
1945 	struct mfc *prev_mfc, *mfc0;
1946 
1947 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1948 	if (ip_mrtdebug > 1) {
1949 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1950 		    "expire_upcalls: hash %d s %x g %x",
1951 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1952 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1953 	}
1954 	MFCB_REFHOLD(&mfctable[hash]);
1955 	mutex_enter(&mfc_rt->mfc_mutex);
1956 	/*
1957 	 * if timeout has been set to zero, than the
1958 	 * entry has been filled, no need to delete it.
1959 	 */
1960 	if (mfc_rt->mfc_timeout_id == 0)
1961 		goto done;
1962 	mrtstat.mrts_cache_cleanups++;
1963 	mfc_rt->mfc_timeout_id = 0;
1964 
1965 	/* Determine entry to be cleaned up in cache table. */
1966 	for (prev_mfc = mfc0 = mfctable[hash].mfcb_mfc; mfc0;
1967 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1968 		if (mfc0 == mfc_rt)
1969 			break;
1970 
1971 	/* del_mfc takes care of gone mfcs */
1972 	ASSERT(prev_mfc != NULL);
1973 	ASSERT(mfc0 != NULL);
1974 
1975 	/*
1976 	 * Delete the entry from the cache
1977 	 */
1978 	mfctable[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1979 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1980 
1981 	/*
1982 	 * release_mfc will drop all queued upcall packets.
1983 	 * and will free the mbuf with the pkt, if, timing info.
1984 	 */
1985 done:
1986 	mutex_exit(&mfc_rt->mfc_mutex);
1987 	MFCB_REFRELE(&mfctable[hash]);
1988 }
1989 
1990 /*
1991  * Packet forwarding routine once entry in the cache is made.
1992  */
1993 static int
1994 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1995     struct mfc *rt)
1996 {
1997 	vifi_t vifi;
1998 	struct vif *vifp;
1999 	ipaddr_t dst = ipha->ipha_dst;
2000 	size_t  plen = msgdsize(mp);
2001 	vifi_t num_of_vifs;
2002 
2003 	if (ip_mrtdebug > 1) {
2004 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2005 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
2006 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
2007 		    ill->ill_name);
2008 	}
2009 
2010 	/* Macro to send packet on vif */
2011 #define	MC_SEND(ipha, mp, vifp, dst) { \
2012 	if ((vifp)->v_flags & VIFF_TUNNEL) \
2013 		encap_send((ipha), (mp), (vifp), (dst)); \
2014 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2015 		register_send((ipha), (mp), (vifp), (dst)); \
2016 	else \
2017 		phyint_send((ipha), (mp), (vifp), (dst)); \
2018 }
2019 
2020 	vifi = rt->mfc_parent;
2021 
2022 	/*
2023 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2024 	 * Mrouted had no route.
2025 	 * We wanted the route installed in the mfctable to prevent multiple
2026 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2027 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2028 	 * 3.6.
2029 	 */
2030 	if (vifi == NO_VIF) {
2031 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2032 		    ill->ill_name));
2033 		if (ip_mrtdebug > 1) {
2034 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2035 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2036 		}
2037 		return (-1);	/* drop pkt */
2038 	}
2039 
2040 	if (!lock_good_vif(&viftable[vifi]))
2041 		return (-1);
2042 	/*
2043 	 * The MFC entries are not cleaned up when an ipif goes
2044 	 * away thus this code has to guard against an MFC referencing
2045 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2046 	 * sets the v_ipif to NULL when the ipif disappears.
2047 	 */
2048 	ASSERT(viftable[vifi].v_ipif != NULL);
2049 
2050 	if (vifi >= numvifs) {
2051 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2052 		    "%d ill %s viftable ill %s\n",
2053 		    (int)vifi, (int)numvifs, ill->ill_name,
2054 		    viftable[vifi].v_ipif->ipif_ill->ill_name);
2055 		unlock_good_vif(&viftable[vifi]);
2056 		return (-1);
2057 	}
2058 	/*
2059 	 * Don't forward if it didn't arrive from the parent vif for its
2060 	 * origin. But do match on the groups as we nominate only one
2061 	 * ill in the group for receiving allmulti packets.
2062 	 */
2063 	if ((viftable[vifi].v_ipif->ipif_ill != ill &&
2064 	    (ill->ill_group == NULL ||
2065 	    viftable[vifi].v_ipif->ipif_ill->ill_group != ill->ill_group)) ||
2066 	    (viftable[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2067 		/* Came in the wrong interface */
2068 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2069 			"numvifs %d ill %s viftable ill %s\n",
2070 			(int)vifi, (int)numvifs, ill->ill_name,
2071 			viftable[vifi].v_ipif->ipif_ill->ill_name));
2072 		if (ip_mrtdebug > 1) {
2073 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2074 			    "ip_mdq: arrived wrong if, vifi %d ill "
2075 			    "%s viftable ill %s\n",
2076 			    (int)vifi, ill->ill_name,
2077 			    viftable[vifi].v_ipif->ipif_ill->ill_name);
2078 		}
2079 		mrtstat.mrts_wrong_if++;
2080 		rt->mfc_wrong_if++;
2081 
2082 		/*
2083 		 * If we are doing PIM assert processing and we are forwarding
2084 		 * packets on this interface, and it is a broadcast medium
2085 		 * interface (and not a tunnel), send a message to the routing.
2086 		 *
2087 		 * We use the first ipif on the list, since it's all we have.
2088 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2089 		 */
2090 		if (pim_assert && rt->mfc_ttls[vifi] > 0 &&
2091 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2092 		    !(viftable[vifi].v_flags & VIFF_TUNNEL)) {
2093 			mblk_t		*mp_copy;
2094 			struct igmpmsg	*im;
2095 
2096 			/* TODO could copy header and dup rest */
2097 			mp_copy = copymsg(mp);
2098 			if (mp_copy == NULL) {
2099 				mrtstat.mrts_fwd_drop++;
2100 				ip1dbg(("ip_mdq: out of memory "
2101 				    "for mblk, mp_copy\n"));
2102 				unlock_good_vif(&viftable[vifi]);
2103 				return (-1);
2104 			}
2105 
2106 			im = (struct igmpmsg *)mp_copy->b_rptr;
2107 			im->im_msgtype = IGMPMSG_WRONGVIF;
2108 			im->im_mbz = 0;
2109 			im->im_vif = (ushort_t)vifi;
2110 			putnext(RD(ip_g_mrouter), mp_copy);
2111 		}
2112 		unlock_good_vif(&viftable[vifi]);
2113 		if (tunnel_src != 0)
2114 			return (1);
2115 		else
2116 			return (0);
2117 	}
2118 	/*
2119 	 * If I sourced this packet, it counts as output, else it was input.
2120 	 */
2121 	if (ipha->ipha_src == viftable[vifi].v_lcl_addr.s_addr) {
2122 		viftable[vifi].v_pkt_out++;
2123 		viftable[vifi].v_bytes_out += plen;
2124 	} else {
2125 		viftable[vifi].v_pkt_in++;
2126 		viftable[vifi].v_bytes_in += plen;
2127 	}
2128 	mutex_enter(&rt->mfc_mutex);
2129 	rt->mfc_pkt_cnt++;
2130 	rt->mfc_byte_cnt += plen;
2131 	mutex_exit(&rt->mfc_mutex);
2132 	unlock_good_vif(&viftable[vifi]);
2133 	/*
2134 	 * For each vif, decide if a copy of the packet should be forwarded.
2135 	 * Forward if:
2136 	 *		- the vif threshold ttl is non-zero AND
2137 	 *		- the pkt ttl exceeds the vif's threshold
2138 	 * A non-zero mfc_ttl indicates that the vif is part of
2139 	 * the output set for the mfc entry.
2140 	 */
2141 	mutex_enter(&numvifs_mutex);
2142 	num_of_vifs = numvifs;
2143 	mutex_exit(&numvifs_mutex);
2144 	for (vifp = viftable, vifi = 0; vifi < num_of_vifs; vifp++, vifi++) {
2145 		if (!lock_good_vif(vifp))
2146 			continue;
2147 		if ((rt->mfc_ttls[vifi] > 0) &&
2148 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2149 			/*
2150 			 * lock_good_vif should not have succedded if
2151 			 * v_ipif is null.
2152 			 */
2153 			ASSERT(vifp->v_ipif != NULL);
2154 			vifp->v_pkt_out++;
2155 			vifp->v_bytes_out += plen;
2156 			MC_SEND(ipha, mp, vifp, dst);
2157 			mrtstat.mrts_fwd_out++;
2158 		}
2159 		unlock_good_vif(vifp);
2160 	}
2161 	if (tunnel_src != 0)
2162 		return (1);
2163 	else
2164 		return (0);
2165 }
2166 
2167 /*
2168  * Send the packet on physical interface.
2169  * Caller assumes can continue to use mp on return.
2170  */
2171 /* ARGSUSED */
2172 static void
2173 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2174 {
2175 	mblk_t 	*mp_copy;
2176 
2177 	/* Make a new reference to the packet */
2178 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2179 	if (mp_copy == NULL) {
2180 		mrtstat.mrts_fwd_drop++;
2181 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2182 		return;
2183 	}
2184 	if (vifp->v_rate_limit <= 0)
2185 		tbf_send_packet(vifp, mp_copy);
2186 	else  {
2187 		if (ip_mrtdebug > 1) {
2188 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2189 			    "phyint_send: tbf_contr rate %d "
2190 			    "vifp 0x%p mp 0x%p dst 0x%x",
2191 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2192 		}
2193 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2194 	}
2195 }
2196 
2197 /*
2198  * Send the whole packet for REGISTER encapsulation to PIM daemon
2199  * Caller assumes it can continue to use mp on return.
2200  */
2201 /* ARGSUSED */
2202 static void
2203 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2204 {
2205 	struct igmpmsg	*im;
2206 	mblk_t		*mp_copy;
2207 	ipha_t		*ipha_copy;
2208 
2209 	if (ip_mrtdebug > 1) {
2210 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2211 		    "register_send: src %x, dst %x\n",
2212 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2213 	}
2214 
2215 	/*
2216 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2217 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2218 	 * ethernet driver will.
2219 	 */
2220 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2221 	if (mp_copy == NULL) {
2222 		++mrtstat.mrts_pim_nomemory;
2223 		if (ip_mrtdebug > 3) {
2224 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2225 			    "register_send: allocb failure.");
2226 		}
2227 		return;
2228 	}
2229 
2230 	/*
2231 	 * Bump write pointer to account for igmpmsg being added.
2232 	 */
2233 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2234 
2235 	/*
2236 	 * Chain packet to new mblk_t.
2237 	 */
2238 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2239 		++mrtstat.mrts_pim_nomemory;
2240 		if (ip_mrtdebug > 3) {
2241 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2242 			    "register_send: copymsg failure.");
2243 		}
2244 		freeb(mp_copy);
2245 		return;
2246 	}
2247 
2248 	/*
2249 	 * icmp_rput() asserts that IP version field is set to an
2250 	 * appropriate version. Hence, the struct igmpmsg that this really
2251 	 * becomes, needs to have the correct IP version field.
2252 	 */
2253 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2254 	*ipha_copy = multicast_encap_iphdr;
2255 
2256 	/*
2257 	 * The kernel uses the struct igmpmsg header to encode the messages to
2258 	 * the multicast routing daemon. Fill in the fields in the header
2259 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2260 	 */
2261 	im = (struct igmpmsg *)mp_copy->b_rptr;
2262 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2263 	im->im_src.s_addr = ipha->ipha_src;
2264 	im->im_dst.s_addr = ipha->ipha_dst;
2265 
2266 	/*
2267 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2268 	 * header with renamed fields and the multicast routing daemon uses
2269 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2270 	 */
2271 	im->im_mbz = 0;
2272 
2273 	++mrtstat.mrts_upcalls;
2274 	if (!canputnext(RD(ip_g_mrouter))) {
2275 		++mrtstat.mrts_pim_regsend_drops;
2276 		if (ip_mrtdebug > 3) {
2277 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2278 			    "register_send: register upcall failure.");
2279 		}
2280 		freemsg(mp_copy);
2281 	} else {
2282 		putnext(RD(ip_g_mrouter), mp_copy);
2283 	}
2284 }
2285 
2286 /*
2287  * pim_validate_cksum handles verification of the checksum in the
2288  * pim header.  For PIM Register packets, the checksum is calculated
2289  * across the PIM header only.  For all other packets, the checksum
2290  * is for the PIM header and remainder of the packet.
2291  *
2292  * returns: B_TRUE, if checksum is okay.
2293  *          B_FALSE, if checksum is not valid.
2294  */
2295 static boolean_t
2296 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2297 {
2298 	mblk_t *mp_dup;
2299 
2300 	if ((mp_dup = dupmsg(mp)) == NULL)
2301 		return (B_FALSE);
2302 
2303 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2304 	if (pimp->pim_type == PIM_REGISTER)
2305 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2306 	if (IP_CSUM(mp_dup, 0, 0)) {
2307 		freemsg(mp_dup);
2308 		return (B_FALSE);
2309 	}
2310 	freemsg(mp_dup);
2311 	return (B_TRUE);
2312 }
2313 
2314 /*
2315  * int
2316  * pim_input(queue_t *, mblk_t *) - Process PIM protocol packets.
2317  *	IP Protocol 103. Register messages are decapsulated and sent
2318  *	onto multicast forwarding.
2319  */
2320 int
2321 pim_input(queue_t *q, mblk_t *mp)
2322 {
2323 	ipha_t		*eip, *ip;
2324 	int		iplen, pimlen, iphlen;
2325 	struct pim	*pimp;	/* pointer to a pim struct */
2326 	uint32_t	*reghdr;
2327 
2328 	/*
2329 	 * Pullup the msg for PIM protocol processing.
2330 	 */
2331 	if (pullupmsg(mp, -1) == 0) {
2332 		++mrtstat.mrts_pim_nomemory;
2333 		freemsg(mp);
2334 		return (-1);
2335 	}
2336 
2337 	ip = (ipha_t *)mp->b_rptr;
2338 	iplen = ip->ipha_length;
2339 	iphlen = IPH_HDR_LENGTH(ip);
2340 	pimlen = ntohs(iplen) - iphlen;
2341 
2342 	/*
2343 	 * Validate lengths
2344 	 */
2345 	if (pimlen < PIM_MINLEN) {
2346 		++mrtstat.mrts_pim_malformed;
2347 		if (ip_mrtdebug > 1) {
2348 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2349 			    "pim_input: length not at least minlen");
2350 		}
2351 		freemsg(mp);
2352 		return (-1);
2353 	}
2354 
2355 	/*
2356 	 * Point to the PIM header.
2357 	 */
2358 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2359 
2360 	/*
2361 	 * Check the version number.
2362 	 */
2363 	if (pimp->pim_vers != PIM_VERSION) {
2364 		++mrtstat.mrts_pim_badversion;
2365 		if (ip_mrtdebug > 1) {
2366 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2367 			    "pim_input: unknown version of PIM");
2368 		}
2369 		freemsg(mp);
2370 		return (-1);
2371 	}
2372 
2373 	/*
2374 	 * Validate the checksum
2375 	 */
2376 	if (!pim_validate_cksum(mp, ip, pimp)) {
2377 		++mrtstat.mrts_pim_rcv_badcsum;
2378 		if (ip_mrtdebug > 1) {
2379 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2380 			    "pim_input: invalid checksum");
2381 		}
2382 		freemsg(mp);
2383 		return (-1);
2384 	}
2385 
2386 	if (pimp->pim_type != PIM_REGISTER)
2387 		return (0);
2388 
2389 	reghdr = (uint32_t *)(pimp + 1);
2390 	eip = (ipha_t *)(reghdr + 1);
2391 
2392 	/*
2393 	 * check if the inner packet is destined to mcast group
2394 	 */
2395 	if (!CLASSD(eip->ipha_dst)) {
2396 		++mrtstat.mrts_pim_badregisters;
2397 		if (ip_mrtdebug > 1) {
2398 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2399 			    "pim_input: Inner pkt not mcast .. !");
2400 		}
2401 		freemsg(mp);
2402 		return (-1);
2403 	}
2404 	if (ip_mrtdebug > 1) {
2405 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2406 		    "register from %x, to %x, len %d",
2407 		    ntohl(eip->ipha_src),
2408 		    ntohl(eip->ipha_dst),
2409 		    ntohs(eip->ipha_length));
2410 	}
2411 	/*
2412 	 * If the null register bit is not set, decapsulate
2413 	 * the packet before forwarding it.
2414 	 */
2415 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
2416 		mblk_t *mp_copy;
2417 
2418 		/* Copy the message */
2419 		if ((mp_copy = copymsg(mp)) == NULL) {
2420 			++mrtstat.mrts_pim_nomemory;
2421 			freemsg(mp);
2422 			return (-1);
2423 		}
2424 
2425 		/*
2426 		 * Decapsulate the packet and give it to
2427 		 * register_mforward.
2428 		 */
2429 		mp_copy->b_rptr += iphlen + sizeof (pim_t) +
2430 		    sizeof (*reghdr);
2431 		if (register_mforward(q, mp_copy) != 0) {
2432 			freemsg(mp);
2433 			return (-1);
2434 		}
2435 	}
2436 
2437 	/*
2438 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2439 	 * PIM socket. For Solaris it is done right after pim_input() is
2440 	 * called.
2441 	 */
2442 	return (0);
2443 }
2444 
2445 /*
2446  * PIM sparse mode hook.  Called by pim_input after decapsulating
2447  * the packet. Loop back the packet, as if we have received it.
2448  * In pim_input() we have to check if the destination is a multicast address.
2449  */
2450 /* ARGSUSED */
2451 static int
2452 register_mforward(queue_t *q, mblk_t *mp)
2453 {
2454 	ASSERT(reg_vif_num <= numvifs);
2455 
2456 	if (ip_mrtdebug > 3) {
2457 		ipha_t *ipha;
2458 
2459 		ipha = (ipha_t *)mp->b_rptr;
2460 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2461 		    "register_mforward: src %x, dst %x\n",
2462 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2463 	}
2464 	/*
2465 	 * Need to pass in to ip_mforward() the information that the
2466 	 * packet has arrived on the register_vif. We use the solution that
2467 	 * ip_mroute_decap() employs: use mp->b_prev to pass some information
2468 	 * to ip_mforward(). Nonzero value means the packet has arrived on a
2469 	 * tunnel (ip_mroute_decap() puts the address of the other side of the
2470 	 * tunnel there.) This is safe since ip_rput() either frees the packet
2471 	 * or passes it to ip_mforward(). We use
2472 	 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
2473 	 * register vif. If in the future we have more than one register vifs,
2474 	 * then this will need re-examination.
2475 	 */
2476 	mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
2477 	++mrtstat.mrts_pim_regforwards;
2478 	ip_rput(q, mp);
2479 	return (0);
2480 }
2481 
2482 /*
2483  * Send an encapsulated packet.
2484  * Caller assumes can continue to use mp when routine returns.
2485  */
2486 /* ARGSUSED */
2487 static void
2488 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2489 {
2490 	mblk_t 	*mp_copy;
2491 	ipha_t 	*ipha_copy;
2492 	size_t	len;
2493 
2494 	if (ip_mrtdebug > 1) {
2495 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2496 		    "encap_send: vif %ld enter", (ptrdiff_t)(vifp - viftable));
2497 	}
2498 	len = ntohs(ipha->ipha_length);
2499 
2500 	/*
2501 	 * Copy the old packet & pullup it's IP header into the
2502 	 * new mbuf so we can modify it.  Try to fill the new
2503 	 * mbuf since if we don't the ethernet driver will.
2504 	 */
2505 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2506 	if (mp_copy == NULL)
2507 		return;
2508 	mp_copy->b_rptr += 32;
2509 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2510 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2511 		freeb(mp_copy);
2512 		return;
2513 	}
2514 
2515 	/*
2516 	 * Fill in the encapsulating IP header.
2517 	 * Remote tunnel dst in rmt_addr, from add_vif().
2518 	 */
2519 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2520 	*ipha_copy = multicast_encap_iphdr;
2521 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2522 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2523 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2524 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2525 	ASSERT(ipha_copy->ipha_ident == 0);
2526 
2527 	/* Turn the encapsulated IP header back into a valid one. */
2528 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2529 	ipha->ipha_ttl--;
2530 	ipha->ipha_hdr_checksum = 0;
2531 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2532 
2533 	if (ip_mrtdebug > 1) {
2534 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2535 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2536 	}
2537 	if (vifp->v_rate_limit <= 0)
2538 		tbf_send_packet(vifp, mp_copy);
2539 	else
2540 		/* ipha is from the original header */
2541 		tbf_control(vifp, mp_copy, ipha);
2542 }
2543 
2544 /*
2545  * De-encapsulate a packet and feed it back through IP input.
2546  * This routine is called whenever IP gets a packet with prototype
2547  * IPPROTO_ENCAP and a local destination address.
2548  */
2549 void
2550 ip_mroute_decap(queue_t *q, mblk_t *mp)
2551 {
2552 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2553 	ipha_t		*ipha_encap;
2554 	int		hlen = IPH_HDR_LENGTH(ipha);
2555 	ipaddr_t	src;
2556 	struct vif	*vifp;
2557 
2558 	/*
2559 	 * Dump the packet if it's not to a multicast destination or if
2560 	 * we don't have an encapsulating tunnel with the source.
2561 	 * Note:  This code assumes that the remote site IP address
2562 	 * uniquely identifies the tunnel (i.e., that this site has
2563 	 * at most one tunnel with the remote site).
2564 	 */
2565 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2566 	if (!CLASSD(ipha_encap->ipha_dst)) {
2567 		mrtstat.mrts_bad_tunnel++;
2568 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2569 		freemsg(mp);
2570 		return;
2571 	}
2572 	src = (ipaddr_t)ipha->ipha_src;
2573 	mutex_enter(&last_encap_lock);
2574 	if (src != last_encap_src) {
2575 		struct vif *vife;
2576 
2577 		vifp = viftable;
2578 		vife = vifp + numvifs;
2579 		last_encap_src = src;
2580 		last_encap_vif = 0;
2581 		for (; vifp < vife; ++vifp) {
2582 			if (!lock_good_vif(vifp))
2583 				continue;
2584 			if (vifp->v_rmt_addr.s_addr == src) {
2585 				if (vifp->v_flags & VIFF_TUNNEL)
2586 					last_encap_vif = vifp;
2587 				if (ip_mrtdebug > 1) {
2588 					(void) mi_strlog(ip_g_mrouter,
2589 					    1, SL_TRACE,
2590 					    "ip_mroute_decap: good tun "
2591 					    "vif %ld with %x",
2592 					    (ptrdiff_t)(vifp - viftable),
2593 					    ntohl(src));
2594 				}
2595 				unlock_good_vif(vifp);
2596 				break;
2597 			}
2598 			unlock_good_vif(vifp);
2599 		}
2600 	}
2601 	if ((vifp = last_encap_vif) == 0) {
2602 		mutex_exit(&last_encap_lock);
2603 		mrtstat.mrts_bad_tunnel++;
2604 		freemsg(mp);
2605 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2606 		    (ptrdiff_t)(vifp - viftable), ntohl(src)));
2607 		return;
2608 	}
2609 	mutex_exit(&last_encap_lock);
2610 
2611 	/*
2612 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2613 	 * verify that the packet arrived over the correct vif.)  We use b_prev
2614 	 * to pass this information. This is safe since the ip_rput either
2615 	 * frees the packet or passes it to ip_mforward.
2616 	 */
2617 	mp->b_prev = (mblk_t *)(uintptr_t)src;
2618 	mp->b_rptr += hlen;
2619 	/* Feed back into ip_rput as an M_DATA. */
2620 	ip_rput(q, mp);
2621 }
2622 
2623 /*
2624  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2625  * (stream closed).  Called as writer.
2626  */
2627 void
2628 reset_mrt_vif_ipif(ipif_t *ipif)
2629 {
2630 	vifi_t vifi, tmp_vifi;
2631 	vifi_t num_of_vifs;
2632 
2633 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2634 
2635 	mutex_enter(&numvifs_mutex);
2636 	num_of_vifs = numvifs;
2637 	mutex_exit(&numvifs_mutex);
2638 
2639 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2640 		tmp_vifi = vifi - 1;
2641 		if (viftable[tmp_vifi].v_ipif == ipif) {
2642 			(void) del_vif(&tmp_vifi, NULL, NULL);
2643 		}
2644 	}
2645 }
2646 
2647 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2648 void
2649 reset_mrt_ill(ill_t *ill)
2650 {
2651 	struct mfc		*rt;
2652 	struct rtdetq	*rte;
2653 	int			i;
2654 
2655 	for (i = 0; i < MFCTBLSIZ; i++) {
2656 		MFCB_REFHOLD(&mfctable[i]);
2657 		if ((rt = mfctable[i].mfcb_mfc) != NULL) {
2658 			if (ip_mrtdebug > 1) {
2659 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2660 				    "reset_mrt_ill: mfctable [%d]", i);
2661 			}
2662 			while (rt != NULL) {
2663 				mutex_enter(&rt->mfc_mutex);
2664 				while ((rte = rt->mfc_rte) != NULL) {
2665 					if (rte->ill == ill) {
2666 						if (ip_mrtdebug > 1) {
2667 							(void) mi_strlog(
2668 							    ip_g_mrouter,
2669 							    1, SL_TRACE,
2670 							    "reset_mrt_ill: "
2671 							    "ill 0x%p", ill);
2672 						}
2673 						rt->mfc_rte = rte->rte_next;
2674 						freemsg(rte->mp);
2675 						mi_free((char *)rte);
2676 					}
2677 				}
2678 				mutex_exit(&rt->mfc_mutex);
2679 				rt = rt->mfc_next;
2680 			}
2681 		}
2682 		MFCB_REFRELE(&mfctable[i]);
2683 	}
2684 }
2685 
2686 /*
2687  * Token bucket filter module.
2688  * The ipha is for mcastgrp destination for phyint and encap.
2689  */
2690 static void
2691 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2692 {
2693 	size_t 	p_len =  msgdsize(mp);
2694 	struct tbf	*t    = vifp->v_tbf;
2695 	timeout_id_t id = 0;
2696 
2697 	/* Drop if packet is too large */
2698 	if (p_len > MAX_BKT_SIZE) {
2699 		mrtstat.mrts_pkt2large++;
2700 		freemsg(mp);
2701 		return;
2702 	}
2703 	if (ip_mrtdebug > 1) {
2704 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2705 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2706 		    (ptrdiff_t)(vifp - viftable), t->tbf_q_len,
2707 		    ntohl(ipha->ipha_dst));
2708 	}
2709 
2710 	mutex_enter(&t->tbf_lock);
2711 
2712 	tbf_update_tokens(vifp);
2713 
2714 	/*
2715 	 * If there are enough tokens,
2716 	 * and the queue is empty, send this packet out.
2717 	 */
2718 	if (ip_mrtdebug > 1) {
2719 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2720 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2721 		    (ptrdiff_t)(vifp - viftable), t->tbf_n_tok, p_len,
2722 		    t->tbf_q_len);
2723 	}
2724 	/* No packets are queued */
2725 	if (t->tbf_q_len == 0) {
2726 		/* queue empty, send packet if enough tokens */
2727 		if (p_len <= t->tbf_n_tok) {
2728 			t->tbf_n_tok -= p_len;
2729 			mutex_exit(&t->tbf_lock);
2730 			tbf_send_packet(vifp, mp);
2731 			return;
2732 		} else {
2733 			/* Queue packet and timeout till later */
2734 			tbf_queue(vifp, mp);
2735 			ASSERT(vifp->v_timeout_id == 0);
2736 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2737 			    TBF_REPROCESS);
2738 		}
2739 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2740 		/* Finite queue length, so queue pkts and process queue */
2741 		tbf_queue(vifp, mp);
2742 		tbf_process_q(vifp);
2743 	} else {
2744 		/* Check that we have UDP header with IP header */
2745 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2746 					sizeof (struct udphdr);
2747 
2748 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2749 			if (!pullupmsg(mp, hdr_length)) {
2750 				freemsg(mp);
2751 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2752 				    "vif %ld src 0x%x dst 0x%x\n",
2753 				    (ptrdiff_t)(vifp - viftable),
2754 				    ntohl(ipha->ipha_src),
2755 				    ntohl(ipha->ipha_dst)));
2756 				mutex_exit(&vifp->v_tbf->tbf_lock);
2757 				return;
2758 			} else
2759 				/* Have to reassign ipha after pullupmsg */
2760 				ipha = (ipha_t *)mp->b_rptr;
2761 		}
2762 		/*
2763 		 * Queue length too much,
2764 		 * try to selectively dq, or queue and process
2765 		 */
2766 		if (!tbf_dq_sel(vifp, ipha)) {
2767 			mrtstat.mrts_q_overflow++;
2768 			freemsg(mp);
2769 		} else {
2770 			tbf_queue(vifp, mp);
2771 			tbf_process_q(vifp);
2772 		}
2773 	}
2774 	if (t->tbf_q_len == 0) {
2775 		id = vifp->v_timeout_id;
2776 		vifp->v_timeout_id = 0;
2777 	}
2778 	mutex_exit(&vifp->v_tbf->tbf_lock);
2779 	if (id != 0)
2780 		(void) untimeout(id);
2781 }
2782 
2783 /*
2784  * Adds a packet to the tbf queue at the interface.
2785  * The ipha is for mcastgrp destination for phyint and encap.
2786  */
2787 static void
2788 tbf_queue(struct vif *vifp, mblk_t *mp)
2789 {
2790 	struct tbf	*t = vifp->v_tbf;
2791 
2792 	if (ip_mrtdebug > 1) {
2793 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2794 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - viftable));
2795 	}
2796 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2797 
2798 	if (t->tbf_t == NULL) {
2799 		/* Queue was empty */
2800 		t->tbf_q = mp;
2801 	} else {
2802 		/* Insert at tail */
2803 		t->tbf_t->b_next = mp;
2804 	}
2805 	/* set new tail pointer */
2806 	t->tbf_t = mp;
2807 
2808 	mp->b_next = mp->b_prev = NULL;
2809 
2810 	t->tbf_q_len++;
2811 }
2812 
2813 /*
2814  * Process the queue at the vif interface.
2815  * Drops the tbf_lock when sending packets.
2816  *
2817  * NOTE : The caller should quntimeout if the queue length is 0.
2818  */
2819 static void
2820 tbf_process_q(struct vif *vifp)
2821 {
2822 	mblk_t	*mp;
2823 	struct tbf	*t = vifp->v_tbf;
2824 	size_t	len;
2825 
2826 	if (ip_mrtdebug > 1) {
2827 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2828 		    "tbf_process_q 1: vif %ld qlen = %d",
2829 		    (ptrdiff_t)(vifp - viftable), t->tbf_q_len);
2830 	}
2831 
2832 	/*
2833 	 * Loop through the queue at the interface and send
2834 	 * as many packets as possible.
2835 	 */
2836 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2837 
2838 	while (t->tbf_q_len > 0) {
2839 		mp = t->tbf_q;
2840 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2841 
2842 		/* Determine if the packet can be sent */
2843 		if (len <= t->tbf_n_tok) {
2844 			/*
2845 			 * If so, reduce no. of tokens, dequeue the packet,
2846 			 * send the packet.
2847 			 */
2848 			t->tbf_n_tok -= len;
2849 
2850 			t->tbf_q = mp->b_next;
2851 			if (--t->tbf_q_len == 0) {
2852 				t->tbf_t = NULL;
2853 			}
2854 			mp->b_next = NULL;
2855 			/* Exit mutex before sending packet, then re-enter */
2856 			mutex_exit(&t->tbf_lock);
2857 			tbf_send_packet(vifp, mp);
2858 			mutex_enter(&t->tbf_lock);
2859 		} else
2860 			break;
2861 	}
2862 }
2863 
2864 /* Called at tbf timeout to update tokens, process q and reset timer.  */
2865 static void
2866 tbf_reprocess_q(void *arg)
2867 {
2868 	struct vif *vifp = arg;
2869 
2870 	mutex_enter(&vifp->v_tbf->tbf_lock);
2871 	vifp->v_timeout_id = 0;
2872 	tbf_update_tokens(vifp);
2873 
2874 	tbf_process_q(vifp);
2875 
2876 	if (vifp->v_tbf->tbf_q_len > 0) {
2877 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2878 		    TBF_REPROCESS);
2879 	}
2880 	mutex_exit(&vifp->v_tbf->tbf_lock);
2881 
2882 	if (ip_mrtdebug > 1) {
2883 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2884 		    "tbf_reprcess_q: vif %ld timeout id = %p",
2885 		    (ptrdiff_t)(vifp - viftable), vifp->v_timeout_id);
2886 	}
2887 }
2888 
2889 /*
2890  * Function that will selectively discard a member of the tbf queue,
2891  * based on the precedence value and the priority.
2892  *
2893  * NOTE : The caller should quntimeout if the queue length is 0.
2894  */
2895 static int
2896 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
2897 {
2898 	uint_t		p;
2899 	struct tbf		*t = vifp->v_tbf;
2900 	mblk_t		**np;
2901 	mblk_t		*last, *mp;
2902 
2903 	if (ip_mrtdebug > 1) {
2904 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2905 		    "dq_sel: vif %ld dst 0x%x",
2906 		    (ptrdiff_t)(vifp - viftable), ntohl(ipha->ipha_dst));
2907 	}
2908 
2909 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2910 	p = priority(vifp, ipha);
2911 
2912 	np = &t->tbf_q;
2913 	last = NULL;
2914 	while ((mp = *np) != NULL) {
2915 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
2916 			*np = mp->b_next;
2917 			/* If removing the last packet, fix the tail pointer */
2918 			if (mp == t->tbf_t)
2919 				t->tbf_t = last;
2920 			mp->b_prev = mp->b_next = NULL;
2921 			freemsg(mp);
2922 			/*
2923 			 * It's impossible for the queue to be empty, but
2924 			 * we check anyway.
2925 			 */
2926 			if (--t->tbf_q_len == 0) {
2927 				t->tbf_t = NULL;
2928 			}
2929 			mrtstat.mrts_drop_sel++;
2930 			return (1);
2931 		}
2932 		np = &mp->b_next;
2933 		last = mp;
2934 	}
2935 	return (0);
2936 }
2937 
2938 /* Sends packet, 2 cases - encap tunnel, phyint.  */
2939 static void
2940 tbf_send_packet(struct vif *vifp, mblk_t *mp)
2941 {
2942 	ipif_t  *ipif;
2943 
2944 	/* If encap tunnel options */
2945 	if (vifp->v_flags & VIFF_TUNNEL)  {
2946 		if (ip_mrtdebug > 1) {
2947 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2948 			    "tbf_send_pkt: ENCAP tunnel vif %ld",
2949 			    (ptrdiff_t)(vifp - viftable));
2950 		}
2951 
2952 		/*
2953 		 * Feed into ip_wput which will set the ident field and
2954 		 * checksum the encapsulating header.
2955 		 * BSD gets the cached route vifp->v_route from ip_output()
2956 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
2957 		 */
2958 		put(vifp->v_ipif->ipif_wq, mp);
2959 		return;
2960 
2961 		/* phyint */
2962 	} else {
2963 		/* Need to loop back to members on the outgoing interface. */
2964 		ipha_t  *ipha;
2965 		ipaddr_t    dst;
2966 		ipha  = (ipha_t *)mp->b_rptr;
2967 		dst  = ipha->ipha_dst;
2968 		ipif = vifp->v_ipif;
2969 
2970 		mutex_enter(&ipif->ipif_ill->ill_lock);
2971 		if (ilm_lookup_ipif(ipif, dst) != NULL) {
2972 			/*
2973 			 * The packet is not yet reassembled, thus we need to
2974 			 * pass it to ip_rput_local for checksum verification
2975 			 * and reassembly (and fanout the user stream).
2976 			 */
2977 			mblk_t 	*mp_loop;
2978 			ire_t	*ire;
2979 
2980 			mutex_exit(&ipif->ipif_ill->ill_lock);
2981 			if (ip_mrtdebug > 1) {
2982 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2983 				    "tbf_send_pkt: loopback vif %ld",
2984 				    (ptrdiff_t)(vifp - viftable));
2985 			}
2986 			mp_loop = copymsg(mp);
2987 			ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
2988 			    ALL_ZONES, NULL, MATCH_IRE_TYPE);
2989 
2990 			if (mp_loop != NULL && ire != NULL) {
2991 				IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
2992 				    ((ipha_t *)mp_loop->b_rptr),
2993 				    ire, (ill_t *)ipif->ipif_rq->q_ptr);
2994 			} else {
2995 				/* Either copymsg failed or no ire */
2996 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2997 				    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
2998 				    "vif %ld\n", mp_loop, ire,
2999 				    (ptrdiff_t)(vifp - viftable));
3000 			}
3001 			if (ire != NULL)
3002 				ire_refrele(ire);
3003 		} else {
3004 			mutex_exit(&ipif->ipif_ill->ill_lock);
3005 		}
3006 		if (ip_mrtdebug > 1) {
3007 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
3008 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3009 			    (ptrdiff_t)(vifp - viftable), ntohl(dst));
3010 		}
3011 		ip_rput_forward_multicast(dst, mp, ipif);
3012 	}
3013 }
3014 
3015 /*
3016  * Determine the current time and then the elapsed time (between the last time
3017  * and time now).  Update the no. of tokens in the bucket.
3018  */
3019 static void
3020 tbf_update_tokens(struct vif *vifp)
3021 {
3022 	timespec_t	tp;
3023 	hrtime_t	tm;
3024 	struct tbf	*t = vifp->v_tbf;
3025 
3026 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3027 
3028 	/* Time in secs and nsecs, rate limit in kbits/sec */
3029 	gethrestime(&tp);
3030 
3031 	/*LINTED*/
3032 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3033 
3034 	/*
3035 	 * This formula is actually
3036 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3037 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3038 	 *
3039 	 * The (1000/1024) was introduced in add_vif to optimize
3040 	 * this divide into a shift.
3041 	 */
3042 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3043 	t->tbf_last_pkt_t = tp;
3044 
3045 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3046 		t->tbf_n_tok = MAX_BKT_SIZE;
3047 	if (ip_mrtdebug > 1) {
3048 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
3049 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3050 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - viftable));
3051 	}
3052 }
3053 
3054 /*
3055  * Priority currently is based on port nos.
3056  * Different forwarding mechanisms have different ways
3057  * of obtaining the port no. Hence, the vif must be
3058  * given along with the packet itself.
3059  *
3060  */
3061 static int
3062 priority(struct vif *vifp, ipha_t *ipha)
3063 {
3064 	int prio;
3065 
3066 	/* Temporary hack; may add general packet classifier some day */
3067 
3068 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3069 
3070 	/*
3071 	 * The UDP port space is divided up into four priority ranges:
3072 	 * [0, 16384)	: unclassified - lowest priority
3073 	 * [16384, 32768)	: audio - highest priority
3074 	 * [32768, 49152)	: whiteboard - medium priority
3075 	 * [49152, 65536)	: video - low priority
3076 	 */
3077 
3078 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3079 		struct udphdr *udp =
3080 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3081 		switch (ntohs(udp->uh_dport) & 0xc000) {
3082 		case 0x4000:
3083 			prio = 70;
3084 			break;
3085 		case 0x8000:
3086 			prio = 60;
3087 			break;
3088 		case 0xc000:
3089 			prio = 55;
3090 			break;
3091 		default:
3092 			prio = 50;
3093 			break;
3094 		}
3095 		if (ip_mrtdebug > 1) {
3096 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
3097 			    "priority: port %x prio %d\n",
3098 			    ntohs(udp->uh_dport), prio);
3099 		}
3100 	} else
3101 		prio = 50;  /* default priority */
3102 	return (prio);
3103 }
3104 
3105 /*
3106  * End of token bucket filter modifications
3107  */
3108 
3109 
3110 
3111 /*
3112  * Produces data for netstat -M.
3113  */
3114 int
3115 ip_mroute_stats(mblk_t *mp)
3116 {
3117 	mrtstat.mrts_vifctlSize = sizeof (struct vifctl);
3118 	mrtstat.mrts_mfcctlSize = sizeof (struct mfcctl);
3119 	if (!snmp_append_data(mp, (char *)&mrtstat, sizeof (mrtstat))) {
3120 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3121 		    (size_t)sizeof (mrtstat)));
3122 		return (0);
3123 	}
3124 	return (1);
3125 }
3126 
3127 /*
3128  * Sends info for SNMP's MIB.
3129  */
3130 int
3131 ip_mroute_vif(mblk_t *mp)
3132 {
3133 	struct vifctl 	vi;
3134 	vifi_t		vifi;
3135 
3136 	mutex_enter(&numvifs_mutex);
3137 	for (vifi = 0; vifi < numvifs; vifi++) {
3138 		if (viftable[vifi].v_lcl_addr.s_addr == 0)
3139 			continue;
3140 		/*
3141 		 * No locks here, an approximation is fine.
3142 		 */
3143 		vi.vifc_vifi = vifi;
3144 		vi.vifc_flags = viftable[vifi].v_flags;
3145 		vi.vifc_threshold = viftable[vifi].v_threshold;
3146 		vi.vifc_rate_limit	= viftable[vifi].v_rate_limit;
3147 		vi.vifc_lcl_addr	= viftable[vifi].v_lcl_addr;
3148 		vi.vifc_rmt_addr	= viftable[vifi].v_rmt_addr;
3149 		vi.vifc_pkt_in		= viftable[vifi].v_pkt_in;
3150 		vi.vifc_pkt_out		= viftable[vifi].v_pkt_out;
3151 
3152 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3153 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3154 			    (size_t)sizeof (vi)));
3155 			return (0);
3156 		}
3157 	}
3158 	mutex_exit(&numvifs_mutex);
3159 	return (1);
3160 }
3161 
3162 /*
3163  * Called by ip_snmp_get to send up multicast routing table.
3164  */
3165 int
3166 ip_mroute_mrt(mblk_t *mp)
3167 {
3168 	int			i, j;
3169 	struct mfc		*rt;
3170 	struct mfcctl	mfcc;
3171 
3172 	/*
3173 	 * Make sure multicast has not been turned off.
3174 	 */
3175 	if (is_mrouter_off())
3176 		return (1);
3177 
3178 	/* Loop over all hash buckets and their chains */
3179 	for (i = 0; i < MFCTBLSIZ; i++) {
3180 		MFCB_REFHOLD(&mfctable[i]);
3181 		for (rt = mfctable[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3182 			mutex_enter(&rt->mfc_mutex);
3183 			if (rt->mfc_rte != NULL ||
3184 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3185 				mutex_exit(&rt->mfc_mutex);
3186 				continue;
3187 			}
3188 			mfcc.mfcc_origin = rt->mfc_origin;
3189 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3190 			mfcc.mfcc_parent = rt->mfc_parent;
3191 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3192 			mutex_enter(&numvifs_mutex);
3193 			for (j = 0; j < (int)numvifs; j++)
3194 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3195 			for (j = (int)numvifs; j < MAXVIFS; j++)
3196 				mfcc.mfcc_ttls[j] = 0;
3197 			mutex_exit(&numvifs_mutex);
3198 
3199 			mutex_exit(&rt->mfc_mutex);
3200 			if (!snmp_append_data(mp, (char *)&mfcc,
3201 			    sizeof (mfcc))) {
3202 				MFCB_REFRELE(&mfctable[i]);
3203 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3204 				    (size_t)sizeof (mfcc)));
3205 				return (0);
3206 			}
3207 		}
3208 		MFCB_REFRELE(&mfctable[i]);
3209 	}
3210 	return (1);
3211 }
3212