xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_mroute.c (revision 15c07adc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /* Copyright (c) 1990 Mentat Inc. */
25 
26 /*
27  * Copyright (c) 2018, Joyent, Inc.
28  */
29 
30 /*
31  * Procedures for the kernel part of DVMRP,
32  * a Distance-Vector Multicast Routing Protocol.
33  * (See RFC-1075)
34  * Written by David Waitzman, BBN Labs, August 1988.
35  * Modified by Steve Deering, Stanford, February 1989.
36  * Modified by Mark J. Steiglitz, Stanford, May, 1991
37  * Modified by Van Jacobson, LBL, January 1993
38  * Modified by Ajit Thyagarajan, PARC, August 1993
39  * Modified by Bill Fenner, PARC, April 1995
40  *
41  * MROUTING 3.5
42  */
43 
44 /*
45  * TODO
46  * - function pointer field in vif, void *vif_sendit()
47  */
48 
49 #include <sys/types.h>
50 #include <sys/stream.h>
51 #include <sys/stropts.h>
52 #include <sys/strlog.h>
53 #include <sys/systm.h>
54 #include <sys/ddi.h>
55 #include <sys/cmn_err.h>
56 #include <sys/zone.h>
57 
58 #include <sys/param.h>
59 #include <sys/socket.h>
60 #include <sys/vtrace.h>
61 #include <sys/debug.h>
62 #include <net/if.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <net/if_dl.h>
66 
67 #include <inet/ipsec_impl.h>
68 #include <inet/common.h>
69 #include <inet/mi.h>
70 #include <inet/nd.h>
71 #include <inet/tunables.h>
72 #include <inet/mib2.h>
73 #include <netinet/ip6.h>
74 #include <inet/ip.h>
75 #include <inet/snmpcom.h>
76 
77 #include <netinet/igmp.h>
78 #include <netinet/igmp_var.h>
79 #include <netinet/udp.h>
80 #include <netinet/ip_mroute.h>
81 #include <inet/ip_multi.h>
82 #include <inet/ip_ire.h>
83 #include <inet/ip_ndp.h>
84 #include <inet/ip_if.h>
85 #include <inet/ipclassifier.h>
86 
87 #include <netinet/pim.h>
88 
89 
90 /*
91  * MT Design:
92  *
93  * There are three main data structures viftable, mfctable and tbftable that
94  * need to be protected against MT races.
95  *
96  * vitable is a fixed length array of vif structs. There is no lock to protect
97  * the whole array, instead each struct is protected by its own indiviual lock.
98  * The value of v_marks in conjuction with the value of v_refcnt determines the
99  * current state of a vif structure. One special state that needs mention
100  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
101  * that vif is being initalized.
102  * Each structure is freed when the refcnt goes down to zero. If a delete comes
103  * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
104  * which prevents the struct from further use.  When the refcnt goes to zero
105  * the struct is freed and is marked VIF_MARK_NOTINUSE.
106  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
107  * from  going away a refhold is put on the ipif before using it. see
108  * lock_good_vif() and unlock_good_vif().
109  *
110  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
111  * of the vif struct.
112  *
113  * tbftable is also a fixed length array of tbf structs and is only accessed
114  * via v_tbf.  It is protected by its own lock tbf_lock.
115  *
116  * Lock Ordering is
117  * v_lock --> tbf_lock
118  * v_lock --> ill_locK
119  *
120  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
121  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
122  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
123  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
124  * protect the struct elements.
125  *
126  * mfc structs are dynamically allocated and are singly linked
127  * at the head of the chain. When an mfc structure is to be deleted
128  * it is marked condemned and so is the state in the bucket struct.
129  * When the last walker of the hash bucket exits all the mfc structs
130  * marked condemed are freed.
131  *
132  * Locking Hierarchy:
133  * The bucket lock should be acquired before the mfc struct lock.
134  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
135  * operations on the bucket struct.
136  *
137  * last_encap_lock and numvifs_mutex should be acquired after
138  * acquring vif or mfc locks. These locks protect some global variables.
139  *
140  * The statistics are not currently protected by a lock
141  * causing the stats be be approximate, not exact.
142  */
143 
144 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
145 
146 /*
147  * Timeouts:
148  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
149  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
150  *	SunOS 5.x uses mfc->timeout for each mfc.
151  *	Some Unixes are limited in the number of simultaneous timeouts
152  * 	that can be run, SunOS 5.x does not have this restriction.
153  */
154 
155 /*
156  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
157  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
158  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
159  */
160 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
161 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
162 
163 /*
164  * Hash function for a source, group entry
165  */
166 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
167 	((g) >> 20) ^ ((g) >> 10) ^ (g))
168 
169 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
170 
171 /* Identify PIM packet that came on a Register interface */
172 #define	PIM_REGISTER_MARKER	0xffffffff
173 
174 /* Function declarations */
175 static int	add_mfc(struct mfcctl *, ip_stack_t *);
176 static int	add_vif(struct vifctl *, conn_t *, ip_stack_t *);
177 static int	del_mfc(struct mfcctl *, ip_stack_t *);
178 static int	del_vif(vifi_t *, ip_stack_t *);
179 static void	del_vifp(struct vif *);
180 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
181 static void	expire_upcalls(void *);
182 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
183 static void	free_queue(struct mfc *);
184 static int	get_assert(uchar_t *, ip_stack_t *);
185 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
186 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
187 static int	get_version(uchar_t *);
188 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
189 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
190 		    ipaddr_t, struct mfc *);
191 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
192 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
193 static int	register_mforward(mblk_t *, ip_recv_attr_t *);
194 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
195 static int	set_assert(int *, ip_stack_t *);
196 
197 /*
198  * Token Bucket Filter functions
199  */
200 static int  priority(struct vif *, ipha_t *);
201 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
202 static int  tbf_dq_sel(struct vif *, ipha_t *);
203 static void tbf_process_q(struct vif *);
204 static void tbf_queue(struct vif *, mblk_t *);
205 static void tbf_reprocess_q(void *);
206 static void tbf_send_packet(struct vif *, mblk_t *);
207 static void tbf_update_tokens(struct vif *);
208 static void release_mfc(struct mfcb *);
209 
210 static boolean_t is_mrouter_off(ip_stack_t *);
211 /*
212  * Encapsulation packets
213  */
214 
215 #define	ENCAP_TTL	64
216 
217 /* prototype IP hdr for encapsulated packets */
218 static ipha_t multicast_encap_iphdr = {
219 	IP_SIMPLE_HDR_VERSION,
220 	0,				/* tos */
221 	sizeof (ipha_t),		/* total length */
222 	0,				/* id */
223 	0,				/* frag offset */
224 	ENCAP_TTL, IPPROTO_ENCAP,
225 	0,				/* checksum */
226 };
227 
228 /*
229  * Rate limit for assert notification messages, in nsec.
230  */
231 #define	ASSERT_MSG_TIME		3000000000
232 
233 
234 #define	VIF_REFHOLD(vifp) {			\
235 	mutex_enter(&(vifp)->v_lock);		\
236 	(vifp)->v_refcnt++;			\
237 	mutex_exit(&(vifp)->v_lock);		\
238 }
239 
240 #define	VIF_REFRELE_LOCKED(vifp) {				\
241 	(vifp)->v_refcnt--;					\
242 	if ((vifp)->v_refcnt == 0 &&				\
243 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
244 			del_vifp(vifp);				\
245 	} else {						\
246 		mutex_exit(&(vifp)->v_lock);			\
247 	}							\
248 }
249 
250 #define	VIF_REFRELE(vifp) {					\
251 	mutex_enter(&(vifp)->v_lock);				\
252 	(vifp)->v_refcnt--;					\
253 	if ((vifp)->v_refcnt == 0 &&				\
254 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
255 			del_vifp(vifp);				\
256 	} else {						\
257 		mutex_exit(&(vifp)->v_lock);			\
258 	}							\
259 }
260 
261 #define	MFCB_REFHOLD(mfcb) {				\
262 	mutex_enter(&(mfcb)->mfcb_lock);		\
263 	(mfcb)->mfcb_refcnt++;				\
264 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
265 	mutex_exit(&(mfcb)->mfcb_lock);			\
266 }
267 
268 #define	MFCB_REFRELE(mfcb) {					\
269 	mutex_enter(&(mfcb)->mfcb_lock);			\
270 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
271 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
272 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
273 			release_mfc(mfcb);			\
274 	}							\
275 	mutex_exit(&(mfcb)->mfcb_lock);				\
276 }
277 
278 /*
279  * MFCFIND:
280  * Find a route for a given origin IP address and multicast group address.
281  * Skip entries with pending upcalls.
282  * Type of service parameter to be added in the future!
283  */
284 #define	MFCFIND(mfcbp, o, g, rt) { \
285 	struct mfc *_mb_rt = NULL; \
286 	rt = NULL; \
287 	_mb_rt = mfcbp->mfcb_mfc; \
288 	while (_mb_rt) { \
289 		if ((_mb_rt->mfc_origin.s_addr == o) && \
290 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
291 		    (_mb_rt->mfc_rte == NULL) && \
292 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
293 		    rt = _mb_rt; \
294 		    break; \
295 		} \
296 	_mb_rt = _mb_rt->mfc_next; \
297 	} \
298 }
299 
300 /*
301  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
302  * are inefficient. We use gethrestime() which returns a timespec_t with
303  * sec and nsec, the resolution is machine dependent.
304  * The following 2 macros have been changed to use nsec instead of usec.
305  */
306 /*
307  * Macros to compute elapsed time efficiently.
308  * Borrowed from Van Jacobson's scheduling code.
309  * Delta should be a hrtime_t.
310  */
311 #define	TV_DELTA(a, b, delta) { \
312 	int xxs; \
313  \
314 	delta = (a).tv_nsec - (b).tv_nsec; \
315 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
316 		switch (xxs) { \
317 		case 2: \
318 		    delta += 1000000000; \
319 		    /*FALLTHROUGH*/ \
320 		case 1: \
321 		    delta += 1000000000; \
322 		    break; \
323 		default: \
324 		    delta += (1000000000 * xxs); \
325 		} \
326 	} \
327 }
328 
329 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
330 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
331 
332 /*
333  * Handle MRT setsockopt commands to modify the multicast routing tables.
334  */
335 int
ip_mrouter_set(int cmd,conn_t * connp,int checkonly,uchar_t * data,int datalen)336 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
337     int datalen)
338 {
339 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
340 
341 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
342 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
343 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
344 		return (EACCES);
345 	}
346 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
347 
348 	if (checkonly) {
349 		/*
350 		 * do not do operation, just pretend to - new T_CHECK
351 		 * Note: Even routines further on can probably fail but
352 		 * this T_CHECK stuff is only to please XTI so it not
353 		 * necessary to be perfect.
354 		 */
355 		switch (cmd) {
356 		case MRT_INIT:
357 		case MRT_DONE:
358 		case MRT_ADD_VIF:
359 		case MRT_DEL_VIF:
360 		case MRT_ADD_MFC:
361 		case MRT_DEL_MFC:
362 		case MRT_ASSERT:
363 			return (0);
364 		default:
365 			return (EOPNOTSUPP);
366 		}
367 	}
368 
369 	/*
370 	 * make sure no command is issued after multicast routing has been
371 	 * turned off.
372 	 */
373 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
374 		if (is_mrouter_off(ipst))
375 			return (EINVAL);
376 	}
377 
378 	switch (cmd) {
379 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
380 	case MRT_DONE:	return (ip_mrouter_done(ipst));
381 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp, ipst));
382 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, ipst));
383 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
384 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
385 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
386 	default:	   return (EOPNOTSUPP);
387 	}
388 }
389 
390 /*
391  * Handle MRT getsockopt commands
392  */
393 int
ip_mrouter_get(int cmd,conn_t * connp,uchar_t * data)394 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
395 {
396 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
397 
398 	if (connp != ipst->ips_ip_g_mrouter)
399 		return (EACCES);
400 
401 	switch (cmd) {
402 	case MRT_VERSION:	return (get_version((uchar_t *)data));
403 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
404 	default:		return (EOPNOTSUPP);
405 	}
406 }
407 
408 /*
409  * Handle ioctl commands to obtain information from the cache.
410  * Called with shared access to IP. These are read_only ioctls.
411  */
412 /* ARGSUSED */
413 int
mrt_ioctl(ipif_t * ipif,sin_t * sin,queue_t * q,mblk_t * mp,ip_ioctl_cmd_t * ipip,void * if_req)414 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
415     ip_ioctl_cmd_t *ipip, void *if_req)
416 {
417 	mblk_t	*mp1;
418 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
419 	conn_t		*connp = Q_TO_CONN(q);
420 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
421 
422 	/* Existence verified in ip_wput_nondata */
423 	mp1 = mp->b_cont->b_cont;
424 
425 	switch (iocp->ioc_cmd) {
426 	case (SIOCGETVIFCNT):
427 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
428 	case (SIOCGETSGCNT):
429 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
430 	case (SIOCGETLSGCNT):
431 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
432 	default:
433 		return (EINVAL);
434 	}
435 }
436 
437 /*
438  * Returns the packet, byte, rpf-failure count for the source, group provided.
439  */
440 static int
get_sg_cnt(struct sioc_sg_req * req,ip_stack_t * ipst)441 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
442 {
443 	struct mfc *rt;
444 	struct mfcb *mfcbp;
445 
446 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
447 	MFCB_REFHOLD(mfcbp);
448 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
449 
450 	if (rt != NULL) {
451 		mutex_enter(&rt->mfc_mutex);
452 		req->pktcnt   = rt->mfc_pkt_cnt;
453 		req->bytecnt  = rt->mfc_byte_cnt;
454 		req->wrong_if = rt->mfc_wrong_if;
455 		mutex_exit(&rt->mfc_mutex);
456 	} else
457 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
458 
459 	MFCB_REFRELE(mfcbp);
460 	return (0);
461 }
462 
463 /*
464  * Returns the packet, byte, rpf-failure count for the source, group provided.
465  * Uses larger counters and IPv6 addresses.
466  */
467 /* ARGSUSED XXX until implemented */
468 static int
get_lsg_cnt(struct sioc_lsg_req * req,ip_stack_t * ipst)469 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
470 {
471 	/* XXX TODO SIOCGETLSGCNT */
472 	return (ENXIO);
473 }
474 
475 /*
476  * Returns the input and output packet and byte counts on the vif provided.
477  */
478 static int
get_vif_cnt(struct sioc_vif_req * req,ip_stack_t * ipst)479 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
480 {
481 	vifi_t vifi = req->vifi;
482 
483 	if (vifi >= ipst->ips_numvifs)
484 		return (EINVAL);
485 
486 	/*
487 	 * No locks here, an approximation is fine.
488 	 */
489 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
490 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
491 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
492 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
493 
494 	return (0);
495 }
496 
497 static int
get_version(uchar_t * data)498 get_version(uchar_t *data)
499 {
500 	int *v = (int *)data;
501 
502 	*v = 0x0305;	/* XXX !!!! */
503 
504 	return (0);
505 }
506 
507 /*
508  * Set PIM assert processing global.
509  */
510 static int
set_assert(int * i,ip_stack_t * ipst)511 set_assert(int *i, ip_stack_t *ipst)
512 {
513 	if ((*i != 1) && (*i != 0))
514 		return (EINVAL);
515 
516 	ipst->ips_pim_assert = *i;
517 
518 	return (0);
519 }
520 
521 /*
522  * Get PIM assert processing global.
523  */
524 static int
get_assert(uchar_t * data,ip_stack_t * ipst)525 get_assert(uchar_t *data, ip_stack_t *ipst)
526 {
527 	int *i = (int *)data;
528 
529 	*i = ipst->ips_pim_assert;
530 
531 	return (0);
532 }
533 
534 /*
535  * Enable multicast routing.
536  */
537 static int
ip_mrouter_init(conn_t * connp,uchar_t * data,int datalen,ip_stack_t * ipst)538 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
539 {
540 	int	*v;
541 
542 	if (data == NULL || (datalen != sizeof (int)))
543 		return (ENOPROTOOPT);
544 
545 	v = (int *)data;
546 	if (*v != 1)
547 		return (ENOPROTOOPT);
548 
549 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
550 	if (ipst->ips_ip_g_mrouter != NULL) {
551 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
552 		return (EADDRINUSE);
553 	}
554 
555 	/*
556 	 * MRT_INIT should only be allowed for RAW sockets, but we double
557 	 * check.
558 	 */
559 	if (!IPCL_IS_RAWIP(connp)) {
560 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
561 		return (EINVAL);
562 	}
563 
564 	ipst->ips_ip_g_mrouter = connp;
565 	connp->conn_multi_router = 1;
566 	/* In order for tunnels to work we have to turn ip_g_forward on */
567 	if (!WE_ARE_FORWARDING(ipst)) {
568 		if (ipst->ips_ip_mrtdebug > 1) {
569 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
570 			    "ip_mrouter_init: turning on forwarding");
571 		}
572 		ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
573 		ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
574 	}
575 
576 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
577 	return (0);
578 }
579 
580 void
ip_mrouter_stack_init(ip_stack_t * ipst)581 ip_mrouter_stack_init(ip_stack_t *ipst)
582 {
583 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
584 
585 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
586 	    KM_SLEEP);
587 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
588 	/*
589 	 * mfctable:
590 	 * Includes all mfcs, including waiting upcalls.
591 	 * Multiple mfcs per bucket.
592 	 */
593 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
594 	    KM_SLEEP);
595 	/*
596 	 * Define the token bucket filter structures.
597 	 * tbftable -> each vif has one of these for storing info.
598 	 */
599 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
600 
601 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
602 
603 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
604 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
605 }
606 
607 /*
608  * Disable multicast routing.
609  * Didn't use global timeout_val (BSD version), instead check the mfctable.
610  */
611 int
ip_mrouter_done(ip_stack_t * ipst)612 ip_mrouter_done(ip_stack_t *ipst)
613 {
614 	conn_t		*mrouter;
615 	vifi_t 		vifi;
616 	struct mfc	*mfc_rt;
617 	int		i;
618 
619 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
620 	if (ipst->ips_ip_g_mrouter == NULL) {
621 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
622 		return (EINVAL);
623 	}
624 
625 	mrouter = ipst->ips_ip_g_mrouter;
626 
627 	if (ipst->ips_saved_ip_forwarding != -1) {
628 		if (ipst->ips_ip_mrtdebug > 1) {
629 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
630 			    "ip_mrouter_done: turning off forwarding");
631 		}
632 		ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
633 		ipst->ips_saved_ip_forwarding = -1;
634 	}
635 
636 	/*
637 	 * Always clear cache when vifs change.
638 	 * No need to get ipst->ips_last_encap_lock since we are running as
639 	 * a writer.
640 	 */
641 	mutex_enter(&ipst->ips_last_encap_lock);
642 	ipst->ips_last_encap_src = 0;
643 	ipst->ips_last_encap_vif = NULL;
644 	mutex_exit(&ipst->ips_last_encap_lock);
645 	mrouter->conn_multi_router = 0;
646 
647 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
648 
649 	/*
650 	 * For each phyint in use,
651 	 * disable promiscuous reception of all IP multicasts.
652 	 */
653 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
654 		struct vif *vifp = ipst->ips_vifs + vifi;
655 
656 		mutex_enter(&vifp->v_lock);
657 		/*
658 		 * if the vif is active mark it condemned.
659 		 */
660 		if (vifp->v_marks & VIF_MARK_GOOD) {
661 			ASSERT(vifp->v_ipif != NULL);
662 			ipif_refhold(vifp->v_ipif);
663 			/* Phyint only */
664 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
665 				ipif_t *ipif = vifp->v_ipif;
666 				ilm_t *ilm = vifp->v_ilm;
667 
668 				vifp->v_ilm = NULL;
669 				vifp->v_marks &= ~VIF_MARK_GOOD;
670 				vifp->v_marks |= VIF_MARK_CONDEMNED;
671 
672 				mutex_exit(&(vifp)->v_lock);
673 				if (ilm != NULL) {
674 					ill_t *ill = ipif->ipif_ill;
675 
676 					(void) ip_delmulti(ilm);
677 					ASSERT(ill->ill_mrouter_cnt > 0);
678 					atomic_dec_32(&ill->ill_mrouter_cnt);
679 				}
680 				mutex_enter(&vifp->v_lock);
681 			}
682 			ipif_refrele(vifp->v_ipif);
683 			/*
684 			 * decreases the refcnt added in add_vif.
685 			 * and release v_lock.
686 			 */
687 			VIF_REFRELE_LOCKED(vifp);
688 		} else {
689 			mutex_exit(&vifp->v_lock);
690 			continue;
691 		}
692 	}
693 
694 	mutex_enter(&ipst->ips_numvifs_mutex);
695 	ipst->ips_numvifs = 0;
696 	ipst->ips_pim_assert = 0;
697 	ipst->ips_reg_vif_num = ALL_VIFS;
698 	mutex_exit(&ipst->ips_numvifs_mutex);
699 
700 	/*
701 	 * Free upcall msgs.
702 	 * Go through mfctable and stop any outstanding upcall
703 	 * timeouts remaining on mfcs.
704 	 */
705 	for (i = 0; i < MFCTBLSIZ; i++) {
706 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
707 		ipst->ips_mfcs[i].mfcb_refcnt++;
708 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
709 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
710 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
711 		while (mfc_rt) {
712 			/* Free upcalls */
713 			mutex_enter(&mfc_rt->mfc_mutex);
714 			if (mfc_rt->mfc_rte != NULL) {
715 				if (mfc_rt->mfc_timeout_id != 0) {
716 					/*
717 					 * OK to drop the lock as we have
718 					 * a refcnt on the bucket. timeout
719 					 * can fire but it will see that
720 					 * mfc_timeout_id == 0 and not do
721 					 * anything. see expire_upcalls().
722 					 */
723 					mfc_rt->mfc_timeout_id = 0;
724 					mutex_exit(&mfc_rt->mfc_mutex);
725 					(void) untimeout(
726 					    mfc_rt->mfc_timeout_id);
727 					mfc_rt->mfc_timeout_id = 0;
728 					mutex_enter(&mfc_rt->mfc_mutex);
729 
730 					/*
731 					 * all queued upcall packets
732 					 * and mblk will be freed in
733 					 * release_mfc().
734 					 */
735 				}
736 			}
737 
738 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
739 
740 			mutex_exit(&mfc_rt->mfc_mutex);
741 			mfc_rt = mfc_rt->mfc_next;
742 		}
743 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
744 	}
745 
746 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
747 	ipst->ips_ip_g_mrouter = NULL;
748 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
749 	return (0);
750 }
751 
752 void
ip_mrouter_stack_destroy(ip_stack_t * ipst)753 ip_mrouter_stack_destroy(ip_stack_t *ipst)
754 {
755 	struct mfcb *mfcbp;
756 	struct mfc  *rt;
757 	int i;
758 
759 	for (i = 0; i < MFCTBLSIZ; i++) {
760 		mfcbp = &ipst->ips_mfcs[i];
761 
762 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
763 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
764 			    i);
765 
766 			mfcbp->mfcb_mfc = rt->mfc_next;
767 			free_queue(rt);
768 			mi_free(rt);
769 		}
770 	}
771 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
772 	ipst->ips_vifs = NULL;
773 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
774 	ipst->ips_mrtstat = NULL;
775 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
776 	ipst->ips_mfcs = NULL;
777 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
778 	ipst->ips_tbfs = NULL;
779 
780 	mutex_destroy(&ipst->ips_last_encap_lock);
781 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
782 }
783 
784 static boolean_t
is_mrouter_off(ip_stack_t * ipst)785 is_mrouter_off(ip_stack_t *ipst)
786 {
787 	conn_t	*mrouter;
788 
789 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
790 	if (ipst->ips_ip_g_mrouter == NULL) {
791 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
792 		return (B_TRUE);
793 	}
794 
795 	mrouter = ipst->ips_ip_g_mrouter;
796 	if (mrouter->conn_multi_router == 0) {
797 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
798 		return (B_TRUE);
799 	}
800 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
801 	return (B_FALSE);
802 }
803 
804 static void
unlock_good_vif(struct vif * vifp)805 unlock_good_vif(struct vif *vifp)
806 {
807 	ASSERT(vifp->v_ipif != NULL);
808 	ipif_refrele(vifp->v_ipif);
809 	VIF_REFRELE(vifp);
810 }
811 
812 static boolean_t
lock_good_vif(struct vif * vifp)813 lock_good_vif(struct vif *vifp)
814 {
815 	mutex_enter(&vifp->v_lock);
816 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
817 		mutex_exit(&vifp->v_lock);
818 		return (B_FALSE);
819 	}
820 
821 	ASSERT(vifp->v_ipif != NULL);
822 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
823 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
824 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
825 		mutex_exit(&vifp->v_lock);
826 		return (B_FALSE);
827 	}
828 	ipif_refhold_locked(vifp->v_ipif);
829 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
830 	vifp->v_refcnt++;
831 	mutex_exit(&vifp->v_lock);
832 	return (B_TRUE);
833 }
834 
835 /*
836  * Add a vif to the vif table.
837  */
838 static int
add_vif(struct vifctl * vifcp,conn_t * connp,ip_stack_t * ipst)839 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
840 {
841 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
842 	ipif_t		*ipif;
843 	int		error = 0;
844 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
845 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
846 	ilm_t		*ilm;
847 	ill_t		*ill;
848 
849 	ASSERT(connp != NULL);
850 
851 	if (vifcp->vifc_vifi >= MAXVIFS)
852 		return (EINVAL);
853 
854 	if (is_mrouter_off(ipst))
855 		return (EINVAL);
856 
857 	mutex_enter(&vifp->v_lock);
858 	/*
859 	 * Viftable entry should be 0.
860 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
861 	 * initialized.
862 	 *
863 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
864 	 * request while the delete is in progress, mrouted only sends add
865 	 * requests when a new interface is added and the new interface cannot
866 	 * have the same vifi as an existing interface. We make sure that
867 	 * ill_delete will block till the vif is deleted by adding a refcnt
868 	 * to ipif in del_vif().
869 	 */
870 	if (vifp->v_lcl_addr.s_addr != 0 ||
871 	    vifp->v_marks != 0 ||
872 	    vifp->v_refcnt != 0) {
873 		mutex_exit(&vifp->v_lock);
874 		return (EADDRINUSE);
875 	}
876 
877 	/* Incoming vif should not be 0 */
878 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
879 		mutex_exit(&vifp->v_lock);
880 		return (EINVAL);
881 	}
882 
883 	vifp->v_refcnt++;
884 	mutex_exit(&vifp->v_lock);
885 	/* Find the interface with the local address */
886 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
887 	    IPCL_ZONEID(connp), ipst);
888 	if (ipif == NULL) {
889 		VIF_REFRELE(vifp);
890 		return (EADDRNOTAVAIL);
891 	}
892 
893 	if (ipst->ips_ip_mrtdebug > 1) {
894 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
895 		    "add_vif: src 0x%x enter",
896 		    vifcp->vifc_lcl_addr.s_addr);
897 	}
898 
899 	mutex_enter(&vifp->v_lock);
900 	/*
901 	 * Always clear cache when vifs change.
902 	 * Needed to ensure that src isn't left over from before vif was added.
903 	 * No need to get last_encap_lock, since we are running as a writer.
904 	 */
905 
906 	mutex_enter(&ipst->ips_last_encap_lock);
907 	ipst->ips_last_encap_src = 0;
908 	ipst->ips_last_encap_vif = NULL;
909 	mutex_exit(&ipst->ips_last_encap_lock);
910 
911 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
912 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
913 			cmn_err(CE_WARN,
914 			    "add_vif: source route tunnels not supported\n");
915 			VIF_REFRELE_LOCKED(vifp);
916 			ipif_refrele(ipif);
917 			return (EOPNOTSUPP);
918 		}
919 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
920 
921 	} else {
922 		/* Phyint or Register vif */
923 		if (vifcp->vifc_flags & VIFF_REGISTER) {
924 			/*
925 			 * Note: Since all IPPROTO_IP level options (including
926 			 * MRT_ADD_VIF) are done exclusively via
927 			 * ip_optmgmt_writer(), a lock is not necessary to
928 			 * protect reg_vif_num.
929 			 */
930 			mutex_enter(&ipst->ips_numvifs_mutex);
931 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
932 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
933 				mutex_exit(&ipst->ips_numvifs_mutex);
934 			} else {
935 				mutex_exit(&ipst->ips_numvifs_mutex);
936 				VIF_REFRELE_LOCKED(vifp);
937 				ipif_refrele(ipif);
938 				return (EADDRINUSE);
939 			}
940 		}
941 
942 		/* Make sure the interface supports multicast */
943 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
944 			VIF_REFRELE_LOCKED(vifp);
945 			ipif_refrele(ipif);
946 			if (vifcp->vifc_flags & VIFF_REGISTER) {
947 				mutex_enter(&ipst->ips_numvifs_mutex);
948 				ipst->ips_reg_vif_num = ALL_VIFS;
949 				mutex_exit(&ipst->ips_numvifs_mutex);
950 			}
951 			return (EOPNOTSUPP);
952 		}
953 		/* Enable promiscuous reception of all IP mcasts from the if */
954 		mutex_exit(&vifp->v_lock);
955 
956 		ill = ipif->ipif_ill;
957 		if (IS_UNDER_IPMP(ill))
958 			ill = ipmp_ill_hold_ipmp_ill(ill);
959 
960 		if (ill == NULL) {
961 			ilm = NULL;
962 		} else {
963 			ilm = ip_addmulti(&ipv6_all_zeros, ill,
964 			    ipif->ipif_zoneid, &error);
965 			if (ilm != NULL)
966 				atomic_inc_32(&ill->ill_mrouter_cnt);
967 			if (IS_UNDER_IPMP(ipif->ipif_ill)) {
968 				ill_refrele(ill);
969 				ill = ipif->ipif_ill;
970 			}
971 		}
972 
973 		mutex_enter(&vifp->v_lock);
974 		/*
975 		 * since we released the lock lets make sure that
976 		 * ip_mrouter_done() has not been called.
977 		 */
978 		if (ilm == NULL || is_mrouter_off(ipst)) {
979 			if (ilm != NULL) {
980 				(void) ip_delmulti(ilm);
981 				ASSERT(ill->ill_mrouter_cnt > 0);
982 				atomic_dec_32(&ill->ill_mrouter_cnt);
983 			}
984 			if (vifcp->vifc_flags & VIFF_REGISTER) {
985 				mutex_enter(&ipst->ips_numvifs_mutex);
986 				ipst->ips_reg_vif_num = ALL_VIFS;
987 				mutex_exit(&ipst->ips_numvifs_mutex);
988 			}
989 			VIF_REFRELE_LOCKED(vifp);
990 			ipif_refrele(ipif);
991 			return (error?error:EINVAL);
992 		}
993 		vifp->v_ilm = ilm;
994 	}
995 	/* Define parameters for the tbf structure */
996 	vifp->v_tbf = v_tbf;
997 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
998 	vifp->v_tbf->tbf_n_tok = 0;
999 	vifp->v_tbf->tbf_q_len = 0;
1000 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1001 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1002 
1003 	vifp->v_flags = vifcp->vifc_flags;
1004 	vifp->v_threshold = vifcp->vifc_threshold;
1005 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1006 	vifp->v_ipif = ipif;
1007 	ipif_refrele(ipif);
1008 	/* Scaling up here, allows division by 1024 in critical code.	*/
1009 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1010 	vifp->v_timeout_id = 0;
1011 	/* initialize per vif pkt counters */
1012 	vifp->v_pkt_in = 0;
1013 	vifp->v_pkt_out = 0;
1014 	vifp->v_bytes_in = 0;
1015 	vifp->v_bytes_out = 0;
1016 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1017 
1018 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1019 	mutex_enter(&ipst->ips_numvifs_mutex);
1020 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1021 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1022 	mutex_exit(&ipst->ips_numvifs_mutex);
1023 
1024 	if (ipst->ips_ip_mrtdebug > 1) {
1025 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1026 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1027 		    vifcp->vifc_vifi,
1028 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1029 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1030 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1031 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1032 	}
1033 
1034 	vifp->v_marks = VIF_MARK_GOOD;
1035 	mutex_exit(&vifp->v_lock);
1036 	return (0);
1037 }
1038 
1039 
1040 /* Delete a vif from the vif table. */
1041 static void
del_vifp(struct vif * vifp)1042 del_vifp(struct vif *vifp)
1043 {
1044 	struct tbf	*t = vifp->v_tbf;
1045 	mblk_t  *mp0;
1046 	vifi_t  vifi;
1047 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1048 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1049 
1050 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1051 	ASSERT(t != NULL);
1052 
1053 	if (ipst->ips_ip_mrtdebug > 1) {
1054 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1055 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1056 	}
1057 
1058 	if (vifp->v_timeout_id != 0) {
1059 		(void) untimeout(vifp->v_timeout_id);
1060 		vifp->v_timeout_id = 0;
1061 	}
1062 
1063 	/*
1064 	 * Free packets queued at the interface.
1065 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1066 	 */
1067 	mutex_enter(&t->tbf_lock);
1068 	while (t->tbf_q != NULL) {
1069 		mp0 = t->tbf_q;
1070 		t->tbf_q = t->tbf_q->b_next;
1071 		mp0->b_prev = mp0->b_next = NULL;
1072 		freemsg(mp0);
1073 	}
1074 	mutex_exit(&t->tbf_lock);
1075 
1076 	/*
1077 	 * Always clear cache when vifs change.
1078 	 * No need to get last_encap_lock since we are running as a writer.
1079 	 */
1080 	mutex_enter(&ipst->ips_last_encap_lock);
1081 	if (vifp == ipst->ips_last_encap_vif) {
1082 		ipst->ips_last_encap_vif = NULL;
1083 		ipst->ips_last_encap_src = 0;
1084 	}
1085 	mutex_exit(&ipst->ips_last_encap_lock);
1086 
1087 	mutex_destroy(&t->tbf_lock);
1088 
1089 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1090 
1091 	/* Adjust numvifs down */
1092 	mutex_enter(&ipst->ips_numvifs_mutex);
1093 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1094 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1095 			break;
1096 	ipst->ips_numvifs = vifi;
1097 	mutex_exit(&ipst->ips_numvifs_mutex);
1098 
1099 	bzero(vifp, sizeof (*vifp));
1100 }
1101 
1102 static int
del_vif(vifi_t * vifip,ip_stack_t * ipst)1103 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1104 {
1105 	struct vif	*vifp = ipst->ips_vifs + *vifip;
1106 
1107 	if (*vifip >= ipst->ips_numvifs)
1108 		return (EINVAL);
1109 
1110 	mutex_enter(&vifp->v_lock);
1111 	/*
1112 	 * Not initialized
1113 	 * Here we are not looking at the vif that is being initialized
1114 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1115 	 */
1116 	if (vifp->v_lcl_addr.s_addr == 0 ||
1117 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1118 		mutex_exit(&vifp->v_lock);
1119 		return (EADDRNOTAVAIL);
1120 	}
1121 
1122 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1123 	vifp->v_marks &= ~VIF_MARK_GOOD;
1124 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1125 
1126 	/* Phyint only */
1127 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1128 		ipif_t *ipif = vifp->v_ipif;
1129 		ilm_t *ilm = vifp->v_ilm;
1130 
1131 		vifp->v_ilm = NULL;
1132 
1133 		ASSERT(ipif != NULL);
1134 		/*
1135 		 * should be OK to drop the lock as we
1136 		 * have marked this as CONDEMNED.
1137 		 */
1138 		mutex_exit(&(vifp)->v_lock);
1139 		if (ilm != NULL) {
1140 			(void) ip_delmulti(ilm);
1141 			ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1142 			atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1143 		}
1144 		mutex_enter(&(vifp)->v_lock);
1145 	}
1146 
1147 	if (vifp->v_flags & VIFF_REGISTER) {
1148 		mutex_enter(&ipst->ips_numvifs_mutex);
1149 		ipst->ips_reg_vif_num = ALL_VIFS;
1150 		mutex_exit(&ipst->ips_numvifs_mutex);
1151 	}
1152 
1153 	/*
1154 	 * decreases the refcnt added in add_vif.
1155 	 */
1156 	VIF_REFRELE_LOCKED(vifp);
1157 	return (0);
1158 }
1159 
1160 /*
1161  * Add an mfc entry.
1162  */
1163 static int
add_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1164 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1165 {
1166 	struct mfc *rt;
1167 	struct rtdetq *rte;
1168 	ushort_t nstl;
1169 	int i;
1170 	struct mfcb *mfcbp;
1171 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1172 
1173 	/*
1174 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1175 	 * did not have a real route for pkt.
1176 	 * We want this pkt without rt installed in the mfctable to prevent
1177 	 * multiiple tries, so go ahead and put it in mfctable, it will
1178 	 * be discarded later in ip_mdq() because the child is NULL.
1179 	 */
1180 
1181 	/* Error checking, out of bounds? */
1182 	if (mfccp->mfcc_parent > MAXVIFS) {
1183 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1184 		    (int)mfccp->mfcc_parent));
1185 		return (EINVAL);
1186 	}
1187 
1188 	if ((mfccp->mfcc_parent != NO_VIF) &&
1189 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1190 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1191 		    (int)mfccp->mfcc_parent));
1192 		return (EINVAL);
1193 	}
1194 
1195 	if (is_mrouter_off(ipst)) {
1196 		return (EINVAL);
1197 	}
1198 
1199 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1200 	    mfccp->mfcc_mcastgrp.s_addr)];
1201 	MFCB_REFHOLD(mfcbp);
1202 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1203 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1204 
1205 	/* If an entry already exists, just update the fields */
1206 	if (rt) {
1207 		if (ipst->ips_ip_mrtdebug > 1) {
1208 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1209 			    "add_mfc: update o %x grp %x parent %x",
1210 			    ntohl(mfccp->mfcc_origin.s_addr),
1211 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1212 			    mfccp->mfcc_parent);
1213 		}
1214 		mutex_enter(&rt->mfc_mutex);
1215 		rt->mfc_parent = mfccp->mfcc_parent;
1216 
1217 		mutex_enter(&ipst->ips_numvifs_mutex);
1218 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
1219 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1220 		mutex_exit(&ipst->ips_numvifs_mutex);
1221 		mutex_exit(&rt->mfc_mutex);
1222 
1223 		MFCB_REFRELE(mfcbp);
1224 		return (0);
1225 	}
1226 
1227 	/*
1228 	 * Find the entry for which the upcall was made and update.
1229 	 */
1230 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1231 		mutex_enter(&rt->mfc_mutex);
1232 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1233 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1234 		    (rt->mfc_rte != NULL) &&
1235 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1236 			if (nstl++ != 0)
1237 				cmn_err(CE_WARN,
1238 				    "add_mfc: %s o %x g %x p %x",
1239 				    "multiple kernel entries",
1240 				    ntohl(mfccp->mfcc_origin.s_addr),
1241 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1242 				    mfccp->mfcc_parent);
1243 
1244 			if (ipst->ips_ip_mrtdebug > 1) {
1245 				(void) mi_strlog(mrouter->conn_rq, 1,
1246 				    SL_TRACE,
1247 				    "add_mfc: o %x g %x p %x",
1248 				    ntohl(mfccp->mfcc_origin.s_addr),
1249 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1250 				    mfccp->mfcc_parent);
1251 			}
1252 			fill_route(rt, mfccp, ipst);
1253 
1254 			/*
1255 			 * Prevent cleanup of cache entry.
1256 			 * Timer starts in ip_mforward.
1257 			 */
1258 			if (rt->mfc_timeout_id != 0) {
1259 				timeout_id_t id;
1260 				id = rt->mfc_timeout_id;
1261 				/*
1262 				 * setting id to zero will avoid this
1263 				 * entry from being cleaned up in
1264 				 * expire_up_calls().
1265 				 */
1266 				rt->mfc_timeout_id = 0;
1267 				/*
1268 				 * dropping the lock is fine as we
1269 				 * have a refhold on the bucket.
1270 				 * so mfc cannot be freed.
1271 				 * The timeout can fire but it will see
1272 				 * that mfc_timeout_id == 0 and not cleanup.
1273 				 */
1274 				mutex_exit(&rt->mfc_mutex);
1275 				(void) untimeout(id);
1276 				mutex_enter(&rt->mfc_mutex);
1277 			}
1278 
1279 			/*
1280 			 * Send all pkts that are queued waiting for the upcall.
1281 			 * ip_mdq param tun set to 0 -
1282 			 * the return value of ip_mdq() isn't used here,
1283 			 * so value we send doesn't matter.
1284 			 */
1285 			while (rt->mfc_rte != NULL) {
1286 				rte = rt->mfc_rte;
1287 				rt->mfc_rte = rte->rte_next;
1288 				mutex_exit(&rt->mfc_mutex);
1289 				(void) ip_mdq(rte->mp, (ipha_t *)
1290 				    rte->mp->b_rptr, rte->ill, 0, rt);
1291 				freemsg(rte->mp);
1292 				mi_free((char *)rte);
1293 				mutex_enter(&rt->mfc_mutex);
1294 			}
1295 		}
1296 		mutex_exit(&rt->mfc_mutex);
1297 	}
1298 
1299 
1300 	/*
1301 	 * It is possible that an entry is being inserted without an upcall
1302 	 */
1303 	if (nstl == 0) {
1304 		mutex_enter(&(mfcbp->mfcb_lock));
1305 		if (ipst->ips_ip_mrtdebug > 1) {
1306 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1307 			    "add_mfc: no upcall o %x g %x p %x",
1308 			    ntohl(mfccp->mfcc_origin.s_addr),
1309 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1310 			    mfccp->mfcc_parent);
1311 		}
1312 		if (is_mrouter_off(ipst)) {
1313 			mutex_exit(&mfcbp->mfcb_lock);
1314 			MFCB_REFRELE(mfcbp);
1315 			return (EINVAL);
1316 		}
1317 
1318 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1319 
1320 			mutex_enter(&rt->mfc_mutex);
1321 			if ((rt->mfc_origin.s_addr ==
1322 			    mfccp->mfcc_origin.s_addr) &&
1323 			    (rt->mfc_mcastgrp.s_addr ==
1324 			    mfccp->mfcc_mcastgrp.s_addr) &&
1325 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1326 				fill_route(rt, mfccp, ipst);
1327 				mutex_exit(&rt->mfc_mutex);
1328 				break;
1329 			}
1330 			mutex_exit(&rt->mfc_mutex);
1331 		}
1332 
1333 		/* No upcall, so make a new entry into mfctable */
1334 		if (rt == NULL) {
1335 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1336 			if (rt == NULL) {
1337 				ip1dbg(("add_mfc: out of memory\n"));
1338 				mutex_exit(&mfcbp->mfcb_lock);
1339 				MFCB_REFRELE(mfcbp);
1340 				return (ENOBUFS);
1341 			}
1342 
1343 			/* Insert new entry at head of hash chain */
1344 			mutex_enter(&rt->mfc_mutex);
1345 			fill_route(rt, mfccp, ipst);
1346 
1347 			/* Link into table */
1348 			rt->mfc_next   = mfcbp->mfcb_mfc;
1349 			mfcbp->mfcb_mfc = rt;
1350 			mutex_exit(&rt->mfc_mutex);
1351 		}
1352 		mutex_exit(&mfcbp->mfcb_lock);
1353 	}
1354 
1355 	MFCB_REFRELE(mfcbp);
1356 	return (0);
1357 }
1358 
1359 /*
1360  * Fills in mfc structure from mrouted mfcctl.
1361  */
1362 static void
fill_route(struct mfc * rt,struct mfcctl * mfccp,ip_stack_t * ipst)1363 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1364 {
1365 	int i;
1366 
1367 	rt->mfc_origin		= mfccp->mfcc_origin;
1368 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1369 	rt->mfc_parent		= mfccp->mfcc_parent;
1370 	mutex_enter(&ipst->ips_numvifs_mutex);
1371 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1372 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1373 	}
1374 	mutex_exit(&ipst->ips_numvifs_mutex);
1375 	/* Initialize pkt counters per src-grp */
1376 	rt->mfc_pkt_cnt	= 0;
1377 	rt->mfc_byte_cnt	= 0;
1378 	rt->mfc_wrong_if	= 0;
1379 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1380 
1381 }
1382 
1383 static void
free_queue(struct mfc * mfcp)1384 free_queue(struct mfc *mfcp)
1385 {
1386 	struct rtdetq *rte0;
1387 
1388 	/*
1389 	 * Drop all queued upcall packets.
1390 	 * Free the mbuf with the pkt.
1391 	 */
1392 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1393 		mfcp->mfc_rte = rte0->rte_next;
1394 		freemsg(rte0->mp);
1395 		mi_free((char *)rte0);
1396 	}
1397 }
1398 /*
1399  * go thorugh the hash bucket and free all the entries marked condemned.
1400  */
1401 void
release_mfc(struct mfcb * mfcbp)1402 release_mfc(struct mfcb *mfcbp)
1403 {
1404 	struct mfc *current_mfcp;
1405 	struct mfc *prev_mfcp;
1406 
1407 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1408 
1409 	while (current_mfcp != NULL) {
1410 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1411 			if (current_mfcp == mfcbp->mfcb_mfc) {
1412 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1413 				free_queue(current_mfcp);
1414 				mi_free(current_mfcp);
1415 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1416 				continue;
1417 			}
1418 			ASSERT(prev_mfcp != NULL);
1419 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1420 			free_queue(current_mfcp);
1421 			mi_free(current_mfcp);
1422 			current_mfcp = NULL;
1423 		} else {
1424 			prev_mfcp = current_mfcp;
1425 		}
1426 
1427 		current_mfcp = prev_mfcp->mfc_next;
1428 
1429 	}
1430 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1431 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1432 }
1433 
1434 /*
1435  * Delete an mfc entry.
1436  */
1437 static int
del_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1438 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1439 {
1440 	struct in_addr	origin;
1441 	struct in_addr	mcastgrp;
1442 	struct mfc 	*rt;
1443 	uint_t		hash;
1444 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1445 
1446 	origin = mfccp->mfcc_origin;
1447 	mcastgrp = mfccp->mfcc_mcastgrp;
1448 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1449 
1450 	if (ipst->ips_ip_mrtdebug > 1) {
1451 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1452 		    "del_mfc: o %x g %x",
1453 		    ntohl(origin.s_addr),
1454 		    ntohl(mcastgrp.s_addr));
1455 	}
1456 
1457 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1458 
1459 	/* Find mfc in mfctable, finds only entries without upcalls */
1460 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1461 		mutex_enter(&rt->mfc_mutex);
1462 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1463 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1464 		    rt->mfc_rte == NULL &&
1465 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1466 			break;
1467 		mutex_exit(&rt->mfc_mutex);
1468 	}
1469 
1470 	/*
1471 	 * Return if there was an upcall (mfc_rte != NULL,
1472 	 * or rt not in mfctable.
1473 	 */
1474 	if (rt == NULL) {
1475 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1476 		return (EADDRNOTAVAIL);
1477 	}
1478 
1479 
1480 	/*
1481 	 * no need to hold lock as we have a reference.
1482 	 */
1483 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1484 	/* error checking */
1485 	if (rt->mfc_timeout_id != 0) {
1486 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1487 		/*
1488 		 * Its ok to drop the lock,  the struct cannot be freed
1489 		 * since we have a ref on the hash bucket.
1490 		 */
1491 		rt->mfc_timeout_id = 0;
1492 		mutex_exit(&rt->mfc_mutex);
1493 		(void) untimeout(rt->mfc_timeout_id);
1494 		mutex_enter(&rt->mfc_mutex);
1495 	}
1496 
1497 	ASSERT(rt->mfc_rte == NULL);
1498 
1499 
1500 	/*
1501 	 * Delete the entry from the cache
1502 	 */
1503 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1504 	mutex_exit(&rt->mfc_mutex);
1505 
1506 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1507 
1508 	return (0);
1509 }
1510 
1511 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1512 
1513 /*
1514  * IP multicast forwarding function. This function assumes that the packet
1515  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1516  * pointed to by "ill", and the packet is to be relayed to other networks
1517  * that have members of the packet's destination IP multicast group.
1518  *
1519  * The packet is returned unscathed to the caller, unless it is
1520  * erroneous, in which case a -1 value tells the caller (IP)
1521  * to discard it.
1522  *
1523  * Unlike BSD, SunOS 5.x needs to return to IP info about
1524  * whether pkt came in thru a tunnel, so it can be discarded, unless
1525  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1526  * to be delivered.
1527  * Return values are 0 - pkt is okay and phyint
1528  *		    -1 - pkt is malformed and to be tossed
1529  *                   1 - pkt came in on tunnel
1530  */
1531 int
ip_mforward(mblk_t * mp,ip_recv_attr_t * ira)1532 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1533 {
1534 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1535 	ill_t		*ill = ira->ira_ill;
1536 	struct mfc 	*rt;
1537 	ipaddr_t	src, dst, tunnel_src = 0;
1538 	static int	srctun = 0;
1539 	vifi_t		vifi;
1540 	boolean_t	pim_reg_packet = B_FALSE;
1541 	struct mfcb	*mfcbp;
1542 	ip_stack_t	*ipst = ill->ill_ipst;
1543 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1544 	ill_t		*rill = ira->ira_rill;
1545 
1546 	ASSERT(ira->ira_pktlen == msgdsize(mp));
1547 
1548 	if (ipst->ips_ip_mrtdebug > 1) {
1549 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1550 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1551 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1552 		    ill->ill_name);
1553 	}
1554 
1555 	dst = ipha->ipha_dst;
1556 	if (ira->ira_flags & IRAF_PIM_REGISTER)
1557 		pim_reg_packet = B_TRUE;
1558 	else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1559 		tunnel_src = ira->ira_mroute_tunnel;
1560 
1561 	/*
1562 	 * Don't forward a packet with time-to-live of zero or one,
1563 	 * or a packet destined to a local-only group.
1564 	 */
1565 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1566 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1567 		if (ipst->ips_ip_mrtdebug > 1) {
1568 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1569 			    "ip_mforward: not forwarded ttl %d,"
1570 			    " dst 0x%x ill %s",
1571 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1572 		}
1573 		if (tunnel_src != 0)
1574 			return (1);
1575 		else
1576 			return (0);
1577 	}
1578 
1579 	if ((tunnel_src != 0) || pim_reg_packet) {
1580 		/*
1581 		 * Packet arrived over an encapsulated tunnel or via a PIM
1582 		 * register message.
1583 		 */
1584 		if (ipst->ips_ip_mrtdebug > 1) {
1585 			if (tunnel_src != 0) {
1586 				(void) mi_strlog(mrouter->conn_rq, 1,
1587 				    SL_TRACE,
1588 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1589 				    ill->ill_name);
1590 			} else if (pim_reg_packet) {
1591 				(void) mi_strlog(mrouter->conn_rq, 1,
1592 				    SL_TRACE,
1593 				    "ip_mforward: ill %s arrived via"
1594 				    "  REGISTER VIF",
1595 				    ill->ill_name);
1596 			}
1597 		}
1598 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1599 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1600 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1601 		/* Packet arrived via a physical interface. */
1602 		if (ipst->ips_ip_mrtdebug > 1) {
1603 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1604 			    "ip_mforward: ill %s arrived via PHYINT",
1605 			    ill->ill_name);
1606 		}
1607 
1608 	} else {
1609 		/*
1610 		 * Packet arrived through a SRCRT tunnel.
1611 		 * Source-route tunnels are no longer supported.
1612 		 * Error message printed every 1000 times.
1613 		 */
1614 		if ((srctun++ % 1000) == 0) {
1615 			cmn_err(CE_WARN,
1616 			    "ip_mforward: received source-routed pkt from %x",
1617 			    ntohl(ipha->ipha_src));
1618 		}
1619 		return (-1);
1620 	}
1621 
1622 	ipst->ips_mrtstat->mrts_fwd_in++;
1623 	src = ipha->ipha_src;
1624 
1625 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1626 
1627 	/*
1628 	 * Lock the mfctable against changes made by ip_mforward.
1629 	 * Note that only add_mfc and del_mfc can remove entries and
1630 	 * they run with exclusive access to IP. So we do not need to
1631 	 * guard against the rt being deleted, so release lock after reading.
1632 	 */
1633 
1634 	if (is_mrouter_off(ipst))
1635 		return (-1);
1636 
1637 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1638 	MFCB_REFHOLD(mfcbp);
1639 	MFCFIND(mfcbp, src, dst, rt);
1640 
1641 	/* Entry exists, so forward if necessary */
1642 	if (rt != NULL) {
1643 		int ret = 0;
1644 		ipst->ips_mrtstat->mrts_mfc_hits++;
1645 		if (pim_reg_packet) {
1646 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1647 			ret = ip_mdq(mp, ipha,
1648 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1649 			    v_ipif->ipif_ill,
1650 			    0, rt);
1651 		} else {
1652 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1653 		}
1654 
1655 		MFCB_REFRELE(mfcbp);
1656 		return (ret);
1657 
1658 		/*
1659 		 * Don't forward if we don't have a cache entry.  Mrouted will
1660 		 * always provide a cache entry in response to an upcall.
1661 		 */
1662 	} else {
1663 		/*
1664 		 * If we don't have a route for packet's origin, make a copy
1665 		 * of the packet and send message to routing daemon.
1666 		 */
1667 		struct mfc	*mfc_rt	 = NULL;
1668 		mblk_t		*mp0	 = NULL;
1669 		mblk_t		*mp_copy = NULL;
1670 		struct rtdetq	*rte	 = NULL;
1671 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1672 		uint_t		hash;
1673 		int		npkts;
1674 		boolean_t	new_mfc = B_FALSE;
1675 		ipst->ips_mrtstat->mrts_mfc_misses++;
1676 		/* BSD uses mrts_no_route++ */
1677 		if (ipst->ips_ip_mrtdebug > 1) {
1678 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1679 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1680 			    ill->ill_name, ntohl(src), ntohl(dst),
1681 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
1682 		}
1683 		/*
1684 		 * The order of the following code differs from the BSD code.
1685 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1686 		 * code works, so SunOS 5.x wasn't changed to conform to the
1687 		 * BSD version.
1688 		 */
1689 
1690 		/* Lock mfctable. */
1691 		hash = MFCHASH(src, dst);
1692 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1693 
1694 		/*
1695 		 * If we are turning off mrouted return an error
1696 		 */
1697 		if (is_mrouter_off(ipst)) {
1698 			mutex_exit(&mfcbp->mfcb_lock);
1699 			MFCB_REFRELE(mfcbp);
1700 			return (-1);
1701 		}
1702 
1703 		/* Is there an upcall waiting for this packet? */
1704 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1705 		    mfc_rt = mfc_rt->mfc_next) {
1706 			mutex_enter(&mfc_rt->mfc_mutex);
1707 			if (ipst->ips_ip_mrtdebug > 1) {
1708 				(void) mi_strlog(mrouter->conn_rq, 1,
1709 				    SL_TRACE,
1710 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1711 				    " g 0x%x\n",
1712 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1713 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1714 			}
1715 			/* There is an upcall */
1716 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1717 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1718 			    (mfc_rt->mfc_rte != NULL) &&
1719 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1720 				break;
1721 			}
1722 			mutex_exit(&mfc_rt->mfc_mutex);
1723 		}
1724 		/* No upcall, so make a new entry into mfctable */
1725 		if (mfc_rt == NULL) {
1726 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1727 			if (mfc_rt == NULL) {
1728 				ipst->ips_mrtstat->mrts_fwd_drop++;
1729 				ip1dbg(("ip_mforward: out of memory "
1730 				    "for mfc, mfc_rt\n"));
1731 				goto error_return;
1732 			} else
1733 				new_mfc = B_TRUE;
1734 			/* Get resources */
1735 			/* TODO could copy header and dup rest */
1736 			mp_copy = copymsg(mp);
1737 			if (mp_copy == NULL) {
1738 				ipst->ips_mrtstat->mrts_fwd_drop++;
1739 				ip1dbg(("ip_mforward: out of memory for "
1740 				    "mblk, mp_copy\n"));
1741 				goto error_return;
1742 			}
1743 			mutex_enter(&mfc_rt->mfc_mutex);
1744 		}
1745 		/* Get resources for rte, whether first rte or not first. */
1746 		/* Add this packet into rtdetq */
1747 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1748 		if (rte == NULL) {
1749 			ipst->ips_mrtstat->mrts_fwd_drop++;
1750 			mutex_exit(&mfc_rt->mfc_mutex);
1751 			ip1dbg(("ip_mforward: out of memory for"
1752 			    " rtdetq, rte\n"));
1753 			goto error_return;
1754 		}
1755 
1756 		mp0 = copymsg(mp);
1757 		if (mp0 == NULL) {
1758 			ipst->ips_mrtstat->mrts_fwd_drop++;
1759 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1760 			mutex_exit(&mfc_rt->mfc_mutex);
1761 			goto error_return;
1762 		}
1763 		rte->mp		= mp0;
1764 		if (pim_reg_packet) {
1765 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1766 			rte->ill =
1767 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1768 			    v_ipif->ipif_ill;
1769 		} else {
1770 			rte->ill = ill;
1771 		}
1772 		rte->rte_next	= NULL;
1773 
1774 		/*
1775 		 * Determine if upcall q (rtdetq) has overflowed.
1776 		 * mfc_rt->mfc_rte is null by mi_zalloc
1777 		 * if it is the first message.
1778 		 */
1779 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1780 		    rte_m = rte_m->rte_next)
1781 			npkts++;
1782 		if (ipst->ips_ip_mrtdebug > 1) {
1783 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1784 			    "ip_mforward: upcalls %d\n", npkts);
1785 		}
1786 		if (npkts > MAX_UPQ) {
1787 			ipst->ips_mrtstat->mrts_upq_ovflw++;
1788 			mutex_exit(&mfc_rt->mfc_mutex);
1789 			goto error_return;
1790 		}
1791 
1792 		if (npkts == 0) {	/* first upcall */
1793 			int i = 0;
1794 			/*
1795 			 * Now finish installing the new mfc! Now that we have
1796 			 * resources!  Insert new entry at head of hash chain.
1797 			 * Use src and dst which are ipaddr_t's.
1798 			 */
1799 			mfc_rt->mfc_origin.s_addr = src;
1800 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1801 
1802 			mutex_enter(&ipst->ips_numvifs_mutex);
1803 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
1804 				mfc_rt->mfc_ttls[i] = 0;
1805 			mutex_exit(&ipst->ips_numvifs_mutex);
1806 			mfc_rt->mfc_parent = ALL_VIFS;
1807 
1808 			/* Link into table */
1809 			if (ipst->ips_ip_mrtdebug > 1) {
1810 				(void) mi_strlog(mrouter->conn_rq, 1,
1811 				    SL_TRACE,
1812 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1813 				    "g 0x%x\n", hash,
1814 				    ntohl(mfc_rt->mfc_origin.s_addr),
1815 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1816 			}
1817 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1818 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1819 			mfc_rt->mfc_rte = NULL;
1820 		}
1821 
1822 		/* Link in the upcall */
1823 		/* First upcall */
1824 		if (mfc_rt->mfc_rte == NULL)
1825 			mfc_rt->mfc_rte = rte;
1826 		else {
1827 			/* not the first upcall */
1828 			prev_rte = mfc_rt->mfc_rte;
1829 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1830 			    prev_rte = rte1, rte1 = rte1->rte_next)
1831 				;
1832 			prev_rte->rte_next = rte;
1833 		}
1834 
1835 		/*
1836 		 * No upcalls waiting, this is first one, so send a message to
1837 		 * routing daemon to install a route into kernel table.
1838 		 */
1839 		if (npkts == 0) {
1840 			struct igmpmsg	*im;
1841 			/* ipha_protocol is 0, for upcall */
1842 			ASSERT(mp_copy != NULL);
1843 			im = (struct igmpmsg *)mp_copy->b_rptr;
1844 			im->im_msgtype	= IGMPMSG_NOCACHE;
1845 			im->im_mbz = 0;
1846 			mutex_enter(&ipst->ips_numvifs_mutex);
1847 			if (pim_reg_packet) {
1848 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1849 				mutex_exit(&ipst->ips_numvifs_mutex);
1850 			} else {
1851 				/*
1852 				 * XXX do we need to hold locks here ?
1853 				 */
1854 				for (vifi = 0;
1855 				    vifi < ipst->ips_numvifs;
1856 				    vifi++) {
1857 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
1858 						continue;
1859 					if (ipst->ips_vifs[vifi].
1860 					    v_ipif->ipif_ill == ill) {
1861 						im->im_vif = (uchar_t)vifi;
1862 						break;
1863 					}
1864 				}
1865 				mutex_exit(&ipst->ips_numvifs_mutex);
1866 				ASSERT(vifi < ipst->ips_numvifs);
1867 			}
1868 
1869 			ipst->ips_mrtstat->mrts_upcalls++;
1870 			/* Timer to discard upcalls if mrouted is too slow */
1871 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1872 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1873 			mutex_exit(&mfc_rt->mfc_mutex);
1874 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1875 			/* Pass to RAWIP */
1876 			ira->ira_ill = ira->ira_rill = NULL;
1877 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1878 			ira->ira_ill = ill;
1879 			ira->ira_rill = rill;
1880 		} else {
1881 			mutex_exit(&mfc_rt->mfc_mutex);
1882 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1883 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1884 			ip_drop_input("ip_mforward - upcall already waiting",
1885 			    mp_copy, ill);
1886 			freemsg(mp_copy);
1887 		}
1888 
1889 		MFCB_REFRELE(mfcbp);
1890 		if (tunnel_src != 0)
1891 			return (1);
1892 		else
1893 			return (0);
1894 	error_return:
1895 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1896 		MFCB_REFRELE(mfcbp);
1897 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1898 			mi_free((char *)mfc_rt);
1899 		if (rte != NULL)
1900 			mi_free((char *)rte);
1901 		if (mp_copy != NULL) {
1902 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1903 			ip_drop_input("ip_mforward error", mp_copy, ill);
1904 			freemsg(mp_copy);
1905 		}
1906 		if (mp0 != NULL)
1907 			freemsg(mp0);
1908 		return (-1);
1909 	}
1910 }
1911 
1912 /*
1913  * Clean up the mfctable cache entry if upcall is not serviced.
1914  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1915  */
1916 static void
expire_upcalls(void * arg)1917 expire_upcalls(void *arg)
1918 {
1919 	struct mfc *mfc_rt = arg;
1920 	uint_t hash;
1921 	struct mfc *prev_mfc, *mfc0;
1922 	ip_stack_t	*ipst;
1923 	conn_t		*mrouter;
1924 
1925 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1926 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1927 		return;
1928 	}
1929 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1930 	mrouter = ipst->ips_ip_g_mrouter;
1931 
1932 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1933 	if (ipst->ips_ip_mrtdebug > 1) {
1934 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1935 		    "expire_upcalls: hash %d s %x g %x",
1936 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1937 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1938 	}
1939 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1940 	mutex_enter(&mfc_rt->mfc_mutex);
1941 	/*
1942 	 * if timeout has been set to zero, than the
1943 	 * entry has been filled, no need to delete it.
1944 	 */
1945 	if (mfc_rt->mfc_timeout_id == 0)
1946 		goto done;
1947 	ipst->ips_mrtstat->mrts_cache_cleanups++;
1948 	mfc_rt->mfc_timeout_id = 0;
1949 
1950 	/* Determine entry to be cleaned up in cache table. */
1951 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1952 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1953 		if (mfc0 == mfc_rt)
1954 			break;
1955 
1956 	/* del_mfc takes care of gone mfcs */
1957 	ASSERT(prev_mfc != NULL);
1958 	ASSERT(mfc0 != NULL);
1959 
1960 	/*
1961 	 * Delete the entry from the cache
1962 	 */
1963 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1964 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1965 
1966 	/*
1967 	 * release_mfc will drop all queued upcall packets.
1968 	 * and will free the mbuf with the pkt, if, timing info.
1969 	 */
1970 done:
1971 	mutex_exit(&mfc_rt->mfc_mutex);
1972 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1973 }
1974 
1975 /*
1976  * Packet forwarding routine once entry in the cache is made.
1977  */
1978 static int
ip_mdq(mblk_t * mp,ipha_t * ipha,ill_t * ill,ipaddr_t tunnel_src,struct mfc * rt)1979 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1980     struct mfc *rt)
1981 {
1982 	vifi_t vifi;
1983 	struct vif *vifp;
1984 	ipaddr_t dst = ipha->ipha_dst;
1985 	size_t  plen = msgdsize(mp);
1986 	vifi_t num_of_vifs;
1987 	ip_stack_t	*ipst = ill->ill_ipst;
1988 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1989 	ip_recv_attr_t	iras;
1990 
1991 	if (ipst->ips_ip_mrtdebug > 1) {
1992 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1993 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1994 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1995 		    ill->ill_name);
1996 	}
1997 
1998 	/* Macro to send packet on vif */
1999 #define	MC_SEND(ipha, mp, vifp, dst) { \
2000 	if ((vifp)->v_flags & VIFF_TUNNEL) \
2001 		encap_send((ipha), (mp), (vifp), (dst)); \
2002 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2003 		register_send((ipha), (mp), (vifp), (dst)); \
2004 	else \
2005 		phyint_send((ipha), (mp), (vifp), (dst)); \
2006 }
2007 
2008 	vifi = rt->mfc_parent;
2009 
2010 	/*
2011 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2012 	 * Mrouted had no route.
2013 	 * We wanted the route installed in the mfctable to prevent multiple
2014 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2015 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2016 	 * 3.6.
2017 	 */
2018 	if (vifi == NO_VIF) {
2019 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2020 		    ill->ill_name));
2021 		if (ipst->ips_ip_mrtdebug > 1) {
2022 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2023 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2024 		}
2025 		return (-1);	/* drop pkt */
2026 	}
2027 
2028 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2029 		return (-1);
2030 	/*
2031 	 * The MFC entries are not cleaned up when an ipif goes
2032 	 * away thus this code has to guard against an MFC referencing
2033 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2034 	 * sets the v_ipif to NULL when the ipif disappears.
2035 	 */
2036 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2037 
2038 	if (vifi >= ipst->ips_numvifs) {
2039 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2040 		    "%d ill %s viftable ill %s\n",
2041 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2042 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2043 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2044 		return (-1);
2045 	}
2046 	/*
2047 	 * Don't forward if it didn't arrive from the parent vif for its
2048 	 * origin.
2049 	 */
2050 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2051 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2052 		/* Came in the wrong interface */
2053 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2054 			"numvifs %d ill %s viftable ill %s\n",
2055 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2056 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2057 		if (ipst->ips_ip_mrtdebug > 1) {
2058 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2059 			    "ip_mdq: arrived wrong if, vifi %d ill "
2060 			    "%s viftable ill %s\n",
2061 			    (int)vifi, ill->ill_name,
2062 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2063 		}
2064 		ipst->ips_mrtstat->mrts_wrong_if++;
2065 		rt->mfc_wrong_if++;
2066 
2067 		/*
2068 		 * If we are doing PIM assert processing and we are forwarding
2069 		 * packets on this interface, and it is a broadcast medium
2070 		 * interface (and not a tunnel), send a message to the routing.
2071 		 *
2072 		 * We use the first ipif on the list, since it's all we have.
2073 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2074 		 */
2075 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2076 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2077 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2078 			mblk_t		*mp_copy;
2079 			struct igmpmsg	*im;
2080 
2081 			/* TODO could copy header and dup rest */
2082 			mp_copy = copymsg(mp);
2083 			if (mp_copy == NULL) {
2084 				ipst->ips_mrtstat->mrts_fwd_drop++;
2085 				ip1dbg(("ip_mdq: out of memory "
2086 				    "for mblk, mp_copy\n"));
2087 				unlock_good_vif(&ipst->ips_vifs[vifi]);
2088 				return (-1);
2089 			}
2090 
2091 			im = (struct igmpmsg *)mp_copy->b_rptr;
2092 			im->im_msgtype = IGMPMSG_WRONGVIF;
2093 			im->im_mbz = 0;
2094 			im->im_vif = (ushort_t)vifi;
2095 			/* Pass to RAWIP */
2096 
2097 			bzero(&iras, sizeof (iras));
2098 			iras.ira_flags = IRAF_IS_IPV4;
2099 			iras.ira_ip_hdr_length =
2100 			    IPH_HDR_LENGTH(mp_copy->b_rptr);
2101 			iras.ira_pktlen = msgdsize(mp_copy);
2102 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2103 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2104 		}
2105 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2106 		if (tunnel_src != 0)
2107 			return (1);
2108 		else
2109 			return (0);
2110 	}
2111 	/*
2112 	 * If I sourced this packet, it counts as output, else it was input.
2113 	 */
2114 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2115 		ipst->ips_vifs[vifi].v_pkt_out++;
2116 		ipst->ips_vifs[vifi].v_bytes_out += plen;
2117 	} else {
2118 		ipst->ips_vifs[vifi].v_pkt_in++;
2119 		ipst->ips_vifs[vifi].v_bytes_in += plen;
2120 	}
2121 	mutex_enter(&rt->mfc_mutex);
2122 	rt->mfc_pkt_cnt++;
2123 	rt->mfc_byte_cnt += plen;
2124 	mutex_exit(&rt->mfc_mutex);
2125 	unlock_good_vif(&ipst->ips_vifs[vifi]);
2126 	/*
2127 	 * For each vif, decide if a copy of the packet should be forwarded.
2128 	 * Forward if:
2129 	 *		- the vif threshold ttl is non-zero AND
2130 	 *		- the pkt ttl exceeds the vif's threshold
2131 	 * A non-zero mfc_ttl indicates that the vif is part of
2132 	 * the output set for the mfc entry.
2133 	 */
2134 	mutex_enter(&ipst->ips_numvifs_mutex);
2135 	num_of_vifs = ipst->ips_numvifs;
2136 	mutex_exit(&ipst->ips_numvifs_mutex);
2137 	for (vifp = ipst->ips_vifs, vifi = 0;
2138 	    vifi < num_of_vifs;
2139 	    vifp++, vifi++) {
2140 		if (!lock_good_vif(vifp))
2141 			continue;
2142 		if ((rt->mfc_ttls[vifi] > 0) &&
2143 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2144 			/*
2145 			 * lock_good_vif should not have succedded if
2146 			 * v_ipif is null.
2147 			 */
2148 			ASSERT(vifp->v_ipif != NULL);
2149 			vifp->v_pkt_out++;
2150 			vifp->v_bytes_out += plen;
2151 			MC_SEND(ipha, mp, vifp, dst);
2152 			ipst->ips_mrtstat->mrts_fwd_out++;
2153 		}
2154 		unlock_good_vif(vifp);
2155 	}
2156 	if (tunnel_src != 0)
2157 		return (1);
2158 	else
2159 		return (0);
2160 }
2161 
2162 /*
2163  * Send the packet on physical interface.
2164  * Caller assumes can continue to use mp on return.
2165  */
2166 /* ARGSUSED */
2167 static void
phyint_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2168 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2169 {
2170 	mblk_t 	*mp_copy;
2171 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2172 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2173 
2174 	/* Make a new reference to the packet */
2175 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2176 	if (mp_copy == NULL) {
2177 		ipst->ips_mrtstat->mrts_fwd_drop++;
2178 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2179 		return;
2180 	}
2181 	if (vifp->v_rate_limit <= 0)
2182 		tbf_send_packet(vifp, mp_copy);
2183 	else  {
2184 		if (ipst->ips_ip_mrtdebug > 1) {
2185 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2186 			    "phyint_send: tbf_contr rate %d "
2187 			    "vifp 0x%p mp 0x%p dst 0x%x",
2188 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2189 		}
2190 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2191 	}
2192 }
2193 
2194 /*
2195  * Send the whole packet for REGISTER encapsulation to PIM daemon
2196  * Caller assumes it can continue to use mp on return.
2197  */
2198 /* ARGSUSED */
2199 static void
register_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2200 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2201 {
2202 	struct igmpmsg	*im;
2203 	mblk_t		*mp_copy;
2204 	ipha_t		*ipha_copy;
2205 	ill_t		*ill = vifp->v_ipif->ipif_ill;
2206 	ip_stack_t	*ipst = ill->ill_ipst;
2207 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2208 	ip_recv_attr_t	iras;
2209 
2210 	if (ipst->ips_ip_mrtdebug > 1) {
2211 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2212 		    "register_send: src %x, dst %x\n",
2213 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2214 	}
2215 
2216 	/*
2217 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2218 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2219 	 * ethernet driver will.
2220 	 */
2221 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2222 	if (mp_copy == NULL) {
2223 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2224 		if (ipst->ips_ip_mrtdebug > 3) {
2225 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2226 			    "register_send: allocb failure.");
2227 		}
2228 		return;
2229 	}
2230 
2231 	/*
2232 	 * Bump write pointer to account for igmpmsg being added.
2233 	 */
2234 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2235 
2236 	/*
2237 	 * Chain packet to new mblk_t.
2238 	 */
2239 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2240 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2241 		if (ipst->ips_ip_mrtdebug > 3) {
2242 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2243 			    "register_send: copymsg failure.");
2244 		}
2245 		freeb(mp_copy);
2246 		return;
2247 	}
2248 
2249 	/*
2250 	 * icmp_input() asserts that IP version field is set to an
2251 	 * appropriate version. Hence, the struct igmpmsg that this really
2252 	 * becomes, needs to have the correct IP version field.
2253 	 */
2254 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2255 	*ipha_copy = multicast_encap_iphdr;
2256 
2257 	/*
2258 	 * The kernel uses the struct igmpmsg header to encode the messages to
2259 	 * the multicast routing daemon. Fill in the fields in the header
2260 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2261 	 */
2262 	im = (struct igmpmsg *)mp_copy->b_rptr;
2263 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2264 	im->im_src.s_addr = ipha->ipha_src;
2265 	im->im_dst.s_addr = ipha->ipha_dst;
2266 
2267 	/*
2268 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2269 	 * header with renamed fields and the multicast routing daemon uses
2270 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2271 	 */
2272 	im->im_mbz = 0;
2273 
2274 	++ipst->ips_mrtstat->mrts_upcalls;
2275 	if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2276 	    !canputnext(mrouter->conn_rq)) {
2277 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2278 		if (ipst->ips_ip_mrtdebug > 3) {
2279 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2280 			    "register_send: register upcall failure.");
2281 		}
2282 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2283 		ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2284 		freemsg(mp_copy);
2285 	} else {
2286 		/* Pass to RAWIP */
2287 		bzero(&iras, sizeof (iras));
2288 		iras.ira_flags = IRAF_IS_IPV4;
2289 		iras.ira_ip_hdr_length = sizeof (ipha_t);
2290 		iras.ira_pktlen = msgdsize(mp_copy);
2291 		(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2292 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2293 	}
2294 }
2295 
2296 /*
2297  * pim_validate_cksum handles verification of the checksum in the
2298  * pim header.  For PIM Register packets, the checksum is calculated
2299  * across the PIM header only.  For all other packets, the checksum
2300  * is for the PIM header and remainder of the packet.
2301  *
2302  * returns: B_TRUE, if checksum is okay.
2303  *          B_FALSE, if checksum is not valid.
2304  */
2305 static boolean_t
pim_validate_cksum(mblk_t * mp,ipha_t * ip,struct pim * pimp)2306 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2307 {
2308 	mblk_t *mp_dup;
2309 
2310 	if ((mp_dup = dupmsg(mp)) == NULL)
2311 		return (B_FALSE);
2312 
2313 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2314 	if (pimp->pim_type == PIM_REGISTER)
2315 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2316 	if (IP_CSUM(mp_dup, 0, 0)) {
2317 		freemsg(mp_dup);
2318 		return (B_FALSE);
2319 	}
2320 	freemsg(mp_dup);
2321 	return (B_TRUE);
2322 }
2323 
2324 /*
2325  * Process PIM protocol packets i.e. IP Protocol 103.
2326  * Register messages are decapsulated and sent onto multicast forwarding.
2327  *
2328  * Return NULL for a bad packet that is discarded here.
2329  * Return mp if the message is OK and should be handed to "raw" receivers.
2330  * Callers of pim_input() may need to reinitialize variables that were copied
2331  * from the mblk as this calls pullupmsg().
2332  */
2333 mblk_t *
pim_input(mblk_t * mp,ip_recv_attr_t * ira)2334 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2335 {
2336 	ipha_t		*eip, *ip;
2337 	int		iplen, pimlen, iphlen;
2338 	struct pim	*pimp;	/* pointer to a pim struct */
2339 	uint32_t	*reghdr;
2340 	ill_t		*ill = ira->ira_ill;
2341 	ip_stack_t	*ipst = ill->ill_ipst;
2342 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2343 
2344 	/*
2345 	 * Pullup the msg for PIM protocol processing.
2346 	 */
2347 	if (pullupmsg(mp, -1) == 0) {
2348 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2349 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2350 		ip_drop_input("mrts_pim_nomemory", mp, ill);
2351 		freemsg(mp);
2352 		return (NULL);
2353 	}
2354 
2355 	ip = (ipha_t *)mp->b_rptr;
2356 	iplen = ip->ipha_length;
2357 	iphlen = IPH_HDR_LENGTH(ip);
2358 	pimlen = ntohs(iplen) - iphlen;
2359 
2360 	/*
2361 	 * Validate lengths
2362 	 */
2363 	if (pimlen < PIM_MINLEN) {
2364 		++ipst->ips_mrtstat->mrts_pim_malformed;
2365 		if (ipst->ips_ip_mrtdebug > 1) {
2366 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2367 			    "pim_input: length not at least minlen");
2368 		}
2369 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2370 		ip_drop_input("mrts_pim_malformed", mp, ill);
2371 		freemsg(mp);
2372 		return (NULL);
2373 	}
2374 
2375 	/*
2376 	 * Point to the PIM header.
2377 	 */
2378 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2379 
2380 	/*
2381 	 * Check the version number.
2382 	 */
2383 	if (pimp->pim_vers != PIM_VERSION) {
2384 		++ipst->ips_mrtstat->mrts_pim_badversion;
2385 		if (ipst->ips_ip_mrtdebug > 1) {
2386 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2387 			    "pim_input: unknown version of PIM");
2388 		}
2389 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2390 		ip_drop_input("mrts_pim_badversion", mp, ill);
2391 		freemsg(mp);
2392 		return (NULL);
2393 	}
2394 
2395 	/*
2396 	 * Validate the checksum
2397 	 */
2398 	if (!pim_validate_cksum(mp, ip, pimp)) {
2399 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2400 		if (ipst->ips_ip_mrtdebug > 1) {
2401 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2402 			    "pim_input: invalid checksum");
2403 		}
2404 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2405 		ip_drop_input("pim_rcv_badcsum", mp, ill);
2406 		freemsg(mp);
2407 		return (NULL);
2408 	}
2409 
2410 	if (pimp->pim_type != PIM_REGISTER)
2411 		return (mp);
2412 
2413 	reghdr = (uint32_t *)(pimp + 1);
2414 	eip = (ipha_t *)(reghdr + 1);
2415 
2416 	/*
2417 	 * check if the inner packet is destined to mcast group
2418 	 */
2419 	if (!CLASSD(eip->ipha_dst)) {
2420 		++ipst->ips_mrtstat->mrts_pim_badregisters;
2421 		if (ipst->ips_ip_mrtdebug > 1) {
2422 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2423 			    "pim_input: Inner pkt not mcast .. !");
2424 		}
2425 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2426 		ip_drop_input("mrts_pim_badregisters", mp, ill);
2427 		freemsg(mp);
2428 		return (NULL);
2429 	}
2430 	if (ipst->ips_ip_mrtdebug > 1) {
2431 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2432 		    "register from %x, to %x, len %d",
2433 		    ntohl(eip->ipha_src),
2434 		    ntohl(eip->ipha_dst),
2435 		    ntohs(eip->ipha_length));
2436 	}
2437 	/*
2438 	 * If the null register bit is not set, decapsulate
2439 	 * the packet before forwarding it.
2440 	 * Avoid this in no register vif
2441 	 */
2442 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2443 	    ipst->ips_reg_vif_num != ALL_VIFS) {
2444 		mblk_t *mp_copy;
2445 		uint_t saved_pktlen;
2446 
2447 		/* Copy the message */
2448 		if ((mp_copy = copymsg(mp)) == NULL) {
2449 			++ipst->ips_mrtstat->mrts_pim_nomemory;
2450 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2451 			ip_drop_input("mrts_pim_nomemory", mp, ill);
2452 			freemsg(mp);
2453 			return (NULL);
2454 		}
2455 
2456 		/*
2457 		 * Decapsulate the packet and give it to
2458 		 * register_mforward.
2459 		 */
2460 		mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2461 		saved_pktlen = ira->ira_pktlen;
2462 		ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2463 		if (register_mforward(mp_copy, ira) != 0) {
2464 			/* register_mforward already called ip_drop_input */
2465 			freemsg(mp);
2466 			ira->ira_pktlen = saved_pktlen;
2467 			return (NULL);
2468 		}
2469 		ira->ira_pktlen = saved_pktlen;
2470 	}
2471 
2472 	/*
2473 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2474 	 * PIM socket. For Solaris it is done right after pim_input() is
2475 	 * called.
2476 	 */
2477 	return (mp);
2478 }
2479 
2480 /*
2481  * PIM sparse mode hook.  Called by pim_input after decapsulating
2482  * the packet. Loop back the packet, as if we have received it.
2483  * In pim_input() we have to check if the destination is a multicast address.
2484  */
2485 static int
register_mforward(mblk_t * mp,ip_recv_attr_t * ira)2486 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2487 {
2488 	ire_t		*ire;
2489 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2490 	ill_t		*ill = ira->ira_ill;
2491 	ip_stack_t	*ipst = ill->ill_ipst;
2492 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2493 
2494 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2495 
2496 	if (ipst->ips_ip_mrtdebug > 3) {
2497 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2498 		    "register_mforward: src %x, dst %x\n",
2499 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2500 	}
2501 	/*
2502 	 * Need to pass in to ip_mforward() the information that the
2503 	 * packet has arrived on the register_vif. We mark it with
2504 	 * the IRAF_PIM_REGISTER attribute.
2505 	 * pim_input verified that the (inner) destination is multicast,
2506 	 * hence we skip the generic code in ip_input.
2507 	 */
2508 	ira->ira_flags |= IRAF_PIM_REGISTER;
2509 	++ipst->ips_mrtstat->mrts_pim_regforwards;
2510 
2511 	if (!CLASSD(ipha->ipha_dst)) {
2512 		ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2513 		    ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2514 		    NULL, NULL, NULL);
2515 	} else {
2516 		ire = ire_multicast(ill);
2517 	}
2518 	ASSERT(ire != NULL);
2519 	/* Normally this will return the IRE_MULTICAST */
2520 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2521 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2522 		ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2523 		freemsg(mp);
2524 		ire_refrele(ire);
2525 		return (-1);
2526 	}
2527 	ASSERT(ire->ire_type & IRE_MULTICAST);
2528 	(*ire->ire_recvfn)(ire, mp, ipha, ira);
2529 	ire_refrele(ire);
2530 
2531 	return (0);
2532 }
2533 
2534 /*
2535  * Send an encapsulated packet.
2536  * Caller assumes can continue to use mp when routine returns.
2537  */
2538 /* ARGSUSED */
2539 static void
encap_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2540 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2541 {
2542 	mblk_t 	*mp_copy;
2543 	ipha_t 	*ipha_copy;
2544 	size_t	len;
2545 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2546 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2547 
2548 	if (ipst->ips_ip_mrtdebug > 1) {
2549 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2550 		    "encap_send: vif %ld enter",
2551 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
2552 	}
2553 	len = ntohs(ipha->ipha_length);
2554 
2555 	/*
2556 	 * Copy the old packet & pullup it's IP header into the
2557 	 * new mbuf so we can modify it.  Try to fill the new
2558 	 * mbuf since if we don't the ethernet driver will.
2559 	 */
2560 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2561 	if (mp_copy == NULL)
2562 		return;
2563 	mp_copy->b_rptr += 32;
2564 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2565 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2566 		freeb(mp_copy);
2567 		return;
2568 	}
2569 
2570 	/*
2571 	 * Fill in the encapsulating IP header.
2572 	 * Remote tunnel dst in rmt_addr, from add_vif().
2573 	 */
2574 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2575 	*ipha_copy = multicast_encap_iphdr;
2576 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2577 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2578 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2579 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2580 	ASSERT(ipha_copy->ipha_ident == 0);
2581 
2582 	/* Turn the encapsulated IP header back into a valid one. */
2583 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2584 	ipha->ipha_ttl--;
2585 	ipha->ipha_hdr_checksum = 0;
2586 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2587 
2588 	ipha_copy->ipha_ttl = ipha->ipha_ttl;
2589 
2590 	if (ipst->ips_ip_mrtdebug > 1) {
2591 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2592 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2593 	}
2594 	if (vifp->v_rate_limit <= 0)
2595 		tbf_send_packet(vifp, mp_copy);
2596 	else
2597 		/* ipha is from the original header */
2598 		tbf_control(vifp, mp_copy, ipha);
2599 }
2600 
2601 /*
2602  * De-encapsulate a packet and feed it back through IP input if it
2603  * matches one of our multicast tunnels.
2604  *
2605  * This routine is called whenever IP gets a packet with prototype
2606  * IPPROTO_ENCAP and a local destination address and the packet didn't
2607  * match one of our configured IP-in-IP tunnels.
2608  */
2609 void
ip_mroute_decap(mblk_t * mp,ip_recv_attr_t * ira)2610 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2611 {
2612 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2613 	ipha_t		*ipha_encap;
2614 	int		hlen = IPH_HDR_LENGTH(ipha);
2615 	int		hlen_encap;
2616 	ipaddr_t	src;
2617 	struct vif	*vifp;
2618 	ire_t		*ire;
2619 	ill_t		*ill = ira->ira_ill;
2620 	ip_stack_t	*ipst = ill->ill_ipst;
2621 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2622 
2623 	/* Make sure we have all of the inner header */
2624 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2625 	if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2626 		ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2627 		if (ipha == NULL) {
2628 			ipst->ips_mrtstat->mrts_bad_tunnel++;
2629 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2630 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
2631 			freemsg(mp);
2632 			return;
2633 		}
2634 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
2635 	}
2636 	hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2637 	if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2638 		ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2639 		if (ipha == NULL) {
2640 			ipst->ips_mrtstat->mrts_bad_tunnel++;
2641 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2642 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
2643 			freemsg(mp);
2644 			return;
2645 		}
2646 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
2647 	}
2648 
2649 	/*
2650 	 * Dump the packet if it's not to a multicast destination or if
2651 	 * we don't have an encapsulating tunnel with the source.
2652 	 * Note:  This code assumes that the remote site IP address
2653 	 * uniquely identifies the tunnel (i.e., that this site has
2654 	 * at most one tunnel with the remote site).
2655 	 */
2656 	if (!CLASSD(ipha_encap->ipha_dst)) {
2657 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2658 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2659 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2660 		ip_drop_input("mrts_bad_tunnel", mp, ill);
2661 		freemsg(mp);
2662 		return;
2663 	}
2664 	src = (ipaddr_t)ipha->ipha_src;
2665 	mutex_enter(&ipst->ips_last_encap_lock);
2666 	if (src != ipst->ips_last_encap_src) {
2667 		struct vif *vife;
2668 
2669 		vifp = ipst->ips_vifs;
2670 		vife = vifp + ipst->ips_numvifs;
2671 		ipst->ips_last_encap_src = src;
2672 		ipst->ips_last_encap_vif = 0;
2673 		for (; vifp < vife; ++vifp) {
2674 			if (!lock_good_vif(vifp))
2675 				continue;
2676 			if (vifp->v_rmt_addr.s_addr == src) {
2677 				if (vifp->v_flags & VIFF_TUNNEL)
2678 					ipst->ips_last_encap_vif = vifp;
2679 				if (ipst->ips_ip_mrtdebug > 1) {
2680 					(void) mi_strlog(mrouter->conn_rq,
2681 					    1, SL_TRACE,
2682 					    "ip_mroute_decap: good tun "
2683 					    "vif %ld with %x",
2684 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
2685 					    ntohl(src));
2686 				}
2687 				unlock_good_vif(vifp);
2688 				break;
2689 			}
2690 			unlock_good_vif(vifp);
2691 		}
2692 	}
2693 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
2694 		mutex_exit(&ipst->ips_last_encap_lock);
2695 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2696 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2697 		ip_drop_input("mrts_bad_tunnel", mp, ill);
2698 		freemsg(mp);
2699 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2700 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2701 		return;
2702 	}
2703 	mutex_exit(&ipst->ips_last_encap_lock);
2704 
2705 	/*
2706 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2707 	 * verify that the packet arrived over the correct vif.)
2708 	 */
2709 	ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2710 	ira->ira_mroute_tunnel = src;
2711 	mp->b_rptr += hlen;
2712 	ira->ira_pktlen -= hlen;
2713 	ira->ira_ip_hdr_length = hlen_encap;
2714 
2715 	/*
2716 	 * We don't redo any of the filtering in ill_input_full_v4 and we
2717 	 * have checked that all of ipha_encap and any IP options are
2718 	 * pulled up. Hence we call ire_recv_multicast_v4 directly.
2719 	 * However, we have to check for RSVP as in ip_input_full_v4
2720 	 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2721 	 * to the rsvpd.
2722 	 */
2723 	if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2724 	    ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2725 		ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2726 		    ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2727 		    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2728 	} else {
2729 		ire = ire_multicast(ill);
2730 	}
2731 	ASSERT(ire != NULL);
2732 	/* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2733 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2734 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2735 		ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2736 		freemsg(mp);
2737 		ire_refrele(ire);
2738 		return;
2739 	}
2740 	ire->ire_ib_pkt_count++;
2741 	ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2742 	(*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2743 	ire_refrele(ire);
2744 }
2745 
2746 /*
2747  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2748  * (stream closed).  Called as writer.
2749  */
2750 void
reset_mrt_vif_ipif(ipif_t * ipif)2751 reset_mrt_vif_ipif(ipif_t *ipif)
2752 {
2753 	vifi_t vifi, tmp_vifi;
2754 	vifi_t num_of_vifs;
2755 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2756 
2757 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2758 
2759 	mutex_enter(&ipst->ips_numvifs_mutex);
2760 	num_of_vifs = ipst->ips_numvifs;
2761 	mutex_exit(&ipst->ips_numvifs_mutex);
2762 
2763 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2764 		tmp_vifi = vifi - 1;
2765 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2766 			(void) del_vif(&tmp_vifi, ipst);
2767 		}
2768 	}
2769 }
2770 
2771 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2772 void
reset_mrt_ill(ill_t * ill)2773 reset_mrt_ill(ill_t *ill)
2774 {
2775 	struct mfc	*rt;
2776 	struct rtdetq	*rte;
2777 	int		i;
2778 	ip_stack_t	*ipst = ill->ill_ipst;
2779 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2780 	timeout_id_t	id;
2781 
2782 	for (i = 0; i < MFCTBLSIZ; i++) {
2783 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2784 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2785 			if (ipst->ips_ip_mrtdebug > 1) {
2786 				(void) mi_strlog(mrouter->conn_rq, 1,
2787 				    SL_TRACE,
2788 				    "reset_mrt_ill: mfctable [%d]", i);
2789 			}
2790 			while (rt != NULL) {
2791 				mutex_enter(&rt->mfc_mutex);
2792 				while ((rte = rt->mfc_rte) != NULL) {
2793 					if (rte->ill == ill &&
2794 					    (id = rt->mfc_timeout_id) != 0) {
2795 						/*
2796 						 * Its ok to drop the lock,  the
2797 						 * struct cannot be freed since
2798 						 * we have a ref on the hash
2799 						 * bucket.
2800 						 */
2801 						mutex_exit(&rt->mfc_mutex);
2802 						(void) untimeout(id);
2803 						mutex_enter(&rt->mfc_mutex);
2804 					}
2805 					if (rte->ill == ill) {
2806 						if (ipst->ips_ip_mrtdebug > 1) {
2807 						(void) mi_strlog(
2808 						    mrouter->conn_rq,
2809 						    1, SL_TRACE,
2810 						    "reset_mrt_ill: "
2811 						    "ill 0x%p", (void *)ill);
2812 						}
2813 						rt->mfc_rte = rte->rte_next;
2814 						freemsg(rte->mp);
2815 						mi_free((char *)rte);
2816 					}
2817 				}
2818 				mutex_exit(&rt->mfc_mutex);
2819 				rt = rt->mfc_next;
2820 			}
2821 		}
2822 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
2823 	}
2824 }
2825 
2826 /*
2827  * Token bucket filter module.
2828  * The ipha is for mcastgrp destination for phyint and encap.
2829  */
2830 static void
tbf_control(struct vif * vifp,mblk_t * mp,ipha_t * ipha)2831 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2832 {
2833 	size_t 	p_len =  msgdsize(mp);
2834 	struct tbf	*t    = vifp->v_tbf;
2835 	timeout_id_t id = 0;
2836 	ill_t		*ill = vifp->v_ipif->ipif_ill;
2837 	ip_stack_t	*ipst = ill->ill_ipst;
2838 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2839 
2840 	/* Drop if packet is too large */
2841 	if (p_len > MAX_BKT_SIZE) {
2842 		ipst->ips_mrtstat->mrts_pkt2large++;
2843 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2844 		ip_drop_output("tbf_control - too large", mp, ill);
2845 		freemsg(mp);
2846 		return;
2847 	}
2848 	if (ipst->ips_ip_mrtdebug > 1) {
2849 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2850 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2851 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2852 		    ntohl(ipha->ipha_dst));
2853 	}
2854 
2855 	mutex_enter(&t->tbf_lock);
2856 
2857 	tbf_update_tokens(vifp);
2858 
2859 	/*
2860 	 * If there are enough tokens,
2861 	 * and the queue is empty, send this packet out.
2862 	 */
2863 	if (ipst->ips_ip_mrtdebug > 1) {
2864 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2865 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2866 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2867 		    t->tbf_q_len);
2868 	}
2869 	/* No packets are queued */
2870 	if (t->tbf_q_len == 0) {
2871 		/* queue empty, send packet if enough tokens */
2872 		if (p_len <= t->tbf_n_tok) {
2873 			t->tbf_n_tok -= p_len;
2874 			mutex_exit(&t->tbf_lock);
2875 			tbf_send_packet(vifp, mp);
2876 			return;
2877 		} else {
2878 			/* Queue packet and timeout till later */
2879 			tbf_queue(vifp, mp);
2880 			ASSERT(vifp->v_timeout_id == 0);
2881 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2882 			    TBF_REPROCESS);
2883 		}
2884 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2885 		/* Finite queue length, so queue pkts and process queue */
2886 		tbf_queue(vifp, mp);
2887 		tbf_process_q(vifp);
2888 	} else {
2889 		/* Check that we have UDP header with IP header */
2890 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2891 		    sizeof (struct udphdr);
2892 
2893 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2894 			if (!pullupmsg(mp, hdr_length)) {
2895 				BUMP_MIB(ill->ill_ip_mib,
2896 				    ipIfStatsOutDiscards);
2897 				ip_drop_output("tbf_control - pullup", mp, ill);
2898 				freemsg(mp);
2899 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2900 				    "vif %ld src 0x%x dst 0x%x\n",
2901 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
2902 				    ntohl(ipha->ipha_src),
2903 				    ntohl(ipha->ipha_dst)));
2904 				mutex_exit(&vifp->v_tbf->tbf_lock);
2905 				return;
2906 			} else
2907 				/* Have to reassign ipha after pullupmsg */
2908 				ipha = (ipha_t *)mp->b_rptr;
2909 		}
2910 		/*
2911 		 * Queue length too much,
2912 		 * try to selectively dq, or queue and process
2913 		 */
2914 		if (!tbf_dq_sel(vifp, ipha)) {
2915 			ipst->ips_mrtstat->mrts_q_overflow++;
2916 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2917 			ip_drop_output("mrts_q_overflow", mp, ill);
2918 			freemsg(mp);
2919 		} else {
2920 			tbf_queue(vifp, mp);
2921 			tbf_process_q(vifp);
2922 		}
2923 	}
2924 	if (t->tbf_q_len == 0) {
2925 		id = vifp->v_timeout_id;
2926 		vifp->v_timeout_id = 0;
2927 	}
2928 	mutex_exit(&vifp->v_tbf->tbf_lock);
2929 	if (id != 0)
2930 		(void) untimeout(id);
2931 }
2932 
2933 /*
2934  * Adds a packet to the tbf queue at the interface.
2935  * The ipha is for mcastgrp destination for phyint and encap.
2936  */
2937 static void
tbf_queue(struct vif * vifp,mblk_t * mp)2938 tbf_queue(struct vif *vifp, mblk_t *mp)
2939 {
2940 	struct tbf	*t = vifp->v_tbf;
2941 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2942 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2943 
2944 	if (ipst->ips_ip_mrtdebug > 1) {
2945 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2946 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2947 	}
2948 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2949 
2950 	if (t->tbf_t == NULL) {
2951 		/* Queue was empty */
2952 		t->tbf_q = mp;
2953 	} else {
2954 		/* Insert at tail */
2955 		t->tbf_t->b_next = mp;
2956 	}
2957 	/* set new tail pointer */
2958 	t->tbf_t = mp;
2959 
2960 	mp->b_next = mp->b_prev = NULL;
2961 
2962 	t->tbf_q_len++;
2963 }
2964 
2965 /*
2966  * Process the queue at the vif interface.
2967  * Drops the tbf_lock when sending packets.
2968  *
2969  * NOTE : The caller should quntimeout if the queue length is 0.
2970  */
2971 static void
tbf_process_q(struct vif * vifp)2972 tbf_process_q(struct vif *vifp)
2973 {
2974 	mblk_t	*mp;
2975 	struct tbf	*t = vifp->v_tbf;
2976 	size_t	len;
2977 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2978 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2979 
2980 	if (ipst->ips_ip_mrtdebug > 1) {
2981 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2982 		    "tbf_process_q 1: vif %ld qlen = %d",
2983 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2984 	}
2985 
2986 	/*
2987 	 * Loop through the queue at the interface and send
2988 	 * as many packets as possible.
2989 	 */
2990 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2991 
2992 	while (t->tbf_q_len > 0) {
2993 		mp = t->tbf_q;
2994 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2995 
2996 		/* Determine if the packet can be sent */
2997 		if (len <= t->tbf_n_tok) {
2998 			/*
2999 			 * If so, reduce no. of tokens, dequeue the packet,
3000 			 * send the packet.
3001 			 */
3002 			t->tbf_n_tok -= len;
3003 
3004 			t->tbf_q = mp->b_next;
3005 			if (--t->tbf_q_len == 0) {
3006 				t->tbf_t = NULL;
3007 			}
3008 			mp->b_next = NULL;
3009 			/* Exit mutex before sending packet, then re-enter */
3010 			mutex_exit(&t->tbf_lock);
3011 			tbf_send_packet(vifp, mp);
3012 			mutex_enter(&t->tbf_lock);
3013 		} else
3014 			break;
3015 	}
3016 }
3017 
3018 /* Called at tbf timeout to update tokens, process q and reset timer.  */
3019 static void
tbf_reprocess_q(void * arg)3020 tbf_reprocess_q(void *arg)
3021 {
3022 	struct vif *vifp = arg;
3023 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3024 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3025 
3026 	mutex_enter(&vifp->v_tbf->tbf_lock);
3027 	vifp->v_timeout_id = 0;
3028 	tbf_update_tokens(vifp);
3029 
3030 	tbf_process_q(vifp);
3031 
3032 	if (vifp->v_tbf->tbf_q_len > 0) {
3033 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3034 		    TBF_REPROCESS);
3035 	}
3036 	mutex_exit(&vifp->v_tbf->tbf_lock);
3037 
3038 	if (ipst->ips_ip_mrtdebug > 1) {
3039 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3040 		    "tbf_reprcess_q: vif %ld timeout id = %p",
3041 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3042 	}
3043 }
3044 
3045 /*
3046  * Function that will selectively discard a member of the tbf queue,
3047  * based on the precedence value and the priority.
3048  *
3049  * NOTE : The caller should quntimeout if the queue length is 0.
3050  */
3051 static int
tbf_dq_sel(struct vif * vifp,ipha_t * ipha)3052 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3053 {
3054 	uint_t		p;
3055 	struct tbf		*t = vifp->v_tbf;
3056 	mblk_t		**np;
3057 	mblk_t		*last, *mp;
3058 	ill_t		*ill = vifp->v_ipif->ipif_ill;
3059 	ip_stack_t	*ipst = ill->ill_ipst;
3060 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3061 
3062 	if (ipst->ips_ip_mrtdebug > 1) {
3063 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3064 		    "dq_sel: vif %ld dst 0x%x",
3065 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3066 	}
3067 
3068 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3069 	p = priority(vifp, ipha);
3070 
3071 	np = &t->tbf_q;
3072 	last = NULL;
3073 	while ((mp = *np) != NULL) {
3074 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3075 			*np = mp->b_next;
3076 			/* If removing the last packet, fix the tail pointer */
3077 			if (mp == t->tbf_t)
3078 				t->tbf_t = last;
3079 			mp->b_prev = mp->b_next = NULL;
3080 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3081 			ip_drop_output("tbf_dq_send", mp, ill);
3082 			freemsg(mp);
3083 			/*
3084 			 * It's impossible for the queue to be empty, but
3085 			 * we check anyway.
3086 			 */
3087 			if (--t->tbf_q_len == 0) {
3088 				t->tbf_t = NULL;
3089 			}
3090 			ipst->ips_mrtstat->mrts_drop_sel++;
3091 			return (1);
3092 		}
3093 		np = &mp->b_next;
3094 		last = mp;
3095 	}
3096 	return (0);
3097 }
3098 
3099 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3100 static void
tbf_send_packet(struct vif * vifp,mblk_t * mp)3101 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3102 {
3103 	ipif_t		*ipif = vifp->v_ipif;
3104 	ill_t		*ill = ipif->ipif_ill;
3105 	ip_stack_t	*ipst = ill->ill_ipst;
3106 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3107 	ipha_t		*ipha;
3108 
3109 	ipha = (ipha_t *)mp->b_rptr;
3110 	/* If encap tunnel options */
3111 	if (vifp->v_flags & VIFF_TUNNEL)  {
3112 		ip_xmit_attr_t	ixas;
3113 
3114 		if (ipst->ips_ip_mrtdebug > 1) {
3115 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3116 			    "tbf_send_packet: ENCAP tunnel vif %ld",
3117 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
3118 		}
3119 		bzero(&ixas, sizeof (ixas));
3120 		ixas.ixa_flags =
3121 		    IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3122 		ixas.ixa_ipst = ipst;
3123 		ixas.ixa_ifindex = 0;
3124 		ixas.ixa_cred = kcred;
3125 		ixas.ixa_cpid = NOPID;
3126 		ixas.ixa_tsl = NULL;
3127 		ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3128 		ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3129 		ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3130 
3131 		/*
3132 		 * Feed into ip_output_simple which will set the ident field
3133 		 * and checksum the encapsulating header.
3134 		 * BSD gets the cached route vifp->v_route from ip_output()
3135 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
3136 		 * One could make multicast forwarding faster by putting an
3137 		 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3138 		 */
3139 		(void) ip_output_simple(mp, &ixas);
3140 		ixa_cleanup(&ixas);
3141 		return;
3142 
3143 		/* phyint */
3144 	} else {
3145 		/* Need to loop back to members on the outgoing interface. */
3146 		ipaddr_t	dst;
3147 		ip_recv_attr_t	iras;
3148 		nce_t		*nce;
3149 
3150 		bzero(&iras, sizeof (iras));
3151 		iras.ira_flags = IRAF_IS_IPV4;
3152 		iras.ira_ill = iras.ira_rill = ill;
3153 		iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3154 		iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3155 		iras.ira_pktlen = ntohs(ipha->ipha_length);
3156 		iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3157 
3158 		dst = ipha->ipha_dst;
3159 		if (ill_hasmembers_v4(ill, dst)) {
3160 			iras.ira_flags |= IRAF_LOOPBACK_COPY;
3161 		}
3162 		if (ipst->ips_ip_mrtdebug > 1) {
3163 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3164 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3165 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3166 		}
3167 		/*
3168 		 * Find an NCE which matches the nexthop.
3169 		 * For a pt-pt interface we use the other end of the pt-pt
3170 		 * link.
3171 		 */
3172 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3173 			dst = ipif->ipif_pp_dst_addr;
3174 			nce = arp_nce_init(ill, dst, ill->ill_net_type);
3175 		} else {
3176 			nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3177 		}
3178 		if (nce == NULL) {
3179 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3180 			ip_drop_output("tbf_send_packet - no nce", mp, ill);
3181 			freemsg(mp);
3182 			return;
3183 		}
3184 
3185 		/*
3186 		 * We don't remeber the incoming ill. Thus we
3187 		 * pretend the  packet arrived on the outbound ill. This means
3188 		 * statistics for input errors will be increased on the wrong
3189 		 * ill but that isn't a big deal.
3190 		 */
3191 		ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
3192 		    0);
3193 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3194 
3195 		nce_refrele(nce);
3196 	}
3197 }
3198 
3199 /*
3200  * Determine the current time and then the elapsed time (between the last time
3201  * and time now).  Update the no. of tokens in the bucket.
3202  */
3203 static void
tbf_update_tokens(struct vif * vifp)3204 tbf_update_tokens(struct vif *vifp)
3205 {
3206 	timespec_t	tp;
3207 	hrtime_t	tm;
3208 	struct tbf	*t = vifp->v_tbf;
3209 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3210 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3211 
3212 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3213 
3214 	/* Time in secs and nsecs, rate limit in kbits/sec */
3215 	gethrestime(&tp);
3216 
3217 	/*LINTED*/
3218 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3219 
3220 	/*
3221 	 * This formula is actually
3222 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3223 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3224 	 *
3225 	 * The (1000/1024) was introduced in add_vif to optimize
3226 	 * this divide into a shift.
3227 	 */
3228 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3229 	t->tbf_last_pkt_t = tp;
3230 
3231 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3232 		t->tbf_n_tok = MAX_BKT_SIZE;
3233 	if (ipst->ips_ip_mrtdebug > 1) {
3234 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3235 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3236 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3237 	}
3238 }
3239 
3240 /*
3241  * Priority currently is based on port nos.
3242  * Different forwarding mechanisms have different ways
3243  * of obtaining the port no. Hence, the vif must be
3244  * given along with the packet itself.
3245  *
3246  */
3247 static int
priority(struct vif * vifp,ipha_t * ipha)3248 priority(struct vif *vifp, ipha_t *ipha)
3249 {
3250 	int prio;
3251 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3252 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3253 
3254 	/* Temporary hack; may add general packet classifier some day */
3255 
3256 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3257 
3258 	/*
3259 	 * The UDP port space is divided up into four priority ranges:
3260 	 * [0, 16384)	: unclassified - lowest priority
3261 	 * [16384, 32768)	: audio - highest priority
3262 	 * [32768, 49152)	: whiteboard - medium priority
3263 	 * [49152, 65536)	: video - low priority
3264 	 */
3265 
3266 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3267 		struct udphdr *udp =
3268 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3269 		switch (ntohs(udp->uh_dport) & 0xc000) {
3270 		case 0x4000:
3271 			prio = 70;
3272 			break;
3273 		case 0x8000:
3274 			prio = 60;
3275 			break;
3276 		case 0xc000:
3277 			prio = 55;
3278 			break;
3279 		default:
3280 			prio = 50;
3281 			break;
3282 		}
3283 		if (ipst->ips_ip_mrtdebug > 1) {
3284 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3285 			    "priority: port %x prio %d\n",
3286 			    ntohs(udp->uh_dport), prio);
3287 		}
3288 	} else
3289 		prio = 50;  /* default priority */
3290 	return (prio);
3291 }
3292 
3293 /*
3294  * End of token bucket filter modifications
3295  */
3296 
3297 
3298 
3299 /*
3300  * Produces data for netstat -M.
3301  */
3302 int
ip_mroute_stats(mblk_t * mp,ip_stack_t * ipst)3303 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3304 {
3305 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3306 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3307 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3308 		sizeof (struct mrtstat))) {
3309 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3310 		    (size_t)sizeof (struct mrtstat)));
3311 		return (0);
3312 	}
3313 	return (1);
3314 }
3315 
3316 /*
3317  * Sends info for SNMP's MIB.
3318  */
3319 int
ip_mroute_vif(mblk_t * mp,ip_stack_t * ipst)3320 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3321 {
3322 	struct vifctl 	vi;
3323 	vifi_t		vifi;
3324 
3325 	mutex_enter(&ipst->ips_numvifs_mutex);
3326 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3327 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3328 			continue;
3329 		/*
3330 		 * No locks here, an approximation is fine.
3331 		 */
3332 		vi.vifc_vifi = vifi;
3333 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3334 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3335 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
3336 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
3337 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
3338 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
3339 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
3340 
3341 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3342 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3343 			    (size_t)sizeof (vi)));
3344 			mutex_exit(&ipst->ips_numvifs_mutex);
3345 			return (0);
3346 		}
3347 	}
3348 	mutex_exit(&ipst->ips_numvifs_mutex);
3349 	return (1);
3350 }
3351 
3352 /*
3353  * Called by ip_snmp_get to send up multicast routing table.
3354  */
3355 int
ip_mroute_mrt(mblk_t * mp,ip_stack_t * ipst)3356 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3357 {
3358 	int			i, j;
3359 	struct mfc		*rt;
3360 	struct mfcctl	mfcc;
3361 
3362 	/*
3363 	 * Make sure multicast has not been turned off.
3364 	 */
3365 	if (is_mrouter_off(ipst))
3366 		return (1);
3367 
3368 	/* Loop over all hash buckets and their chains */
3369 	for (i = 0; i < MFCTBLSIZ; i++) {
3370 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3371 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3372 			mutex_enter(&rt->mfc_mutex);
3373 			if (rt->mfc_rte != NULL ||
3374 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3375 				mutex_exit(&rt->mfc_mutex);
3376 				continue;
3377 			}
3378 			mfcc.mfcc_origin = rt->mfc_origin;
3379 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3380 			mfcc.mfcc_parent = rt->mfc_parent;
3381 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3382 			mutex_enter(&ipst->ips_numvifs_mutex);
3383 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
3384 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3385 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3386 				mfcc.mfcc_ttls[j] = 0;
3387 			mutex_exit(&ipst->ips_numvifs_mutex);
3388 
3389 			mutex_exit(&rt->mfc_mutex);
3390 			if (!snmp_append_data(mp, (char *)&mfcc,
3391 			    sizeof (mfcc))) {
3392 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
3393 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3394 				    (size_t)sizeof (mfcc)));
3395 				return (0);
3396 			}
3397 		}
3398 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
3399 	}
3400 	return (1);
3401 }
3402