xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_mroute.c (revision e8249070)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /* Copyright (c) 1990 Mentat Inc. */
25 
26 /*
27  * Copyright (c) 2018, Joyent, Inc.
28  * Copyright 2024 Oxide Computer Company
29  */
30 
31 /*
32  * Procedures for the kernel part of DVMRP,
33  * a Distance-Vector Multicast Routing Protocol.
34  * (See RFC-1075)
35  * Written by David Waitzman, BBN Labs, August 1988.
36  * Modified by Steve Deering, Stanford, February 1989.
37  * Modified by Mark J. Steiglitz, Stanford, May, 1991
38  * Modified by Van Jacobson, LBL, January 1993
39  * Modified by Ajit Thyagarajan, PARC, August 1993
40  * Modified by Bill Fenner, PARC, April 1995
41  *
42  * MROUTING 3.5
43  */
44 
45 /*
46  * TODO
47  * - function pointer field in vif, void *vif_sendit()
48  */
49 
50 #include <sys/types.h>
51 #include <sys/stream.h>
52 #include <sys/stropts.h>
53 #include <sys/strlog.h>
54 #include <sys/systm.h>
55 #include <sys/ddi.h>
56 #include <sys/cmn_err.h>
57 #include <sys/zone.h>
58 
59 #include <sys/param.h>
60 #include <sys/socket.h>
61 #include <sys/vtrace.h>
62 #include <sys/debug.h>
63 #include <net/if.h>
64 #include <sys/sockio.h>
65 #include <netinet/in.h>
66 #include <net/if_dl.h>
67 
68 #include <inet/ipsec_impl.h>
69 #include <inet/common.h>
70 #include <inet/mi.h>
71 #include <inet/nd.h>
72 #include <inet/tunables.h>
73 #include <inet/mib2.h>
74 #include <netinet/ip6.h>
75 #include <inet/ip.h>
76 #include <inet/snmpcom.h>
77 
78 #include <netinet/igmp.h>
79 #include <netinet/igmp_var.h>
80 #include <netinet/udp.h>
81 #include <netinet/ip_mroute.h>
82 #include <inet/ip_multi.h>
83 #include <inet/ip_ire.h>
84 #include <inet/ip_ndp.h>
85 #include <inet/ip_if.h>
86 #include <inet/ipclassifier.h>
87 
88 #include <netinet/pim.h>
89 
90 
91 /*
92  * MT Design:
93  *
94  * There are three main data structures viftable, mfctable and tbftable that
95  * need to be protected against MT races.
96  *
97  * vitable is a fixed length array of vif structs. There is no lock to protect
98  * the whole array, instead each struct is protected by its own indiviual lock.
99  * The value of v_marks in conjuction with the value of v_refcnt determines the
100  * current state of a vif structure. One special state that needs mention
101  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
102  * that vif is being initalized.
103  * Each structure is freed when the refcnt goes down to zero. If a delete comes
104  * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
105  * which prevents the struct from further use.  When the refcnt goes to zero
106  * the struct is freed and is marked VIF_MARK_NOTINUSE.
107  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
108  * from  going away a refhold is put on the ipif before using it. see
109  * lock_good_vif() and unlock_good_vif().
110  *
111  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
112  * of the vif struct.
113  *
114  * tbftable is also a fixed length array of tbf structs and is only accessed
115  * via v_tbf.  It is protected by its own lock tbf_lock.
116  *
117  * Lock Ordering is
118  * v_lock --> tbf_lock
119  * v_lock --> ill_locK
120  *
121  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
122  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
123  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
124  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
125  * protect the struct elements.
126  *
127  * mfc structs are dynamically allocated and are singly linked
128  * at the head of the chain. When an mfc structure is to be deleted
129  * it is marked condemned and so is the state in the bucket struct.
130  * When the last walker of the hash bucket exits all the mfc structs
131  * marked condemed are freed.
132  *
133  * Locking Hierarchy:
134  * The bucket lock should be acquired before the mfc struct lock.
135  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
136  * operations on the bucket struct.
137  *
138  * last_encap_lock and numvifs_mutex should be acquired after
139  * acquring vif or mfc locks. These locks protect some global variables.
140  *
141  * The statistics are not currently protected by a lock
142  * causing the stats be be approximate, not exact.
143  */
144 
145 #define	NO_VIF	MAXVIFS		/* from mrouted, no route for src */
146 
147 /*
148  * Timeouts:
149  *	Upcall timeouts - BSD uses boolean_t mfc->expire and
150  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
151  *	SunOS 5.x uses mfc->timeout for each mfc.
152  *	Some Unixes are limited in the number of simultaneous timeouts
153  *	that can be run, SunOS 5.x does not have this restriction.
154  */
155 
156 /*
157  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
158  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
159  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
160  */
161 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
162 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
163 
164 /*
165  * Hash function for a source, group entry
166  */
167 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
168 	((g) >> 20) ^ ((g) >> 10) ^ (g))
169 
170 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
171 
172 /* Identify PIM packet that came on a Register interface */
173 #define	PIM_REGISTER_MARKER	0xffffffff
174 
175 /* Function declarations */
176 static int	add_mfc(struct mfcctl *, ip_stack_t *);
177 static int	add_vif(struct vifctl *, conn_t *, ip_stack_t *);
178 static int	del_mfc(struct mfcctl *, ip_stack_t *);
179 static int	del_vif(vifi_t *, ip_stack_t *);
180 static void	del_vifp(struct vif *);
181 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
182 static void	expire_upcalls(void *);
183 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
184 static void	free_queue(struct mfc *);
185 static int	get_assert(uchar_t *, ip_stack_t *);
186 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
187 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
188 static int	get_version(uchar_t *);
189 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
190 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
191 		    ipaddr_t, struct mfc *);
192 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
193 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
194 static int	register_mforward(mblk_t *, ip_recv_attr_t *);
195 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
196 static int	set_assert(int *, ip_stack_t *);
197 
198 /*
199  * Token Bucket Filter functions
200  */
201 static int  priority(struct vif *, ipha_t *);
202 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
203 static int  tbf_dq_sel(struct vif *, ipha_t *);
204 static void tbf_process_q(struct vif *);
205 static void tbf_queue(struct vif *, mblk_t *);
206 static void tbf_reprocess_q(void *);
207 static void tbf_send_packet(struct vif *, mblk_t *);
208 static void tbf_update_tokens(struct vif *);
209 static void release_mfc(struct mfcb *);
210 
211 static boolean_t is_mrouter_off(ip_stack_t *);
212 /*
213  * Encapsulation packets
214  */
215 
216 #define	ENCAP_TTL	64
217 
218 /* prototype IP hdr for encapsulated packets */
219 static ipha_t multicast_encap_iphdr = {
220 	IP_SIMPLE_HDR_VERSION,
221 	0,				/* tos */
222 	sizeof (ipha_t),		/* total length */
223 	0,				/* id */
224 	0,				/* frag offset */
225 	ENCAP_TTL, IPPROTO_ENCAP,
226 	0,				/* checksum */
227 };
228 
229 /*
230  * Rate limit for assert notification messages, in nsec.
231  */
232 #define	ASSERT_MSG_TIME		3000000000
233 
234 
235 #define	VIF_REFHOLD(vifp) {			\
236 	mutex_enter(&(vifp)->v_lock);		\
237 	(vifp)->v_refcnt++;			\
238 	mutex_exit(&(vifp)->v_lock);		\
239 }
240 
241 #define	VIF_REFRELE_LOCKED(vifp) {				\
242 	(vifp)->v_refcnt--;					\
243 	if ((vifp)->v_refcnt == 0 &&				\
244 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
245 			del_vifp(vifp);				\
246 	} else {						\
247 		mutex_exit(&(vifp)->v_lock);			\
248 	}							\
249 }
250 
251 #define	VIF_REFRELE(vifp) {					\
252 	mutex_enter(&(vifp)->v_lock);				\
253 	(vifp)->v_refcnt--;					\
254 	if ((vifp)->v_refcnt == 0 &&				\
255 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
256 			del_vifp(vifp);				\
257 	} else {						\
258 		mutex_exit(&(vifp)->v_lock);			\
259 	}							\
260 }
261 
262 #define	MFCB_REFHOLD(mfcb) {				\
263 	mutex_enter(&(mfcb)->mfcb_lock);		\
264 	(mfcb)->mfcb_refcnt++;				\
265 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
266 	mutex_exit(&(mfcb)->mfcb_lock);			\
267 }
268 
269 #define	MFCB_REFRELE(mfcb) {					\
270 	mutex_enter(&(mfcb)->mfcb_lock);			\
271 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
272 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
273 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
274 			release_mfc(mfcb);			\
275 	}							\
276 	mutex_exit(&(mfcb)->mfcb_lock);				\
277 }
278 
279 /*
280  * MFCFIND:
281  * Find a route for a given origin IP address and multicast group address.
282  * Skip entries with pending upcalls.
283  * Type of service parameter to be added in the future!
284  */
285 #define	MFCFIND(mfcbp, o, g, rt) { \
286 	struct mfc *_mb_rt = NULL; \
287 	rt = NULL; \
288 	_mb_rt = mfcbp->mfcb_mfc; \
289 	while (_mb_rt) { \
290 		if ((_mb_rt->mfc_origin.s_addr == o) && \
291 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
292 		    (_mb_rt->mfc_rte == NULL) && \
293 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
294 		    rt = _mb_rt; \
295 		    break; \
296 		} \
297 	_mb_rt = _mb_rt->mfc_next; \
298 	} \
299 }
300 
301 /*
302  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
303  * are inefficient. We use gethrestime() which returns a timespec_t with
304  * sec and nsec, the resolution is machine dependent.
305  * The following 2 macros have been changed to use nsec instead of usec.
306  */
307 /*
308  * Macros to compute elapsed time efficiently.
309  * Borrowed from Van Jacobson's scheduling code.
310  * Delta should be a hrtime_t.
311  */
312 #define	TV_DELTA(a, b, delta) { \
313 	int xxs; \
314  \
315 	delta = (a).tv_nsec - (b).tv_nsec; \
316 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
317 		switch (xxs) { \
318 		case 2: \
319 		    delta += 1000000000; \
320 		    /*FALLTHROUGH*/ \
321 		case 1: \
322 		    delta += 1000000000; \
323 		    break; \
324 		default: \
325 		    delta += (1000000000 * xxs); \
326 		} \
327 	} \
328 }
329 
330 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
331 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
332 
333 /*
334  * Handle MRT setsockopt commands to modify the multicast routing tables.
335  */
336 int
ip_mrouter_set(int cmd,conn_t * connp,int checkonly,uchar_t * data,int datalen)337 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
338     int datalen)
339 {
340 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
341 
342 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
343 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
344 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
345 		return (EACCES);
346 	}
347 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
348 
349 	if (checkonly) {
350 		/*
351 		 * do not do operation, just pretend to - new T_CHECK
352 		 * Note: Even routines further on can probably fail but
353 		 * this T_CHECK stuff is only to please XTI so it not
354 		 * necessary to be perfect.
355 		 */
356 		switch (cmd) {
357 		case MRT_INIT:
358 		case MRT_DONE:
359 		case MRT_ADD_VIF:
360 		case MRT_DEL_VIF:
361 		case MRT_ADD_MFC:
362 		case MRT_DEL_MFC:
363 		case MRT_ASSERT:
364 			return (0);
365 		default:
366 			return (EOPNOTSUPP);
367 		}
368 	}
369 
370 	/*
371 	 * make sure no command is issued after multicast routing has been
372 	 * turned off.
373 	 */
374 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
375 		if (is_mrouter_off(ipst))
376 			return (EINVAL);
377 	}
378 
379 	switch (cmd) {
380 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
381 	case MRT_DONE:	return (ip_mrouter_done(ipst));
382 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp, ipst));
383 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, ipst));
384 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
385 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
386 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
387 	default:	   return (EOPNOTSUPP);
388 	}
389 }
390 
391 /*
392  * Handle MRT getsockopt commands
393  */
394 int
ip_mrouter_get(int cmd,conn_t * connp,uchar_t * data)395 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
396 {
397 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
398 
399 	if (connp != ipst->ips_ip_g_mrouter)
400 		return (EACCES);
401 
402 	switch (cmd) {
403 	case MRT_VERSION:	return (get_version((uchar_t *)data));
404 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
405 	default:		return (EOPNOTSUPP);
406 	}
407 }
408 
409 /*
410  * Handle ioctl commands to obtain information from the cache.
411  * Called with shared access to IP. These are read_only ioctls.
412  */
413 /* ARGSUSED */
414 int
mrt_ioctl(ipif_t * ipif,sin_t * sin,queue_t * q,mblk_t * mp,ip_ioctl_cmd_t * ipip,void * if_req)415 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
416     ip_ioctl_cmd_t *ipip, void *if_req)
417 {
418 	mblk_t	*mp1;
419 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
420 	conn_t		*connp = Q_TO_CONN(q);
421 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
422 
423 	/* Existence verified in ip_wput_nondata */
424 	mp1 = mp->b_cont->b_cont;
425 
426 	switch (iocp->ioc_cmd) {
427 	case (SIOCGETVIFCNT):
428 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
429 	case (SIOCGETSGCNT):
430 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
431 	case (SIOCGETLSGCNT):
432 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
433 	default:
434 		return (EINVAL);
435 	}
436 }
437 
438 /*
439  * Returns the packet, byte, rpf-failure count for the source, group provided.
440  */
441 static int
get_sg_cnt(struct sioc_sg_req * req,ip_stack_t * ipst)442 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
443 {
444 	struct mfc *rt;
445 	struct mfcb *mfcbp;
446 
447 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
448 	MFCB_REFHOLD(mfcbp);
449 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
450 
451 	if (rt != NULL) {
452 		mutex_enter(&rt->mfc_mutex);
453 		req->pktcnt   = rt->mfc_pkt_cnt;
454 		req->bytecnt  = rt->mfc_byte_cnt;
455 		req->wrong_if = rt->mfc_wrong_if;
456 		mutex_exit(&rt->mfc_mutex);
457 	} else
458 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
459 
460 	MFCB_REFRELE(mfcbp);
461 	return (0);
462 }
463 
464 /*
465  * Returns the packet, byte, rpf-failure count for the source, group provided.
466  * Uses larger counters and IPv6 addresses.
467  */
468 /* ARGSUSED XXX until implemented */
469 static int
get_lsg_cnt(struct sioc_lsg_req * req,ip_stack_t * ipst)470 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
471 {
472 	/* XXX TODO SIOCGETLSGCNT */
473 	return (ENXIO);
474 }
475 
476 /*
477  * Returns the input and output packet and byte counts on the vif provided.
478  */
479 static int
get_vif_cnt(struct sioc_vif_req * req,ip_stack_t * ipst)480 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
481 {
482 	vifi_t vifi = req->vifi;
483 
484 	if (vifi >= ipst->ips_numvifs)
485 		return (EINVAL);
486 
487 	/*
488 	 * No locks here, an approximation is fine.
489 	 */
490 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
491 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
492 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
493 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
494 
495 	return (0);
496 }
497 
498 static int
get_version(uchar_t * data)499 get_version(uchar_t *data)
500 {
501 	int *v = (int *)data;
502 
503 	*v = 0x0305;	/* XXX !!!! */
504 
505 	return (0);
506 }
507 
508 /*
509  * Set PIM assert processing global.
510  */
511 static int
set_assert(int * i,ip_stack_t * ipst)512 set_assert(int *i, ip_stack_t *ipst)
513 {
514 	if ((*i != 1) && (*i != 0))
515 		return (EINVAL);
516 
517 	ipst->ips_pim_assert = *i;
518 
519 	return (0);
520 }
521 
522 /*
523  * Get PIM assert processing global.
524  */
525 static int
get_assert(uchar_t * data,ip_stack_t * ipst)526 get_assert(uchar_t *data, ip_stack_t *ipst)
527 {
528 	int *i = (int *)data;
529 
530 	*i = ipst->ips_pim_assert;
531 
532 	return (0);
533 }
534 
535 /*
536  * Enable multicast routing.
537  */
538 static int
ip_mrouter_init(conn_t * connp,uchar_t * data,int datalen,ip_stack_t * ipst)539 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
540 {
541 	int	*v;
542 
543 	if (data == NULL || (datalen != sizeof (int)))
544 		return (ENOPROTOOPT);
545 
546 	v = (int *)data;
547 	if (*v != 1)
548 		return (ENOPROTOOPT);
549 
550 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
551 	if (ipst->ips_ip_g_mrouter != NULL) {
552 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
553 		return (EADDRINUSE);
554 	}
555 
556 	/*
557 	 * MRT_INIT should only be allowed for RAW sockets, but we double
558 	 * check.
559 	 */
560 	if (!IPCL_IS_RAWIP(connp)) {
561 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
562 		return (EINVAL);
563 	}
564 
565 	ipst->ips_ip_g_mrouter = connp;
566 	connp->conn_multi_router = 1;
567 	/* In order for tunnels to work we have to turn ip_g_forward on */
568 	if (!WE_ARE_FORWARDING(ipst)) {
569 		if (ipst->ips_ip_mrtdebug > 1) {
570 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
571 			    "ip_mrouter_init: turning on forwarding");
572 		}
573 		ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
574 		ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
575 	}
576 
577 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
578 	return (0);
579 }
580 
581 void
ip_mrouter_stack_init(ip_stack_t * ipst)582 ip_mrouter_stack_init(ip_stack_t *ipst)
583 {
584 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
585 
586 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
587 	    KM_SLEEP);
588 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
589 	/*
590 	 * mfctable:
591 	 * Includes all mfcs, including waiting upcalls.
592 	 * Multiple mfcs per bucket.
593 	 */
594 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
595 	    KM_SLEEP);
596 	/*
597 	 * Define the token bucket filter structures.
598 	 * tbftable -> each vif has one of these for storing info.
599 	 */
600 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
601 
602 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
603 
604 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
605 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
606 }
607 
608 /*
609  * Disable multicast routing.
610  * Didn't use global timeout_val (BSD version), instead check the mfctable.
611  */
612 int
ip_mrouter_done(ip_stack_t * ipst)613 ip_mrouter_done(ip_stack_t *ipst)
614 {
615 	conn_t		*mrouter;
616 	vifi_t		vifi;
617 	struct mfc	*mfc_rt;
618 	int		i;
619 
620 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
621 	if (ipst->ips_ip_g_mrouter == NULL) {
622 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
623 		return (EINVAL);
624 	}
625 
626 	mrouter = ipst->ips_ip_g_mrouter;
627 
628 	if (ipst->ips_saved_ip_forwarding != -1) {
629 		if (ipst->ips_ip_mrtdebug > 1) {
630 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
631 			    "ip_mrouter_done: turning off forwarding");
632 		}
633 		ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
634 		ipst->ips_saved_ip_forwarding = -1;
635 	}
636 
637 	/*
638 	 * Always clear cache when vifs change.
639 	 * No need to get ipst->ips_last_encap_lock since we are running as
640 	 * a writer.
641 	 */
642 	mutex_enter(&ipst->ips_last_encap_lock);
643 	ipst->ips_last_encap_src = 0;
644 	ipst->ips_last_encap_vif = NULL;
645 	mutex_exit(&ipst->ips_last_encap_lock);
646 	mrouter->conn_multi_router = 0;
647 
648 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
649 
650 	/*
651 	 * For each phyint in use,
652 	 * disable promiscuous reception of all IP multicasts.
653 	 */
654 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
655 		struct vif *vifp = ipst->ips_vifs + vifi;
656 
657 		mutex_enter(&vifp->v_lock);
658 		/*
659 		 * if the vif is active mark it condemned.
660 		 */
661 		if (vifp->v_marks & VIF_MARK_GOOD) {
662 			ASSERT(vifp->v_ipif != NULL);
663 			ipif_refhold(vifp->v_ipif);
664 			/* Phyint only */
665 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
666 				ipif_t *ipif = vifp->v_ipif;
667 				ilm_t *ilm = vifp->v_ilm;
668 
669 				vifp->v_ilm = NULL;
670 				vifp->v_marks &= ~VIF_MARK_GOOD;
671 				vifp->v_marks |= VIF_MARK_CONDEMNED;
672 
673 				mutex_exit(&(vifp)->v_lock);
674 				if (ilm != NULL) {
675 					ill_t *ill = ipif->ipif_ill;
676 
677 					(void) ip_delmulti(ilm);
678 					ASSERT(ill->ill_mrouter_cnt > 0);
679 					atomic_dec_32(&ill->ill_mrouter_cnt);
680 				}
681 				mutex_enter(&vifp->v_lock);
682 			}
683 			ipif_refrele(vifp->v_ipif);
684 			/*
685 			 * decreases the refcnt added in add_vif.
686 			 * and release v_lock.
687 			 */
688 			VIF_REFRELE_LOCKED(vifp);
689 		} else {
690 			mutex_exit(&vifp->v_lock);
691 			continue;
692 		}
693 	}
694 
695 	mutex_enter(&ipst->ips_numvifs_mutex);
696 	ipst->ips_numvifs = 0;
697 	ipst->ips_pim_assert = 0;
698 	ipst->ips_reg_vif_num = ALL_VIFS;
699 	mutex_exit(&ipst->ips_numvifs_mutex);
700 
701 	/*
702 	 * Free upcall msgs.
703 	 * Go through mfctable and stop any outstanding upcall
704 	 * timeouts remaining on mfcs.
705 	 */
706 	for (i = 0; i < MFCTBLSIZ; i++) {
707 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
708 		ipst->ips_mfcs[i].mfcb_refcnt++;
709 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
710 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
711 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
712 		while (mfc_rt) {
713 			/* Free upcalls */
714 			mutex_enter(&mfc_rt->mfc_mutex);
715 			if (mfc_rt->mfc_rte != NULL) {
716 				if (mfc_rt->mfc_timeout_id != 0) {
717 					/*
718 					 * OK to drop the lock as we have
719 					 * a refcnt on the bucket. timeout
720 					 * can fire but it will see that
721 					 * mfc_timeout_id == 0 and not do
722 					 * anything. see expire_upcalls().
723 					 */
724 					mfc_rt->mfc_timeout_id = 0;
725 					mutex_exit(&mfc_rt->mfc_mutex);
726 					(void) untimeout(
727 					    mfc_rt->mfc_timeout_id);
728 					mfc_rt->mfc_timeout_id = 0;
729 					mutex_enter(&mfc_rt->mfc_mutex);
730 
731 					/*
732 					 * all queued upcall packets
733 					 * and mblk will be freed in
734 					 * release_mfc().
735 					 */
736 				}
737 			}
738 
739 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
740 
741 			mutex_exit(&mfc_rt->mfc_mutex);
742 			mfc_rt = mfc_rt->mfc_next;
743 		}
744 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
745 	}
746 
747 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
748 	ipst->ips_ip_g_mrouter = NULL;
749 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
750 	return (0);
751 }
752 
753 void
ip_mrouter_stack_destroy(ip_stack_t * ipst)754 ip_mrouter_stack_destroy(ip_stack_t *ipst)
755 {
756 	struct mfcb *mfcbp;
757 	struct mfc  *rt;
758 	int i;
759 
760 	for (i = 0; i < MFCTBLSIZ; i++) {
761 		mfcbp = &ipst->ips_mfcs[i];
762 
763 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
764 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
765 			    i);
766 
767 			mfcbp->mfcb_mfc = rt->mfc_next;
768 			free_queue(rt);
769 			mi_free(rt);
770 		}
771 	}
772 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
773 	ipst->ips_vifs = NULL;
774 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
775 	ipst->ips_mrtstat = NULL;
776 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
777 	ipst->ips_mfcs = NULL;
778 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
779 	ipst->ips_tbfs = NULL;
780 
781 	mutex_destroy(&ipst->ips_last_encap_lock);
782 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
783 }
784 
785 static boolean_t
is_mrouter_off(ip_stack_t * ipst)786 is_mrouter_off(ip_stack_t *ipst)
787 {
788 	conn_t	*mrouter;
789 
790 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
791 	if (ipst->ips_ip_g_mrouter == NULL) {
792 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
793 		return (B_TRUE);
794 	}
795 
796 	mrouter = ipst->ips_ip_g_mrouter;
797 	if (mrouter->conn_multi_router == 0) {
798 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
799 		return (B_TRUE);
800 	}
801 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
802 	return (B_FALSE);
803 }
804 
805 static void
unlock_good_vif(struct vif * vifp)806 unlock_good_vif(struct vif *vifp)
807 {
808 	ASSERT(vifp->v_ipif != NULL);
809 	ipif_refrele(vifp->v_ipif);
810 	VIF_REFRELE(vifp);
811 }
812 
813 static boolean_t
lock_good_vif(struct vif * vifp)814 lock_good_vif(struct vif *vifp)
815 {
816 	mutex_enter(&vifp->v_lock);
817 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
818 		mutex_exit(&vifp->v_lock);
819 		return (B_FALSE);
820 	}
821 
822 	ASSERT(vifp->v_ipif != NULL);
823 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
824 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
825 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
826 		mutex_exit(&vifp->v_lock);
827 		return (B_FALSE);
828 	}
829 	ipif_refhold_locked(vifp->v_ipif);
830 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
831 	vifp->v_refcnt++;
832 	mutex_exit(&vifp->v_lock);
833 	return (B_TRUE);
834 }
835 
836 /*
837  * Add a vif to the vif table.
838  */
839 static int
add_vif(struct vifctl * vifcp,conn_t * connp,ip_stack_t * ipst)840 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
841 {
842 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
843 	ipif_t		*ipif;
844 	int		error = 0;
845 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
846 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
847 	ilm_t		*ilm;
848 	ill_t		*ill;
849 
850 	ASSERT(connp != NULL);
851 
852 	if (vifcp->vifc_vifi >= MAXVIFS)
853 		return (EINVAL);
854 
855 	if (is_mrouter_off(ipst))
856 		return (EINVAL);
857 
858 	mutex_enter(&vifp->v_lock);
859 	/*
860 	 * Viftable entry should be 0.
861 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
862 	 * initialized.
863 	 *
864 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
865 	 * request while the delete is in progress, mrouted only sends add
866 	 * requests when a new interface is added and the new interface cannot
867 	 * have the same vifi as an existing interface. We make sure that
868 	 * ill_delete will block till the vif is deleted by adding a refcnt
869 	 * to ipif in del_vif().
870 	 */
871 	if (vifp->v_lcl_addr.s_addr != 0 ||
872 	    vifp->v_marks != 0 ||
873 	    vifp->v_refcnt != 0) {
874 		mutex_exit(&vifp->v_lock);
875 		return (EADDRINUSE);
876 	}
877 
878 	/* Incoming vif should not be 0 */
879 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
880 		mutex_exit(&vifp->v_lock);
881 		return (EINVAL);
882 	}
883 
884 	vifp->v_refcnt++;
885 	mutex_exit(&vifp->v_lock);
886 	/* Find the interface with the local address */
887 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
888 	    IPCL_ZONEID(connp), ipst);
889 	if (ipif == NULL) {
890 		VIF_REFRELE(vifp);
891 		return (EADDRNOTAVAIL);
892 	}
893 
894 	if (ipst->ips_ip_mrtdebug > 1) {
895 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
896 		    "add_vif: src 0x%x enter",
897 		    vifcp->vifc_lcl_addr.s_addr);
898 	}
899 
900 	mutex_enter(&vifp->v_lock);
901 	/*
902 	 * Always clear cache when vifs change.
903 	 * Needed to ensure that src isn't left over from before vif was added.
904 	 * No need to get last_encap_lock, since we are running as a writer.
905 	 */
906 
907 	mutex_enter(&ipst->ips_last_encap_lock);
908 	ipst->ips_last_encap_src = 0;
909 	ipst->ips_last_encap_vif = NULL;
910 	mutex_exit(&ipst->ips_last_encap_lock);
911 
912 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
913 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
914 			cmn_err(CE_WARN,
915 			    "add_vif: source route tunnels not supported\n");
916 			VIF_REFRELE_LOCKED(vifp);
917 			ipif_refrele(ipif);
918 			return (EOPNOTSUPP);
919 		}
920 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
921 
922 	} else {
923 		/* Phyint or Register vif */
924 		if (vifcp->vifc_flags & VIFF_REGISTER) {
925 			/*
926 			 * Note: Since all IPPROTO_IP level options (including
927 			 * MRT_ADD_VIF) are done exclusively via
928 			 * ip_optmgmt_writer(), a lock is not necessary to
929 			 * protect reg_vif_num.
930 			 */
931 			mutex_enter(&ipst->ips_numvifs_mutex);
932 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
933 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
934 				mutex_exit(&ipst->ips_numvifs_mutex);
935 			} else {
936 				mutex_exit(&ipst->ips_numvifs_mutex);
937 				VIF_REFRELE_LOCKED(vifp);
938 				ipif_refrele(ipif);
939 				return (EADDRINUSE);
940 			}
941 		}
942 
943 		/* Make sure the interface supports multicast */
944 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
945 			VIF_REFRELE_LOCKED(vifp);
946 			ipif_refrele(ipif);
947 			if (vifcp->vifc_flags & VIFF_REGISTER) {
948 				mutex_enter(&ipst->ips_numvifs_mutex);
949 				ipst->ips_reg_vif_num = ALL_VIFS;
950 				mutex_exit(&ipst->ips_numvifs_mutex);
951 			}
952 			return (EOPNOTSUPP);
953 		}
954 		/* Enable promiscuous reception of all IP mcasts from the if */
955 		mutex_exit(&vifp->v_lock);
956 
957 		ill = ipif->ipif_ill;
958 		if (IS_UNDER_IPMP(ill))
959 			ill = ipmp_ill_hold_ipmp_ill(ill);
960 
961 		if (ill == NULL) {
962 			ilm = NULL;
963 		} else {
964 			ilm = ip_addmulti(&ipv6_all_zeros, ill,
965 			    ipif->ipif_zoneid, &error);
966 			if (ilm != NULL)
967 				atomic_inc_32(&ill->ill_mrouter_cnt);
968 			if (IS_UNDER_IPMP(ipif->ipif_ill)) {
969 				ill_refrele(ill);
970 				ill = ipif->ipif_ill;
971 			}
972 		}
973 
974 		mutex_enter(&vifp->v_lock);
975 		/*
976 		 * since we released the lock lets make sure that
977 		 * ip_mrouter_done() has not been called.
978 		 */
979 		if (ilm == NULL || is_mrouter_off(ipst)) {
980 			if (ilm != NULL) {
981 				(void) ip_delmulti(ilm);
982 				ASSERT(ill->ill_mrouter_cnt > 0);
983 				atomic_dec_32(&ill->ill_mrouter_cnt);
984 			}
985 			if (vifcp->vifc_flags & VIFF_REGISTER) {
986 				mutex_enter(&ipst->ips_numvifs_mutex);
987 				ipst->ips_reg_vif_num = ALL_VIFS;
988 				mutex_exit(&ipst->ips_numvifs_mutex);
989 			}
990 			VIF_REFRELE_LOCKED(vifp);
991 			ipif_refrele(ipif);
992 			return (error?error:EINVAL);
993 		}
994 		vifp->v_ilm = ilm;
995 	}
996 	/* Define parameters for the tbf structure */
997 	vifp->v_tbf = v_tbf;
998 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
999 	vifp->v_tbf->tbf_n_tok = 0;
1000 	vifp->v_tbf->tbf_q_len = 0;
1001 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1002 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1003 
1004 	vifp->v_flags = vifcp->vifc_flags;
1005 	vifp->v_threshold = vifcp->vifc_threshold;
1006 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1007 	vifp->v_ipif = ipif;
1008 	ipif_refrele(ipif);
1009 	/* Scaling up here, allows division by 1024 in critical code.	*/
1010 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1011 	vifp->v_timeout_id = 0;
1012 	/* initialize per vif pkt counters */
1013 	vifp->v_pkt_in = 0;
1014 	vifp->v_pkt_out = 0;
1015 	vifp->v_bytes_in = 0;
1016 	vifp->v_bytes_out = 0;
1017 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1018 
1019 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1020 	mutex_enter(&ipst->ips_numvifs_mutex);
1021 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1022 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1023 	mutex_exit(&ipst->ips_numvifs_mutex);
1024 
1025 	if (ipst->ips_ip_mrtdebug > 1) {
1026 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1027 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1028 		    vifcp->vifc_vifi,
1029 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1030 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1031 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1032 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1033 	}
1034 
1035 	vifp->v_marks = VIF_MARK_GOOD;
1036 	mutex_exit(&vifp->v_lock);
1037 	return (0);
1038 }
1039 
1040 
1041 /* Delete a vif from the vif table. */
1042 static void
del_vifp(struct vif * vifp)1043 del_vifp(struct vif *vifp)
1044 {
1045 	struct tbf	*t = vifp->v_tbf;
1046 	mblk_t  *mp0;
1047 	vifi_t  vifi;
1048 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1049 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1050 
1051 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1052 	ASSERT(t != NULL);
1053 
1054 	if (ipst->ips_ip_mrtdebug > 1) {
1055 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1056 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1057 	}
1058 
1059 	if (vifp->v_timeout_id != 0) {
1060 		(void) untimeout(vifp->v_timeout_id);
1061 		vifp->v_timeout_id = 0;
1062 	}
1063 
1064 	/*
1065 	 * Free packets queued at the interface.
1066 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1067 	 */
1068 	mutex_enter(&t->tbf_lock);
1069 	while (t->tbf_q != NULL) {
1070 		mp0 = t->tbf_q;
1071 		t->tbf_q = t->tbf_q->b_next;
1072 		mp0->b_prev = mp0->b_next = NULL;
1073 		freemsg(mp0);
1074 	}
1075 	mutex_exit(&t->tbf_lock);
1076 
1077 	/*
1078 	 * Always clear cache when vifs change.
1079 	 * No need to get last_encap_lock since we are running as a writer.
1080 	 */
1081 	mutex_enter(&ipst->ips_last_encap_lock);
1082 	if (vifp == ipst->ips_last_encap_vif) {
1083 		ipst->ips_last_encap_vif = NULL;
1084 		ipst->ips_last_encap_src = 0;
1085 	}
1086 	mutex_exit(&ipst->ips_last_encap_lock);
1087 
1088 	mutex_destroy(&t->tbf_lock);
1089 
1090 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1091 
1092 	/* Adjust numvifs down */
1093 	mutex_enter(&ipst->ips_numvifs_mutex);
1094 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1095 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1096 			break;
1097 	ipst->ips_numvifs = vifi;
1098 	mutex_exit(&ipst->ips_numvifs_mutex);
1099 
1100 	bzero(vifp, sizeof (*vifp));
1101 }
1102 
1103 static int
del_vif(vifi_t * vifip,ip_stack_t * ipst)1104 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1105 {
1106 	struct vif	*vifp = ipst->ips_vifs + *vifip;
1107 
1108 	if (*vifip >= ipst->ips_numvifs)
1109 		return (EINVAL);
1110 
1111 	mutex_enter(&vifp->v_lock);
1112 	/*
1113 	 * Not initialized
1114 	 * Here we are not looking at the vif that is being initialized
1115 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1116 	 */
1117 	if (vifp->v_lcl_addr.s_addr == 0 ||
1118 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1119 		mutex_exit(&vifp->v_lock);
1120 		return (EADDRNOTAVAIL);
1121 	}
1122 
1123 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1124 	vifp->v_marks &= ~VIF_MARK_GOOD;
1125 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1126 
1127 	/* Phyint only */
1128 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1129 		ipif_t *ipif = vifp->v_ipif;
1130 		ilm_t *ilm = vifp->v_ilm;
1131 
1132 		vifp->v_ilm = NULL;
1133 
1134 		ASSERT(ipif != NULL);
1135 		/*
1136 		 * should be OK to drop the lock as we
1137 		 * have marked this as CONDEMNED.
1138 		 */
1139 		mutex_exit(&(vifp)->v_lock);
1140 		if (ilm != NULL) {
1141 			(void) ip_delmulti(ilm);
1142 			ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1143 			atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1144 		}
1145 		mutex_enter(&(vifp)->v_lock);
1146 	}
1147 
1148 	if (vifp->v_flags & VIFF_REGISTER) {
1149 		mutex_enter(&ipst->ips_numvifs_mutex);
1150 		ipst->ips_reg_vif_num = ALL_VIFS;
1151 		mutex_exit(&ipst->ips_numvifs_mutex);
1152 	}
1153 
1154 	/*
1155 	 * decreases the refcnt added in add_vif.
1156 	 */
1157 	VIF_REFRELE_LOCKED(vifp);
1158 	return (0);
1159 }
1160 
1161 /*
1162  * Add an mfc entry.
1163  */
1164 static int
add_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1165 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1166 {
1167 	struct mfc *rt;
1168 	struct rtdetq *rte;
1169 	ushort_t nstl;
1170 	int i;
1171 	struct mfcb *mfcbp;
1172 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1173 
1174 	/*
1175 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1176 	 * did not have a real route for pkt.
1177 	 * We want this pkt without rt installed in the mfctable to prevent
1178 	 * multiiple tries, so go ahead and put it in mfctable, it will
1179 	 * be discarded later in ip_mdq() because the child is NULL.
1180 	 */
1181 
1182 	/* Error checking, out of bounds? */
1183 	if (mfccp->mfcc_parent > MAXVIFS) {
1184 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1185 		    (int)mfccp->mfcc_parent));
1186 		return (EINVAL);
1187 	}
1188 
1189 	if ((mfccp->mfcc_parent != NO_VIF) &&
1190 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1191 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1192 		    (int)mfccp->mfcc_parent));
1193 		return (EINVAL);
1194 	}
1195 
1196 	if (is_mrouter_off(ipst)) {
1197 		return (EINVAL);
1198 	}
1199 
1200 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1201 	    mfccp->mfcc_mcastgrp.s_addr)];
1202 	MFCB_REFHOLD(mfcbp);
1203 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1204 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1205 
1206 	/* If an entry already exists, just update the fields */
1207 	if (rt) {
1208 		if (ipst->ips_ip_mrtdebug > 1) {
1209 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1210 			    "add_mfc: update o %x grp %x parent %x",
1211 			    ntohl(mfccp->mfcc_origin.s_addr),
1212 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1213 			    mfccp->mfcc_parent);
1214 		}
1215 		mutex_enter(&rt->mfc_mutex);
1216 		rt->mfc_parent = mfccp->mfcc_parent;
1217 
1218 		mutex_enter(&ipst->ips_numvifs_mutex);
1219 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
1220 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1221 		mutex_exit(&ipst->ips_numvifs_mutex);
1222 		mutex_exit(&rt->mfc_mutex);
1223 
1224 		MFCB_REFRELE(mfcbp);
1225 		return (0);
1226 	}
1227 
1228 	/*
1229 	 * Find the entry for which the upcall was made and update.
1230 	 */
1231 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1232 		mutex_enter(&rt->mfc_mutex);
1233 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1234 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1235 		    (rt->mfc_rte != NULL) &&
1236 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1237 			if (nstl++ != 0)
1238 				cmn_err(CE_WARN,
1239 				    "add_mfc: %s o %x g %x p %x",
1240 				    "multiple kernel entries",
1241 				    ntohl(mfccp->mfcc_origin.s_addr),
1242 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1243 				    mfccp->mfcc_parent);
1244 
1245 			if (ipst->ips_ip_mrtdebug > 1) {
1246 				(void) mi_strlog(mrouter->conn_rq, 1,
1247 				    SL_TRACE,
1248 				    "add_mfc: o %x g %x p %x",
1249 				    ntohl(mfccp->mfcc_origin.s_addr),
1250 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1251 				    mfccp->mfcc_parent);
1252 			}
1253 			fill_route(rt, mfccp, ipst);
1254 
1255 			/*
1256 			 * Prevent cleanup of cache entry.
1257 			 * Timer starts in ip_mforward.
1258 			 */
1259 			if (rt->mfc_timeout_id != 0) {
1260 				timeout_id_t id;
1261 				id = rt->mfc_timeout_id;
1262 				/*
1263 				 * setting id to zero will avoid this
1264 				 * entry from being cleaned up in
1265 				 * expire_up_calls().
1266 				 */
1267 				rt->mfc_timeout_id = 0;
1268 				/*
1269 				 * dropping the lock is fine as we
1270 				 * have a refhold on the bucket.
1271 				 * so mfc cannot be freed.
1272 				 * The timeout can fire but it will see
1273 				 * that mfc_timeout_id == 0 and not cleanup.
1274 				 */
1275 				mutex_exit(&rt->mfc_mutex);
1276 				(void) untimeout(id);
1277 				mutex_enter(&rt->mfc_mutex);
1278 			}
1279 
1280 			/*
1281 			 * Send all pkts that are queued waiting for the upcall.
1282 			 * ip_mdq param tun set to 0 -
1283 			 * the return value of ip_mdq() isn't used here,
1284 			 * so value we send doesn't matter.
1285 			 */
1286 			while (rt->mfc_rte != NULL) {
1287 				rte = rt->mfc_rte;
1288 				rt->mfc_rte = rte->rte_next;
1289 				mutex_exit(&rt->mfc_mutex);
1290 				(void) ip_mdq(rte->mp, (ipha_t *)
1291 				    rte->mp->b_rptr, rte->ill, 0, rt);
1292 				freemsg(rte->mp);
1293 				mi_free((char *)rte);
1294 				mutex_enter(&rt->mfc_mutex);
1295 			}
1296 		}
1297 		mutex_exit(&rt->mfc_mutex);
1298 	}
1299 
1300 
1301 	/*
1302 	 * It is possible that an entry is being inserted without an upcall
1303 	 */
1304 	if (nstl == 0) {
1305 		mutex_enter(&(mfcbp->mfcb_lock));
1306 		if (ipst->ips_ip_mrtdebug > 1) {
1307 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1308 			    "add_mfc: no upcall o %x g %x p %x",
1309 			    ntohl(mfccp->mfcc_origin.s_addr),
1310 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1311 			    mfccp->mfcc_parent);
1312 		}
1313 		if (is_mrouter_off(ipst)) {
1314 			mutex_exit(&mfcbp->mfcb_lock);
1315 			MFCB_REFRELE(mfcbp);
1316 			return (EINVAL);
1317 		}
1318 
1319 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1320 
1321 			mutex_enter(&rt->mfc_mutex);
1322 			if ((rt->mfc_origin.s_addr ==
1323 			    mfccp->mfcc_origin.s_addr) &&
1324 			    (rt->mfc_mcastgrp.s_addr ==
1325 			    mfccp->mfcc_mcastgrp.s_addr) &&
1326 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1327 				fill_route(rt, mfccp, ipst);
1328 				mutex_exit(&rt->mfc_mutex);
1329 				break;
1330 			}
1331 			mutex_exit(&rt->mfc_mutex);
1332 		}
1333 
1334 		/* No upcall, so make a new entry into mfctable */
1335 		if (rt == NULL) {
1336 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1337 			if (rt == NULL) {
1338 				ip1dbg(("add_mfc: out of memory\n"));
1339 				mutex_exit(&mfcbp->mfcb_lock);
1340 				MFCB_REFRELE(mfcbp);
1341 				return (ENOBUFS);
1342 			}
1343 
1344 			/* Insert new entry at head of hash chain */
1345 			mutex_enter(&rt->mfc_mutex);
1346 			fill_route(rt, mfccp, ipst);
1347 
1348 			/* Link into table */
1349 			rt->mfc_next   = mfcbp->mfcb_mfc;
1350 			mfcbp->mfcb_mfc = rt;
1351 			mutex_exit(&rt->mfc_mutex);
1352 		}
1353 		mutex_exit(&mfcbp->mfcb_lock);
1354 	}
1355 
1356 	MFCB_REFRELE(mfcbp);
1357 	return (0);
1358 }
1359 
1360 /*
1361  * Fills in mfc structure from mrouted mfcctl.
1362  */
1363 static void
fill_route(struct mfc * rt,struct mfcctl * mfccp,ip_stack_t * ipst)1364 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1365 {
1366 	int i;
1367 
1368 	rt->mfc_origin		= mfccp->mfcc_origin;
1369 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1370 	rt->mfc_parent		= mfccp->mfcc_parent;
1371 	mutex_enter(&ipst->ips_numvifs_mutex);
1372 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1373 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1374 	}
1375 	mutex_exit(&ipst->ips_numvifs_mutex);
1376 	/* Initialize pkt counters per src-grp */
1377 	rt->mfc_pkt_cnt	= 0;
1378 	rt->mfc_byte_cnt	= 0;
1379 	rt->mfc_wrong_if	= 0;
1380 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1381 
1382 }
1383 
1384 static void
free_queue(struct mfc * mfcp)1385 free_queue(struct mfc *mfcp)
1386 {
1387 	struct rtdetq *rte0;
1388 
1389 	/*
1390 	 * Drop all queued upcall packets.
1391 	 * Free the mbuf with the pkt.
1392 	 */
1393 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1394 		mfcp->mfc_rte = rte0->rte_next;
1395 		freemsg(rte0->mp);
1396 		mi_free((char *)rte0);
1397 	}
1398 }
1399 /*
1400  * go thorugh the hash bucket and free all the entries marked condemned.
1401  */
1402 void
release_mfc(struct mfcb * mfcbp)1403 release_mfc(struct mfcb *mfcbp)
1404 {
1405 	struct mfc *current_mfcp;
1406 	struct mfc *prev_mfcp;
1407 
1408 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1409 
1410 	while (current_mfcp != NULL) {
1411 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1412 			if (current_mfcp == mfcbp->mfcb_mfc) {
1413 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1414 				free_queue(current_mfcp);
1415 				mi_free(current_mfcp);
1416 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1417 				continue;
1418 			}
1419 			ASSERT(prev_mfcp != NULL);
1420 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1421 			free_queue(current_mfcp);
1422 			mi_free(current_mfcp);
1423 			current_mfcp = NULL;
1424 		} else {
1425 			prev_mfcp = current_mfcp;
1426 		}
1427 
1428 		current_mfcp = prev_mfcp->mfc_next;
1429 
1430 	}
1431 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1432 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1433 }
1434 
1435 /*
1436  * Delete an mfc entry.
1437  */
1438 static int
del_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1439 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1440 {
1441 	struct in_addr	origin;
1442 	struct in_addr	mcastgrp;
1443 	struct mfc	*rt;
1444 	uint_t		hash;
1445 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1446 
1447 	origin = mfccp->mfcc_origin;
1448 	mcastgrp = mfccp->mfcc_mcastgrp;
1449 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1450 
1451 	if (ipst->ips_ip_mrtdebug > 1) {
1452 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1453 		    "del_mfc: o %x g %x",
1454 		    ntohl(origin.s_addr),
1455 		    ntohl(mcastgrp.s_addr));
1456 	}
1457 
1458 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1459 
1460 	/* Find mfc in mfctable, finds only entries without upcalls */
1461 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1462 		mutex_enter(&rt->mfc_mutex);
1463 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1464 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1465 		    rt->mfc_rte == NULL &&
1466 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1467 			break;
1468 		mutex_exit(&rt->mfc_mutex);
1469 	}
1470 
1471 	/*
1472 	 * Return if there was an upcall (mfc_rte != NULL,
1473 	 * or rt not in mfctable.
1474 	 */
1475 	if (rt == NULL) {
1476 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1477 		return (EADDRNOTAVAIL);
1478 	}
1479 
1480 
1481 	/*
1482 	 * no need to hold lock as we have a reference.
1483 	 */
1484 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1485 	/* error checking */
1486 	if (rt->mfc_timeout_id != 0) {
1487 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1488 		/*
1489 		 * Its ok to drop the lock,  the struct cannot be freed
1490 		 * since we have a ref on the hash bucket.
1491 		 */
1492 		rt->mfc_timeout_id = 0;
1493 		mutex_exit(&rt->mfc_mutex);
1494 		(void) untimeout(rt->mfc_timeout_id);
1495 		mutex_enter(&rt->mfc_mutex);
1496 	}
1497 
1498 	ASSERT(rt->mfc_rte == NULL);
1499 
1500 
1501 	/*
1502 	 * Delete the entry from the cache
1503 	 */
1504 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1505 	mutex_exit(&rt->mfc_mutex);
1506 
1507 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1508 
1509 	return (0);
1510 }
1511 
1512 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1513 
1514 /*
1515  * IP multicast forwarding function. This function assumes that the packet
1516  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1517  * pointed to by "ill", and the packet is to be relayed to other networks
1518  * that have members of the packet's destination IP multicast group.
1519  *
1520  * The packet is returned unscathed to the caller, unless it is
1521  * erroneous, in which case a -1 value tells the caller (IP)
1522  * to discard it.
1523  *
1524  * Unlike BSD, SunOS 5.x needs to return to IP info about
1525  * whether pkt came in thru a tunnel, so it can be discarded, unless
1526  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1527  * to be delivered.
1528  * Return values are 0 - pkt is okay and phyint
1529  *		    -1 - pkt is malformed and to be tossed
1530  *                   1 - pkt came in on tunnel
1531  */
1532 int
ip_mforward(mblk_t * mp,ip_recv_attr_t * ira)1533 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1534 {
1535 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1536 	ill_t		*ill = ira->ira_ill;
1537 	struct mfc	*rt;
1538 	ipaddr_t	src, dst, tunnel_src = 0;
1539 	static int	srctun = 0;
1540 	vifi_t		vifi;
1541 	boolean_t	pim_reg_packet = B_FALSE;
1542 	struct mfcb	*mfcbp;
1543 	ip_stack_t	*ipst = ill->ill_ipst;
1544 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1545 	ill_t		*rill = ira->ira_rill;
1546 
1547 	ASSERT(ira->ira_pktlen == msgdsize(mp));
1548 
1549 	if (ipst->ips_ip_mrtdebug > 1) {
1550 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1551 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1552 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1553 		    ill->ill_name);
1554 	}
1555 
1556 	dst = ipha->ipha_dst;
1557 	if (ira->ira_flags & IRAF_PIM_REGISTER)
1558 		pim_reg_packet = B_TRUE;
1559 	else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1560 		tunnel_src = ira->ira_mroute_tunnel;
1561 
1562 	/*
1563 	 * Don't forward a packet with time-to-live of zero or one,
1564 	 * or a packet destined to a local-only group.
1565 	 */
1566 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1567 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1568 		if (ipst->ips_ip_mrtdebug > 1) {
1569 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1570 			    "ip_mforward: not forwarded ttl %d,"
1571 			    " dst 0x%x ill %s",
1572 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1573 		}
1574 		if (tunnel_src != 0)
1575 			return (1);
1576 		else
1577 			return (0);
1578 	}
1579 
1580 	if ((tunnel_src != 0) || pim_reg_packet) {
1581 		/*
1582 		 * Packet arrived over an encapsulated tunnel or via a PIM
1583 		 * register message.
1584 		 */
1585 		if (ipst->ips_ip_mrtdebug > 1) {
1586 			if (tunnel_src != 0) {
1587 				(void) mi_strlog(mrouter->conn_rq, 1,
1588 				    SL_TRACE,
1589 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1590 				    ill->ill_name);
1591 			} else if (pim_reg_packet) {
1592 				(void) mi_strlog(mrouter->conn_rq, 1,
1593 				    SL_TRACE,
1594 				    "ip_mforward: ill %s arrived via"
1595 				    "  REGISTER VIF",
1596 				    ill->ill_name);
1597 			}
1598 		}
1599 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1600 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1601 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1602 		/* Packet arrived via a physical interface. */
1603 		if (ipst->ips_ip_mrtdebug > 1) {
1604 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1605 			    "ip_mforward: ill %s arrived via PHYINT",
1606 			    ill->ill_name);
1607 		}
1608 
1609 	} else {
1610 		/*
1611 		 * Packet arrived through a SRCRT tunnel.
1612 		 * Source-route tunnels are no longer supported.
1613 		 * Error message printed every 1000 times.
1614 		 */
1615 		if ((srctun++ % 1000) == 0) {
1616 			cmn_err(CE_WARN,
1617 			    "ip_mforward: received source-routed pkt from %x",
1618 			    ntohl(ipha->ipha_src));
1619 		}
1620 		return (-1);
1621 	}
1622 
1623 	ipst->ips_mrtstat->mrts_fwd_in++;
1624 	src = ipha->ipha_src;
1625 
1626 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1627 
1628 	/*
1629 	 * Lock the mfctable against changes made by ip_mforward.
1630 	 * Note that only add_mfc and del_mfc can remove entries and
1631 	 * they run with exclusive access to IP. So we do not need to
1632 	 * guard against the rt being deleted, so release lock after reading.
1633 	 */
1634 
1635 	if (is_mrouter_off(ipst))
1636 		return (-1);
1637 
1638 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1639 	MFCB_REFHOLD(mfcbp);
1640 	MFCFIND(mfcbp, src, dst, rt);
1641 
1642 	/* Entry exists, so forward if necessary */
1643 	if (rt != NULL) {
1644 		int ret = 0;
1645 		ipst->ips_mrtstat->mrts_mfc_hits++;
1646 		if (pim_reg_packet) {
1647 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1648 			ret = ip_mdq(mp, ipha,
1649 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1650 			    v_ipif->ipif_ill,
1651 			    0, rt);
1652 		} else {
1653 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1654 		}
1655 
1656 		MFCB_REFRELE(mfcbp);
1657 		return (ret);
1658 
1659 		/*
1660 		 * Don't forward if we don't have a cache entry.  Mrouted will
1661 		 * always provide a cache entry in response to an upcall.
1662 		 */
1663 	} else {
1664 		/*
1665 		 * If we don't have a route for packet's origin, make a copy
1666 		 * of the packet and send message to routing daemon.
1667 		 */
1668 		struct mfc	*mfc_rt	 = NULL;
1669 		mblk_t		*mp0	 = NULL;
1670 		mblk_t		*mp_copy = NULL;
1671 		struct rtdetq	*rte	 = NULL;
1672 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1673 		uint_t		hash;
1674 		int		npkts;
1675 		boolean_t	new_mfc = B_FALSE;
1676 		ipst->ips_mrtstat->mrts_mfc_misses++;
1677 		/* BSD uses mrts_no_route++ */
1678 		if (ipst->ips_ip_mrtdebug > 1) {
1679 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1680 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1681 			    ill->ill_name, ntohl(src), ntohl(dst),
1682 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
1683 		}
1684 		/*
1685 		 * The order of the following code differs from the BSD code.
1686 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1687 		 * code works, so SunOS 5.x wasn't changed to conform to the
1688 		 * BSD version.
1689 		 */
1690 
1691 		/* Lock mfctable. */
1692 		hash = MFCHASH(src, dst);
1693 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1694 
1695 		/*
1696 		 * If we are turning off mrouted return an error
1697 		 */
1698 		if (is_mrouter_off(ipst)) {
1699 			mutex_exit(&mfcbp->mfcb_lock);
1700 			MFCB_REFRELE(mfcbp);
1701 			return (-1);
1702 		}
1703 
1704 		/* Is there an upcall waiting for this packet? */
1705 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1706 		    mfc_rt = mfc_rt->mfc_next) {
1707 			mutex_enter(&mfc_rt->mfc_mutex);
1708 			if (ipst->ips_ip_mrtdebug > 1) {
1709 				(void) mi_strlog(mrouter->conn_rq, 1,
1710 				    SL_TRACE,
1711 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1712 				    " g 0x%x\n",
1713 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1714 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1715 			}
1716 			/* There is an upcall */
1717 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1718 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1719 			    (mfc_rt->mfc_rte != NULL) &&
1720 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1721 				break;
1722 			}
1723 			mutex_exit(&mfc_rt->mfc_mutex);
1724 		}
1725 		/* No upcall, so make a new entry into mfctable */
1726 		if (mfc_rt == NULL) {
1727 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1728 			if (mfc_rt == NULL) {
1729 				ipst->ips_mrtstat->mrts_fwd_drop++;
1730 				ip1dbg(("ip_mforward: out of memory "
1731 				    "for mfc, mfc_rt\n"));
1732 				goto error_return;
1733 			} else
1734 				new_mfc = B_TRUE;
1735 			/* Get resources */
1736 			/* TODO could copy header and dup rest */
1737 			mp_copy = copymsg(mp);
1738 			if (mp_copy == NULL) {
1739 				ipst->ips_mrtstat->mrts_fwd_drop++;
1740 				ip1dbg(("ip_mforward: out of memory for "
1741 				    "mblk, mp_copy\n"));
1742 				goto error_return;
1743 			}
1744 			mutex_enter(&mfc_rt->mfc_mutex);
1745 		}
1746 		/* Get resources for rte, whether first rte or not first. */
1747 		/* Add this packet into rtdetq */
1748 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1749 		if (rte == NULL) {
1750 			ipst->ips_mrtstat->mrts_fwd_drop++;
1751 			mutex_exit(&mfc_rt->mfc_mutex);
1752 			ip1dbg(("ip_mforward: out of memory for"
1753 			    " rtdetq, rte\n"));
1754 			goto error_return;
1755 		}
1756 
1757 		mp0 = copymsg(mp);
1758 		if (mp0 == NULL) {
1759 			ipst->ips_mrtstat->mrts_fwd_drop++;
1760 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1761 			mutex_exit(&mfc_rt->mfc_mutex);
1762 			goto error_return;
1763 		}
1764 		rte->mp		= mp0;
1765 		if (pim_reg_packet) {
1766 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1767 			rte->ill =
1768 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1769 			    v_ipif->ipif_ill;
1770 		} else {
1771 			rte->ill = ill;
1772 		}
1773 		rte->rte_next	= NULL;
1774 
1775 		/*
1776 		 * Determine if upcall q (rtdetq) has overflowed.
1777 		 * mfc_rt->mfc_rte is null by mi_zalloc
1778 		 * if it is the first message.
1779 		 */
1780 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1781 		    rte_m = rte_m->rte_next)
1782 			npkts++;
1783 		if (ipst->ips_ip_mrtdebug > 1) {
1784 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1785 			    "ip_mforward: upcalls %d\n", npkts);
1786 		}
1787 		if (npkts > MAX_UPQ) {
1788 			ipst->ips_mrtstat->mrts_upq_ovflw++;
1789 			mutex_exit(&mfc_rt->mfc_mutex);
1790 			goto error_return;
1791 		}
1792 
1793 		if (npkts == 0) {	/* first upcall */
1794 			int i = 0;
1795 			/*
1796 			 * Now finish installing the new mfc! Now that we have
1797 			 * resources!  Insert new entry at head of hash chain.
1798 			 * Use src and dst which are ipaddr_t's.
1799 			 */
1800 			mfc_rt->mfc_origin.s_addr = src;
1801 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1802 
1803 			mutex_enter(&ipst->ips_numvifs_mutex);
1804 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
1805 				mfc_rt->mfc_ttls[i] = 0;
1806 			mutex_exit(&ipst->ips_numvifs_mutex);
1807 			mfc_rt->mfc_parent = ALL_VIFS;
1808 
1809 			/* Link into table */
1810 			if (ipst->ips_ip_mrtdebug > 1) {
1811 				(void) mi_strlog(mrouter->conn_rq, 1,
1812 				    SL_TRACE,
1813 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1814 				    "g 0x%x\n", hash,
1815 				    ntohl(mfc_rt->mfc_origin.s_addr),
1816 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1817 			}
1818 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1819 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1820 			mfc_rt->mfc_rte = NULL;
1821 		}
1822 
1823 		/* Link in the upcall */
1824 		/* First upcall */
1825 		if (mfc_rt->mfc_rte == NULL)
1826 			mfc_rt->mfc_rte = rte;
1827 		else {
1828 			/* not the first upcall */
1829 			prev_rte = mfc_rt->mfc_rte;
1830 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1831 			    prev_rte = rte1, rte1 = rte1->rte_next)
1832 				;
1833 			prev_rte->rte_next = rte;
1834 		}
1835 
1836 		/*
1837 		 * No upcalls waiting, this is first one, so send a message to
1838 		 * routing daemon to install a route into kernel table.
1839 		 */
1840 		if (npkts == 0) {
1841 			struct igmpmsg	*im;
1842 			/* ipha_protocol is 0, for upcall */
1843 			ASSERT(mp_copy != NULL);
1844 			im = (struct igmpmsg *)mp_copy->b_rptr;
1845 			im->im_msgtype	= IGMPMSG_NOCACHE;
1846 			im->im_mbz = 0;
1847 			mutex_enter(&ipst->ips_numvifs_mutex);
1848 			if (pim_reg_packet) {
1849 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1850 				mutex_exit(&ipst->ips_numvifs_mutex);
1851 			} else {
1852 				/*
1853 				 * XXX do we need to hold locks here ?
1854 				 */
1855 				for (vifi = 0;
1856 				    vifi < ipst->ips_numvifs;
1857 				    vifi++) {
1858 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
1859 						continue;
1860 					if (ipst->ips_vifs[vifi].
1861 					    v_ipif->ipif_ill == ill) {
1862 						im->im_vif = (uchar_t)vifi;
1863 						break;
1864 					}
1865 				}
1866 				mutex_exit(&ipst->ips_numvifs_mutex);
1867 				ASSERT(vifi < ipst->ips_numvifs);
1868 			}
1869 
1870 			ipst->ips_mrtstat->mrts_upcalls++;
1871 			/* Timer to discard upcalls if mrouted is too slow */
1872 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1873 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1874 			mutex_exit(&mfc_rt->mfc_mutex);
1875 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1876 			/* Pass to RAWIP */
1877 			ira->ira_ill = ira->ira_rill = NULL;
1878 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1879 			ira->ira_ill = ill;
1880 			ira->ira_rill = rill;
1881 		} else {
1882 			mutex_exit(&mfc_rt->mfc_mutex);
1883 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1884 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1885 			ip_drop_input("ip_mforward - upcall already waiting",
1886 			    mp_copy, ill);
1887 			freemsg(mp_copy);
1888 		}
1889 
1890 		MFCB_REFRELE(mfcbp);
1891 		if (tunnel_src != 0)
1892 			return (1);
1893 		else
1894 			return (0);
1895 	error_return:
1896 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1897 		MFCB_REFRELE(mfcbp);
1898 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1899 			mi_free((char *)mfc_rt);
1900 		if (rte != NULL)
1901 			mi_free((char *)rte);
1902 		if (mp_copy != NULL) {
1903 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1904 			ip_drop_input("ip_mforward error", mp_copy, ill);
1905 			freemsg(mp_copy);
1906 		}
1907 		if (mp0 != NULL)
1908 			freemsg(mp0);
1909 		return (-1);
1910 	}
1911 }
1912 
1913 /*
1914  * Clean up the mfctable cache entry if upcall is not serviced.
1915  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1916  */
1917 static void
expire_upcalls(void * arg)1918 expire_upcalls(void *arg)
1919 {
1920 	struct mfc *mfc_rt = arg;
1921 	uint_t hash;
1922 	struct mfc *prev_mfc, *mfc0;
1923 	ip_stack_t	*ipst;
1924 	conn_t		*mrouter;
1925 
1926 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1927 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1928 		return;
1929 	}
1930 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1931 	mrouter = ipst->ips_ip_g_mrouter;
1932 
1933 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1934 	if (ipst->ips_ip_mrtdebug > 1) {
1935 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1936 		    "expire_upcalls: hash %d s %x g %x",
1937 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1938 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1939 	}
1940 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1941 	mutex_enter(&mfc_rt->mfc_mutex);
1942 	/*
1943 	 * if timeout has been set to zero, than the
1944 	 * entry has been filled, no need to delete it.
1945 	 */
1946 	if (mfc_rt->mfc_timeout_id == 0)
1947 		goto done;
1948 	ipst->ips_mrtstat->mrts_cache_cleanups++;
1949 	mfc_rt->mfc_timeout_id = 0;
1950 
1951 	/* Determine entry to be cleaned up in cache table. */
1952 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1953 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1954 		if (mfc0 == mfc_rt)
1955 			break;
1956 
1957 	/* del_mfc takes care of gone mfcs */
1958 	ASSERT(prev_mfc != NULL);
1959 	ASSERT(mfc0 != NULL);
1960 
1961 	/*
1962 	 * Delete the entry from the cache
1963 	 */
1964 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1965 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1966 
1967 	/*
1968 	 * release_mfc will drop all queued upcall packets.
1969 	 * and will free the mbuf with the pkt, if, timing info.
1970 	 */
1971 done:
1972 	mutex_exit(&mfc_rt->mfc_mutex);
1973 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1974 }
1975 
1976 /*
1977  * Packet forwarding routine once entry in the cache is made.
1978  */
1979 static int
ip_mdq(mblk_t * mp,ipha_t * ipha,ill_t * ill,ipaddr_t tunnel_src,struct mfc * rt)1980 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1981     struct mfc *rt)
1982 {
1983 	vifi_t vifi;
1984 	struct vif *vifp;
1985 	ipaddr_t dst = ipha->ipha_dst;
1986 	size_t  plen = msgdsize(mp);
1987 	vifi_t num_of_vifs;
1988 	ip_stack_t	*ipst = ill->ill_ipst;
1989 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1990 	ip_recv_attr_t	iras;
1991 
1992 	if (ipst->ips_ip_mrtdebug > 1) {
1993 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1994 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1995 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1996 		    ill->ill_name);
1997 	}
1998 
1999 	/* Macro to send packet on vif */
2000 #define	MC_SEND(ipha, mp, vifp, dst) { \
2001 	if ((vifp)->v_flags & VIFF_TUNNEL) \
2002 		encap_send((ipha), (mp), (vifp), (dst)); \
2003 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2004 		register_send((ipha), (mp), (vifp), (dst)); \
2005 	else \
2006 		phyint_send((ipha), (mp), (vifp), (dst)); \
2007 }
2008 
2009 	vifi = rt->mfc_parent;
2010 
2011 	/*
2012 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2013 	 * Mrouted had no route.
2014 	 * We wanted the route installed in the mfctable to prevent multiple
2015 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2016 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2017 	 * 3.6.
2018 	 */
2019 	if (vifi == NO_VIF) {
2020 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2021 		    ill->ill_name));
2022 		if (ipst->ips_ip_mrtdebug > 1) {
2023 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2024 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2025 		}
2026 		return (-1);	/* drop pkt */
2027 	}
2028 
2029 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2030 		return (-1);
2031 	/*
2032 	 * The MFC entries are not cleaned up when an ipif goes
2033 	 * away thus this code has to guard against an MFC referencing
2034 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2035 	 * sets the v_ipif to NULL when the ipif disappears.
2036 	 */
2037 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2038 
2039 	if (vifi >= ipst->ips_numvifs) {
2040 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2041 		    "%d ill %s viftable ill %s\n",
2042 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2043 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2044 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2045 		return (-1);
2046 	}
2047 	/*
2048 	 * Don't forward if it didn't arrive from the parent vif for its
2049 	 * origin.
2050 	 */
2051 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2052 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2053 		/* Came in the wrong interface */
2054 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2055 			"numvifs %d ill %s viftable ill %s\n",
2056 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2057 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2058 		if (ipst->ips_ip_mrtdebug > 1) {
2059 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2060 			    "ip_mdq: arrived wrong if, vifi %d ill "
2061 			    "%s viftable ill %s\n",
2062 			    (int)vifi, ill->ill_name,
2063 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2064 		}
2065 		ipst->ips_mrtstat->mrts_wrong_if++;
2066 		rt->mfc_wrong_if++;
2067 
2068 		/*
2069 		 * If we are doing PIM assert processing and we are forwarding
2070 		 * packets on this interface, and it is a broadcast medium
2071 		 * interface (and not a tunnel), send a message to the routing.
2072 		 *
2073 		 * We use the first ipif on the list, since it's all we have.
2074 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2075 		 */
2076 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2077 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2078 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2079 			mblk_t		*mp_copy;
2080 			struct igmpmsg	*im;
2081 
2082 			/* TODO could copy header and dup rest */
2083 			mp_copy = copymsg(mp);
2084 			if (mp_copy == NULL) {
2085 				ipst->ips_mrtstat->mrts_fwd_drop++;
2086 				ip1dbg(("ip_mdq: out of memory "
2087 				    "for mblk, mp_copy\n"));
2088 				unlock_good_vif(&ipst->ips_vifs[vifi]);
2089 				return (-1);
2090 			}
2091 
2092 			im = (struct igmpmsg *)mp_copy->b_rptr;
2093 			im->im_msgtype = IGMPMSG_WRONGVIF;
2094 			im->im_mbz = 0;
2095 			im->im_vif = (ushort_t)vifi;
2096 			/* Pass to RAWIP */
2097 
2098 			bzero(&iras, sizeof (iras));
2099 			iras.ira_flags = IRAF_IS_IPV4;
2100 			iras.ira_ip_hdr_length =
2101 			    IPH_HDR_LENGTH(mp_copy->b_rptr);
2102 			iras.ira_pktlen = msgdsize(mp_copy);
2103 			iras.ira_ttl = ipha->ipha_ttl;
2104 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2105 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2106 		}
2107 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2108 		if (tunnel_src != 0)
2109 			return (1);
2110 		else
2111 			return (0);
2112 	}
2113 	/*
2114 	 * If I sourced this packet, it counts as output, else it was input.
2115 	 */
2116 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2117 		ipst->ips_vifs[vifi].v_pkt_out++;
2118 		ipst->ips_vifs[vifi].v_bytes_out += plen;
2119 	} else {
2120 		ipst->ips_vifs[vifi].v_pkt_in++;
2121 		ipst->ips_vifs[vifi].v_bytes_in += plen;
2122 	}
2123 	mutex_enter(&rt->mfc_mutex);
2124 	rt->mfc_pkt_cnt++;
2125 	rt->mfc_byte_cnt += plen;
2126 	mutex_exit(&rt->mfc_mutex);
2127 	unlock_good_vif(&ipst->ips_vifs[vifi]);
2128 	/*
2129 	 * For each vif, decide if a copy of the packet should be forwarded.
2130 	 * Forward if:
2131 	 *		- the vif threshold ttl is non-zero AND
2132 	 *		- the pkt ttl exceeds the vif's threshold
2133 	 * A non-zero mfc_ttl indicates that the vif is part of
2134 	 * the output set for the mfc entry.
2135 	 */
2136 	mutex_enter(&ipst->ips_numvifs_mutex);
2137 	num_of_vifs = ipst->ips_numvifs;
2138 	mutex_exit(&ipst->ips_numvifs_mutex);
2139 	for (vifp = ipst->ips_vifs, vifi = 0;
2140 	    vifi < num_of_vifs;
2141 	    vifp++, vifi++) {
2142 		if (!lock_good_vif(vifp))
2143 			continue;
2144 		if ((rt->mfc_ttls[vifi] > 0) &&
2145 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2146 			/*
2147 			 * lock_good_vif should not have succedded if
2148 			 * v_ipif is null.
2149 			 */
2150 			ASSERT(vifp->v_ipif != NULL);
2151 			vifp->v_pkt_out++;
2152 			vifp->v_bytes_out += plen;
2153 			MC_SEND(ipha, mp, vifp, dst);
2154 			ipst->ips_mrtstat->mrts_fwd_out++;
2155 		}
2156 		unlock_good_vif(vifp);
2157 	}
2158 	if (tunnel_src != 0)
2159 		return (1);
2160 	else
2161 		return (0);
2162 }
2163 
2164 /*
2165  * Send the packet on physical interface.
2166  * Caller assumes can continue to use mp on return.
2167  */
2168 /* ARGSUSED */
2169 static void
phyint_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2170 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2171 {
2172 	mblk_t	*mp_copy;
2173 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2174 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2175 
2176 	/* Make a new reference to the packet */
2177 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2178 	if (mp_copy == NULL) {
2179 		ipst->ips_mrtstat->mrts_fwd_drop++;
2180 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2181 		return;
2182 	}
2183 	if (vifp->v_rate_limit <= 0)
2184 		tbf_send_packet(vifp, mp_copy);
2185 	else  {
2186 		if (ipst->ips_ip_mrtdebug > 1) {
2187 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2188 			    "phyint_send: tbf_contr rate %d "
2189 			    "vifp 0x%p mp 0x%p dst 0x%x",
2190 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2191 		}
2192 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2193 	}
2194 }
2195 
2196 /*
2197  * Send the whole packet for REGISTER encapsulation to PIM daemon
2198  * Caller assumes it can continue to use mp on return.
2199  */
2200 /* ARGSUSED */
2201 static void
register_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2202 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2203 {
2204 	struct igmpmsg	*im;
2205 	mblk_t		*mp_copy;
2206 	ipha_t		*ipha_copy;
2207 	ill_t		*ill = vifp->v_ipif->ipif_ill;
2208 	ip_stack_t	*ipst = ill->ill_ipst;
2209 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2210 	ip_recv_attr_t	iras;
2211 
2212 	if (ipst->ips_ip_mrtdebug > 1) {
2213 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2214 		    "register_send: src %x, dst %x\n",
2215 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2216 	}
2217 
2218 	/*
2219 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2220 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2221 	 * ethernet driver will.
2222 	 */
2223 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2224 	if (mp_copy == NULL) {
2225 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2226 		if (ipst->ips_ip_mrtdebug > 3) {
2227 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2228 			    "register_send: allocb failure.");
2229 		}
2230 		return;
2231 	}
2232 
2233 	/*
2234 	 * Bump write pointer to account for igmpmsg being added.
2235 	 */
2236 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2237 
2238 	/*
2239 	 * Chain packet to new mblk_t.
2240 	 */
2241 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2242 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2243 		if (ipst->ips_ip_mrtdebug > 3) {
2244 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2245 			    "register_send: copymsg failure.");
2246 		}
2247 		freeb(mp_copy);
2248 		return;
2249 	}
2250 
2251 	/*
2252 	 * icmp_input() asserts that IP version field is set to an
2253 	 * appropriate version. Hence, the struct igmpmsg that this really
2254 	 * becomes, needs to have the correct IP version field.
2255 	 */
2256 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2257 	*ipha_copy = multicast_encap_iphdr;
2258 
2259 	/*
2260 	 * The kernel uses the struct igmpmsg header to encode the messages to
2261 	 * the multicast routing daemon. Fill in the fields in the header
2262 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2263 	 */
2264 	im = (struct igmpmsg *)mp_copy->b_rptr;
2265 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2266 	im->im_src.s_addr = ipha->ipha_src;
2267 	im->im_dst.s_addr = ipha->ipha_dst;
2268 
2269 	/*
2270 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2271 	 * header with renamed fields and the multicast routing daemon uses
2272 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2273 	 */
2274 	im->im_mbz = 0;
2275 
2276 	++ipst->ips_mrtstat->mrts_upcalls;
2277 	if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2278 	    !canputnext(mrouter->conn_rq)) {
2279 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2280 		if (ipst->ips_ip_mrtdebug > 3) {
2281 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2282 			    "register_send: register upcall failure.");
2283 		}
2284 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2285 		ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2286 		freemsg(mp_copy);
2287 	} else {
2288 		/* Pass to RAWIP */
2289 		bzero(&iras, sizeof (iras));
2290 		iras.ira_flags = IRAF_IS_IPV4;
2291 		iras.ira_ip_hdr_length = sizeof (ipha_t);
2292 		iras.ira_pktlen = msgdsize(mp_copy);
2293 		iras.ira_ttl = ipha->ipha_ttl;
2294 		(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2295 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2296 	}
2297 }
2298 
2299 /*
2300  * pim_validate_cksum handles verification of the checksum in the
2301  * pim header.  For PIM Register packets, the checksum is calculated
2302  * across the PIM header only.  For all other packets, the checksum
2303  * is for the PIM header and remainder of the packet.
2304  *
2305  * returns: B_TRUE, if checksum is okay.
2306  *          B_FALSE, if checksum is not valid.
2307  */
2308 static boolean_t
pim_validate_cksum(mblk_t * mp,ipha_t * ip,struct pim * pimp)2309 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2310 {
2311 	mblk_t *mp_dup;
2312 
2313 	if ((mp_dup = dupmsg(mp)) == NULL)
2314 		return (B_FALSE);
2315 
2316 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2317 	if (pimp->pim_type == PIM_REGISTER)
2318 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2319 	if (IP_CSUM(mp_dup, 0, 0)) {
2320 		freemsg(mp_dup);
2321 		return (B_FALSE);
2322 	}
2323 	freemsg(mp_dup);
2324 	return (B_TRUE);
2325 }
2326 
2327 /*
2328  * Process PIM protocol packets i.e. IP Protocol 103.
2329  * Register messages are decapsulated and sent onto multicast forwarding.
2330  *
2331  * Return NULL for a bad packet that is discarded here.
2332  * Return mp if the message is OK and should be handed to "raw" receivers.
2333  * Callers of pim_input() may need to reinitialize variables that were copied
2334  * from the mblk as this calls pullupmsg().
2335  */
2336 mblk_t *
pim_input(mblk_t * mp,ip_recv_attr_t * ira)2337 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2338 {
2339 	ipha_t		*eip, *ip;
2340 	int		iplen, pimlen, iphlen;
2341 	struct pim	*pimp;	/* pointer to a pim struct */
2342 	uint32_t	*reghdr;
2343 	ill_t		*ill = ira->ira_ill;
2344 	ip_stack_t	*ipst = ill->ill_ipst;
2345 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2346 
2347 	/*
2348 	 * Pullup the msg for PIM protocol processing.
2349 	 */
2350 	if (pullupmsg(mp, -1) == 0) {
2351 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2352 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2353 		ip_drop_input("mrts_pim_nomemory", mp, ill);
2354 		freemsg(mp);
2355 		return (NULL);
2356 	}
2357 
2358 	ip = (ipha_t *)mp->b_rptr;
2359 	iplen = ip->ipha_length;
2360 	iphlen = IPH_HDR_LENGTH(ip);
2361 	pimlen = ntohs(iplen) - iphlen;
2362 
2363 	/*
2364 	 * Validate lengths
2365 	 */
2366 	if (pimlen < PIM_MINLEN) {
2367 		++ipst->ips_mrtstat->mrts_pim_malformed;
2368 		if (ipst->ips_ip_mrtdebug > 1) {
2369 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2370 			    "pim_input: length not at least minlen");
2371 		}
2372 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2373 		ip_drop_input("mrts_pim_malformed", mp, ill);
2374 		freemsg(mp);
2375 		return (NULL);
2376 	}
2377 
2378 	/*
2379 	 * Point to the PIM header.
2380 	 */
2381 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2382 
2383 	/*
2384 	 * Check the version number.
2385 	 */
2386 	if (pimp->pim_vers != PIM_VERSION) {
2387 		++ipst->ips_mrtstat->mrts_pim_badversion;
2388 		if (ipst->ips_ip_mrtdebug > 1) {
2389 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2390 			    "pim_input: unknown version of PIM");
2391 		}
2392 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2393 		ip_drop_input("mrts_pim_badversion", mp, ill);
2394 		freemsg(mp);
2395 		return (NULL);
2396 	}
2397 
2398 	/*
2399 	 * Validate the checksum
2400 	 */
2401 	if (!pim_validate_cksum(mp, ip, pimp)) {
2402 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2403 		if (ipst->ips_ip_mrtdebug > 1) {
2404 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2405 			    "pim_input: invalid checksum");
2406 		}
2407 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2408 		ip_drop_input("pim_rcv_badcsum", mp, ill);
2409 		freemsg(mp);
2410 		return (NULL);
2411 	}
2412 
2413 	if (pimp->pim_type != PIM_REGISTER)
2414 		return (mp);
2415 
2416 	reghdr = (uint32_t *)(pimp + 1);
2417 	eip = (ipha_t *)(reghdr + 1);
2418 
2419 	/*
2420 	 * check if the inner packet is destined to mcast group
2421 	 */
2422 	if (!CLASSD(eip->ipha_dst)) {
2423 		++ipst->ips_mrtstat->mrts_pim_badregisters;
2424 		if (ipst->ips_ip_mrtdebug > 1) {
2425 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2426 			    "pim_input: Inner pkt not mcast .. !");
2427 		}
2428 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2429 		ip_drop_input("mrts_pim_badregisters", mp, ill);
2430 		freemsg(mp);
2431 		return (NULL);
2432 	}
2433 	if (ipst->ips_ip_mrtdebug > 1) {
2434 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2435 		    "register from %x, to %x, len %d",
2436 		    ntohl(eip->ipha_src),
2437 		    ntohl(eip->ipha_dst),
2438 		    ntohs(eip->ipha_length));
2439 	}
2440 	/*
2441 	 * If the null register bit is not set, decapsulate
2442 	 * the packet before forwarding it.
2443 	 * Avoid this in no register vif
2444 	 */
2445 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2446 	    ipst->ips_reg_vif_num != ALL_VIFS) {
2447 		mblk_t *mp_copy;
2448 		uint_t saved_pktlen;
2449 
2450 		/* Copy the message */
2451 		if ((mp_copy = copymsg(mp)) == NULL) {
2452 			++ipst->ips_mrtstat->mrts_pim_nomemory;
2453 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2454 			ip_drop_input("mrts_pim_nomemory", mp, ill);
2455 			freemsg(mp);
2456 			return (NULL);
2457 		}
2458 
2459 		/*
2460 		 * Decapsulate the packet and give it to
2461 		 * register_mforward.
2462 		 */
2463 		mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2464 		saved_pktlen = ira->ira_pktlen;
2465 		ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2466 		if (register_mforward(mp_copy, ira) != 0) {
2467 			/* register_mforward already called ip_drop_input */
2468 			freemsg(mp);
2469 			ira->ira_pktlen = saved_pktlen;
2470 			return (NULL);
2471 		}
2472 		ira->ira_pktlen = saved_pktlen;
2473 	}
2474 
2475 	/*
2476 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2477 	 * PIM socket. For Solaris it is done right after pim_input() is
2478 	 * called.
2479 	 */
2480 	return (mp);
2481 }
2482 
2483 /*
2484  * PIM sparse mode hook.  Called by pim_input after decapsulating
2485  * the packet. Loop back the packet, as if we have received it.
2486  * In pim_input() we have to check if the destination is a multicast address.
2487  */
2488 static int
register_mforward(mblk_t * mp,ip_recv_attr_t * ira)2489 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2490 {
2491 	ire_t		*ire;
2492 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2493 	ill_t		*ill = ira->ira_ill;
2494 	ip_stack_t	*ipst = ill->ill_ipst;
2495 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2496 
2497 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2498 
2499 	if (ipst->ips_ip_mrtdebug > 3) {
2500 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2501 		    "register_mforward: src %x, dst %x\n",
2502 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2503 	}
2504 	/*
2505 	 * Need to pass in to ip_mforward() the information that the
2506 	 * packet has arrived on the register_vif. We mark it with
2507 	 * the IRAF_PIM_REGISTER attribute.
2508 	 * pim_input verified that the (inner) destination is multicast,
2509 	 * hence we skip the generic code in ip_input.
2510 	 */
2511 	ira->ira_flags |= IRAF_PIM_REGISTER;
2512 	++ipst->ips_mrtstat->mrts_pim_regforwards;
2513 
2514 	if (!CLASSD(ipha->ipha_dst)) {
2515 		ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2516 		    ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2517 		    NULL, NULL, NULL);
2518 	} else {
2519 		ire = ire_multicast(ill);
2520 	}
2521 	ASSERT(ire != NULL);
2522 	/* Normally this will return the IRE_MULTICAST */
2523 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2524 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2525 		ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2526 		freemsg(mp);
2527 		ire_refrele(ire);
2528 		return (-1);
2529 	}
2530 	ASSERT(ire->ire_type & IRE_MULTICAST);
2531 	(*ire->ire_recvfn)(ire, mp, ipha, ira);
2532 	ire_refrele(ire);
2533 
2534 	return (0);
2535 }
2536 
2537 /*
2538  * Send an encapsulated packet.
2539  * Caller assumes can continue to use mp when routine returns.
2540  */
2541 /* ARGSUSED */
2542 static void
encap_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2543 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2544 {
2545 	mblk_t	*mp_copy;
2546 	ipha_t	*ipha_copy;
2547 	size_t	len;
2548 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2549 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2550 
2551 	if (ipst->ips_ip_mrtdebug > 1) {
2552 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2553 		    "encap_send: vif %ld enter",
2554 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
2555 	}
2556 	len = ntohs(ipha->ipha_length);
2557 
2558 	/*
2559 	 * Copy the old packet & pullup it's IP header into the
2560 	 * new mbuf so we can modify it.  Try to fill the new
2561 	 * mbuf since if we don't the ethernet driver will.
2562 	 */
2563 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2564 	if (mp_copy == NULL)
2565 		return;
2566 	mp_copy->b_rptr += 32;
2567 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2568 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2569 		freeb(mp_copy);
2570 		return;
2571 	}
2572 
2573 	/*
2574 	 * Fill in the encapsulating IP header.
2575 	 * Remote tunnel dst in rmt_addr, from add_vif().
2576 	 */
2577 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2578 	*ipha_copy = multicast_encap_iphdr;
2579 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2580 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2581 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2582 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2583 	ASSERT(ipha_copy->ipha_ident == 0);
2584 
2585 	/* Turn the encapsulated IP header back into a valid one. */
2586 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2587 	ipha->ipha_ttl--;
2588 	ipha->ipha_hdr_checksum = 0;
2589 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2590 
2591 	ipha_copy->ipha_ttl = ipha->ipha_ttl;
2592 
2593 	if (ipst->ips_ip_mrtdebug > 1) {
2594 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2595 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2596 	}
2597 	if (vifp->v_rate_limit <= 0)
2598 		tbf_send_packet(vifp, mp_copy);
2599 	else
2600 		/* ipha is from the original header */
2601 		tbf_control(vifp, mp_copy, ipha);
2602 }
2603 
2604 /*
2605  * De-encapsulate a packet and feed it back through IP input if it
2606  * matches one of our multicast tunnels.
2607  *
2608  * This routine is called whenever IP gets a packet with prototype
2609  * IPPROTO_ENCAP and a local destination address and the packet didn't
2610  * match one of our configured IP-in-IP tunnels.
2611  */
2612 void
ip_mroute_decap(mblk_t * mp,ip_recv_attr_t * ira)2613 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2614 {
2615 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2616 	ipha_t		*ipha_encap;
2617 	int		hlen = IPH_HDR_LENGTH(ipha);
2618 	int		hlen_encap;
2619 	ipaddr_t	src;
2620 	struct vif	*vifp;
2621 	ire_t		*ire;
2622 	ill_t		*ill = ira->ira_ill;
2623 	ip_stack_t	*ipst = ill->ill_ipst;
2624 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2625 
2626 	/* Make sure we have all of the inner header */
2627 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2628 	if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2629 		ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2630 		if (ipha == NULL) {
2631 			ipst->ips_mrtstat->mrts_bad_tunnel++;
2632 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2633 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
2634 			freemsg(mp);
2635 			return;
2636 		}
2637 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
2638 	}
2639 	hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2640 	if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2641 		ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2642 		if (ipha == NULL) {
2643 			ipst->ips_mrtstat->mrts_bad_tunnel++;
2644 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2645 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
2646 			freemsg(mp);
2647 			return;
2648 		}
2649 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
2650 	}
2651 
2652 	/*
2653 	 * Dump the packet if it's not to a multicast destination or if
2654 	 * we don't have an encapsulating tunnel with the source.
2655 	 * Note:  This code assumes that the remote site IP address
2656 	 * uniquely identifies the tunnel (i.e., that this site has
2657 	 * at most one tunnel with the remote site).
2658 	 */
2659 	if (!CLASSD(ipha_encap->ipha_dst)) {
2660 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2661 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2662 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2663 		ip_drop_input("mrts_bad_tunnel", mp, ill);
2664 		freemsg(mp);
2665 		return;
2666 	}
2667 	src = (ipaddr_t)ipha->ipha_src;
2668 	mutex_enter(&ipst->ips_last_encap_lock);
2669 	if (src != ipst->ips_last_encap_src) {
2670 		struct vif *vife;
2671 
2672 		vifp = ipst->ips_vifs;
2673 		vife = vifp + ipst->ips_numvifs;
2674 		ipst->ips_last_encap_src = src;
2675 		ipst->ips_last_encap_vif = 0;
2676 		for (; vifp < vife; ++vifp) {
2677 			if (!lock_good_vif(vifp))
2678 				continue;
2679 			if (vifp->v_rmt_addr.s_addr == src) {
2680 				if (vifp->v_flags & VIFF_TUNNEL)
2681 					ipst->ips_last_encap_vif = vifp;
2682 				if (ipst->ips_ip_mrtdebug > 1) {
2683 					(void) mi_strlog(mrouter->conn_rq,
2684 					    1, SL_TRACE,
2685 					    "ip_mroute_decap: good tun "
2686 					    "vif %ld with %x",
2687 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
2688 					    ntohl(src));
2689 				}
2690 				unlock_good_vif(vifp);
2691 				break;
2692 			}
2693 			unlock_good_vif(vifp);
2694 		}
2695 	}
2696 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
2697 		mutex_exit(&ipst->ips_last_encap_lock);
2698 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2699 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2700 		ip_drop_input("mrts_bad_tunnel", mp, ill);
2701 		freemsg(mp);
2702 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2703 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2704 		return;
2705 	}
2706 	mutex_exit(&ipst->ips_last_encap_lock);
2707 
2708 	/*
2709 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2710 	 * verify that the packet arrived over the correct vif.)
2711 	 */
2712 	ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2713 	ira->ira_mroute_tunnel = src;
2714 	mp->b_rptr += hlen;
2715 	ira->ira_pktlen -= hlen;
2716 	ira->ira_ip_hdr_length = hlen_encap;
2717 
2718 	/*
2719 	 * We don't redo any of the filtering in ill_input_full_v4 and we
2720 	 * have checked that all of ipha_encap and any IP options are
2721 	 * pulled up. Hence we call ire_recv_multicast_v4 directly.
2722 	 * However, we have to check for RSVP as in ip_input_full_v4
2723 	 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2724 	 * to the rsvpd.
2725 	 */
2726 	if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2727 	    ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2728 		ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2729 		    ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2730 		    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2731 	} else {
2732 		ire = ire_multicast(ill);
2733 	}
2734 	ASSERT(ire != NULL);
2735 	/* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2736 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2737 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2738 		ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2739 		freemsg(mp);
2740 		ire_refrele(ire);
2741 		return;
2742 	}
2743 	ire->ire_ib_pkt_count++;
2744 	ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2745 	(*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2746 	ire_refrele(ire);
2747 }
2748 
2749 /*
2750  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2751  * (stream closed).  Called as writer.
2752  */
2753 void
reset_mrt_vif_ipif(ipif_t * ipif)2754 reset_mrt_vif_ipif(ipif_t *ipif)
2755 {
2756 	vifi_t vifi, tmp_vifi;
2757 	vifi_t num_of_vifs;
2758 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2759 
2760 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2761 
2762 	mutex_enter(&ipst->ips_numvifs_mutex);
2763 	num_of_vifs = ipst->ips_numvifs;
2764 	mutex_exit(&ipst->ips_numvifs_mutex);
2765 
2766 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2767 		tmp_vifi = vifi - 1;
2768 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2769 			(void) del_vif(&tmp_vifi, ipst);
2770 		}
2771 	}
2772 }
2773 
2774 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2775 void
reset_mrt_ill(ill_t * ill)2776 reset_mrt_ill(ill_t *ill)
2777 {
2778 	struct mfc	*rt;
2779 	struct rtdetq	*rte;
2780 	int		i;
2781 	ip_stack_t	*ipst = ill->ill_ipst;
2782 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2783 	timeout_id_t	id;
2784 
2785 	for (i = 0; i < MFCTBLSIZ; i++) {
2786 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2787 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2788 			if (ipst->ips_ip_mrtdebug > 1) {
2789 				(void) mi_strlog(mrouter->conn_rq, 1,
2790 				    SL_TRACE,
2791 				    "reset_mrt_ill: mfctable [%d]", i);
2792 			}
2793 			while (rt != NULL) {
2794 				mutex_enter(&rt->mfc_mutex);
2795 				while ((rte = rt->mfc_rte) != NULL) {
2796 					if (rte->ill == ill &&
2797 					    (id = rt->mfc_timeout_id) != 0) {
2798 						/*
2799 						 * Its ok to drop the lock,  the
2800 						 * struct cannot be freed since
2801 						 * we have a ref on the hash
2802 						 * bucket.
2803 						 */
2804 						mutex_exit(&rt->mfc_mutex);
2805 						(void) untimeout(id);
2806 						mutex_enter(&rt->mfc_mutex);
2807 					}
2808 					if (rte->ill == ill) {
2809 						if (ipst->ips_ip_mrtdebug > 1) {
2810 						(void) mi_strlog(
2811 						    mrouter->conn_rq,
2812 						    1, SL_TRACE,
2813 						    "reset_mrt_ill: "
2814 						    "ill 0x%p", (void *)ill);
2815 						}
2816 						rt->mfc_rte = rte->rte_next;
2817 						freemsg(rte->mp);
2818 						mi_free((char *)rte);
2819 					}
2820 				}
2821 				mutex_exit(&rt->mfc_mutex);
2822 				rt = rt->mfc_next;
2823 			}
2824 		}
2825 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
2826 	}
2827 }
2828 
2829 /*
2830  * Token bucket filter module.
2831  * The ipha is for mcastgrp destination for phyint and encap.
2832  */
2833 static void
tbf_control(struct vif * vifp,mblk_t * mp,ipha_t * ipha)2834 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2835 {
2836 	size_t	p_len =  msgdsize(mp);
2837 	struct tbf	*t    = vifp->v_tbf;
2838 	timeout_id_t id = 0;
2839 	ill_t		*ill = vifp->v_ipif->ipif_ill;
2840 	ip_stack_t	*ipst = ill->ill_ipst;
2841 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2842 
2843 	/* Drop if packet is too large */
2844 	if (p_len > MAX_BKT_SIZE) {
2845 		ipst->ips_mrtstat->mrts_pkt2large++;
2846 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2847 		ip_drop_output("tbf_control - too large", mp, ill);
2848 		freemsg(mp);
2849 		return;
2850 	}
2851 	if (ipst->ips_ip_mrtdebug > 1) {
2852 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2853 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2854 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2855 		    ntohl(ipha->ipha_dst));
2856 	}
2857 
2858 	mutex_enter(&t->tbf_lock);
2859 
2860 	tbf_update_tokens(vifp);
2861 
2862 	/*
2863 	 * If there are enough tokens,
2864 	 * and the queue is empty, send this packet out.
2865 	 */
2866 	if (ipst->ips_ip_mrtdebug > 1) {
2867 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2868 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2869 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2870 		    t->tbf_q_len);
2871 	}
2872 	/* No packets are queued */
2873 	if (t->tbf_q_len == 0) {
2874 		/* queue empty, send packet if enough tokens */
2875 		if (p_len <= t->tbf_n_tok) {
2876 			t->tbf_n_tok -= p_len;
2877 			mutex_exit(&t->tbf_lock);
2878 			tbf_send_packet(vifp, mp);
2879 			return;
2880 		} else {
2881 			/* Queue packet and timeout till later */
2882 			tbf_queue(vifp, mp);
2883 			ASSERT(vifp->v_timeout_id == 0);
2884 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2885 			    TBF_REPROCESS);
2886 		}
2887 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2888 		/* Finite queue length, so queue pkts and process queue */
2889 		tbf_queue(vifp, mp);
2890 		tbf_process_q(vifp);
2891 	} else {
2892 		/* Check that we have UDP header with IP header */
2893 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2894 		    sizeof (struct udphdr);
2895 
2896 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2897 			if (!pullupmsg(mp, hdr_length)) {
2898 				BUMP_MIB(ill->ill_ip_mib,
2899 				    ipIfStatsOutDiscards);
2900 				ip_drop_output("tbf_control - pullup", mp, ill);
2901 				freemsg(mp);
2902 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2903 				    "vif %ld src 0x%x dst 0x%x\n",
2904 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
2905 				    ntohl(ipha->ipha_src),
2906 				    ntohl(ipha->ipha_dst)));
2907 				mutex_exit(&vifp->v_tbf->tbf_lock);
2908 				return;
2909 			} else
2910 				/* Have to reassign ipha after pullupmsg */
2911 				ipha = (ipha_t *)mp->b_rptr;
2912 		}
2913 		/*
2914 		 * Queue length too much,
2915 		 * try to selectively dq, or queue and process
2916 		 */
2917 		if (!tbf_dq_sel(vifp, ipha)) {
2918 			ipst->ips_mrtstat->mrts_q_overflow++;
2919 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2920 			ip_drop_output("mrts_q_overflow", mp, ill);
2921 			freemsg(mp);
2922 		} else {
2923 			tbf_queue(vifp, mp);
2924 			tbf_process_q(vifp);
2925 		}
2926 	}
2927 	if (t->tbf_q_len == 0) {
2928 		id = vifp->v_timeout_id;
2929 		vifp->v_timeout_id = 0;
2930 	}
2931 	mutex_exit(&vifp->v_tbf->tbf_lock);
2932 	if (id != 0)
2933 		(void) untimeout(id);
2934 }
2935 
2936 /*
2937  * Adds a packet to the tbf queue at the interface.
2938  * The ipha is for mcastgrp destination for phyint and encap.
2939  */
2940 static void
tbf_queue(struct vif * vifp,mblk_t * mp)2941 tbf_queue(struct vif *vifp, mblk_t *mp)
2942 {
2943 	struct tbf	*t = vifp->v_tbf;
2944 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2945 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2946 
2947 	if (ipst->ips_ip_mrtdebug > 1) {
2948 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2949 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2950 	}
2951 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2952 
2953 	if (t->tbf_t == NULL) {
2954 		/* Queue was empty */
2955 		t->tbf_q = mp;
2956 	} else {
2957 		/* Insert at tail */
2958 		t->tbf_t->b_next = mp;
2959 	}
2960 	/* set new tail pointer */
2961 	t->tbf_t = mp;
2962 
2963 	mp->b_next = mp->b_prev = NULL;
2964 
2965 	t->tbf_q_len++;
2966 }
2967 
2968 /*
2969  * Process the queue at the vif interface.
2970  * Drops the tbf_lock when sending packets.
2971  *
2972  * NOTE : The caller should quntimeout if the queue length is 0.
2973  */
2974 static void
tbf_process_q(struct vif * vifp)2975 tbf_process_q(struct vif *vifp)
2976 {
2977 	mblk_t	*mp;
2978 	struct tbf	*t = vifp->v_tbf;
2979 	size_t	len;
2980 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2981 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2982 
2983 	if (ipst->ips_ip_mrtdebug > 1) {
2984 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2985 		    "tbf_process_q 1: vif %ld qlen = %d",
2986 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2987 	}
2988 
2989 	/*
2990 	 * Loop through the queue at the interface and send
2991 	 * as many packets as possible.
2992 	 */
2993 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2994 
2995 	while (t->tbf_q_len > 0) {
2996 		mp = t->tbf_q;
2997 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2998 
2999 		/* Determine if the packet can be sent */
3000 		if (len <= t->tbf_n_tok) {
3001 			/*
3002 			 * If so, reduce no. of tokens, dequeue the packet,
3003 			 * send the packet.
3004 			 */
3005 			t->tbf_n_tok -= len;
3006 
3007 			t->tbf_q = mp->b_next;
3008 			if (--t->tbf_q_len == 0) {
3009 				t->tbf_t = NULL;
3010 			}
3011 			mp->b_next = NULL;
3012 			/* Exit mutex before sending packet, then re-enter */
3013 			mutex_exit(&t->tbf_lock);
3014 			tbf_send_packet(vifp, mp);
3015 			mutex_enter(&t->tbf_lock);
3016 		} else
3017 			break;
3018 	}
3019 }
3020 
3021 /* Called at tbf timeout to update tokens, process q and reset timer.  */
3022 static void
tbf_reprocess_q(void * arg)3023 tbf_reprocess_q(void *arg)
3024 {
3025 	struct vif *vifp = arg;
3026 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3027 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3028 
3029 	mutex_enter(&vifp->v_tbf->tbf_lock);
3030 	vifp->v_timeout_id = 0;
3031 	tbf_update_tokens(vifp);
3032 
3033 	tbf_process_q(vifp);
3034 
3035 	if (vifp->v_tbf->tbf_q_len > 0) {
3036 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3037 		    TBF_REPROCESS);
3038 	}
3039 	mutex_exit(&vifp->v_tbf->tbf_lock);
3040 
3041 	if (ipst->ips_ip_mrtdebug > 1) {
3042 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3043 		    "tbf_reprcess_q: vif %ld timeout id = %p",
3044 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3045 	}
3046 }
3047 
3048 /*
3049  * Function that will selectively discard a member of the tbf queue,
3050  * based on the precedence value and the priority.
3051  *
3052  * NOTE : The caller should quntimeout if the queue length is 0.
3053  */
3054 static int
tbf_dq_sel(struct vif * vifp,ipha_t * ipha)3055 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3056 {
3057 	uint_t		p;
3058 	struct tbf		*t = vifp->v_tbf;
3059 	mblk_t		**np;
3060 	mblk_t		*last, *mp;
3061 	ill_t		*ill = vifp->v_ipif->ipif_ill;
3062 	ip_stack_t	*ipst = ill->ill_ipst;
3063 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3064 
3065 	if (ipst->ips_ip_mrtdebug > 1) {
3066 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3067 		    "dq_sel: vif %ld dst 0x%x",
3068 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3069 	}
3070 
3071 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3072 	p = priority(vifp, ipha);
3073 
3074 	np = &t->tbf_q;
3075 	last = NULL;
3076 	while ((mp = *np) != NULL) {
3077 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3078 			*np = mp->b_next;
3079 			/* If removing the last packet, fix the tail pointer */
3080 			if (mp == t->tbf_t)
3081 				t->tbf_t = last;
3082 			mp->b_prev = mp->b_next = NULL;
3083 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3084 			ip_drop_output("tbf_dq_send", mp, ill);
3085 			freemsg(mp);
3086 			/*
3087 			 * It's impossible for the queue to be empty, but
3088 			 * we check anyway.
3089 			 */
3090 			if (--t->tbf_q_len == 0) {
3091 				t->tbf_t = NULL;
3092 			}
3093 			ipst->ips_mrtstat->mrts_drop_sel++;
3094 			return (1);
3095 		}
3096 		np = &mp->b_next;
3097 		last = mp;
3098 	}
3099 	return (0);
3100 }
3101 
3102 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3103 static void
tbf_send_packet(struct vif * vifp,mblk_t * mp)3104 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3105 {
3106 	ipif_t		*ipif = vifp->v_ipif;
3107 	ill_t		*ill = ipif->ipif_ill;
3108 	ip_stack_t	*ipst = ill->ill_ipst;
3109 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3110 	ipha_t		*ipha;
3111 
3112 	ipha = (ipha_t *)mp->b_rptr;
3113 	/* If encap tunnel options */
3114 	if (vifp->v_flags & VIFF_TUNNEL)  {
3115 		ip_xmit_attr_t	ixas;
3116 
3117 		if (ipst->ips_ip_mrtdebug > 1) {
3118 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3119 			    "tbf_send_packet: ENCAP tunnel vif %ld",
3120 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
3121 		}
3122 		bzero(&ixas, sizeof (ixas));
3123 		ixas.ixa_flags =
3124 		    IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3125 		ixas.ixa_ipst = ipst;
3126 		ixas.ixa_ifindex = 0;
3127 		ixas.ixa_cred = kcred;
3128 		ixas.ixa_cpid = NOPID;
3129 		ixas.ixa_tsl = NULL;
3130 		ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3131 		ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3132 		ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3133 
3134 		/*
3135 		 * Feed into ip_output_simple which will set the ident field
3136 		 * and checksum the encapsulating header.
3137 		 * BSD gets the cached route vifp->v_route from ip_output()
3138 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
3139 		 * One could make multicast forwarding faster by putting an
3140 		 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3141 		 */
3142 		(void) ip_output_simple(mp, &ixas);
3143 		ixa_cleanup(&ixas);
3144 		return;
3145 
3146 		/* phyint */
3147 	} else {
3148 		/* Need to loop back to members on the outgoing interface. */
3149 		ipaddr_t	dst;
3150 		ip_recv_attr_t	iras;
3151 		nce_t		*nce;
3152 
3153 		bzero(&iras, sizeof (iras));
3154 		iras.ira_flags = IRAF_IS_IPV4;
3155 		iras.ira_ill = iras.ira_rill = ill;
3156 		iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3157 		iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3158 		iras.ira_pktlen = ntohs(ipha->ipha_length);
3159 		iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3160 
3161 		dst = ipha->ipha_dst;
3162 		if (ill_hasmembers_v4(ill, dst)) {
3163 			iras.ira_flags |= IRAF_LOOPBACK_COPY;
3164 		}
3165 		if (ipst->ips_ip_mrtdebug > 1) {
3166 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3167 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3168 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3169 		}
3170 		/*
3171 		 * Find an NCE which matches the nexthop.
3172 		 * For a pt-pt interface we use the other end of the pt-pt
3173 		 * link.
3174 		 */
3175 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3176 			dst = ipif->ipif_pp_dst_addr;
3177 			nce = arp_nce_init(ill, dst, ill->ill_net_type);
3178 		} else {
3179 			nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3180 		}
3181 		if (nce == NULL) {
3182 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3183 			ip_drop_output("tbf_send_packet - no nce", mp, ill);
3184 			freemsg(mp);
3185 			return;
3186 		}
3187 
3188 		/*
3189 		 * We don't remeber the incoming ill. Thus we
3190 		 * pretend the  packet arrived on the outbound ill. This means
3191 		 * statistics for input errors will be increased on the wrong
3192 		 * ill but that isn't a big deal.
3193 		 */
3194 		ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
3195 		    0);
3196 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3197 
3198 		nce_refrele(nce);
3199 	}
3200 }
3201 
3202 /*
3203  * Determine the current time and then the elapsed time (between the last time
3204  * and time now).  Update the no. of tokens in the bucket.
3205  */
3206 static void
tbf_update_tokens(struct vif * vifp)3207 tbf_update_tokens(struct vif *vifp)
3208 {
3209 	timespec_t	tp;
3210 	hrtime_t	tm;
3211 	struct tbf	*t = vifp->v_tbf;
3212 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3213 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3214 
3215 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3216 
3217 	/* Time in secs and nsecs, rate limit in kbits/sec */
3218 	gethrestime(&tp);
3219 
3220 	/*LINTED*/
3221 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3222 
3223 	/*
3224 	 * This formula is actually
3225 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3226 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3227 	 *
3228 	 * The (1000/1024) was introduced in add_vif to optimize
3229 	 * this divide into a shift.
3230 	 */
3231 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3232 	t->tbf_last_pkt_t = tp;
3233 
3234 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3235 		t->tbf_n_tok = MAX_BKT_SIZE;
3236 	if (ipst->ips_ip_mrtdebug > 1) {
3237 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3238 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3239 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3240 	}
3241 }
3242 
3243 /*
3244  * Priority currently is based on port nos.
3245  * Different forwarding mechanisms have different ways
3246  * of obtaining the port no. Hence, the vif must be
3247  * given along with the packet itself.
3248  *
3249  */
3250 static int
priority(struct vif * vifp,ipha_t * ipha)3251 priority(struct vif *vifp, ipha_t *ipha)
3252 {
3253 	int prio;
3254 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3255 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3256 
3257 	/* Temporary hack; may add general packet classifier some day */
3258 
3259 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3260 
3261 	/*
3262 	 * The UDP port space is divided up into four priority ranges:
3263 	 * [0, 16384)	: unclassified - lowest priority
3264 	 * [16384, 32768)	: audio - highest priority
3265 	 * [32768, 49152)	: whiteboard - medium priority
3266 	 * [49152, 65536)	: video - low priority
3267 	 */
3268 
3269 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3270 		struct udphdr *udp =
3271 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3272 		switch (ntohs(udp->uh_dport) & 0xc000) {
3273 		case 0x4000:
3274 			prio = 70;
3275 			break;
3276 		case 0x8000:
3277 			prio = 60;
3278 			break;
3279 		case 0xc000:
3280 			prio = 55;
3281 			break;
3282 		default:
3283 			prio = 50;
3284 			break;
3285 		}
3286 		if (ipst->ips_ip_mrtdebug > 1) {
3287 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3288 			    "priority: port %x prio %d\n",
3289 			    ntohs(udp->uh_dport), prio);
3290 		}
3291 	} else
3292 		prio = 50;  /* default priority */
3293 	return (prio);
3294 }
3295 
3296 /*
3297  * End of token bucket filter modifications
3298  */
3299 
3300 
3301 
3302 /*
3303  * Produces data for netstat -M.
3304  */
3305 int
ip_mroute_stats(mblk_t * mp,ip_stack_t * ipst)3306 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3307 {
3308 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3309 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3310 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3311 		sizeof (struct mrtstat))) {
3312 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3313 		    (size_t)sizeof (struct mrtstat)));
3314 		return (0);
3315 	}
3316 	return (1);
3317 }
3318 
3319 /*
3320  * Sends info for SNMP's MIB.
3321  */
3322 int
ip_mroute_vif(mblk_t * mp,ip_stack_t * ipst)3323 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3324 {
3325 	struct vifctl	vi;
3326 	vifi_t		vifi;
3327 
3328 	mutex_enter(&ipst->ips_numvifs_mutex);
3329 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3330 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3331 			continue;
3332 		/*
3333 		 * No locks here, an approximation is fine.
3334 		 */
3335 		vi.vifc_vifi = vifi;
3336 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3337 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3338 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
3339 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
3340 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
3341 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
3342 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
3343 
3344 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3345 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3346 			    (size_t)sizeof (vi)));
3347 			mutex_exit(&ipst->ips_numvifs_mutex);
3348 			return (0);
3349 		}
3350 	}
3351 	mutex_exit(&ipst->ips_numvifs_mutex);
3352 	return (1);
3353 }
3354 
3355 /*
3356  * Called by ip_snmp_get to send up multicast routing table.
3357  */
3358 int
ip_mroute_mrt(mblk_t * mp,ip_stack_t * ipst)3359 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3360 {
3361 	int			i, j;
3362 	struct mfc		*rt;
3363 	struct mfcctl	mfcc;
3364 
3365 	/*
3366 	 * Make sure multicast has not been turned off.
3367 	 */
3368 	if (is_mrouter_off(ipst))
3369 		return (1);
3370 
3371 	/* Loop over all hash buckets and their chains */
3372 	for (i = 0; i < MFCTBLSIZ; i++) {
3373 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3374 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3375 			mutex_enter(&rt->mfc_mutex);
3376 			if (rt->mfc_rte != NULL ||
3377 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3378 				mutex_exit(&rt->mfc_mutex);
3379 				continue;
3380 			}
3381 			mfcc.mfcc_origin = rt->mfc_origin;
3382 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3383 			mfcc.mfcc_parent = rt->mfc_parent;
3384 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3385 			mutex_enter(&ipst->ips_numvifs_mutex);
3386 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
3387 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3388 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3389 				mfcc.mfcc_ttls[j] = 0;
3390 			mutex_exit(&ipst->ips_numvifs_mutex);
3391 
3392 			mutex_exit(&rt->mfc_mutex);
3393 			if (!snmp_append_data(mp, (char *)&mfcc,
3394 			    sizeof (mfcc))) {
3395 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
3396 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3397 				    (size_t)sizeof (mfcc)));
3398 				return (0);
3399 			}
3400 		}
3401 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
3402 	}
3403 	return (1);
3404 }
3405