1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24 /* Copyright (c) 1990 Mentat Inc. */
25
26 /*
27 * Copyright (c) 2018, Joyent, Inc.
28 */
29
30 /*
31 * Procedures for the kernel part of DVMRP,
32 * a Distance-Vector Multicast Routing Protocol.
33 * (See RFC-1075)
34 * Written by David Waitzman, BBN Labs, August 1988.
35 * Modified by Steve Deering, Stanford, February 1989.
36 * Modified by Mark J. Steiglitz, Stanford, May, 1991
37 * Modified by Van Jacobson, LBL, January 1993
38 * Modified by Ajit Thyagarajan, PARC, August 1993
39 * Modified by Bill Fenner, PARC, April 1995
40 *
41 * MROUTING 3.5
42 */
43
44 /*
45 * TODO
46 * - function pointer field in vif, void *vif_sendit()
47 */
48
49 #include <sys/types.h>
50 #include <sys/stream.h>
51 #include <sys/stropts.h>
52 #include <sys/strlog.h>
53 #include <sys/systm.h>
54 #include <sys/ddi.h>
55 #include <sys/cmn_err.h>
56 #include <sys/zone.h>
57
58 #include <sys/param.h>
59 #include <sys/socket.h>
60 #include <sys/vtrace.h>
61 #include <sys/debug.h>
62 #include <net/if.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <net/if_dl.h>
66
67 #include <inet/ipsec_impl.h>
68 #include <inet/common.h>
69 #include <inet/mi.h>
70 #include <inet/nd.h>
71 #include <inet/tunables.h>
72 #include <inet/mib2.h>
73 #include <netinet/ip6.h>
74 #include <inet/ip.h>
75 #include <inet/snmpcom.h>
76
77 #include <netinet/igmp.h>
78 #include <netinet/igmp_var.h>
79 #include <netinet/udp.h>
80 #include <netinet/ip_mroute.h>
81 #include <inet/ip_multi.h>
82 #include <inet/ip_ire.h>
83 #include <inet/ip_ndp.h>
84 #include <inet/ip_if.h>
85 #include <inet/ipclassifier.h>
86
87 #include <netinet/pim.h>
88
89
90 /*
91 * MT Design:
92 *
93 * There are three main data structures viftable, mfctable and tbftable that
94 * need to be protected against MT races.
95 *
96 * vitable is a fixed length array of vif structs. There is no lock to protect
97 * the whole array, instead each struct is protected by its own indiviual lock.
98 * The value of v_marks in conjuction with the value of v_refcnt determines the
99 * current state of a vif structure. One special state that needs mention
100 * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
101 * that vif is being initalized.
102 * Each structure is freed when the refcnt goes down to zero. If a delete comes
103 * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
104 * which prevents the struct from further use. When the refcnt goes to zero
105 * the struct is freed and is marked VIF_MARK_NOTINUSE.
106 * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
107 * from going away a refhold is put on the ipif before using it. see
108 * lock_good_vif() and unlock_good_vif().
109 *
110 * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
111 * of the vif struct.
112 *
113 * tbftable is also a fixed length array of tbf structs and is only accessed
114 * via v_tbf. It is protected by its own lock tbf_lock.
115 *
116 * Lock Ordering is
117 * v_lock --> tbf_lock
118 * v_lock --> ill_locK
119 *
120 * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
121 * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
122 * it also maintains a state. These fields are protected by a lock (mfcb_lock).
123 * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
124 * protect the struct elements.
125 *
126 * mfc structs are dynamically allocated and are singly linked
127 * at the head of the chain. When an mfc structure is to be deleted
128 * it is marked condemned and so is the state in the bucket struct.
129 * When the last walker of the hash bucket exits all the mfc structs
130 * marked condemed are freed.
131 *
132 * Locking Hierarchy:
133 * The bucket lock should be acquired before the mfc struct lock.
134 * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
135 * operations on the bucket struct.
136 *
137 * last_encap_lock and numvifs_mutex should be acquired after
138 * acquring vif or mfc locks. These locks protect some global variables.
139 *
140 * The statistics are not currently protected by a lock
141 * causing the stats be be approximate, not exact.
142 */
143
144 #define NO_VIF MAXVIFS /* from mrouted, no route for src */
145
146 /*
147 * Timeouts:
148 * Upcall timeouts - BSD uses boolean_t mfc->expire and
149 * nexpire[MFCTBLSIZE], the number of times expire has been called.
150 * SunOS 5.x uses mfc->timeout for each mfc.
151 * Some Unixes are limited in the number of simultaneous timeouts
152 * that can be run, SunOS 5.x does not have this restriction.
153 */
154
155 /*
156 * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
157 * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
158 * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
159 */
160 #define EXPIRE_TIMEOUT (hz/4) /* 4x / second */
161 #define UPCALL_EXPIRE 6 /* number of timeouts */
162
163 /*
164 * Hash function for a source, group entry
165 */
166 #define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
167 ((g) >> 20) ^ ((g) >> 10) ^ (g))
168
169 #define TBF_REPROCESS (hz / 100) /* 100x /second */
170
171 /* Identify PIM packet that came on a Register interface */
172 #define PIM_REGISTER_MARKER 0xffffffff
173
174 /* Function declarations */
175 static int add_mfc(struct mfcctl *, ip_stack_t *);
176 static int add_vif(struct vifctl *, conn_t *, ip_stack_t *);
177 static int del_mfc(struct mfcctl *, ip_stack_t *);
178 static int del_vif(vifi_t *, ip_stack_t *);
179 static void del_vifp(struct vif *);
180 static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
181 static void expire_upcalls(void *);
182 static void fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
183 static void free_queue(struct mfc *);
184 static int get_assert(uchar_t *, ip_stack_t *);
185 static int get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
186 static int get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
187 static int get_version(uchar_t *);
188 static int get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
189 static int ip_mdq(mblk_t *, ipha_t *, ill_t *,
190 ipaddr_t, struct mfc *);
191 static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
192 static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
193 static int register_mforward(mblk_t *, ip_recv_attr_t *);
194 static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
195 static int set_assert(int *, ip_stack_t *);
196
197 /*
198 * Token Bucket Filter functions
199 */
200 static int priority(struct vif *, ipha_t *);
201 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
202 static int tbf_dq_sel(struct vif *, ipha_t *);
203 static void tbf_process_q(struct vif *);
204 static void tbf_queue(struct vif *, mblk_t *);
205 static void tbf_reprocess_q(void *);
206 static void tbf_send_packet(struct vif *, mblk_t *);
207 static void tbf_update_tokens(struct vif *);
208 static void release_mfc(struct mfcb *);
209
210 static boolean_t is_mrouter_off(ip_stack_t *);
211 /*
212 * Encapsulation packets
213 */
214
215 #define ENCAP_TTL 64
216
217 /* prototype IP hdr for encapsulated packets */
218 static ipha_t multicast_encap_iphdr = {
219 IP_SIMPLE_HDR_VERSION,
220 0, /* tos */
221 sizeof (ipha_t), /* total length */
222 0, /* id */
223 0, /* frag offset */
224 ENCAP_TTL, IPPROTO_ENCAP,
225 0, /* checksum */
226 };
227
228 /*
229 * Rate limit for assert notification messages, in nsec.
230 */
231 #define ASSERT_MSG_TIME 3000000000
232
233
234 #define VIF_REFHOLD(vifp) { \
235 mutex_enter(&(vifp)->v_lock); \
236 (vifp)->v_refcnt++; \
237 mutex_exit(&(vifp)->v_lock); \
238 }
239
240 #define VIF_REFRELE_LOCKED(vifp) { \
241 (vifp)->v_refcnt--; \
242 if ((vifp)->v_refcnt == 0 && \
243 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \
244 del_vifp(vifp); \
245 } else { \
246 mutex_exit(&(vifp)->v_lock); \
247 } \
248 }
249
250 #define VIF_REFRELE(vifp) { \
251 mutex_enter(&(vifp)->v_lock); \
252 (vifp)->v_refcnt--; \
253 if ((vifp)->v_refcnt == 0 && \
254 ((vifp)->v_marks & VIF_MARK_CONDEMNED)) { \
255 del_vifp(vifp); \
256 } else { \
257 mutex_exit(&(vifp)->v_lock); \
258 } \
259 }
260
261 #define MFCB_REFHOLD(mfcb) { \
262 mutex_enter(&(mfcb)->mfcb_lock); \
263 (mfcb)->mfcb_refcnt++; \
264 ASSERT((mfcb)->mfcb_refcnt != 0); \
265 mutex_exit(&(mfcb)->mfcb_lock); \
266 }
267
268 #define MFCB_REFRELE(mfcb) { \
269 mutex_enter(&(mfcb)->mfcb_lock); \
270 ASSERT((mfcb)->mfcb_refcnt != 0); \
271 if (--(mfcb)->mfcb_refcnt == 0 && \
272 ((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) { \
273 release_mfc(mfcb); \
274 } \
275 mutex_exit(&(mfcb)->mfcb_lock); \
276 }
277
278 /*
279 * MFCFIND:
280 * Find a route for a given origin IP address and multicast group address.
281 * Skip entries with pending upcalls.
282 * Type of service parameter to be added in the future!
283 */
284 #define MFCFIND(mfcbp, o, g, rt) { \
285 struct mfc *_mb_rt = NULL; \
286 rt = NULL; \
287 _mb_rt = mfcbp->mfcb_mfc; \
288 while (_mb_rt) { \
289 if ((_mb_rt->mfc_origin.s_addr == o) && \
290 (_mb_rt->mfc_mcastgrp.s_addr == g) && \
291 (_mb_rt->mfc_rte == NULL) && \
292 (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) { \
293 rt = _mb_rt; \
294 break; \
295 } \
296 _mb_rt = _mb_rt->mfc_next; \
297 } \
298 }
299
300 /*
301 * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
302 * are inefficient. We use gethrestime() which returns a timespec_t with
303 * sec and nsec, the resolution is machine dependent.
304 * The following 2 macros have been changed to use nsec instead of usec.
305 */
306 /*
307 * Macros to compute elapsed time efficiently.
308 * Borrowed from Van Jacobson's scheduling code.
309 * Delta should be a hrtime_t.
310 */
311 #define TV_DELTA(a, b, delta) { \
312 int xxs; \
313 \
314 delta = (a).tv_nsec - (b).tv_nsec; \
315 if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
316 switch (xxs) { \
317 case 2: \
318 delta += 1000000000; \
319 /*FALLTHROUGH*/ \
320 case 1: \
321 delta += 1000000000; \
322 break; \
323 default: \
324 delta += (1000000000 * xxs); \
325 } \
326 } \
327 }
328
329 #define TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
330 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
331
332 /*
333 * Handle MRT setsockopt commands to modify the multicast routing tables.
334 */
335 int
ip_mrouter_set(int cmd,conn_t * connp,int checkonly,uchar_t * data,int datalen)336 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
337 int datalen)
338 {
339 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
340
341 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
342 if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
343 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
344 return (EACCES);
345 }
346 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
347
348 if (checkonly) {
349 /*
350 * do not do operation, just pretend to - new T_CHECK
351 * Note: Even routines further on can probably fail but
352 * this T_CHECK stuff is only to please XTI so it not
353 * necessary to be perfect.
354 */
355 switch (cmd) {
356 case MRT_INIT:
357 case MRT_DONE:
358 case MRT_ADD_VIF:
359 case MRT_DEL_VIF:
360 case MRT_ADD_MFC:
361 case MRT_DEL_MFC:
362 case MRT_ASSERT:
363 return (0);
364 default:
365 return (EOPNOTSUPP);
366 }
367 }
368
369 /*
370 * make sure no command is issued after multicast routing has been
371 * turned off.
372 */
373 if (cmd != MRT_INIT && cmd != MRT_DONE) {
374 if (is_mrouter_off(ipst))
375 return (EINVAL);
376 }
377
378 switch (cmd) {
379 case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst));
380 case MRT_DONE: return (ip_mrouter_done(ipst));
381 case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst));
382 case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst));
383 case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst));
384 case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst));
385 case MRT_ASSERT: return (set_assert((int *)data, ipst));
386 default: return (EOPNOTSUPP);
387 }
388 }
389
390 /*
391 * Handle MRT getsockopt commands
392 */
393 int
ip_mrouter_get(int cmd,conn_t * connp,uchar_t * data)394 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
395 {
396 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
397
398 if (connp != ipst->ips_ip_g_mrouter)
399 return (EACCES);
400
401 switch (cmd) {
402 case MRT_VERSION: return (get_version((uchar_t *)data));
403 case MRT_ASSERT: return (get_assert((uchar_t *)data, ipst));
404 default: return (EOPNOTSUPP);
405 }
406 }
407
408 /*
409 * Handle ioctl commands to obtain information from the cache.
410 * Called with shared access to IP. These are read_only ioctls.
411 */
412 /* ARGSUSED */
413 int
mrt_ioctl(ipif_t * ipif,sin_t * sin,queue_t * q,mblk_t * mp,ip_ioctl_cmd_t * ipip,void * if_req)414 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
415 ip_ioctl_cmd_t *ipip, void *if_req)
416 {
417 mblk_t *mp1;
418 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
419 conn_t *connp = Q_TO_CONN(q);
420 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
421
422 /* Existence verified in ip_wput_nondata */
423 mp1 = mp->b_cont->b_cont;
424
425 switch (iocp->ioc_cmd) {
426 case (SIOCGETVIFCNT):
427 return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
428 case (SIOCGETSGCNT):
429 return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
430 case (SIOCGETLSGCNT):
431 return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
432 default:
433 return (EINVAL);
434 }
435 }
436
437 /*
438 * Returns the packet, byte, rpf-failure count for the source, group provided.
439 */
440 static int
get_sg_cnt(struct sioc_sg_req * req,ip_stack_t * ipst)441 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
442 {
443 struct mfc *rt;
444 struct mfcb *mfcbp;
445
446 mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
447 MFCB_REFHOLD(mfcbp);
448 MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
449
450 if (rt != NULL) {
451 mutex_enter(&rt->mfc_mutex);
452 req->pktcnt = rt->mfc_pkt_cnt;
453 req->bytecnt = rt->mfc_byte_cnt;
454 req->wrong_if = rt->mfc_wrong_if;
455 mutex_exit(&rt->mfc_mutex);
456 } else
457 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
458
459 MFCB_REFRELE(mfcbp);
460 return (0);
461 }
462
463 /*
464 * Returns the packet, byte, rpf-failure count for the source, group provided.
465 * Uses larger counters and IPv6 addresses.
466 */
467 /* ARGSUSED XXX until implemented */
468 static int
get_lsg_cnt(struct sioc_lsg_req * req,ip_stack_t * ipst)469 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
470 {
471 /* XXX TODO SIOCGETLSGCNT */
472 return (ENXIO);
473 }
474
475 /*
476 * Returns the input and output packet and byte counts on the vif provided.
477 */
478 static int
get_vif_cnt(struct sioc_vif_req * req,ip_stack_t * ipst)479 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
480 {
481 vifi_t vifi = req->vifi;
482
483 if (vifi >= ipst->ips_numvifs)
484 return (EINVAL);
485
486 /*
487 * No locks here, an approximation is fine.
488 */
489 req->icount = ipst->ips_vifs[vifi].v_pkt_in;
490 req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
491 req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
492 req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
493
494 return (0);
495 }
496
497 static int
get_version(uchar_t * data)498 get_version(uchar_t *data)
499 {
500 int *v = (int *)data;
501
502 *v = 0x0305; /* XXX !!!! */
503
504 return (0);
505 }
506
507 /*
508 * Set PIM assert processing global.
509 */
510 static int
set_assert(int * i,ip_stack_t * ipst)511 set_assert(int *i, ip_stack_t *ipst)
512 {
513 if ((*i != 1) && (*i != 0))
514 return (EINVAL);
515
516 ipst->ips_pim_assert = *i;
517
518 return (0);
519 }
520
521 /*
522 * Get PIM assert processing global.
523 */
524 static int
get_assert(uchar_t * data,ip_stack_t * ipst)525 get_assert(uchar_t *data, ip_stack_t *ipst)
526 {
527 int *i = (int *)data;
528
529 *i = ipst->ips_pim_assert;
530
531 return (0);
532 }
533
534 /*
535 * Enable multicast routing.
536 */
537 static int
ip_mrouter_init(conn_t * connp,uchar_t * data,int datalen,ip_stack_t * ipst)538 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
539 {
540 int *v;
541
542 if (data == NULL || (datalen != sizeof (int)))
543 return (ENOPROTOOPT);
544
545 v = (int *)data;
546 if (*v != 1)
547 return (ENOPROTOOPT);
548
549 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
550 if (ipst->ips_ip_g_mrouter != NULL) {
551 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
552 return (EADDRINUSE);
553 }
554
555 /*
556 * MRT_INIT should only be allowed for RAW sockets, but we double
557 * check.
558 */
559 if (!IPCL_IS_RAWIP(connp)) {
560 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
561 return (EINVAL);
562 }
563
564 ipst->ips_ip_g_mrouter = connp;
565 connp->conn_multi_router = 1;
566 /* In order for tunnels to work we have to turn ip_g_forward on */
567 if (!WE_ARE_FORWARDING(ipst)) {
568 if (ipst->ips_ip_mrtdebug > 1) {
569 (void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
570 "ip_mrouter_init: turning on forwarding");
571 }
572 ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
573 ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
574 }
575
576 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
577 return (0);
578 }
579
580 void
ip_mrouter_stack_init(ip_stack_t * ipst)581 ip_mrouter_stack_init(ip_stack_t *ipst)
582 {
583 mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
584
585 ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
586 KM_SLEEP);
587 ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
588 /*
589 * mfctable:
590 * Includes all mfcs, including waiting upcalls.
591 * Multiple mfcs per bucket.
592 */
593 ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
594 KM_SLEEP);
595 /*
596 * Define the token bucket filter structures.
597 * tbftable -> each vif has one of these for storing info.
598 */
599 ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
600
601 mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
602
603 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
604 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
605 }
606
607 /*
608 * Disable multicast routing.
609 * Didn't use global timeout_val (BSD version), instead check the mfctable.
610 */
611 int
ip_mrouter_done(ip_stack_t * ipst)612 ip_mrouter_done(ip_stack_t *ipst)
613 {
614 conn_t *mrouter;
615 vifi_t vifi;
616 struct mfc *mfc_rt;
617 int i;
618
619 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
620 if (ipst->ips_ip_g_mrouter == NULL) {
621 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
622 return (EINVAL);
623 }
624
625 mrouter = ipst->ips_ip_g_mrouter;
626
627 if (ipst->ips_saved_ip_forwarding != -1) {
628 if (ipst->ips_ip_mrtdebug > 1) {
629 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
630 "ip_mrouter_done: turning off forwarding");
631 }
632 ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
633 ipst->ips_saved_ip_forwarding = -1;
634 }
635
636 /*
637 * Always clear cache when vifs change.
638 * No need to get ipst->ips_last_encap_lock since we are running as
639 * a writer.
640 */
641 mutex_enter(&ipst->ips_last_encap_lock);
642 ipst->ips_last_encap_src = 0;
643 ipst->ips_last_encap_vif = NULL;
644 mutex_exit(&ipst->ips_last_encap_lock);
645 mrouter->conn_multi_router = 0;
646
647 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
648
649 /*
650 * For each phyint in use,
651 * disable promiscuous reception of all IP multicasts.
652 */
653 for (vifi = 0; vifi < MAXVIFS; vifi++) {
654 struct vif *vifp = ipst->ips_vifs + vifi;
655
656 mutex_enter(&vifp->v_lock);
657 /*
658 * if the vif is active mark it condemned.
659 */
660 if (vifp->v_marks & VIF_MARK_GOOD) {
661 ASSERT(vifp->v_ipif != NULL);
662 ipif_refhold(vifp->v_ipif);
663 /* Phyint only */
664 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
665 ipif_t *ipif = vifp->v_ipif;
666 ilm_t *ilm = vifp->v_ilm;
667
668 vifp->v_ilm = NULL;
669 vifp->v_marks &= ~VIF_MARK_GOOD;
670 vifp->v_marks |= VIF_MARK_CONDEMNED;
671
672 mutex_exit(&(vifp)->v_lock);
673 if (ilm != NULL) {
674 ill_t *ill = ipif->ipif_ill;
675
676 (void) ip_delmulti(ilm);
677 ASSERT(ill->ill_mrouter_cnt > 0);
678 atomic_dec_32(&ill->ill_mrouter_cnt);
679 }
680 mutex_enter(&vifp->v_lock);
681 }
682 ipif_refrele(vifp->v_ipif);
683 /*
684 * decreases the refcnt added in add_vif.
685 * and release v_lock.
686 */
687 VIF_REFRELE_LOCKED(vifp);
688 } else {
689 mutex_exit(&vifp->v_lock);
690 continue;
691 }
692 }
693
694 mutex_enter(&ipst->ips_numvifs_mutex);
695 ipst->ips_numvifs = 0;
696 ipst->ips_pim_assert = 0;
697 ipst->ips_reg_vif_num = ALL_VIFS;
698 mutex_exit(&ipst->ips_numvifs_mutex);
699
700 /*
701 * Free upcall msgs.
702 * Go through mfctable and stop any outstanding upcall
703 * timeouts remaining on mfcs.
704 */
705 for (i = 0; i < MFCTBLSIZ; i++) {
706 mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
707 ipst->ips_mfcs[i].mfcb_refcnt++;
708 ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
709 mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
710 mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
711 while (mfc_rt) {
712 /* Free upcalls */
713 mutex_enter(&mfc_rt->mfc_mutex);
714 if (mfc_rt->mfc_rte != NULL) {
715 if (mfc_rt->mfc_timeout_id != 0) {
716 /*
717 * OK to drop the lock as we have
718 * a refcnt on the bucket. timeout
719 * can fire but it will see that
720 * mfc_timeout_id == 0 and not do
721 * anything. see expire_upcalls().
722 */
723 mfc_rt->mfc_timeout_id = 0;
724 mutex_exit(&mfc_rt->mfc_mutex);
725 (void) untimeout(
726 mfc_rt->mfc_timeout_id);
727 mfc_rt->mfc_timeout_id = 0;
728 mutex_enter(&mfc_rt->mfc_mutex);
729
730 /*
731 * all queued upcall packets
732 * and mblk will be freed in
733 * release_mfc().
734 */
735 }
736 }
737
738 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
739
740 mutex_exit(&mfc_rt->mfc_mutex);
741 mfc_rt = mfc_rt->mfc_next;
742 }
743 MFCB_REFRELE(&ipst->ips_mfcs[i]);
744 }
745
746 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
747 ipst->ips_ip_g_mrouter = NULL;
748 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
749 return (0);
750 }
751
752 void
ip_mrouter_stack_destroy(ip_stack_t * ipst)753 ip_mrouter_stack_destroy(ip_stack_t *ipst)
754 {
755 struct mfcb *mfcbp;
756 struct mfc *rt;
757 int i;
758
759 for (i = 0; i < MFCTBLSIZ; i++) {
760 mfcbp = &ipst->ips_mfcs[i];
761
762 while ((rt = mfcbp->mfcb_mfc) != NULL) {
763 (void) printf("ip_mrouter_stack_destroy: free for %d\n",
764 i);
765
766 mfcbp->mfcb_mfc = rt->mfc_next;
767 free_queue(rt);
768 mi_free(rt);
769 }
770 }
771 kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
772 ipst->ips_vifs = NULL;
773 kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
774 ipst->ips_mrtstat = NULL;
775 kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
776 ipst->ips_mfcs = NULL;
777 kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
778 ipst->ips_tbfs = NULL;
779
780 mutex_destroy(&ipst->ips_last_encap_lock);
781 mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
782 }
783
784 static boolean_t
is_mrouter_off(ip_stack_t * ipst)785 is_mrouter_off(ip_stack_t *ipst)
786 {
787 conn_t *mrouter;
788
789 mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
790 if (ipst->ips_ip_g_mrouter == NULL) {
791 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
792 return (B_TRUE);
793 }
794
795 mrouter = ipst->ips_ip_g_mrouter;
796 if (mrouter->conn_multi_router == 0) {
797 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
798 return (B_TRUE);
799 }
800 mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
801 return (B_FALSE);
802 }
803
804 static void
unlock_good_vif(struct vif * vifp)805 unlock_good_vif(struct vif *vifp)
806 {
807 ASSERT(vifp->v_ipif != NULL);
808 ipif_refrele(vifp->v_ipif);
809 VIF_REFRELE(vifp);
810 }
811
812 static boolean_t
lock_good_vif(struct vif * vifp)813 lock_good_vif(struct vif *vifp)
814 {
815 mutex_enter(&vifp->v_lock);
816 if (!(vifp->v_marks & VIF_MARK_GOOD)) {
817 mutex_exit(&vifp->v_lock);
818 return (B_FALSE);
819 }
820
821 ASSERT(vifp->v_ipif != NULL);
822 mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
823 if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
824 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
825 mutex_exit(&vifp->v_lock);
826 return (B_FALSE);
827 }
828 ipif_refhold_locked(vifp->v_ipif);
829 mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
830 vifp->v_refcnt++;
831 mutex_exit(&vifp->v_lock);
832 return (B_TRUE);
833 }
834
835 /*
836 * Add a vif to the vif table.
837 */
838 static int
add_vif(struct vifctl * vifcp,conn_t * connp,ip_stack_t * ipst)839 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
840 {
841 struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi;
842 ipif_t *ipif;
843 int error = 0;
844 struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
845 conn_t *mrouter = ipst->ips_ip_g_mrouter;
846 ilm_t *ilm;
847 ill_t *ill;
848
849 ASSERT(connp != NULL);
850
851 if (vifcp->vifc_vifi >= MAXVIFS)
852 return (EINVAL);
853
854 if (is_mrouter_off(ipst))
855 return (EINVAL);
856
857 mutex_enter(&vifp->v_lock);
858 /*
859 * Viftable entry should be 0.
860 * if v_marks == 0 but v_refcnt != 0 means struct is being
861 * initialized.
862 *
863 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
864 * request while the delete is in progress, mrouted only sends add
865 * requests when a new interface is added and the new interface cannot
866 * have the same vifi as an existing interface. We make sure that
867 * ill_delete will block till the vif is deleted by adding a refcnt
868 * to ipif in del_vif().
869 */
870 if (vifp->v_lcl_addr.s_addr != 0 ||
871 vifp->v_marks != 0 ||
872 vifp->v_refcnt != 0) {
873 mutex_exit(&vifp->v_lock);
874 return (EADDRINUSE);
875 }
876
877 /* Incoming vif should not be 0 */
878 if (vifcp->vifc_lcl_addr.s_addr == 0) {
879 mutex_exit(&vifp->v_lock);
880 return (EINVAL);
881 }
882
883 vifp->v_refcnt++;
884 mutex_exit(&vifp->v_lock);
885 /* Find the interface with the local address */
886 ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
887 IPCL_ZONEID(connp), ipst);
888 if (ipif == NULL) {
889 VIF_REFRELE(vifp);
890 return (EADDRNOTAVAIL);
891 }
892
893 if (ipst->ips_ip_mrtdebug > 1) {
894 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
895 "add_vif: src 0x%x enter",
896 vifcp->vifc_lcl_addr.s_addr);
897 }
898
899 mutex_enter(&vifp->v_lock);
900 /*
901 * Always clear cache when vifs change.
902 * Needed to ensure that src isn't left over from before vif was added.
903 * No need to get last_encap_lock, since we are running as a writer.
904 */
905
906 mutex_enter(&ipst->ips_last_encap_lock);
907 ipst->ips_last_encap_src = 0;
908 ipst->ips_last_encap_vif = NULL;
909 mutex_exit(&ipst->ips_last_encap_lock);
910
911 if (vifcp->vifc_flags & VIFF_TUNNEL) {
912 if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
913 cmn_err(CE_WARN,
914 "add_vif: source route tunnels not supported\n");
915 VIF_REFRELE_LOCKED(vifp);
916 ipif_refrele(ipif);
917 return (EOPNOTSUPP);
918 }
919 vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
920
921 } else {
922 /* Phyint or Register vif */
923 if (vifcp->vifc_flags & VIFF_REGISTER) {
924 /*
925 * Note: Since all IPPROTO_IP level options (including
926 * MRT_ADD_VIF) are done exclusively via
927 * ip_optmgmt_writer(), a lock is not necessary to
928 * protect reg_vif_num.
929 */
930 mutex_enter(&ipst->ips_numvifs_mutex);
931 if (ipst->ips_reg_vif_num == ALL_VIFS) {
932 ipst->ips_reg_vif_num = vifcp->vifc_vifi;
933 mutex_exit(&ipst->ips_numvifs_mutex);
934 } else {
935 mutex_exit(&ipst->ips_numvifs_mutex);
936 VIF_REFRELE_LOCKED(vifp);
937 ipif_refrele(ipif);
938 return (EADDRINUSE);
939 }
940 }
941
942 /* Make sure the interface supports multicast */
943 if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
944 VIF_REFRELE_LOCKED(vifp);
945 ipif_refrele(ipif);
946 if (vifcp->vifc_flags & VIFF_REGISTER) {
947 mutex_enter(&ipst->ips_numvifs_mutex);
948 ipst->ips_reg_vif_num = ALL_VIFS;
949 mutex_exit(&ipst->ips_numvifs_mutex);
950 }
951 return (EOPNOTSUPP);
952 }
953 /* Enable promiscuous reception of all IP mcasts from the if */
954 mutex_exit(&vifp->v_lock);
955
956 ill = ipif->ipif_ill;
957 if (IS_UNDER_IPMP(ill))
958 ill = ipmp_ill_hold_ipmp_ill(ill);
959
960 if (ill == NULL) {
961 ilm = NULL;
962 } else {
963 ilm = ip_addmulti(&ipv6_all_zeros, ill,
964 ipif->ipif_zoneid, &error);
965 if (ilm != NULL)
966 atomic_inc_32(&ill->ill_mrouter_cnt);
967 if (IS_UNDER_IPMP(ipif->ipif_ill)) {
968 ill_refrele(ill);
969 ill = ipif->ipif_ill;
970 }
971 }
972
973 mutex_enter(&vifp->v_lock);
974 /*
975 * since we released the lock lets make sure that
976 * ip_mrouter_done() has not been called.
977 */
978 if (ilm == NULL || is_mrouter_off(ipst)) {
979 if (ilm != NULL) {
980 (void) ip_delmulti(ilm);
981 ASSERT(ill->ill_mrouter_cnt > 0);
982 atomic_dec_32(&ill->ill_mrouter_cnt);
983 }
984 if (vifcp->vifc_flags & VIFF_REGISTER) {
985 mutex_enter(&ipst->ips_numvifs_mutex);
986 ipst->ips_reg_vif_num = ALL_VIFS;
987 mutex_exit(&ipst->ips_numvifs_mutex);
988 }
989 VIF_REFRELE_LOCKED(vifp);
990 ipif_refrele(ipif);
991 return (error?error:EINVAL);
992 }
993 vifp->v_ilm = ilm;
994 }
995 /* Define parameters for the tbf structure */
996 vifp->v_tbf = v_tbf;
997 gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
998 vifp->v_tbf->tbf_n_tok = 0;
999 vifp->v_tbf->tbf_q_len = 0;
1000 vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1001 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1002
1003 vifp->v_flags = vifcp->vifc_flags;
1004 vifp->v_threshold = vifcp->vifc_threshold;
1005 vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1006 vifp->v_ipif = ipif;
1007 ipif_refrele(ipif);
1008 /* Scaling up here, allows division by 1024 in critical code. */
1009 vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1010 vifp->v_timeout_id = 0;
1011 /* initialize per vif pkt counters */
1012 vifp->v_pkt_in = 0;
1013 vifp->v_pkt_out = 0;
1014 vifp->v_bytes_in = 0;
1015 vifp->v_bytes_out = 0;
1016 mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1017
1018 /* Adjust numvifs up, if the vifi is higher than numvifs */
1019 mutex_enter(&ipst->ips_numvifs_mutex);
1020 if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1021 ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1022 mutex_exit(&ipst->ips_numvifs_mutex);
1023
1024 if (ipst->ips_ip_mrtdebug > 1) {
1025 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1026 "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1027 vifcp->vifc_vifi,
1028 ntohl(vifcp->vifc_lcl_addr.s_addr),
1029 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1030 ntohl(vifcp->vifc_rmt_addr.s_addr),
1031 vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1032 }
1033
1034 vifp->v_marks = VIF_MARK_GOOD;
1035 mutex_exit(&vifp->v_lock);
1036 return (0);
1037 }
1038
1039
1040 /* Delete a vif from the vif table. */
1041 static void
del_vifp(struct vif * vifp)1042 del_vifp(struct vif *vifp)
1043 {
1044 struct tbf *t = vifp->v_tbf;
1045 mblk_t *mp0;
1046 vifi_t vifi;
1047 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1048 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1049
1050 ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1051 ASSERT(t != NULL);
1052
1053 if (ipst->ips_ip_mrtdebug > 1) {
1054 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1055 "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1056 }
1057
1058 if (vifp->v_timeout_id != 0) {
1059 (void) untimeout(vifp->v_timeout_id);
1060 vifp->v_timeout_id = 0;
1061 }
1062
1063 /*
1064 * Free packets queued at the interface.
1065 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1066 */
1067 mutex_enter(&t->tbf_lock);
1068 while (t->tbf_q != NULL) {
1069 mp0 = t->tbf_q;
1070 t->tbf_q = t->tbf_q->b_next;
1071 mp0->b_prev = mp0->b_next = NULL;
1072 freemsg(mp0);
1073 }
1074 mutex_exit(&t->tbf_lock);
1075
1076 /*
1077 * Always clear cache when vifs change.
1078 * No need to get last_encap_lock since we are running as a writer.
1079 */
1080 mutex_enter(&ipst->ips_last_encap_lock);
1081 if (vifp == ipst->ips_last_encap_vif) {
1082 ipst->ips_last_encap_vif = NULL;
1083 ipst->ips_last_encap_src = 0;
1084 }
1085 mutex_exit(&ipst->ips_last_encap_lock);
1086
1087 mutex_destroy(&t->tbf_lock);
1088
1089 bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1090
1091 /* Adjust numvifs down */
1092 mutex_enter(&ipst->ips_numvifs_mutex);
1093 for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1094 if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1095 break;
1096 ipst->ips_numvifs = vifi;
1097 mutex_exit(&ipst->ips_numvifs_mutex);
1098
1099 bzero(vifp, sizeof (*vifp));
1100 }
1101
1102 static int
del_vif(vifi_t * vifip,ip_stack_t * ipst)1103 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1104 {
1105 struct vif *vifp = ipst->ips_vifs + *vifip;
1106
1107 if (*vifip >= ipst->ips_numvifs)
1108 return (EINVAL);
1109
1110 mutex_enter(&vifp->v_lock);
1111 /*
1112 * Not initialized
1113 * Here we are not looking at the vif that is being initialized
1114 * i.e vifp->v_marks == 0 and refcnt > 0.
1115 */
1116 if (vifp->v_lcl_addr.s_addr == 0 ||
1117 !(vifp->v_marks & VIF_MARK_GOOD)) {
1118 mutex_exit(&vifp->v_lock);
1119 return (EADDRNOTAVAIL);
1120 }
1121
1122 /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1123 vifp->v_marks &= ~VIF_MARK_GOOD;
1124 vifp->v_marks |= VIF_MARK_CONDEMNED;
1125
1126 /* Phyint only */
1127 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1128 ipif_t *ipif = vifp->v_ipif;
1129 ilm_t *ilm = vifp->v_ilm;
1130
1131 vifp->v_ilm = NULL;
1132
1133 ASSERT(ipif != NULL);
1134 /*
1135 * should be OK to drop the lock as we
1136 * have marked this as CONDEMNED.
1137 */
1138 mutex_exit(&(vifp)->v_lock);
1139 if (ilm != NULL) {
1140 (void) ip_delmulti(ilm);
1141 ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1142 atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1143 }
1144 mutex_enter(&(vifp)->v_lock);
1145 }
1146
1147 if (vifp->v_flags & VIFF_REGISTER) {
1148 mutex_enter(&ipst->ips_numvifs_mutex);
1149 ipst->ips_reg_vif_num = ALL_VIFS;
1150 mutex_exit(&ipst->ips_numvifs_mutex);
1151 }
1152
1153 /*
1154 * decreases the refcnt added in add_vif.
1155 */
1156 VIF_REFRELE_LOCKED(vifp);
1157 return (0);
1158 }
1159
1160 /*
1161 * Add an mfc entry.
1162 */
1163 static int
add_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1164 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1165 {
1166 struct mfc *rt;
1167 struct rtdetq *rte;
1168 ushort_t nstl;
1169 int i;
1170 struct mfcb *mfcbp;
1171 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1172
1173 /*
1174 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1175 * did not have a real route for pkt.
1176 * We want this pkt without rt installed in the mfctable to prevent
1177 * multiiple tries, so go ahead and put it in mfctable, it will
1178 * be discarded later in ip_mdq() because the child is NULL.
1179 */
1180
1181 /* Error checking, out of bounds? */
1182 if (mfccp->mfcc_parent > MAXVIFS) {
1183 ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1184 (int)mfccp->mfcc_parent));
1185 return (EINVAL);
1186 }
1187
1188 if ((mfccp->mfcc_parent != NO_VIF) &&
1189 (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1190 ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1191 (int)mfccp->mfcc_parent));
1192 return (EINVAL);
1193 }
1194
1195 if (is_mrouter_off(ipst)) {
1196 return (EINVAL);
1197 }
1198
1199 mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1200 mfccp->mfcc_mcastgrp.s_addr)];
1201 MFCB_REFHOLD(mfcbp);
1202 MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1203 mfccp->mfcc_mcastgrp.s_addr, rt);
1204
1205 /* If an entry already exists, just update the fields */
1206 if (rt) {
1207 if (ipst->ips_ip_mrtdebug > 1) {
1208 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1209 "add_mfc: update o %x grp %x parent %x",
1210 ntohl(mfccp->mfcc_origin.s_addr),
1211 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1212 mfccp->mfcc_parent);
1213 }
1214 mutex_enter(&rt->mfc_mutex);
1215 rt->mfc_parent = mfccp->mfcc_parent;
1216
1217 mutex_enter(&ipst->ips_numvifs_mutex);
1218 for (i = 0; i < (int)ipst->ips_numvifs; i++)
1219 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1220 mutex_exit(&ipst->ips_numvifs_mutex);
1221 mutex_exit(&rt->mfc_mutex);
1222
1223 MFCB_REFRELE(mfcbp);
1224 return (0);
1225 }
1226
1227 /*
1228 * Find the entry for which the upcall was made and update.
1229 */
1230 for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1231 mutex_enter(&rt->mfc_mutex);
1232 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1233 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1234 (rt->mfc_rte != NULL) &&
1235 !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1236 if (nstl++ != 0)
1237 cmn_err(CE_WARN,
1238 "add_mfc: %s o %x g %x p %x",
1239 "multiple kernel entries",
1240 ntohl(mfccp->mfcc_origin.s_addr),
1241 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1242 mfccp->mfcc_parent);
1243
1244 if (ipst->ips_ip_mrtdebug > 1) {
1245 (void) mi_strlog(mrouter->conn_rq, 1,
1246 SL_TRACE,
1247 "add_mfc: o %x g %x p %x",
1248 ntohl(mfccp->mfcc_origin.s_addr),
1249 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1250 mfccp->mfcc_parent);
1251 }
1252 fill_route(rt, mfccp, ipst);
1253
1254 /*
1255 * Prevent cleanup of cache entry.
1256 * Timer starts in ip_mforward.
1257 */
1258 if (rt->mfc_timeout_id != 0) {
1259 timeout_id_t id;
1260 id = rt->mfc_timeout_id;
1261 /*
1262 * setting id to zero will avoid this
1263 * entry from being cleaned up in
1264 * expire_up_calls().
1265 */
1266 rt->mfc_timeout_id = 0;
1267 /*
1268 * dropping the lock is fine as we
1269 * have a refhold on the bucket.
1270 * so mfc cannot be freed.
1271 * The timeout can fire but it will see
1272 * that mfc_timeout_id == 0 and not cleanup.
1273 */
1274 mutex_exit(&rt->mfc_mutex);
1275 (void) untimeout(id);
1276 mutex_enter(&rt->mfc_mutex);
1277 }
1278
1279 /*
1280 * Send all pkts that are queued waiting for the upcall.
1281 * ip_mdq param tun set to 0 -
1282 * the return value of ip_mdq() isn't used here,
1283 * so value we send doesn't matter.
1284 */
1285 while (rt->mfc_rte != NULL) {
1286 rte = rt->mfc_rte;
1287 rt->mfc_rte = rte->rte_next;
1288 mutex_exit(&rt->mfc_mutex);
1289 (void) ip_mdq(rte->mp, (ipha_t *)
1290 rte->mp->b_rptr, rte->ill, 0, rt);
1291 freemsg(rte->mp);
1292 mi_free((char *)rte);
1293 mutex_enter(&rt->mfc_mutex);
1294 }
1295 }
1296 mutex_exit(&rt->mfc_mutex);
1297 }
1298
1299
1300 /*
1301 * It is possible that an entry is being inserted without an upcall
1302 */
1303 if (nstl == 0) {
1304 mutex_enter(&(mfcbp->mfcb_lock));
1305 if (ipst->ips_ip_mrtdebug > 1) {
1306 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1307 "add_mfc: no upcall o %x g %x p %x",
1308 ntohl(mfccp->mfcc_origin.s_addr),
1309 ntohl(mfccp->mfcc_mcastgrp.s_addr),
1310 mfccp->mfcc_parent);
1311 }
1312 if (is_mrouter_off(ipst)) {
1313 mutex_exit(&mfcbp->mfcb_lock);
1314 MFCB_REFRELE(mfcbp);
1315 return (EINVAL);
1316 }
1317
1318 for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1319
1320 mutex_enter(&rt->mfc_mutex);
1321 if ((rt->mfc_origin.s_addr ==
1322 mfccp->mfcc_origin.s_addr) &&
1323 (rt->mfc_mcastgrp.s_addr ==
1324 mfccp->mfcc_mcastgrp.s_addr) &&
1325 (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1326 fill_route(rt, mfccp, ipst);
1327 mutex_exit(&rt->mfc_mutex);
1328 break;
1329 }
1330 mutex_exit(&rt->mfc_mutex);
1331 }
1332
1333 /* No upcall, so make a new entry into mfctable */
1334 if (rt == NULL) {
1335 rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1336 if (rt == NULL) {
1337 ip1dbg(("add_mfc: out of memory\n"));
1338 mutex_exit(&mfcbp->mfcb_lock);
1339 MFCB_REFRELE(mfcbp);
1340 return (ENOBUFS);
1341 }
1342
1343 /* Insert new entry at head of hash chain */
1344 mutex_enter(&rt->mfc_mutex);
1345 fill_route(rt, mfccp, ipst);
1346
1347 /* Link into table */
1348 rt->mfc_next = mfcbp->mfcb_mfc;
1349 mfcbp->mfcb_mfc = rt;
1350 mutex_exit(&rt->mfc_mutex);
1351 }
1352 mutex_exit(&mfcbp->mfcb_lock);
1353 }
1354
1355 MFCB_REFRELE(mfcbp);
1356 return (0);
1357 }
1358
1359 /*
1360 * Fills in mfc structure from mrouted mfcctl.
1361 */
1362 static void
fill_route(struct mfc * rt,struct mfcctl * mfccp,ip_stack_t * ipst)1363 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1364 {
1365 int i;
1366
1367 rt->mfc_origin = mfccp->mfcc_origin;
1368 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
1369 rt->mfc_parent = mfccp->mfcc_parent;
1370 mutex_enter(&ipst->ips_numvifs_mutex);
1371 for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1372 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1373 }
1374 mutex_exit(&ipst->ips_numvifs_mutex);
1375 /* Initialize pkt counters per src-grp */
1376 rt->mfc_pkt_cnt = 0;
1377 rt->mfc_byte_cnt = 0;
1378 rt->mfc_wrong_if = 0;
1379 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1380
1381 }
1382
1383 static void
free_queue(struct mfc * mfcp)1384 free_queue(struct mfc *mfcp)
1385 {
1386 struct rtdetq *rte0;
1387
1388 /*
1389 * Drop all queued upcall packets.
1390 * Free the mbuf with the pkt.
1391 */
1392 while ((rte0 = mfcp->mfc_rte) != NULL) {
1393 mfcp->mfc_rte = rte0->rte_next;
1394 freemsg(rte0->mp);
1395 mi_free((char *)rte0);
1396 }
1397 }
1398 /*
1399 * go thorugh the hash bucket and free all the entries marked condemned.
1400 */
1401 void
release_mfc(struct mfcb * mfcbp)1402 release_mfc(struct mfcb *mfcbp)
1403 {
1404 struct mfc *current_mfcp;
1405 struct mfc *prev_mfcp;
1406
1407 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1408
1409 while (current_mfcp != NULL) {
1410 if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1411 if (current_mfcp == mfcbp->mfcb_mfc) {
1412 mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1413 free_queue(current_mfcp);
1414 mi_free(current_mfcp);
1415 prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1416 continue;
1417 }
1418 ASSERT(prev_mfcp != NULL);
1419 prev_mfcp->mfc_next = current_mfcp->mfc_next;
1420 free_queue(current_mfcp);
1421 mi_free(current_mfcp);
1422 current_mfcp = NULL;
1423 } else {
1424 prev_mfcp = current_mfcp;
1425 }
1426
1427 current_mfcp = prev_mfcp->mfc_next;
1428
1429 }
1430 mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1431 ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1432 }
1433
1434 /*
1435 * Delete an mfc entry.
1436 */
1437 static int
del_mfc(struct mfcctl * mfccp,ip_stack_t * ipst)1438 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1439 {
1440 struct in_addr origin;
1441 struct in_addr mcastgrp;
1442 struct mfc *rt;
1443 uint_t hash;
1444 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1445
1446 origin = mfccp->mfcc_origin;
1447 mcastgrp = mfccp->mfcc_mcastgrp;
1448 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1449
1450 if (ipst->ips_ip_mrtdebug > 1) {
1451 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1452 "del_mfc: o %x g %x",
1453 ntohl(origin.s_addr),
1454 ntohl(mcastgrp.s_addr));
1455 }
1456
1457 MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1458
1459 /* Find mfc in mfctable, finds only entries without upcalls */
1460 for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1461 mutex_enter(&rt->mfc_mutex);
1462 if (origin.s_addr == rt->mfc_origin.s_addr &&
1463 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1464 rt->mfc_rte == NULL &&
1465 !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1466 break;
1467 mutex_exit(&rt->mfc_mutex);
1468 }
1469
1470 /*
1471 * Return if there was an upcall (mfc_rte != NULL,
1472 * or rt not in mfctable.
1473 */
1474 if (rt == NULL) {
1475 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1476 return (EADDRNOTAVAIL);
1477 }
1478
1479
1480 /*
1481 * no need to hold lock as we have a reference.
1482 */
1483 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1484 /* error checking */
1485 if (rt->mfc_timeout_id != 0) {
1486 ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1487 /*
1488 * Its ok to drop the lock, the struct cannot be freed
1489 * since we have a ref on the hash bucket.
1490 */
1491 rt->mfc_timeout_id = 0;
1492 mutex_exit(&rt->mfc_mutex);
1493 (void) untimeout(rt->mfc_timeout_id);
1494 mutex_enter(&rt->mfc_mutex);
1495 }
1496
1497 ASSERT(rt->mfc_rte == NULL);
1498
1499
1500 /*
1501 * Delete the entry from the cache
1502 */
1503 rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1504 mutex_exit(&rt->mfc_mutex);
1505
1506 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1507
1508 return (0);
1509 }
1510
1511 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
1512
1513 /*
1514 * IP multicast forwarding function. This function assumes that the packet
1515 * pointed to by ipha has arrived on (or is about to be sent to) the interface
1516 * pointed to by "ill", and the packet is to be relayed to other networks
1517 * that have members of the packet's destination IP multicast group.
1518 *
1519 * The packet is returned unscathed to the caller, unless it is
1520 * erroneous, in which case a -1 value tells the caller (IP)
1521 * to discard it.
1522 *
1523 * Unlike BSD, SunOS 5.x needs to return to IP info about
1524 * whether pkt came in thru a tunnel, so it can be discarded, unless
1525 * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1526 * to be delivered.
1527 * Return values are 0 - pkt is okay and phyint
1528 * -1 - pkt is malformed and to be tossed
1529 * 1 - pkt came in on tunnel
1530 */
1531 int
ip_mforward(mblk_t * mp,ip_recv_attr_t * ira)1532 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1533 {
1534 ipha_t *ipha = (ipha_t *)mp->b_rptr;
1535 ill_t *ill = ira->ira_ill;
1536 struct mfc *rt;
1537 ipaddr_t src, dst, tunnel_src = 0;
1538 static int srctun = 0;
1539 vifi_t vifi;
1540 boolean_t pim_reg_packet = B_FALSE;
1541 struct mfcb *mfcbp;
1542 ip_stack_t *ipst = ill->ill_ipst;
1543 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1544 ill_t *rill = ira->ira_rill;
1545
1546 ASSERT(ira->ira_pktlen == msgdsize(mp));
1547
1548 if (ipst->ips_ip_mrtdebug > 1) {
1549 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1550 "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1551 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1552 ill->ill_name);
1553 }
1554
1555 dst = ipha->ipha_dst;
1556 if (ira->ira_flags & IRAF_PIM_REGISTER)
1557 pim_reg_packet = B_TRUE;
1558 else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1559 tunnel_src = ira->ira_mroute_tunnel;
1560
1561 /*
1562 * Don't forward a packet with time-to-live of zero or one,
1563 * or a packet destined to a local-only group.
1564 */
1565 if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1566 (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1567 if (ipst->ips_ip_mrtdebug > 1) {
1568 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1569 "ip_mforward: not forwarded ttl %d,"
1570 " dst 0x%x ill %s",
1571 ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1572 }
1573 if (tunnel_src != 0)
1574 return (1);
1575 else
1576 return (0);
1577 }
1578
1579 if ((tunnel_src != 0) || pim_reg_packet) {
1580 /*
1581 * Packet arrived over an encapsulated tunnel or via a PIM
1582 * register message.
1583 */
1584 if (ipst->ips_ip_mrtdebug > 1) {
1585 if (tunnel_src != 0) {
1586 (void) mi_strlog(mrouter->conn_rq, 1,
1587 SL_TRACE,
1588 "ip_mforward: ill %s arrived via ENCAP TUN",
1589 ill->ill_name);
1590 } else if (pim_reg_packet) {
1591 (void) mi_strlog(mrouter->conn_rq, 1,
1592 SL_TRACE,
1593 "ip_mforward: ill %s arrived via"
1594 " REGISTER VIF",
1595 ill->ill_name);
1596 }
1597 }
1598 } else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1599 (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1600 ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1601 /* Packet arrived via a physical interface. */
1602 if (ipst->ips_ip_mrtdebug > 1) {
1603 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1604 "ip_mforward: ill %s arrived via PHYINT",
1605 ill->ill_name);
1606 }
1607
1608 } else {
1609 /*
1610 * Packet arrived through a SRCRT tunnel.
1611 * Source-route tunnels are no longer supported.
1612 * Error message printed every 1000 times.
1613 */
1614 if ((srctun++ % 1000) == 0) {
1615 cmn_err(CE_WARN,
1616 "ip_mforward: received source-routed pkt from %x",
1617 ntohl(ipha->ipha_src));
1618 }
1619 return (-1);
1620 }
1621
1622 ipst->ips_mrtstat->mrts_fwd_in++;
1623 src = ipha->ipha_src;
1624
1625 /* Find route in cache, return NULL if not there or upcalls q'ed. */
1626
1627 /*
1628 * Lock the mfctable against changes made by ip_mforward.
1629 * Note that only add_mfc and del_mfc can remove entries and
1630 * they run with exclusive access to IP. So we do not need to
1631 * guard against the rt being deleted, so release lock after reading.
1632 */
1633
1634 if (is_mrouter_off(ipst))
1635 return (-1);
1636
1637 mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1638 MFCB_REFHOLD(mfcbp);
1639 MFCFIND(mfcbp, src, dst, rt);
1640
1641 /* Entry exists, so forward if necessary */
1642 if (rt != NULL) {
1643 int ret = 0;
1644 ipst->ips_mrtstat->mrts_mfc_hits++;
1645 if (pim_reg_packet) {
1646 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1647 ret = ip_mdq(mp, ipha,
1648 ipst->ips_vifs[ipst->ips_reg_vif_num].
1649 v_ipif->ipif_ill,
1650 0, rt);
1651 } else {
1652 ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1653 }
1654
1655 MFCB_REFRELE(mfcbp);
1656 return (ret);
1657
1658 /*
1659 * Don't forward if we don't have a cache entry. Mrouted will
1660 * always provide a cache entry in response to an upcall.
1661 */
1662 } else {
1663 /*
1664 * If we don't have a route for packet's origin, make a copy
1665 * of the packet and send message to routing daemon.
1666 */
1667 struct mfc *mfc_rt = NULL;
1668 mblk_t *mp0 = NULL;
1669 mblk_t *mp_copy = NULL;
1670 struct rtdetq *rte = NULL;
1671 struct rtdetq *rte_m, *rte1, *prev_rte;
1672 uint_t hash;
1673 int npkts;
1674 boolean_t new_mfc = B_FALSE;
1675 ipst->ips_mrtstat->mrts_mfc_misses++;
1676 /* BSD uses mrts_no_route++ */
1677 if (ipst->ips_ip_mrtdebug > 1) {
1678 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1679 "ip_mforward: no rte ill %s src %x g %x misses %d",
1680 ill->ill_name, ntohl(src), ntohl(dst),
1681 (int)ipst->ips_mrtstat->mrts_mfc_misses);
1682 }
1683 /*
1684 * The order of the following code differs from the BSD code.
1685 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1686 * code works, so SunOS 5.x wasn't changed to conform to the
1687 * BSD version.
1688 */
1689
1690 /* Lock mfctable. */
1691 hash = MFCHASH(src, dst);
1692 mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1693
1694 /*
1695 * If we are turning off mrouted return an error
1696 */
1697 if (is_mrouter_off(ipst)) {
1698 mutex_exit(&mfcbp->mfcb_lock);
1699 MFCB_REFRELE(mfcbp);
1700 return (-1);
1701 }
1702
1703 /* Is there an upcall waiting for this packet? */
1704 for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1705 mfc_rt = mfc_rt->mfc_next) {
1706 mutex_enter(&mfc_rt->mfc_mutex);
1707 if (ipst->ips_ip_mrtdebug > 1) {
1708 (void) mi_strlog(mrouter->conn_rq, 1,
1709 SL_TRACE,
1710 "ip_mforward: MFCTAB hash %d o 0x%x"
1711 " g 0x%x\n",
1712 hash, ntohl(mfc_rt->mfc_origin.s_addr),
1713 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1714 }
1715 /* There is an upcall */
1716 if ((src == mfc_rt->mfc_origin.s_addr) &&
1717 (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1718 (mfc_rt->mfc_rte != NULL) &&
1719 !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1720 break;
1721 }
1722 mutex_exit(&mfc_rt->mfc_mutex);
1723 }
1724 /* No upcall, so make a new entry into mfctable */
1725 if (mfc_rt == NULL) {
1726 mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1727 if (mfc_rt == NULL) {
1728 ipst->ips_mrtstat->mrts_fwd_drop++;
1729 ip1dbg(("ip_mforward: out of memory "
1730 "for mfc, mfc_rt\n"));
1731 goto error_return;
1732 } else
1733 new_mfc = B_TRUE;
1734 /* Get resources */
1735 /* TODO could copy header and dup rest */
1736 mp_copy = copymsg(mp);
1737 if (mp_copy == NULL) {
1738 ipst->ips_mrtstat->mrts_fwd_drop++;
1739 ip1dbg(("ip_mforward: out of memory for "
1740 "mblk, mp_copy\n"));
1741 goto error_return;
1742 }
1743 mutex_enter(&mfc_rt->mfc_mutex);
1744 }
1745 /* Get resources for rte, whether first rte or not first. */
1746 /* Add this packet into rtdetq */
1747 rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1748 if (rte == NULL) {
1749 ipst->ips_mrtstat->mrts_fwd_drop++;
1750 mutex_exit(&mfc_rt->mfc_mutex);
1751 ip1dbg(("ip_mforward: out of memory for"
1752 " rtdetq, rte\n"));
1753 goto error_return;
1754 }
1755
1756 mp0 = copymsg(mp);
1757 if (mp0 == NULL) {
1758 ipst->ips_mrtstat->mrts_fwd_drop++;
1759 ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1760 mutex_exit(&mfc_rt->mfc_mutex);
1761 goto error_return;
1762 }
1763 rte->mp = mp0;
1764 if (pim_reg_packet) {
1765 ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1766 rte->ill =
1767 ipst->ips_vifs[ipst->ips_reg_vif_num].
1768 v_ipif->ipif_ill;
1769 } else {
1770 rte->ill = ill;
1771 }
1772 rte->rte_next = NULL;
1773
1774 /*
1775 * Determine if upcall q (rtdetq) has overflowed.
1776 * mfc_rt->mfc_rte is null by mi_zalloc
1777 * if it is the first message.
1778 */
1779 for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1780 rte_m = rte_m->rte_next)
1781 npkts++;
1782 if (ipst->ips_ip_mrtdebug > 1) {
1783 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1784 "ip_mforward: upcalls %d\n", npkts);
1785 }
1786 if (npkts > MAX_UPQ) {
1787 ipst->ips_mrtstat->mrts_upq_ovflw++;
1788 mutex_exit(&mfc_rt->mfc_mutex);
1789 goto error_return;
1790 }
1791
1792 if (npkts == 0) { /* first upcall */
1793 int i = 0;
1794 /*
1795 * Now finish installing the new mfc! Now that we have
1796 * resources! Insert new entry at head of hash chain.
1797 * Use src and dst which are ipaddr_t's.
1798 */
1799 mfc_rt->mfc_origin.s_addr = src;
1800 mfc_rt->mfc_mcastgrp.s_addr = dst;
1801
1802 mutex_enter(&ipst->ips_numvifs_mutex);
1803 for (i = 0; i < (int)ipst->ips_numvifs; i++)
1804 mfc_rt->mfc_ttls[i] = 0;
1805 mutex_exit(&ipst->ips_numvifs_mutex);
1806 mfc_rt->mfc_parent = ALL_VIFS;
1807
1808 /* Link into table */
1809 if (ipst->ips_ip_mrtdebug > 1) {
1810 (void) mi_strlog(mrouter->conn_rq, 1,
1811 SL_TRACE,
1812 "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1813 "g 0x%x\n", hash,
1814 ntohl(mfc_rt->mfc_origin.s_addr),
1815 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1816 }
1817 mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1818 ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1819 mfc_rt->mfc_rte = NULL;
1820 }
1821
1822 /* Link in the upcall */
1823 /* First upcall */
1824 if (mfc_rt->mfc_rte == NULL)
1825 mfc_rt->mfc_rte = rte;
1826 else {
1827 /* not the first upcall */
1828 prev_rte = mfc_rt->mfc_rte;
1829 for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1830 prev_rte = rte1, rte1 = rte1->rte_next)
1831 ;
1832 prev_rte->rte_next = rte;
1833 }
1834
1835 /*
1836 * No upcalls waiting, this is first one, so send a message to
1837 * routing daemon to install a route into kernel table.
1838 */
1839 if (npkts == 0) {
1840 struct igmpmsg *im;
1841 /* ipha_protocol is 0, for upcall */
1842 ASSERT(mp_copy != NULL);
1843 im = (struct igmpmsg *)mp_copy->b_rptr;
1844 im->im_msgtype = IGMPMSG_NOCACHE;
1845 im->im_mbz = 0;
1846 mutex_enter(&ipst->ips_numvifs_mutex);
1847 if (pim_reg_packet) {
1848 im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1849 mutex_exit(&ipst->ips_numvifs_mutex);
1850 } else {
1851 /*
1852 * XXX do we need to hold locks here ?
1853 */
1854 for (vifi = 0;
1855 vifi < ipst->ips_numvifs;
1856 vifi++) {
1857 if (ipst->ips_vifs[vifi].v_ipif == NULL)
1858 continue;
1859 if (ipst->ips_vifs[vifi].
1860 v_ipif->ipif_ill == ill) {
1861 im->im_vif = (uchar_t)vifi;
1862 break;
1863 }
1864 }
1865 mutex_exit(&ipst->ips_numvifs_mutex);
1866 ASSERT(vifi < ipst->ips_numvifs);
1867 }
1868
1869 ipst->ips_mrtstat->mrts_upcalls++;
1870 /* Timer to discard upcalls if mrouted is too slow */
1871 mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1872 mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1873 mutex_exit(&mfc_rt->mfc_mutex);
1874 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1875 /* Pass to RAWIP */
1876 ira->ira_ill = ira->ira_rill = NULL;
1877 (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1878 ira->ira_ill = ill;
1879 ira->ira_rill = rill;
1880 } else {
1881 mutex_exit(&mfc_rt->mfc_mutex);
1882 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1883 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1884 ip_drop_input("ip_mforward - upcall already waiting",
1885 mp_copy, ill);
1886 freemsg(mp_copy);
1887 }
1888
1889 MFCB_REFRELE(mfcbp);
1890 if (tunnel_src != 0)
1891 return (1);
1892 else
1893 return (0);
1894 error_return:
1895 mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1896 MFCB_REFRELE(mfcbp);
1897 if (mfc_rt != NULL && (new_mfc == B_TRUE))
1898 mi_free((char *)mfc_rt);
1899 if (rte != NULL)
1900 mi_free((char *)rte);
1901 if (mp_copy != NULL) {
1902 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1903 ip_drop_input("ip_mforward error", mp_copy, ill);
1904 freemsg(mp_copy);
1905 }
1906 if (mp0 != NULL)
1907 freemsg(mp0);
1908 return (-1);
1909 }
1910 }
1911
1912 /*
1913 * Clean up the mfctable cache entry if upcall is not serviced.
1914 * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1915 */
1916 static void
expire_upcalls(void * arg)1917 expire_upcalls(void *arg)
1918 {
1919 struct mfc *mfc_rt = arg;
1920 uint_t hash;
1921 struct mfc *prev_mfc, *mfc0;
1922 ip_stack_t *ipst;
1923 conn_t *mrouter;
1924
1925 if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1926 cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1927 return;
1928 }
1929 ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1930 mrouter = ipst->ips_ip_g_mrouter;
1931
1932 hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1933 if (ipst->ips_ip_mrtdebug > 1) {
1934 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1935 "expire_upcalls: hash %d s %x g %x",
1936 hash, ntohl(mfc_rt->mfc_origin.s_addr),
1937 ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1938 }
1939 MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1940 mutex_enter(&mfc_rt->mfc_mutex);
1941 /*
1942 * if timeout has been set to zero, than the
1943 * entry has been filled, no need to delete it.
1944 */
1945 if (mfc_rt->mfc_timeout_id == 0)
1946 goto done;
1947 ipst->ips_mrtstat->mrts_cache_cleanups++;
1948 mfc_rt->mfc_timeout_id = 0;
1949
1950 /* Determine entry to be cleaned up in cache table. */
1951 for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1952 prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1953 if (mfc0 == mfc_rt)
1954 break;
1955
1956 /* del_mfc takes care of gone mfcs */
1957 ASSERT(prev_mfc != NULL);
1958 ASSERT(mfc0 != NULL);
1959
1960 /*
1961 * Delete the entry from the cache
1962 */
1963 ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1964 mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1965
1966 /*
1967 * release_mfc will drop all queued upcall packets.
1968 * and will free the mbuf with the pkt, if, timing info.
1969 */
1970 done:
1971 mutex_exit(&mfc_rt->mfc_mutex);
1972 MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1973 }
1974
1975 /*
1976 * Packet forwarding routine once entry in the cache is made.
1977 */
1978 static int
ip_mdq(mblk_t * mp,ipha_t * ipha,ill_t * ill,ipaddr_t tunnel_src,struct mfc * rt)1979 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1980 struct mfc *rt)
1981 {
1982 vifi_t vifi;
1983 struct vif *vifp;
1984 ipaddr_t dst = ipha->ipha_dst;
1985 size_t plen = msgdsize(mp);
1986 vifi_t num_of_vifs;
1987 ip_stack_t *ipst = ill->ill_ipst;
1988 conn_t *mrouter = ipst->ips_ip_g_mrouter;
1989 ip_recv_attr_t iras;
1990
1991 if (ipst->ips_ip_mrtdebug > 1) {
1992 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1993 "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1994 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1995 ill->ill_name);
1996 }
1997
1998 /* Macro to send packet on vif */
1999 #define MC_SEND(ipha, mp, vifp, dst) { \
2000 if ((vifp)->v_flags & VIFF_TUNNEL) \
2001 encap_send((ipha), (mp), (vifp), (dst)); \
2002 else if ((vifp)->v_flags & VIFF_REGISTER) \
2003 register_send((ipha), (mp), (vifp), (dst)); \
2004 else \
2005 phyint_send((ipha), (mp), (vifp), (dst)); \
2006 }
2007
2008 vifi = rt->mfc_parent;
2009
2010 /*
2011 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2012 * Mrouted had no route.
2013 * We wanted the route installed in the mfctable to prevent multiple
2014 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2015 * NULL so we don't want to check the ill. Still needed as of Mrouted
2016 * 3.6.
2017 */
2018 if (vifi == NO_VIF) {
2019 ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2020 ill->ill_name));
2021 if (ipst->ips_ip_mrtdebug > 1) {
2022 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2023 "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2024 }
2025 return (-1); /* drop pkt */
2026 }
2027
2028 if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2029 return (-1);
2030 /*
2031 * The MFC entries are not cleaned up when an ipif goes
2032 * away thus this code has to guard against an MFC referencing
2033 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2034 * sets the v_ipif to NULL when the ipif disappears.
2035 */
2036 ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2037
2038 if (vifi >= ipst->ips_numvifs) {
2039 cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2040 "%d ill %s viftable ill %s\n",
2041 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2042 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2043 unlock_good_vif(&ipst->ips_vifs[vifi]);
2044 return (-1);
2045 }
2046 /*
2047 * Don't forward if it didn't arrive from the parent vif for its
2048 * origin.
2049 */
2050 if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2051 (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2052 /* Came in the wrong interface */
2053 ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2054 "numvifs %d ill %s viftable ill %s\n",
2055 (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2056 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2057 if (ipst->ips_ip_mrtdebug > 1) {
2058 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2059 "ip_mdq: arrived wrong if, vifi %d ill "
2060 "%s viftable ill %s\n",
2061 (int)vifi, ill->ill_name,
2062 ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2063 }
2064 ipst->ips_mrtstat->mrts_wrong_if++;
2065 rt->mfc_wrong_if++;
2066
2067 /*
2068 * If we are doing PIM assert processing and we are forwarding
2069 * packets on this interface, and it is a broadcast medium
2070 * interface (and not a tunnel), send a message to the routing.
2071 *
2072 * We use the first ipif on the list, since it's all we have.
2073 * Chances are the ipif_flags are the same for ipifs on the ill.
2074 */
2075 if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2076 (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2077 !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2078 mblk_t *mp_copy;
2079 struct igmpmsg *im;
2080
2081 /* TODO could copy header and dup rest */
2082 mp_copy = copymsg(mp);
2083 if (mp_copy == NULL) {
2084 ipst->ips_mrtstat->mrts_fwd_drop++;
2085 ip1dbg(("ip_mdq: out of memory "
2086 "for mblk, mp_copy\n"));
2087 unlock_good_vif(&ipst->ips_vifs[vifi]);
2088 return (-1);
2089 }
2090
2091 im = (struct igmpmsg *)mp_copy->b_rptr;
2092 im->im_msgtype = IGMPMSG_WRONGVIF;
2093 im->im_mbz = 0;
2094 im->im_vif = (ushort_t)vifi;
2095 /* Pass to RAWIP */
2096
2097 bzero(&iras, sizeof (iras));
2098 iras.ira_flags = IRAF_IS_IPV4;
2099 iras.ira_ip_hdr_length =
2100 IPH_HDR_LENGTH(mp_copy->b_rptr);
2101 iras.ira_pktlen = msgdsize(mp_copy);
2102 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2103 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2104 }
2105 unlock_good_vif(&ipst->ips_vifs[vifi]);
2106 if (tunnel_src != 0)
2107 return (1);
2108 else
2109 return (0);
2110 }
2111 /*
2112 * If I sourced this packet, it counts as output, else it was input.
2113 */
2114 if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2115 ipst->ips_vifs[vifi].v_pkt_out++;
2116 ipst->ips_vifs[vifi].v_bytes_out += plen;
2117 } else {
2118 ipst->ips_vifs[vifi].v_pkt_in++;
2119 ipst->ips_vifs[vifi].v_bytes_in += plen;
2120 }
2121 mutex_enter(&rt->mfc_mutex);
2122 rt->mfc_pkt_cnt++;
2123 rt->mfc_byte_cnt += plen;
2124 mutex_exit(&rt->mfc_mutex);
2125 unlock_good_vif(&ipst->ips_vifs[vifi]);
2126 /*
2127 * For each vif, decide if a copy of the packet should be forwarded.
2128 * Forward if:
2129 * - the vif threshold ttl is non-zero AND
2130 * - the pkt ttl exceeds the vif's threshold
2131 * A non-zero mfc_ttl indicates that the vif is part of
2132 * the output set for the mfc entry.
2133 */
2134 mutex_enter(&ipst->ips_numvifs_mutex);
2135 num_of_vifs = ipst->ips_numvifs;
2136 mutex_exit(&ipst->ips_numvifs_mutex);
2137 for (vifp = ipst->ips_vifs, vifi = 0;
2138 vifi < num_of_vifs;
2139 vifp++, vifi++) {
2140 if (!lock_good_vif(vifp))
2141 continue;
2142 if ((rt->mfc_ttls[vifi] > 0) &&
2143 (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2144 /*
2145 * lock_good_vif should not have succedded if
2146 * v_ipif is null.
2147 */
2148 ASSERT(vifp->v_ipif != NULL);
2149 vifp->v_pkt_out++;
2150 vifp->v_bytes_out += plen;
2151 MC_SEND(ipha, mp, vifp, dst);
2152 ipst->ips_mrtstat->mrts_fwd_out++;
2153 }
2154 unlock_good_vif(vifp);
2155 }
2156 if (tunnel_src != 0)
2157 return (1);
2158 else
2159 return (0);
2160 }
2161
2162 /*
2163 * Send the packet on physical interface.
2164 * Caller assumes can continue to use mp on return.
2165 */
2166 /* ARGSUSED */
2167 static void
phyint_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2168 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2169 {
2170 mblk_t *mp_copy;
2171 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2172 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2173
2174 /* Make a new reference to the packet */
2175 mp_copy = copymsg(mp); /* TODO could copy header and dup rest */
2176 if (mp_copy == NULL) {
2177 ipst->ips_mrtstat->mrts_fwd_drop++;
2178 ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2179 return;
2180 }
2181 if (vifp->v_rate_limit <= 0)
2182 tbf_send_packet(vifp, mp_copy);
2183 else {
2184 if (ipst->ips_ip_mrtdebug > 1) {
2185 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2186 "phyint_send: tbf_contr rate %d "
2187 "vifp 0x%p mp 0x%p dst 0x%x",
2188 vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2189 }
2190 tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2191 }
2192 }
2193
2194 /*
2195 * Send the whole packet for REGISTER encapsulation to PIM daemon
2196 * Caller assumes it can continue to use mp on return.
2197 */
2198 /* ARGSUSED */
2199 static void
register_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2200 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2201 {
2202 struct igmpmsg *im;
2203 mblk_t *mp_copy;
2204 ipha_t *ipha_copy;
2205 ill_t *ill = vifp->v_ipif->ipif_ill;
2206 ip_stack_t *ipst = ill->ill_ipst;
2207 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2208 ip_recv_attr_t iras;
2209
2210 if (ipst->ips_ip_mrtdebug > 1) {
2211 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2212 "register_send: src %x, dst %x\n",
2213 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2214 }
2215
2216 /*
2217 * Copy the old packet & pullup its IP header into the new mblk_t so we
2218 * can modify it. Try to fill the new mblk_t since if we don't the
2219 * ethernet driver will.
2220 */
2221 mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2222 if (mp_copy == NULL) {
2223 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2224 if (ipst->ips_ip_mrtdebug > 3) {
2225 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2226 "register_send: allocb failure.");
2227 }
2228 return;
2229 }
2230
2231 /*
2232 * Bump write pointer to account for igmpmsg being added.
2233 */
2234 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2235
2236 /*
2237 * Chain packet to new mblk_t.
2238 */
2239 if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2240 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2241 if (ipst->ips_ip_mrtdebug > 3) {
2242 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2243 "register_send: copymsg failure.");
2244 }
2245 freeb(mp_copy);
2246 return;
2247 }
2248
2249 /*
2250 * icmp_input() asserts that IP version field is set to an
2251 * appropriate version. Hence, the struct igmpmsg that this really
2252 * becomes, needs to have the correct IP version field.
2253 */
2254 ipha_copy = (ipha_t *)mp_copy->b_rptr;
2255 *ipha_copy = multicast_encap_iphdr;
2256
2257 /*
2258 * The kernel uses the struct igmpmsg header to encode the messages to
2259 * the multicast routing daemon. Fill in the fields in the header
2260 * starting with the message type which is IGMPMSG_WHOLEPKT
2261 */
2262 im = (struct igmpmsg *)mp_copy->b_rptr;
2263 im->im_msgtype = IGMPMSG_WHOLEPKT;
2264 im->im_src.s_addr = ipha->ipha_src;
2265 im->im_dst.s_addr = ipha->ipha_dst;
2266
2267 /*
2268 * Must Be Zero. This is because the struct igmpmsg is really an IP
2269 * header with renamed fields and the multicast routing daemon uses
2270 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2271 */
2272 im->im_mbz = 0;
2273
2274 ++ipst->ips_mrtstat->mrts_upcalls;
2275 if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2276 !canputnext(mrouter->conn_rq)) {
2277 ++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2278 if (ipst->ips_ip_mrtdebug > 3) {
2279 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2280 "register_send: register upcall failure.");
2281 }
2282 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2283 ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2284 freemsg(mp_copy);
2285 } else {
2286 /* Pass to RAWIP */
2287 bzero(&iras, sizeof (iras));
2288 iras.ira_flags = IRAF_IS_IPV4;
2289 iras.ira_ip_hdr_length = sizeof (ipha_t);
2290 iras.ira_pktlen = msgdsize(mp_copy);
2291 (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2292 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2293 }
2294 }
2295
2296 /*
2297 * pim_validate_cksum handles verification of the checksum in the
2298 * pim header. For PIM Register packets, the checksum is calculated
2299 * across the PIM header only. For all other packets, the checksum
2300 * is for the PIM header and remainder of the packet.
2301 *
2302 * returns: B_TRUE, if checksum is okay.
2303 * B_FALSE, if checksum is not valid.
2304 */
2305 static boolean_t
pim_validate_cksum(mblk_t * mp,ipha_t * ip,struct pim * pimp)2306 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2307 {
2308 mblk_t *mp_dup;
2309
2310 if ((mp_dup = dupmsg(mp)) == NULL)
2311 return (B_FALSE);
2312
2313 mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2314 if (pimp->pim_type == PIM_REGISTER)
2315 mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2316 if (IP_CSUM(mp_dup, 0, 0)) {
2317 freemsg(mp_dup);
2318 return (B_FALSE);
2319 }
2320 freemsg(mp_dup);
2321 return (B_TRUE);
2322 }
2323
2324 /*
2325 * Process PIM protocol packets i.e. IP Protocol 103.
2326 * Register messages are decapsulated and sent onto multicast forwarding.
2327 *
2328 * Return NULL for a bad packet that is discarded here.
2329 * Return mp if the message is OK and should be handed to "raw" receivers.
2330 * Callers of pim_input() may need to reinitialize variables that were copied
2331 * from the mblk as this calls pullupmsg().
2332 */
2333 mblk_t *
pim_input(mblk_t * mp,ip_recv_attr_t * ira)2334 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2335 {
2336 ipha_t *eip, *ip;
2337 int iplen, pimlen, iphlen;
2338 struct pim *pimp; /* pointer to a pim struct */
2339 uint32_t *reghdr;
2340 ill_t *ill = ira->ira_ill;
2341 ip_stack_t *ipst = ill->ill_ipst;
2342 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2343
2344 /*
2345 * Pullup the msg for PIM protocol processing.
2346 */
2347 if (pullupmsg(mp, -1) == 0) {
2348 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2349 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2350 ip_drop_input("mrts_pim_nomemory", mp, ill);
2351 freemsg(mp);
2352 return (NULL);
2353 }
2354
2355 ip = (ipha_t *)mp->b_rptr;
2356 iplen = ip->ipha_length;
2357 iphlen = IPH_HDR_LENGTH(ip);
2358 pimlen = ntohs(iplen) - iphlen;
2359
2360 /*
2361 * Validate lengths
2362 */
2363 if (pimlen < PIM_MINLEN) {
2364 ++ipst->ips_mrtstat->mrts_pim_malformed;
2365 if (ipst->ips_ip_mrtdebug > 1) {
2366 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2367 "pim_input: length not at least minlen");
2368 }
2369 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2370 ip_drop_input("mrts_pim_malformed", mp, ill);
2371 freemsg(mp);
2372 return (NULL);
2373 }
2374
2375 /*
2376 * Point to the PIM header.
2377 */
2378 pimp = (struct pim *)((caddr_t)ip + iphlen);
2379
2380 /*
2381 * Check the version number.
2382 */
2383 if (pimp->pim_vers != PIM_VERSION) {
2384 ++ipst->ips_mrtstat->mrts_pim_badversion;
2385 if (ipst->ips_ip_mrtdebug > 1) {
2386 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2387 "pim_input: unknown version of PIM");
2388 }
2389 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2390 ip_drop_input("mrts_pim_badversion", mp, ill);
2391 freemsg(mp);
2392 return (NULL);
2393 }
2394
2395 /*
2396 * Validate the checksum
2397 */
2398 if (!pim_validate_cksum(mp, ip, pimp)) {
2399 ++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2400 if (ipst->ips_ip_mrtdebug > 1) {
2401 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2402 "pim_input: invalid checksum");
2403 }
2404 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2405 ip_drop_input("pim_rcv_badcsum", mp, ill);
2406 freemsg(mp);
2407 return (NULL);
2408 }
2409
2410 if (pimp->pim_type != PIM_REGISTER)
2411 return (mp);
2412
2413 reghdr = (uint32_t *)(pimp + 1);
2414 eip = (ipha_t *)(reghdr + 1);
2415
2416 /*
2417 * check if the inner packet is destined to mcast group
2418 */
2419 if (!CLASSD(eip->ipha_dst)) {
2420 ++ipst->ips_mrtstat->mrts_pim_badregisters;
2421 if (ipst->ips_ip_mrtdebug > 1) {
2422 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2423 "pim_input: Inner pkt not mcast .. !");
2424 }
2425 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2426 ip_drop_input("mrts_pim_badregisters", mp, ill);
2427 freemsg(mp);
2428 return (NULL);
2429 }
2430 if (ipst->ips_ip_mrtdebug > 1) {
2431 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2432 "register from %x, to %x, len %d",
2433 ntohl(eip->ipha_src),
2434 ntohl(eip->ipha_dst),
2435 ntohs(eip->ipha_length));
2436 }
2437 /*
2438 * If the null register bit is not set, decapsulate
2439 * the packet before forwarding it.
2440 * Avoid this in no register vif
2441 */
2442 if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2443 ipst->ips_reg_vif_num != ALL_VIFS) {
2444 mblk_t *mp_copy;
2445 uint_t saved_pktlen;
2446
2447 /* Copy the message */
2448 if ((mp_copy = copymsg(mp)) == NULL) {
2449 ++ipst->ips_mrtstat->mrts_pim_nomemory;
2450 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2451 ip_drop_input("mrts_pim_nomemory", mp, ill);
2452 freemsg(mp);
2453 return (NULL);
2454 }
2455
2456 /*
2457 * Decapsulate the packet and give it to
2458 * register_mforward.
2459 */
2460 mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2461 saved_pktlen = ira->ira_pktlen;
2462 ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2463 if (register_mforward(mp_copy, ira) != 0) {
2464 /* register_mforward already called ip_drop_input */
2465 freemsg(mp);
2466 ira->ira_pktlen = saved_pktlen;
2467 return (NULL);
2468 }
2469 ira->ira_pktlen = saved_pktlen;
2470 }
2471
2472 /*
2473 * Pass all valid PIM packets up to any process(es) listening on a raw
2474 * PIM socket. For Solaris it is done right after pim_input() is
2475 * called.
2476 */
2477 return (mp);
2478 }
2479
2480 /*
2481 * PIM sparse mode hook. Called by pim_input after decapsulating
2482 * the packet. Loop back the packet, as if we have received it.
2483 * In pim_input() we have to check if the destination is a multicast address.
2484 */
2485 static int
register_mforward(mblk_t * mp,ip_recv_attr_t * ira)2486 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2487 {
2488 ire_t *ire;
2489 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2490 ill_t *ill = ira->ira_ill;
2491 ip_stack_t *ipst = ill->ill_ipst;
2492 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2493
2494 ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2495
2496 if (ipst->ips_ip_mrtdebug > 3) {
2497 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2498 "register_mforward: src %x, dst %x\n",
2499 ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2500 }
2501 /*
2502 * Need to pass in to ip_mforward() the information that the
2503 * packet has arrived on the register_vif. We mark it with
2504 * the IRAF_PIM_REGISTER attribute.
2505 * pim_input verified that the (inner) destination is multicast,
2506 * hence we skip the generic code in ip_input.
2507 */
2508 ira->ira_flags |= IRAF_PIM_REGISTER;
2509 ++ipst->ips_mrtstat->mrts_pim_regforwards;
2510
2511 if (!CLASSD(ipha->ipha_dst)) {
2512 ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2513 ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2514 NULL, NULL, NULL);
2515 } else {
2516 ire = ire_multicast(ill);
2517 }
2518 ASSERT(ire != NULL);
2519 /* Normally this will return the IRE_MULTICAST */
2520 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2521 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2522 ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2523 freemsg(mp);
2524 ire_refrele(ire);
2525 return (-1);
2526 }
2527 ASSERT(ire->ire_type & IRE_MULTICAST);
2528 (*ire->ire_recvfn)(ire, mp, ipha, ira);
2529 ire_refrele(ire);
2530
2531 return (0);
2532 }
2533
2534 /*
2535 * Send an encapsulated packet.
2536 * Caller assumes can continue to use mp when routine returns.
2537 */
2538 /* ARGSUSED */
2539 static void
encap_send(ipha_t * ipha,mblk_t * mp,struct vif * vifp,ipaddr_t dst)2540 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2541 {
2542 mblk_t *mp_copy;
2543 ipha_t *ipha_copy;
2544 size_t len;
2545 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2546 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2547
2548 if (ipst->ips_ip_mrtdebug > 1) {
2549 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2550 "encap_send: vif %ld enter",
2551 (ptrdiff_t)(vifp - ipst->ips_vifs));
2552 }
2553 len = ntohs(ipha->ipha_length);
2554
2555 /*
2556 * Copy the old packet & pullup it's IP header into the
2557 * new mbuf so we can modify it. Try to fill the new
2558 * mbuf since if we don't the ethernet driver will.
2559 */
2560 mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2561 if (mp_copy == NULL)
2562 return;
2563 mp_copy->b_rptr += 32;
2564 mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2565 if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2566 freeb(mp_copy);
2567 return;
2568 }
2569
2570 /*
2571 * Fill in the encapsulating IP header.
2572 * Remote tunnel dst in rmt_addr, from add_vif().
2573 */
2574 ipha_copy = (ipha_t *)mp_copy->b_rptr;
2575 *ipha_copy = multicast_encap_iphdr;
2576 ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2577 ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2578 ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2579 ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2580 ASSERT(ipha_copy->ipha_ident == 0);
2581
2582 /* Turn the encapsulated IP header back into a valid one. */
2583 ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2584 ipha->ipha_ttl--;
2585 ipha->ipha_hdr_checksum = 0;
2586 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2587
2588 ipha_copy->ipha_ttl = ipha->ipha_ttl;
2589
2590 if (ipst->ips_ip_mrtdebug > 1) {
2591 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2592 "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2593 }
2594 if (vifp->v_rate_limit <= 0)
2595 tbf_send_packet(vifp, mp_copy);
2596 else
2597 /* ipha is from the original header */
2598 tbf_control(vifp, mp_copy, ipha);
2599 }
2600
2601 /*
2602 * De-encapsulate a packet and feed it back through IP input if it
2603 * matches one of our multicast tunnels.
2604 *
2605 * This routine is called whenever IP gets a packet with prototype
2606 * IPPROTO_ENCAP and a local destination address and the packet didn't
2607 * match one of our configured IP-in-IP tunnels.
2608 */
2609 void
ip_mroute_decap(mblk_t * mp,ip_recv_attr_t * ira)2610 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2611 {
2612 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2613 ipha_t *ipha_encap;
2614 int hlen = IPH_HDR_LENGTH(ipha);
2615 int hlen_encap;
2616 ipaddr_t src;
2617 struct vif *vifp;
2618 ire_t *ire;
2619 ill_t *ill = ira->ira_ill;
2620 ip_stack_t *ipst = ill->ill_ipst;
2621 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2622
2623 /* Make sure we have all of the inner header */
2624 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2625 if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2626 ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2627 if (ipha == NULL) {
2628 ipst->ips_mrtstat->mrts_bad_tunnel++;
2629 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2630 ip_drop_input("ip_mroute_decap: too short", mp, ill);
2631 freemsg(mp);
2632 return;
2633 }
2634 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2635 }
2636 hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2637 if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2638 ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2639 if (ipha == NULL) {
2640 ipst->ips_mrtstat->mrts_bad_tunnel++;
2641 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2642 ip_drop_input("ip_mroute_decap: too short", mp, ill);
2643 freemsg(mp);
2644 return;
2645 }
2646 ipha_encap = (ipha_t *)((char *)ipha + hlen);
2647 }
2648
2649 /*
2650 * Dump the packet if it's not to a multicast destination or if
2651 * we don't have an encapsulating tunnel with the source.
2652 * Note: This code assumes that the remote site IP address
2653 * uniquely identifies the tunnel (i.e., that this site has
2654 * at most one tunnel with the remote site).
2655 */
2656 if (!CLASSD(ipha_encap->ipha_dst)) {
2657 ipst->ips_mrtstat->mrts_bad_tunnel++;
2658 ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2659 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2660 ip_drop_input("mrts_bad_tunnel", mp, ill);
2661 freemsg(mp);
2662 return;
2663 }
2664 src = (ipaddr_t)ipha->ipha_src;
2665 mutex_enter(&ipst->ips_last_encap_lock);
2666 if (src != ipst->ips_last_encap_src) {
2667 struct vif *vife;
2668
2669 vifp = ipst->ips_vifs;
2670 vife = vifp + ipst->ips_numvifs;
2671 ipst->ips_last_encap_src = src;
2672 ipst->ips_last_encap_vif = 0;
2673 for (; vifp < vife; ++vifp) {
2674 if (!lock_good_vif(vifp))
2675 continue;
2676 if (vifp->v_rmt_addr.s_addr == src) {
2677 if (vifp->v_flags & VIFF_TUNNEL)
2678 ipst->ips_last_encap_vif = vifp;
2679 if (ipst->ips_ip_mrtdebug > 1) {
2680 (void) mi_strlog(mrouter->conn_rq,
2681 1, SL_TRACE,
2682 "ip_mroute_decap: good tun "
2683 "vif %ld with %x",
2684 (ptrdiff_t)(vifp - ipst->ips_vifs),
2685 ntohl(src));
2686 }
2687 unlock_good_vif(vifp);
2688 break;
2689 }
2690 unlock_good_vif(vifp);
2691 }
2692 }
2693 if ((vifp = ipst->ips_last_encap_vif) == 0) {
2694 mutex_exit(&ipst->ips_last_encap_lock);
2695 ipst->ips_mrtstat->mrts_bad_tunnel++;
2696 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2697 ip_drop_input("mrts_bad_tunnel", mp, ill);
2698 freemsg(mp);
2699 ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2700 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2701 return;
2702 }
2703 mutex_exit(&ipst->ips_last_encap_lock);
2704
2705 /*
2706 * Need to pass in the tunnel source to ip_mforward (so that it can
2707 * verify that the packet arrived over the correct vif.)
2708 */
2709 ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2710 ira->ira_mroute_tunnel = src;
2711 mp->b_rptr += hlen;
2712 ira->ira_pktlen -= hlen;
2713 ira->ira_ip_hdr_length = hlen_encap;
2714
2715 /*
2716 * We don't redo any of the filtering in ill_input_full_v4 and we
2717 * have checked that all of ipha_encap and any IP options are
2718 * pulled up. Hence we call ire_recv_multicast_v4 directly.
2719 * However, we have to check for RSVP as in ip_input_full_v4
2720 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2721 * to the rsvpd.
2722 */
2723 if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2724 ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2725 ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2726 ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2727 IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2728 } else {
2729 ire = ire_multicast(ill);
2730 }
2731 ASSERT(ire != NULL);
2732 /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2733 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2734 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2735 ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2736 freemsg(mp);
2737 ire_refrele(ire);
2738 return;
2739 }
2740 ire->ire_ib_pkt_count++;
2741 ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2742 (*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2743 ire_refrele(ire);
2744 }
2745
2746 /*
2747 * Remove all records with v_ipif == ipif. Called when an interface goes away
2748 * (stream closed). Called as writer.
2749 */
2750 void
reset_mrt_vif_ipif(ipif_t * ipif)2751 reset_mrt_vif_ipif(ipif_t *ipif)
2752 {
2753 vifi_t vifi, tmp_vifi;
2754 vifi_t num_of_vifs;
2755 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2756
2757 /* Can't check vifi >= 0 since vifi_t is unsigned! */
2758
2759 mutex_enter(&ipst->ips_numvifs_mutex);
2760 num_of_vifs = ipst->ips_numvifs;
2761 mutex_exit(&ipst->ips_numvifs_mutex);
2762
2763 for (vifi = num_of_vifs; vifi != 0; vifi--) {
2764 tmp_vifi = vifi - 1;
2765 if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2766 (void) del_vif(&tmp_vifi, ipst);
2767 }
2768 }
2769 }
2770
2771 /* Remove pending upcall msgs when ill goes away. Called by ill_delete. */
2772 void
reset_mrt_ill(ill_t * ill)2773 reset_mrt_ill(ill_t *ill)
2774 {
2775 struct mfc *rt;
2776 struct rtdetq *rte;
2777 int i;
2778 ip_stack_t *ipst = ill->ill_ipst;
2779 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2780 timeout_id_t id;
2781
2782 for (i = 0; i < MFCTBLSIZ; i++) {
2783 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2784 if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2785 if (ipst->ips_ip_mrtdebug > 1) {
2786 (void) mi_strlog(mrouter->conn_rq, 1,
2787 SL_TRACE,
2788 "reset_mrt_ill: mfctable [%d]", i);
2789 }
2790 while (rt != NULL) {
2791 mutex_enter(&rt->mfc_mutex);
2792 while ((rte = rt->mfc_rte) != NULL) {
2793 if (rte->ill == ill &&
2794 (id = rt->mfc_timeout_id) != 0) {
2795 /*
2796 * Its ok to drop the lock, the
2797 * struct cannot be freed since
2798 * we have a ref on the hash
2799 * bucket.
2800 */
2801 mutex_exit(&rt->mfc_mutex);
2802 (void) untimeout(id);
2803 mutex_enter(&rt->mfc_mutex);
2804 }
2805 if (rte->ill == ill) {
2806 if (ipst->ips_ip_mrtdebug > 1) {
2807 (void) mi_strlog(
2808 mrouter->conn_rq,
2809 1, SL_TRACE,
2810 "reset_mrt_ill: "
2811 "ill 0x%p", (void *)ill);
2812 }
2813 rt->mfc_rte = rte->rte_next;
2814 freemsg(rte->mp);
2815 mi_free((char *)rte);
2816 }
2817 }
2818 mutex_exit(&rt->mfc_mutex);
2819 rt = rt->mfc_next;
2820 }
2821 }
2822 MFCB_REFRELE(&ipst->ips_mfcs[i]);
2823 }
2824 }
2825
2826 /*
2827 * Token bucket filter module.
2828 * The ipha is for mcastgrp destination for phyint and encap.
2829 */
2830 static void
tbf_control(struct vif * vifp,mblk_t * mp,ipha_t * ipha)2831 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2832 {
2833 size_t p_len = msgdsize(mp);
2834 struct tbf *t = vifp->v_tbf;
2835 timeout_id_t id = 0;
2836 ill_t *ill = vifp->v_ipif->ipif_ill;
2837 ip_stack_t *ipst = ill->ill_ipst;
2838 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2839
2840 /* Drop if packet is too large */
2841 if (p_len > MAX_BKT_SIZE) {
2842 ipst->ips_mrtstat->mrts_pkt2large++;
2843 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2844 ip_drop_output("tbf_control - too large", mp, ill);
2845 freemsg(mp);
2846 return;
2847 }
2848 if (ipst->ips_ip_mrtdebug > 1) {
2849 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2850 "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2851 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2852 ntohl(ipha->ipha_dst));
2853 }
2854
2855 mutex_enter(&t->tbf_lock);
2856
2857 tbf_update_tokens(vifp);
2858
2859 /*
2860 * If there are enough tokens,
2861 * and the queue is empty, send this packet out.
2862 */
2863 if (ipst->ips_ip_mrtdebug > 1) {
2864 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2865 "tbf_control: vif %ld, TOKENS %d, pkt len %lu, qlen %d",
2866 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2867 t->tbf_q_len);
2868 }
2869 /* No packets are queued */
2870 if (t->tbf_q_len == 0) {
2871 /* queue empty, send packet if enough tokens */
2872 if (p_len <= t->tbf_n_tok) {
2873 t->tbf_n_tok -= p_len;
2874 mutex_exit(&t->tbf_lock);
2875 tbf_send_packet(vifp, mp);
2876 return;
2877 } else {
2878 /* Queue packet and timeout till later */
2879 tbf_queue(vifp, mp);
2880 ASSERT(vifp->v_timeout_id == 0);
2881 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2882 TBF_REPROCESS);
2883 }
2884 } else if (t->tbf_q_len < t->tbf_max_q_len) {
2885 /* Finite queue length, so queue pkts and process queue */
2886 tbf_queue(vifp, mp);
2887 tbf_process_q(vifp);
2888 } else {
2889 /* Check that we have UDP header with IP header */
2890 size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2891 sizeof (struct udphdr);
2892
2893 if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2894 if (!pullupmsg(mp, hdr_length)) {
2895 BUMP_MIB(ill->ill_ip_mib,
2896 ipIfStatsOutDiscards);
2897 ip_drop_output("tbf_control - pullup", mp, ill);
2898 freemsg(mp);
2899 ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2900 "vif %ld src 0x%x dst 0x%x\n",
2901 (ptrdiff_t)(vifp - ipst->ips_vifs),
2902 ntohl(ipha->ipha_src),
2903 ntohl(ipha->ipha_dst)));
2904 mutex_exit(&vifp->v_tbf->tbf_lock);
2905 return;
2906 } else
2907 /* Have to reassign ipha after pullupmsg */
2908 ipha = (ipha_t *)mp->b_rptr;
2909 }
2910 /*
2911 * Queue length too much,
2912 * try to selectively dq, or queue and process
2913 */
2914 if (!tbf_dq_sel(vifp, ipha)) {
2915 ipst->ips_mrtstat->mrts_q_overflow++;
2916 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2917 ip_drop_output("mrts_q_overflow", mp, ill);
2918 freemsg(mp);
2919 } else {
2920 tbf_queue(vifp, mp);
2921 tbf_process_q(vifp);
2922 }
2923 }
2924 if (t->tbf_q_len == 0) {
2925 id = vifp->v_timeout_id;
2926 vifp->v_timeout_id = 0;
2927 }
2928 mutex_exit(&vifp->v_tbf->tbf_lock);
2929 if (id != 0)
2930 (void) untimeout(id);
2931 }
2932
2933 /*
2934 * Adds a packet to the tbf queue at the interface.
2935 * The ipha is for mcastgrp destination for phyint and encap.
2936 */
2937 static void
tbf_queue(struct vif * vifp,mblk_t * mp)2938 tbf_queue(struct vif *vifp, mblk_t *mp)
2939 {
2940 struct tbf *t = vifp->v_tbf;
2941 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2942 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2943
2944 if (ipst->ips_ip_mrtdebug > 1) {
2945 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2946 "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2947 }
2948 ASSERT(MUTEX_HELD(&t->tbf_lock));
2949
2950 if (t->tbf_t == NULL) {
2951 /* Queue was empty */
2952 t->tbf_q = mp;
2953 } else {
2954 /* Insert at tail */
2955 t->tbf_t->b_next = mp;
2956 }
2957 /* set new tail pointer */
2958 t->tbf_t = mp;
2959
2960 mp->b_next = mp->b_prev = NULL;
2961
2962 t->tbf_q_len++;
2963 }
2964
2965 /*
2966 * Process the queue at the vif interface.
2967 * Drops the tbf_lock when sending packets.
2968 *
2969 * NOTE : The caller should quntimeout if the queue length is 0.
2970 */
2971 static void
tbf_process_q(struct vif * vifp)2972 tbf_process_q(struct vif *vifp)
2973 {
2974 mblk_t *mp;
2975 struct tbf *t = vifp->v_tbf;
2976 size_t len;
2977 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2978 conn_t *mrouter = ipst->ips_ip_g_mrouter;
2979
2980 if (ipst->ips_ip_mrtdebug > 1) {
2981 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2982 "tbf_process_q 1: vif %ld qlen = %d",
2983 (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2984 }
2985
2986 /*
2987 * Loop through the queue at the interface and send
2988 * as many packets as possible.
2989 */
2990 ASSERT(MUTEX_HELD(&t->tbf_lock));
2991
2992 while (t->tbf_q_len > 0) {
2993 mp = t->tbf_q;
2994 len = (size_t)msgdsize(mp); /* length of ip pkt */
2995
2996 /* Determine if the packet can be sent */
2997 if (len <= t->tbf_n_tok) {
2998 /*
2999 * If so, reduce no. of tokens, dequeue the packet,
3000 * send the packet.
3001 */
3002 t->tbf_n_tok -= len;
3003
3004 t->tbf_q = mp->b_next;
3005 if (--t->tbf_q_len == 0) {
3006 t->tbf_t = NULL;
3007 }
3008 mp->b_next = NULL;
3009 /* Exit mutex before sending packet, then re-enter */
3010 mutex_exit(&t->tbf_lock);
3011 tbf_send_packet(vifp, mp);
3012 mutex_enter(&t->tbf_lock);
3013 } else
3014 break;
3015 }
3016 }
3017
3018 /* Called at tbf timeout to update tokens, process q and reset timer. */
3019 static void
tbf_reprocess_q(void * arg)3020 tbf_reprocess_q(void *arg)
3021 {
3022 struct vif *vifp = arg;
3023 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3024 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3025
3026 mutex_enter(&vifp->v_tbf->tbf_lock);
3027 vifp->v_timeout_id = 0;
3028 tbf_update_tokens(vifp);
3029
3030 tbf_process_q(vifp);
3031
3032 if (vifp->v_tbf->tbf_q_len > 0) {
3033 vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3034 TBF_REPROCESS);
3035 }
3036 mutex_exit(&vifp->v_tbf->tbf_lock);
3037
3038 if (ipst->ips_ip_mrtdebug > 1) {
3039 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3040 "tbf_reprcess_q: vif %ld timeout id = %p",
3041 (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3042 }
3043 }
3044
3045 /*
3046 * Function that will selectively discard a member of the tbf queue,
3047 * based on the precedence value and the priority.
3048 *
3049 * NOTE : The caller should quntimeout if the queue length is 0.
3050 */
3051 static int
tbf_dq_sel(struct vif * vifp,ipha_t * ipha)3052 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3053 {
3054 uint_t p;
3055 struct tbf *t = vifp->v_tbf;
3056 mblk_t **np;
3057 mblk_t *last, *mp;
3058 ill_t *ill = vifp->v_ipif->ipif_ill;
3059 ip_stack_t *ipst = ill->ill_ipst;
3060 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3061
3062 if (ipst->ips_ip_mrtdebug > 1) {
3063 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3064 "dq_sel: vif %ld dst 0x%x",
3065 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3066 }
3067
3068 ASSERT(MUTEX_HELD(&t->tbf_lock));
3069 p = priority(vifp, ipha);
3070
3071 np = &t->tbf_q;
3072 last = NULL;
3073 while ((mp = *np) != NULL) {
3074 if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3075 *np = mp->b_next;
3076 /* If removing the last packet, fix the tail pointer */
3077 if (mp == t->tbf_t)
3078 t->tbf_t = last;
3079 mp->b_prev = mp->b_next = NULL;
3080 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3081 ip_drop_output("tbf_dq_send", mp, ill);
3082 freemsg(mp);
3083 /*
3084 * It's impossible for the queue to be empty, but
3085 * we check anyway.
3086 */
3087 if (--t->tbf_q_len == 0) {
3088 t->tbf_t = NULL;
3089 }
3090 ipst->ips_mrtstat->mrts_drop_sel++;
3091 return (1);
3092 }
3093 np = &mp->b_next;
3094 last = mp;
3095 }
3096 return (0);
3097 }
3098
3099 /* Sends packet, 2 cases - encap tunnel, phyint. */
3100 static void
tbf_send_packet(struct vif * vifp,mblk_t * mp)3101 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3102 {
3103 ipif_t *ipif = vifp->v_ipif;
3104 ill_t *ill = ipif->ipif_ill;
3105 ip_stack_t *ipst = ill->ill_ipst;
3106 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3107 ipha_t *ipha;
3108
3109 ipha = (ipha_t *)mp->b_rptr;
3110 /* If encap tunnel options */
3111 if (vifp->v_flags & VIFF_TUNNEL) {
3112 ip_xmit_attr_t ixas;
3113
3114 if (ipst->ips_ip_mrtdebug > 1) {
3115 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3116 "tbf_send_packet: ENCAP tunnel vif %ld",
3117 (ptrdiff_t)(vifp - ipst->ips_vifs));
3118 }
3119 bzero(&ixas, sizeof (ixas));
3120 ixas.ixa_flags =
3121 IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3122 ixas.ixa_ipst = ipst;
3123 ixas.ixa_ifindex = 0;
3124 ixas.ixa_cred = kcred;
3125 ixas.ixa_cpid = NOPID;
3126 ixas.ixa_tsl = NULL;
3127 ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3128 ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3129 ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3130
3131 /*
3132 * Feed into ip_output_simple which will set the ident field
3133 * and checksum the encapsulating header.
3134 * BSD gets the cached route vifp->v_route from ip_output()
3135 * to speed up route table lookups. Not necessary in SunOS 5.x.
3136 * One could make multicast forwarding faster by putting an
3137 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3138 */
3139 (void) ip_output_simple(mp, &ixas);
3140 ixa_cleanup(&ixas);
3141 return;
3142
3143 /* phyint */
3144 } else {
3145 /* Need to loop back to members on the outgoing interface. */
3146 ipaddr_t dst;
3147 ip_recv_attr_t iras;
3148 nce_t *nce;
3149
3150 bzero(&iras, sizeof (iras));
3151 iras.ira_flags = IRAF_IS_IPV4;
3152 iras.ira_ill = iras.ira_rill = ill;
3153 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3154 iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3155 iras.ira_pktlen = ntohs(ipha->ipha_length);
3156 iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3157
3158 dst = ipha->ipha_dst;
3159 if (ill_hasmembers_v4(ill, dst)) {
3160 iras.ira_flags |= IRAF_LOOPBACK_COPY;
3161 }
3162 if (ipst->ips_ip_mrtdebug > 1) {
3163 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3164 "tbf_send_pkt: phyint forward vif %ld dst = 0x%x",
3165 (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3166 }
3167 /*
3168 * Find an NCE which matches the nexthop.
3169 * For a pt-pt interface we use the other end of the pt-pt
3170 * link.
3171 */
3172 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3173 dst = ipif->ipif_pp_dst_addr;
3174 nce = arp_nce_init(ill, dst, ill->ill_net_type);
3175 } else {
3176 nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3177 }
3178 if (nce == NULL) {
3179 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3180 ip_drop_output("tbf_send_packet - no nce", mp, ill);
3181 freemsg(mp);
3182 return;
3183 }
3184
3185 /*
3186 * We don't remeber the incoming ill. Thus we
3187 * pretend the packet arrived on the outbound ill. This means
3188 * statistics for input errors will be increased on the wrong
3189 * ill but that isn't a big deal.
3190 */
3191 ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
3192 0);
3193 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3194
3195 nce_refrele(nce);
3196 }
3197 }
3198
3199 /*
3200 * Determine the current time and then the elapsed time (between the last time
3201 * and time now). Update the no. of tokens in the bucket.
3202 */
3203 static void
tbf_update_tokens(struct vif * vifp)3204 tbf_update_tokens(struct vif *vifp)
3205 {
3206 timespec_t tp;
3207 hrtime_t tm;
3208 struct tbf *t = vifp->v_tbf;
3209 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3210 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3211
3212 ASSERT(MUTEX_HELD(&t->tbf_lock));
3213
3214 /* Time in secs and nsecs, rate limit in kbits/sec */
3215 gethrestime(&tp);
3216
3217 /*LINTED*/
3218 TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3219
3220 /*
3221 * This formula is actually
3222 * "time in seconds" * "bytes/second". Scaled for nsec.
3223 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3224 *
3225 * The (1000/1024) was introduced in add_vif to optimize
3226 * this divide into a shift.
3227 */
3228 t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3229 t->tbf_last_pkt_t = tp;
3230
3231 if (t->tbf_n_tok > MAX_BKT_SIZE)
3232 t->tbf_n_tok = MAX_BKT_SIZE;
3233 if (ipst->ips_ip_mrtdebug > 1) {
3234 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3235 "tbf_update_tok: tm %lld tok %d vif %ld",
3236 tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3237 }
3238 }
3239
3240 /*
3241 * Priority currently is based on port nos.
3242 * Different forwarding mechanisms have different ways
3243 * of obtaining the port no. Hence, the vif must be
3244 * given along with the packet itself.
3245 *
3246 */
3247 static int
priority(struct vif * vifp,ipha_t * ipha)3248 priority(struct vif *vifp, ipha_t *ipha)
3249 {
3250 int prio;
3251 ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3252 conn_t *mrouter = ipst->ips_ip_g_mrouter;
3253
3254 /* Temporary hack; may add general packet classifier some day */
3255
3256 ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3257
3258 /*
3259 * The UDP port space is divided up into four priority ranges:
3260 * [0, 16384) : unclassified - lowest priority
3261 * [16384, 32768) : audio - highest priority
3262 * [32768, 49152) : whiteboard - medium priority
3263 * [49152, 65536) : video - low priority
3264 */
3265
3266 if (ipha->ipha_protocol == IPPROTO_UDP) {
3267 struct udphdr *udp =
3268 (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3269 switch (ntohs(udp->uh_dport) & 0xc000) {
3270 case 0x4000:
3271 prio = 70;
3272 break;
3273 case 0x8000:
3274 prio = 60;
3275 break;
3276 case 0xc000:
3277 prio = 55;
3278 break;
3279 default:
3280 prio = 50;
3281 break;
3282 }
3283 if (ipst->ips_ip_mrtdebug > 1) {
3284 (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3285 "priority: port %x prio %d\n",
3286 ntohs(udp->uh_dport), prio);
3287 }
3288 } else
3289 prio = 50; /* default priority */
3290 return (prio);
3291 }
3292
3293 /*
3294 * End of token bucket filter modifications
3295 */
3296
3297
3298
3299 /*
3300 * Produces data for netstat -M.
3301 */
3302 int
ip_mroute_stats(mblk_t * mp,ip_stack_t * ipst)3303 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3304 {
3305 ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3306 ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3307 if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3308 sizeof (struct mrtstat))) {
3309 ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3310 (size_t)sizeof (struct mrtstat)));
3311 return (0);
3312 }
3313 return (1);
3314 }
3315
3316 /*
3317 * Sends info for SNMP's MIB.
3318 */
3319 int
ip_mroute_vif(mblk_t * mp,ip_stack_t * ipst)3320 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3321 {
3322 struct vifctl vi;
3323 vifi_t vifi;
3324
3325 mutex_enter(&ipst->ips_numvifs_mutex);
3326 for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3327 if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3328 continue;
3329 /*
3330 * No locks here, an approximation is fine.
3331 */
3332 vi.vifc_vifi = vifi;
3333 vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3334 vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3335 vi.vifc_rate_limit = ipst->ips_vifs[vifi].v_rate_limit;
3336 vi.vifc_lcl_addr = ipst->ips_vifs[vifi].v_lcl_addr;
3337 vi.vifc_rmt_addr = ipst->ips_vifs[vifi].v_rmt_addr;
3338 vi.vifc_pkt_in = ipst->ips_vifs[vifi].v_pkt_in;
3339 vi.vifc_pkt_out = ipst->ips_vifs[vifi].v_pkt_out;
3340
3341 if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3342 ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3343 (size_t)sizeof (vi)));
3344 mutex_exit(&ipst->ips_numvifs_mutex);
3345 return (0);
3346 }
3347 }
3348 mutex_exit(&ipst->ips_numvifs_mutex);
3349 return (1);
3350 }
3351
3352 /*
3353 * Called by ip_snmp_get to send up multicast routing table.
3354 */
3355 int
ip_mroute_mrt(mblk_t * mp,ip_stack_t * ipst)3356 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3357 {
3358 int i, j;
3359 struct mfc *rt;
3360 struct mfcctl mfcc;
3361
3362 /*
3363 * Make sure multicast has not been turned off.
3364 */
3365 if (is_mrouter_off(ipst))
3366 return (1);
3367
3368 /* Loop over all hash buckets and their chains */
3369 for (i = 0; i < MFCTBLSIZ; i++) {
3370 MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3371 for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3372 mutex_enter(&rt->mfc_mutex);
3373 if (rt->mfc_rte != NULL ||
3374 (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3375 mutex_exit(&rt->mfc_mutex);
3376 continue;
3377 }
3378 mfcc.mfcc_origin = rt->mfc_origin;
3379 mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3380 mfcc.mfcc_parent = rt->mfc_parent;
3381 mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3382 mutex_enter(&ipst->ips_numvifs_mutex);
3383 for (j = 0; j < (int)ipst->ips_numvifs; j++)
3384 mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3385 for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3386 mfcc.mfcc_ttls[j] = 0;
3387 mutex_exit(&ipst->ips_numvifs_mutex);
3388
3389 mutex_exit(&rt->mfc_mutex);
3390 if (!snmp_append_data(mp, (char *)&mfcc,
3391 sizeof (mfcc))) {
3392 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3393 ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3394 (size_t)sizeof (mfcc)));
3395 return (0);
3396 }
3397 }
3398 MFCB_REFRELE(&ipst->ips_mfcs[i]);
3399 }
3400 return (1);
3401 }
3402