xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ndp.c (revision ab82c29b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2019, Joyent, Inc.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/errno.h>
35 #include <sys/dlpi.h>
36 #include <sys/socket.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/vtrace.h>
42 #include <sys/kmem.h>
43 #include <sys/zone.h>
44 #include <sys/ethernet.h>
45 #include <sys/sdt.h>
46 #include <sys/mac.h>
47 
48 #include <net/if.h>
49 #include <net/if_types.h>
50 #include <net/if_dl.h>
51 #include <net/route.h>
52 #include <netinet/in.h>
53 #include <netinet/ip6.h>
54 #include <netinet/icmp6.h>
55 
56 #include <inet/common.h>
57 #include <inet/mi.h>
58 #include <inet/mib2.h>
59 #include <inet/nd.h>
60 #include <inet/ip.h>
61 #include <inet/ip_impl.h>
62 #include <inet/ipclassifier.h>
63 #include <inet/ip_if.h>
64 #include <inet/ip_ire.h>
65 #include <inet/ip_rts.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_ndp.h>
68 #include <inet/sctp_ip.h>
69 #include <inet/ip_arp.h>
70 #include <inet/ip2mac_impl.h>
71 
72 #define	ANNOUNCE_INTERVAL(isv6) \
73 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
74 	ipst->ips_ip_arp_publish_interval)
75 
76 #define	DEFENSE_INTERVAL(isv6) \
77 	(isv6 ? ipst->ips_ndp_defend_interval : \
78 	ipst->ips_arp_defend_interval)
79 
80 /* Non-tunable probe interval, based on link capabilities */
81 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
82 
83 /*
84  * The IPv4 Link Local address space is special; we do extra duplicate checking
85  * there, as the entire assignment mechanism rests on random numbers.
86  */
87 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
88 				((uchar_t *)ptr)[1] == 254)
89 
90 /*
91  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
92  * in to the ncec*add* functions.
93  *
94  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
95  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
96  * that we will respond to requests for the protocol address.
97  */
98 #define	NCE_EXTERNAL_FLAGS_MASK \
99 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
100 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
101 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
102 
103 /*
104  * Lock ordering:
105  *
106  *	ndp_g_lock -> ill_lock -> ncec_lock
107  *
108  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
109  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
110  * ncec_refcnt).
111  */
112 
113 static	void	nce_cleanup_list(ncec_t *ncec);
114 static	void	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116     ncec_t *);
117 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
118 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119     uint16_t ncec_flags, nce_t **newnce);
120 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121     uint16_t ncec_flags, nce_t **newnce);
122 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
123     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124     const in6_addr_t *target, int flag);
125 static void	ncec_refhold_locked(ncec_t *);
126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129     uint16_t, uint16_t, nce_t **);
130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
131 static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
132 static void nce_inactive(nce_t *);
133 extern nce_t	*nce_lookup(ill_t *, const in6_addr_t *);
134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136     uint16_t, uint16_t, nce_t **);
137 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138     uint16_t, uint16_t, nce_t **);
139 static int  nce_add_v6_postprocess(nce_t *);
140 static int  nce_add_v4_postprocess(nce_t *);
141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 static void nce_resolv_ipmp_ok(ncec_t *);
144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 static void nce_start_timer(ncec_t *, uint_t);
146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 static void nce_fastpath_trigger(nce_t *);
148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149 
150 #ifdef DEBUG
151 static void	ncec_trace_cleanup(const ncec_t *);
152 #endif
153 
154 #define	NCE_HASH_PTR_V4(ipst, addr)					\
155 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
156 
157 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
158 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
159 		NCE_TABLE_SIZE)]))
160 
161 extern kmem_cache_t *ncec_cache;
162 extern kmem_cache_t *nce_cache;
163 
164 /*
165  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
166  * If src_ill is not null, the ncec_addr is bound to src_ill. The
167  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
168  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
169  * IPMP cast_ill (in the IPMP case).
170  *
171  * Note that the probe interval is based on the src_ill for IPv6, and
172  * the ncec_xmit_interval for IPv4.
173  */
174 static void
nce_dad(ncec_t * ncec,ill_t * src_ill,boolean_t send_probe)175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
176 {
177 	boolean_t dropped;
178 	uint32_t probe_interval;
179 
180 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
181 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
182 	if (ncec->ncec_ipversion == IPV6_VERSION) {
183 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
184 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
185 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
186 		probe_interval = ILL_PROBE_INTERVAL(src_ill);
187 	} else {
188 		/* IPv4 DAD delay the initial probe. */
189 		if (send_probe)
190 			dropped = arp_probe(ncec);
191 		else
192 			dropped = B_TRUE;
193 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
194 		    !send_probe);
195 	}
196 	if (!dropped) {
197 		mutex_enter(&ncec->ncec_lock);
198 		ncec->ncec_pcnt--;
199 		mutex_exit(&ncec->ncec_lock);
200 	}
201 	nce_restart_timer(ncec, probe_interval);
202 }
203 
204 /*
205  * Compute default flags to use for an advertisement of this ncec's address.
206  */
207 static int
nce_advert_flags(const ncec_t * ncec)208 nce_advert_flags(const ncec_t *ncec)
209 {
210 	int flag = 0;
211 
212 	if (ncec->ncec_flags & NCE_F_ISROUTER)
213 		flag |= NDP_ISROUTER;
214 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
215 		flag |= NDP_ORIDE;
216 
217 	return (flag);
218 }
219 
220 /*
221  * NDP Cache Entry creation routine.
222  * This routine must always be called with ndp6->ndp_g_lock held.
223  */
224 int
nce_add_v6(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
226     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
227 {
228 	int		err;
229 	nce_t		*nce;
230 
231 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
232 	ASSERT(ill != NULL && ill->ill_isv6);
233 
234 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
235 	    &nce);
236 	if (err != 0)
237 		return (err);
238 	ASSERT(newnce != NULL);
239 	*newnce = nce;
240 	return (err);
241 }
242 
243 /*
244  * Post-processing routine to be executed after nce_add_v6(). This function
245  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
246  * and must be called without any locks held.
247  */
248 int
nce_add_v6_postprocess(nce_t * nce)249 nce_add_v6_postprocess(nce_t *nce)
250 {
251 	ncec_t		*ncec = nce->nce_common;
252 	boolean_t	dropped = B_FALSE;
253 	uchar_t		*hw_addr = ncec->ncec_lladdr;
254 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
255 	ill_t		*ill = ncec->ncec_ill;
256 	int		err = 0;
257 	uint16_t	flags = ncec->ncec_flags;
258 	ip_stack_t	*ipst = ill->ill_ipst;
259 	boolean_t	trigger_fastpath = B_TRUE;
260 
261 	/*
262 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
263 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
264 	 * We call nce_fastpath from nce_update if the link layer address of
265 	 * the peer changes from nce_update
266 	 */
267 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
268 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
269 		trigger_fastpath = B_FALSE;
270 
271 	if (trigger_fastpath)
272 		nce_fastpath_trigger(nce);
273 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
274 		ill_t *hwaddr_ill;
275 		/*
276 		 * Unicast entry that needs DAD.
277 		 */
278 		if (IS_IPMP(ill)) {
279 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
280 			    hw_addr, hw_addr_len);
281 		} else {
282 			hwaddr_ill = ill;
283 		}
284 		nce_dad(ncec, hwaddr_ill, B_TRUE);
285 		err = EINPROGRESS;
286 	} else if (flags & NCE_F_UNSOL_ADV) {
287 		/*
288 		 * We account for the transmit below by assigning one
289 		 * less than the ndd variable. Subsequent decrements
290 		 * are done in nce_timer.
291 		 */
292 		mutex_enter(&ncec->ncec_lock);
293 		ncec->ncec_unsolicit_count =
294 		    ipst->ips_ip_ndp_unsolicit_count - 1;
295 		mutex_exit(&ncec->ncec_lock);
296 		dropped = ndp_xmit(ill,
297 		    ND_NEIGHBOR_ADVERT,
298 		    hw_addr,
299 		    hw_addr_len,
300 		    &ncec->ncec_addr,	/* Source and target of the adv */
301 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
302 		    nce_advert_flags(ncec));
303 		mutex_enter(&ncec->ncec_lock);
304 		if (dropped)
305 			ncec->ncec_unsolicit_count++;
306 		else
307 			ncec->ncec_last_time_defended = ddi_get_lbolt();
308 		if (ncec->ncec_unsolicit_count != 0) {
309 			nce_start_timer(ncec,
310 			    ipst->ips_ip_ndp_unsolicit_interval);
311 		}
312 		mutex_exit(&ncec->ncec_lock);
313 	}
314 	return (err);
315 }
316 
317 /*
318  * Atomically lookup and add (if needed) Neighbor Cache information for
319  * an address.
320  *
321  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
322  * are always added pointing at the ipmp_ill. Thus, when the ill passed
323  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
324  * entries will be created, both pointing at the same ncec_t. The nce_t
325  * entries will have their nce_ill set to the ipmp_ill and the under_ill
326  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
327  * Local addresses are always created on the ill passed to nce_add_v6.
328  */
329 int
nce_lookup_then_add_v6(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
331     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
332 {
333 	int		err = 0;
334 	ip_stack_t	*ipst = ill->ill_ipst;
335 	nce_t		*nce, *upper_nce = NULL;
336 	ill_t		*in_ill = ill;
337 	boolean_t	need_ill_refrele = B_FALSE;
338 
339 	if (flags & NCE_F_MCAST) {
340 		/*
341 		 * hw_addr will be figured out in nce_set_multicast_v6;
342 		 * caller has to select the cast_ill
343 		 */
344 		ASSERT(hw_addr == NULL);
345 		ASSERT(!IS_IPMP(ill));
346 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
347 		return (err);
348 	}
349 	ASSERT(ill->ill_isv6);
350 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
351 		ill = ipmp_ill_hold_ipmp_ill(ill);
352 		if (ill == NULL)
353 			return (ENXIO);
354 		need_ill_refrele = B_TRUE;
355 	}
356 
357 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
358 	nce = nce_lookup_addr(ill, addr);
359 	if (nce == NULL) {
360 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
361 		    &nce);
362 	} else {
363 		err = EEXIST;
364 	}
365 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
366 	if (err == 0)
367 		err = nce_add_v6_postprocess(nce);
368 	if (in_ill != ill && nce != NULL) {
369 		nce_t *under_nce = NULL;
370 
371 		/*
372 		 * in_ill was the under_ill. Try to create the under_nce.
373 		 * Hold the ill_g_lock to prevent changes to group membership
374 		 * until we are done.
375 		 */
376 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
377 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
378 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
379 			    ill_t *, ill);
380 			rw_exit(&ipst->ips_ill_g_lock);
381 			err = ENXIO;
382 			nce_refrele(nce);
383 			nce = NULL;
384 			goto bail;
385 		}
386 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
387 		if (under_nce == NULL) {
388 			rw_exit(&ipst->ips_ill_g_lock);
389 			err = EINVAL;
390 			nce_refrele(nce);
391 			nce = NULL;
392 			goto bail;
393 		}
394 		rw_exit(&ipst->ips_ill_g_lock);
395 		upper_nce = nce;
396 		nce = under_nce; /* will be returned to caller */
397 		if (NCE_ISREACHABLE(nce->nce_common))
398 			nce_fastpath_trigger(under_nce);
399 	}
400 	/* nce_refrele is deferred until the lock is dropped  */
401 	if (nce != NULL) {
402 		if (newnce != NULL)
403 			*newnce = nce;
404 		else
405 			nce_refrele(nce);
406 	}
407 bail:
408 	if (upper_nce != NULL)
409 		nce_refrele(upper_nce);
410 	if (need_ill_refrele)
411 		ill_refrele(ill);
412 	return (err);
413 }
414 
415 /*
416  * Remove all the CONDEMNED nces from the appropriate hash table.
417  * We create a private list of NCEs, these may have ires pointing
418  * to them, so the list will be passed through to clean up dependent
419  * ires and only then we can do ncec_refrele() which can make NCE inactive.
420  */
421 static void
nce_remove(ndp_g_t * ndp,ncec_t * ncec,ncec_t ** free_nce_list)422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
423 {
424 	ncec_t *ncec1;
425 	ncec_t **ptpn;
426 
427 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
428 	ASSERT(ndp->ndp_g_walker == 0);
429 	for (; ncec; ncec = ncec1) {
430 		ncec1 = ncec->ncec_next;
431 		mutex_enter(&ncec->ncec_lock);
432 		if (NCE_ISCONDEMNED(ncec)) {
433 			ptpn = ncec->ncec_ptpn;
434 			ncec1 = ncec->ncec_next;
435 			if (ncec1 != NULL)
436 				ncec1->ncec_ptpn = ptpn;
437 			*ptpn = ncec1;
438 			ncec->ncec_ptpn = NULL;
439 			ncec->ncec_next = NULL;
440 			ncec->ncec_next = *free_nce_list;
441 			*free_nce_list = ncec;
442 		}
443 		mutex_exit(&ncec->ncec_lock);
444 	}
445 }
446 
447 /*
448  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
449  *    will return this NCE. Also no new timeouts will
450  *    be started (See nce_restart_timer).
451  * 2. Cancel any currently running timeouts.
452  * 3. If there is an ndp walker, return. The walker will do the cleanup.
453  *    This ensures that walkers see a consistent list of NCEs while walking.
454  * 4. Otherwise remove the NCE from the list of NCEs
455  */
456 void
ncec_delete(ncec_t * ncec)457 ncec_delete(ncec_t *ncec)
458 {
459 	ncec_t	**ptpn;
460 	ncec_t	*ncec1;
461 	int	ipversion = ncec->ncec_ipversion;
462 	ndp_g_t *ndp;
463 	ip_stack_t	*ipst = ncec->ncec_ipst;
464 
465 	if (ipversion == IPV4_VERSION)
466 		ndp = ipst->ips_ndp4;
467 	else
468 		ndp = ipst->ips_ndp6;
469 
470 	/* Serialize deletes */
471 	mutex_enter(&ncec->ncec_lock);
472 	if (NCE_ISCONDEMNED(ncec)) {
473 		/* Some other thread is doing the delete */
474 		mutex_exit(&ncec->ncec_lock);
475 		return;
476 	}
477 	/*
478 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
479 	 * refcnt has to be >= 2
480 	 */
481 	ASSERT(ncec->ncec_refcnt >= 2);
482 	ncec->ncec_flags |= NCE_F_CONDEMNED;
483 	mutex_exit(&ncec->ncec_lock);
484 
485 	/* Count how many condemned ires for kmem_cache callback */
486 	atomic_inc_32(&ipst->ips_num_nce_condemned);
487 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
488 
489 	/* Complete any waiting callbacks */
490 	ncec_cb_dispatch(ncec);
491 
492 	/*
493 	 * Cancel any running timer. Timeout can't be restarted
494 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
495 	 * Passing invalid timeout id is fine.
496 	 */
497 	if (ncec->ncec_timeout_id != 0) {
498 		(void) untimeout(ncec->ncec_timeout_id);
499 		ncec->ncec_timeout_id = 0;
500 	}
501 
502 	mutex_enter(&ndp->ndp_g_lock);
503 	if (ncec->ncec_ptpn == NULL) {
504 		/*
505 		 * The last ndp walker has already removed this ncec from
506 		 * the list after we marked the ncec CONDEMNED and before
507 		 * we grabbed the global lock.
508 		 */
509 		mutex_exit(&ndp->ndp_g_lock);
510 		return;
511 	}
512 	if (ndp->ndp_g_walker > 0) {
513 		/*
514 		 * Can't unlink. The walker will clean up
515 		 */
516 		ndp->ndp_g_walker_cleanup = B_TRUE;
517 		mutex_exit(&ndp->ndp_g_lock);
518 		return;
519 	}
520 
521 	/*
522 	 * Now remove the ncec from the list. nce_restart_timer won't restart
523 	 * the timer since it is marked CONDEMNED.
524 	 */
525 	ptpn = ncec->ncec_ptpn;
526 	ncec1 = ncec->ncec_next;
527 	if (ncec1 != NULL)
528 		ncec1->ncec_ptpn = ptpn;
529 	*ptpn = ncec1;
530 	ncec->ncec_ptpn = NULL;
531 	ncec->ncec_next = NULL;
532 	mutex_exit(&ndp->ndp_g_lock);
533 
534 	/* Removed from ncec_ptpn/ncec_next list */
535 	ncec_refrele_notr(ncec);
536 }
537 
538 void
ncec_inactive(ncec_t * ncec)539 ncec_inactive(ncec_t *ncec)
540 {
541 	mblk_t		**mpp;
542 	ill_t		*ill = ncec->ncec_ill;
543 	ip_stack_t	*ipst = ncec->ncec_ipst;
544 
545 	ASSERT(ncec->ncec_refcnt == 0);
546 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
547 
548 	/* Count how many condemned nces for kmem_cache callback */
549 	if (NCE_ISCONDEMNED(ncec))
550 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
551 
552 	/* Free all allocated messages */
553 	mpp = &ncec->ncec_qd_mp;
554 	while (*mpp != NULL) {
555 		mblk_t  *mp;
556 
557 		mp = *mpp;
558 		*mpp = mp->b_next;
559 
560 		inet_freemsg(mp);
561 	}
562 	/*
563 	 * must have been cleaned up in ncec_delete
564 	 */
565 	ASSERT(list_is_empty(&ncec->ncec_cb));
566 	list_destroy(&ncec->ncec_cb);
567 	/*
568 	 * free the ncec_lladdr if one was allocated in nce_add_common()
569 	 */
570 	if (ncec->ncec_lladdr_length > 0)
571 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
572 
573 #ifdef DEBUG
574 	ncec_trace_cleanup(ncec);
575 #endif
576 
577 	mutex_enter(&ill->ill_lock);
578 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
579 	    (char *), "ncec", (void *), ncec);
580 	ill->ill_ncec_cnt--;
581 	ncec->ncec_ill = NULL;
582 	/*
583 	 * If the number of ncec's associated with this ill have dropped
584 	 * to zero, check whether we need to restart any operation that
585 	 * is waiting for this to happen.
586 	 */
587 	if (ILL_DOWN_OK(ill)) {
588 		/* ipif_ill_refrele_tail drops the ill_lock */
589 		ipif_ill_refrele_tail(ill);
590 	} else {
591 		mutex_exit(&ill->ill_lock);
592 	}
593 
594 	mutex_destroy(&ncec->ncec_lock);
595 	kmem_cache_free(ncec_cache, ncec);
596 }
597 
598 /*
599  * ncec_walk routine.  Delete the ncec if it is associated with the ill
600  * that is going away.  Always called as a writer.
601  */
602 void
ncec_delete_per_ill(ncec_t * ncec,void * arg)603 ncec_delete_per_ill(ncec_t *ncec, void *arg)
604 {
605 	if ((ncec != NULL) && ncec->ncec_ill == arg) {
606 		ncec_delete(ncec);
607 	}
608 }
609 
610 /*
611  * Neighbor Cache cleanup logic for a list of ncec_t entries.
612  */
613 static void
nce_cleanup_list(ncec_t * ncec)614 nce_cleanup_list(ncec_t *ncec)
615 {
616 	ncec_t *ncec_next;
617 
618 	ASSERT(ncec != NULL);
619 	while (ncec != NULL) {
620 		ncec_next = ncec->ncec_next;
621 		ncec->ncec_next = NULL;
622 
623 		/*
624 		 * It is possible for the last ndp walker (this thread)
625 		 * to come here after ncec_delete has marked the ncec CONDEMNED
626 		 * and before it has removed the ncec from the fastpath list
627 		 * or called untimeout. So we need to do it here. It is safe
628 		 * for both ncec_delete and this thread to do it twice or
629 		 * even simultaneously since each of the threads has a
630 		 * reference on the ncec.
631 		 */
632 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
633 		/*
634 		 * Cancel any running timer. Timeout can't be restarted
635 		 * since CONDEMNED is set. The ncec_lock can't be
636 		 * held across untimeout though passing invalid timeout
637 		 * id is fine.
638 		 */
639 		if (ncec->ncec_timeout_id != 0) {
640 			(void) untimeout(ncec->ncec_timeout_id);
641 			ncec->ncec_timeout_id = 0;
642 		}
643 		/* Removed from ncec_ptpn/ncec_next list */
644 		ncec_refrele_notr(ncec);
645 		ncec = ncec_next;
646 	}
647 }
648 
649 /*
650  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
651  */
652 boolean_t
nce_restart_dad(ncec_t * ncec)653 nce_restart_dad(ncec_t *ncec)
654 {
655 	boolean_t started;
656 	ill_t *ill, *hwaddr_ill;
657 
658 	if (ncec == NULL)
659 		return (B_FALSE);
660 	ill = ncec->ncec_ill;
661 	mutex_enter(&ncec->ncec_lock);
662 	if (ncec->ncec_state == ND_PROBE) {
663 		mutex_exit(&ncec->ncec_lock);
664 		started = B_TRUE;
665 	} else if (ncec->ncec_state == ND_REACHABLE) {
666 		ASSERT(ncec->ncec_lladdr != NULL);
667 		ncec->ncec_state = ND_PROBE;
668 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
669 		/*
670 		 * Slight cheat here: we don't use the initial probe delay
671 		 * for IPv4 in this obscure case.
672 		 */
673 		mutex_exit(&ncec->ncec_lock);
674 		if (IS_IPMP(ill)) {
675 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
676 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
677 		} else {
678 			hwaddr_ill = ill;
679 		}
680 		nce_dad(ncec, hwaddr_ill, B_TRUE);
681 		started = B_TRUE;
682 	} else {
683 		mutex_exit(&ncec->ncec_lock);
684 		started = B_FALSE;
685 	}
686 	return (started);
687 }
688 
689 /*
690  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
691  * If one is found, the refcnt on the ncec will be incremented.
692  */
693 ncec_t *
ncec_lookup_illgrp_v6(ill_t * ill,const in6_addr_t * addr)694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
695 {
696 	ncec_t		*ncec;
697 	ip_stack_t	*ipst = ill->ill_ipst;
698 
699 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
700 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
701 
702 	/* Get head of v6 hash table */
703 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
704 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
705 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
706 	rw_exit(&ipst->ips_ill_g_lock);
707 	return (ncec);
708 }
709 /*
710  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
711  * If one is found, the refcnt on the ncec will be incremented.
712  */
713 ncec_t *
ncec_lookup_illgrp_v4(ill_t * ill,const in_addr_t * addr)714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
715 {
716 	ncec_t	*ncec = NULL;
717 	in6_addr_t addr6;
718 	ip_stack_t *ipst = ill->ill_ipst;
719 
720 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
721 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
722 
723 	/* Get head of v4 hash table */
724 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
725 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
726 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
727 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
728 	rw_exit(&ipst->ips_ill_g_lock);
729 	return (ncec);
730 }
731 
732 /*
733  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
734  * If an ncec is found, increment the hold count on that ncec.
735  * The caller passes in the start of the appropriate hash table, and must
736  * be holding the appropriate global lock (ndp_g_lock). In addition, since
737  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
738  * must be held as reader.
739  *
740  * This function always matches across the ipmp group.
741  */
742 ncec_t *
ncec_lookup_illgrp(ill_t * ill,const in6_addr_t * addr,ncec_t * ncec)743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
744 {
745 	ndp_g_t		*ndp;
746 	ip_stack_t	*ipst = ill->ill_ipst;
747 
748 	if (ill->ill_isv6)
749 		ndp = ipst->ips_ndp6;
750 	else
751 		ndp = ipst->ips_ndp4;
752 
753 	ASSERT(ill != NULL);
754 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
755 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
756 		return (NULL);
757 	for (; ncec != NULL; ncec = ncec->ncec_next) {
758 		if (ncec->ncec_ill == ill ||
759 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
760 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
761 				mutex_enter(&ncec->ncec_lock);
762 				if (!NCE_ISCONDEMNED(ncec)) {
763 					ncec_refhold_locked(ncec);
764 					mutex_exit(&ncec->ncec_lock);
765 					break;
766 				}
767 				mutex_exit(&ncec->ncec_lock);
768 			}
769 		}
770 	}
771 	return (ncec);
772 }
773 
774 /*
775  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
776  * entries for ill only, i.e., when ill is part of an ipmp group,
777  * nce_lookup_v4 will never try to match across the group.
778  */
779 nce_t *
nce_lookup_v4(ill_t * ill,const in_addr_t * addr)780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
781 {
782 	nce_t *nce;
783 	in6_addr_t addr6;
784 	ip_stack_t *ipst = ill->ill_ipst;
785 
786 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
787 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
788 	nce = nce_lookup_addr(ill, &addr6);
789 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
790 	return (nce);
791 }
792 
793 /*
794  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
795  * entries for ill only, i.e., when ill is part of an ipmp group,
796  * nce_lookup_v6 will never try to match across the group.
797  */
798 nce_t *
nce_lookup_v6(ill_t * ill,const in6_addr_t * addr6)799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
800 {
801 	nce_t *nce;
802 	ip_stack_t *ipst = ill->ill_ipst;
803 
804 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
805 	nce = nce_lookup_addr(ill, addr6);
806 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
807 	return (nce);
808 }
809 
810 static nce_t *
nce_lookup_addr(ill_t * ill,const in6_addr_t * addr)811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
812 {
813 	nce_t *nce;
814 
815 	ASSERT(ill != NULL);
816 #ifdef DEBUG
817 	if (ill->ill_isv6)
818 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
819 	else
820 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
821 #endif
822 	mutex_enter(&ill->ill_lock);
823 	nce = nce_lookup(ill, addr);
824 	mutex_exit(&ill->ill_lock);
825 	return (nce);
826 }
827 
828 
829 /*
830  * Router turned to host.  We need to make sure that cached copies of the ncec
831  * are not used for forwarding packets if they were derived from the default
832  * route, and that the default route itself is removed, as  required by
833  * section 7.2.5 of RFC 2461.
834  *
835  * Note that the ncec itself probably has valid link-layer information for the
836  * nexthop, so that there is no reason to delete the ncec, as long as the
837  * ISROUTER flag is turned off.
838  */
839 static void
ncec_router_to_host(ncec_t * ncec)840 ncec_router_to_host(ncec_t *ncec)
841 {
842 	ire_t		*ire;
843 	ip_stack_t	*ipst = ncec->ncec_ipst;
844 
845 	mutex_enter(&ncec->ncec_lock);
846 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
847 	mutex_exit(&ncec->ncec_lock);
848 
849 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
850 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
851 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
852 	if (ire != NULL) {
853 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
854 		ire_delete(ire);
855 		ire_refrele(ire);
856 	}
857 }
858 
859 /*
860  * Process passed in parameters either from an incoming packet or via
861  * user ioctl.
862  */
863 void
nce_process(ncec_t * ncec,uchar_t * hw_addr,uint32_t flag,boolean_t is_adv)864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
865 {
866 	ill_t	*ill = ncec->ncec_ill;
867 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
868 	boolean_t ll_updated = B_FALSE;
869 	boolean_t ll_changed;
870 	nce_t	*nce;
871 
872 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
873 	/*
874 	 * No updates of link layer address or the neighbor state is
875 	 * allowed, when the cache is in NONUD state.  This still
876 	 * allows for responding to reachability solicitation.
877 	 */
878 	mutex_enter(&ncec->ncec_lock);
879 	if (ncec->ncec_state == ND_INCOMPLETE) {
880 		if (hw_addr == NULL) {
881 			mutex_exit(&ncec->ncec_lock);
882 			return;
883 		}
884 		nce_set_ll(ncec, hw_addr);
885 		/*
886 		 * Update ncec state and send the queued packets
887 		 * back to ip this time ire will be added.
888 		 */
889 		if (flag & ND_NA_FLAG_SOLICITED) {
890 			nce_update(ncec, ND_REACHABLE, NULL);
891 		} else {
892 			nce_update(ncec, ND_STALE, NULL);
893 		}
894 		mutex_exit(&ncec->ncec_lock);
895 		nce = nce_fastpath(ncec, B_TRUE, NULL);
896 		nce_resolv_ok(ncec);
897 		if (nce != NULL)
898 			nce_refrele(nce);
899 		return;
900 	}
901 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
902 	if (!is_adv) {
903 		/* If this is a SOLICITATION request only */
904 		if (ll_changed)
905 			nce_update(ncec, ND_STALE, hw_addr);
906 		mutex_exit(&ncec->ncec_lock);
907 		ncec_cb_dispatch(ncec);
908 		return;
909 	}
910 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
911 		/* If in any other state than REACHABLE, ignore */
912 		if (ncec->ncec_state == ND_REACHABLE) {
913 			nce_update(ncec, ND_STALE, NULL);
914 		}
915 		mutex_exit(&ncec->ncec_lock);
916 		ncec_cb_dispatch(ncec);
917 		return;
918 	} else {
919 		if (ll_changed) {
920 			nce_update(ncec, ND_UNCHANGED, hw_addr);
921 			ll_updated = B_TRUE;
922 		}
923 		if (flag & ND_NA_FLAG_SOLICITED) {
924 			nce_update(ncec, ND_REACHABLE, NULL);
925 		} else {
926 			if (ll_updated) {
927 				nce_update(ncec, ND_STALE, NULL);
928 			}
929 		}
930 		mutex_exit(&ncec->ncec_lock);
931 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
932 		    NCE_F_ISROUTER)) {
933 			ncec_router_to_host(ncec);
934 		} else {
935 			ncec_cb_dispatch(ncec);
936 		}
937 	}
938 }
939 
940 /*
941  * Pass arg1 to the cbf supplied, along with each ncec in existence.
942  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
943  * walking the hash list.
944  */
945 void
ncec_walk_common(ndp_g_t * ndp,ill_t * ill,ncec_walk_cb_t cbf,void * arg1,boolean_t trace)946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
947     void *arg1, boolean_t trace)
948 {
949 	ncec_t	*ncec;
950 	ncec_t	*ncec1;
951 	ncec_t	**ncep;
952 	ncec_t	*free_nce_list = NULL;
953 
954 	mutex_enter(&ndp->ndp_g_lock);
955 	/* Prevent ncec_delete from unlink and free of NCE */
956 	ndp->ndp_g_walker++;
957 	mutex_exit(&ndp->ndp_g_lock);
958 	for (ncep = ndp->nce_hash_tbl;
959 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
960 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
961 			ncec1 = ncec->ncec_next;
962 			if (ill == NULL || ncec->ncec_ill == ill) {
963 				if (trace) {
964 					ncec_refhold(ncec);
965 					(*cbf)(ncec, arg1);
966 					ncec_refrele(ncec);
967 				} else {
968 					ncec_refhold_notr(ncec);
969 					(*cbf)(ncec, arg1);
970 					ncec_refrele_notr(ncec);
971 				}
972 			}
973 		}
974 	}
975 	mutex_enter(&ndp->ndp_g_lock);
976 	ndp->ndp_g_walker--;
977 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
978 		/* Time to delete condemned entries */
979 		for (ncep = ndp->nce_hash_tbl;
980 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
981 			ncec = *ncep;
982 			if (ncec != NULL) {
983 				nce_remove(ndp, ncec, &free_nce_list);
984 			}
985 		}
986 		ndp->ndp_g_walker_cleanup = B_FALSE;
987 	}
988 
989 	mutex_exit(&ndp->ndp_g_lock);
990 
991 	if (free_nce_list != NULL) {
992 		nce_cleanup_list(free_nce_list);
993 	}
994 }
995 
996 /*
997  * Walk everything.
998  * Note that ill can be NULL hence can't derive the ipst from it.
999  */
1000 void
ncec_walk(ill_t * ill,ncec_walk_cb_t cbf,void * arg1,ip_stack_t * ipst)1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003 	ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 	ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006 
1007 /*
1008  * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
1009  * NCEs, and the number to reclaim if we hit the limit.  Used by
1010  * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
1011  * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
1012  */
1013 
1014 /* Maximum number of multicast NCEs on an ill. */
1015 uint_t ip_max_ill_mcast_nces = 16384;
1016 /*
1017  * Number of NCEs to delete if we hit the maximum above.  0 means *don't* and
1018  * return an error.  Non-zero means delete so many, and if the number is >=
1019  * the max above, that means delete them all.
1020  */
1021 uint_t ip_ill_mcast_reclaim = 256;
1022 
1023 /*
1024  * Encapsulate multicast ill capping in a function, for easier DTrace
1025  * detections.  Return a list of refheld NCEs to destroy-via-refrele.  That
1026  * list can be NULL, but can only be non-NULL if we successfully reclaimed.
1027  *
1028  * NOTE:  This function must be called while holding the ill_lock AND
1029  * JUST PRIOR to making the insertion into the ill_nce list.
1030  *
1031  * We can't release the ones we delete ourselves because the ill_lock is held
1032  * by the caller. They are, instead, passed back in a list_t for deletion
1033  * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
1034  *
1035  * While this covers nce_t, ncec_t gets done even further down the road.  See
1036  * nce_graveyard_free() for why.
1037  */
1038 static boolean_t
nce_too_many_mcast(ill_t * ill,list_t * graveyard)1039 nce_too_many_mcast(ill_t *ill, list_t *graveyard)
1040 {
1041 	uint_t reclaim_count, max_count, reclaimed = 0;
1042 	boolean_t too_many;
1043 	nce_t *nce, *deadman;
1044 
1045 	ASSERT(graveyard != NULL);
1046 	ASSERT(list_is_empty(graveyard));
1047 	ASSERT(MUTEX_HELD(&ill->ill_lock));
1048 
1049 	/*
1050 	 * NOTE: Some grinning weirdo may have lowered the global max beyond
1051 	 * what this ill currently has.  The behavior in this case will be
1052 	 * trim-back just by the reclaim amount for any new ones.
1053 	 */
1054 	max_count = ip_max_ill_mcast_nces;
1055 	reclaim_count = min(ip_ill_mcast_reclaim, max_count);
1056 
1057 	/* All good? */
1058 	if (ill->ill_mcast_nces < max_count)
1059 		return (B_FALSE);	/* Yes, all good. */
1060 
1061 	if (reclaim_count == 0)
1062 		return (B_TRUE);	/* Don't bother - we're stuck. */
1063 
1064 	/* We need to reclaim now.  Exploit our held ill_lock. */
1065 
1066 	/*
1067 	 * Start at the tail and work backwards, new nces are head-inserted,
1068 	 * so we'll be reaping the oldest entries.
1069 	 */
1070 	nce = list_tail(&ill->ill_nce);
1071 	while (reclaimed < reclaim_count) {
1072 		/* Skip ahead to a multicast NCE. */
1073 		while (nce != NULL &&
1074 		    (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
1075 			nce = list_prev(&ill->ill_nce, nce);
1076 		}
1077 		if (nce == NULL)
1078 			break;
1079 
1080 		/*
1081 		 * NOTE: For now, we just delete the first one(s) we find.
1082 		 * This is not optimal, and may require some inspection of nce
1083 		 * & its ncec to be better.
1084 		 */
1085 		deadman = nce;
1086 		nce = list_prev(&ill->ill_nce, nce);
1087 
1088 		/* nce_delete() requires caller holds... */
1089 		nce_refhold(deadman);
1090 		nce_delete(deadman);	/* Bumps down ill_mcast_nces. */
1091 
1092 		/* Link the dead ones singly, still refheld... */
1093 		list_insert_tail(graveyard, deadman);
1094 		reclaimed++;
1095 	}
1096 
1097 	if (reclaimed != reclaim_count) {
1098 		/* We didn't have enough to reach reclaim_count. Why?!? */
1099 		DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
1100 		    uint_t, reclaimed, uint_t, reclaim_count);
1101 
1102 		/* In case for some REALLY weird reason we found none! */
1103 		too_many = (reclaimed == 0);
1104 	} else {
1105 		too_many = B_FALSE;
1106 	}
1107 
1108 	return (too_many);
1109 }
1110 
1111 static void
ncec_mcast_reap_one(ncec_t * ncec,void * arg)1112 ncec_mcast_reap_one(ncec_t *ncec, void *arg)
1113 {
1114 	boolean_t reapit;
1115 	ill_t *ill = (ill_t *)arg;
1116 
1117 	/* Obvious no-lock-needed checks... */
1118 	if (ncec == NULL || ncec->ncec_ill != ill ||
1119 	    (ncec->ncec_flags & NCE_F_MCAST) == 0)
1120 		return;
1121 
1122 	mutex_enter(&ncec->ncec_lock);
1123 	/*
1124 	 * It's refheld by the walk infrastructure. It has one reference for
1125 	 * being in the ndp_g_hash, and if an nce_t exists, that's one more.
1126 	 * We want ones without an nce_t, so 2 is the magic number.  If it's
1127 	 * LESS than 2, we have much bigger problems anyway.
1128 	 */
1129 	ASSERT(ncec->ncec_refcnt >= 2);
1130 	reapit = (ncec->ncec_refcnt == 2);
1131 	mutex_exit(&ncec->ncec_lock);
1132 
1133 	if (reapit) {
1134 		IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
1135 		ncec_delete(ncec);
1136 	}
1137 }
1138 
1139 /*
1140  * Attempt to reap stray multicast ncec_t structures left in the wake of
1141  * nce_graveyard_free(). This is a taskq servicing routine, as it's well
1142  * outside any netstack-global locks being held - ndp_g_lock in this case.  We
1143  * have a reference hold on the ill, which will prevent any unplumbing races.
1144  */
1145 static void
ncec_mcast_reap(void * arg)1146 ncec_mcast_reap(void *arg)
1147 {
1148 	ill_t *ill = (ill_t *)arg;
1149 
1150 	IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
1151 	ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
1152 	mutex_enter(&ill->ill_lock);
1153 	ill->ill_mcast_ncec_cleanup = B_FALSE;
1154 	/*
1155 	 * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
1156 	 * below for why.
1157 	 */
1158 	ill->ill_refcnt--;
1159 	if (ill->ill_refcnt == 0)
1160 		ipif_ill_refrele_tail(ill);	/* Drops ill_lock. */
1161 	else
1162 		mutex_exit(&ill->ill_lock);
1163 }
1164 
1165 /*
1166  * Free a list (including handling an empty list or NULL list) of
1167  * reference-held NCEs that were reaped from a nce_too_many_mcast()
1168  * call. Separate because the caller must have dropped ndp_g_lock first.
1169  *
1170  * This also schedules a taskq task to unlink underlying NCECs from the
1171  * ndp_g_hash, which are protected by ndp_g_lock.
1172  */
1173 static void
nce_graveyard_free(list_t * graveyard)1174 nce_graveyard_free(list_t *graveyard)
1175 {
1176 	nce_t *deadman, *current;
1177 	ill_t *ill;
1178 	boolean_t doit;
1179 
1180 	if (graveyard == NULL)
1181 		return;
1182 
1183 	current = list_head(graveyard);
1184 	if (current == NULL) {
1185 		list_destroy(graveyard);
1186 		return;
1187 	}
1188 
1189 	ill = current->nce_ill;
1190 	/*
1191 	 * Normally one should ill_refhold(ill) here.  There's no _notr()
1192 	 * variant like there is for ire_t, dce_t, or even ncec_t, but this is
1193 	 * the ONLY case that'll break the mh_trace that IP debugging uses for
1194 	 * reference counts (i.e. they assume same thread releases as
1195 	 * holds). Instead, we inline ill_refhold() here.  We must do the same
1196 	 * in the release done by the ncec_mcast_reap() above.
1197 	 */
1198 	mutex_enter(&ill->ill_lock);
1199 	ill->ill_refcnt++;
1200 	mutex_exit(&ill->ill_lock);
1201 
1202 	while (current != NULL) {
1203 		ASSERT3P(ill, ==, current->nce_ill);
1204 		deadman = current;
1205 		current = list_next(graveyard, deadman);
1206 		list_remove(graveyard, deadman);
1207 		ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
1208 		    0);
1209 		nce_refrele(deadman);
1210 	}
1211 	list_destroy(graveyard);
1212 
1213 	mutex_enter(&ill->ill_lock);
1214 	if (ill->ill_mcast_ncec_cleanup)
1215 		doit = B_FALSE;
1216 	else {
1217 		ill->ill_mcast_ncec_cleanup = B_TRUE;
1218 		doit = B_TRUE;
1219 	}
1220 	mutex_exit(&ill->ill_lock);
1221 	if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
1222 	    ill, TQ_NOSLEEP) == TASKQID_INVALID) {
1223 		mutex_enter(&ill->ill_lock);
1224 		if (doit) {
1225 			IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
1226 			ill->ill_mcast_ncec_cleanup = B_FALSE;
1227 		}
1228 		/* There's no _notr() for ill_refrele(), so inline it here. */
1229 		ill->ill_refcnt--;
1230 		if (ill->ill_refcnt == 0)
1231 			ipif_ill_refrele_tail(ill);	/* Drops ill_lock */
1232 		else
1233 			mutex_exit(&ill->ill_lock);
1234 	}
1235 }
1236 
1237 /*
1238  * For each interface an entry is added for the unspecified multicast group.
1239  * Here that mapping is used to form the multicast cache entry for a particular
1240  * multicast destination.
1241  */
1242 static int
nce_set_multicast_v6(ill_t * ill,const in6_addr_t * dst,uint16_t flags,nce_t ** newnce)1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1244     uint16_t flags, nce_t **newnce)
1245 {
1246 	uchar_t		*hw_addr;
1247 	int		err = 0;
1248 	ip_stack_t	*ipst = ill->ill_ipst;
1249 	nce_t		*nce;
1250 
1251 	ASSERT(ill != NULL);
1252 	ASSERT(ill->ill_isv6);
1253 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1254 
1255 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1256 	nce = nce_lookup_addr(ill, dst);
1257 	if (nce != NULL) {
1258 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1259 		goto done;
1260 	}
1261 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1262 		/*
1263 		 * For IRE_IF_RESOLVER a hardware mapping can be
1264 		 * generated.
1265 		 */
1266 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1267 		if (hw_addr == NULL) {
1268 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1269 			return (ENOMEM);
1270 		}
1271 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1272 	} else {
1273 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
1274 		hw_addr = NULL;
1275 	}
1276 	ASSERT((flags & NCE_F_MCAST) != 0);
1277 	ASSERT((flags & NCE_F_NONUD) != 0);
1278 	/* nce_state will be computed by nce_add_common() */
1279 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1280 	    ND_UNCHANGED, &nce);
1281 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1282 	if (err == 0)
1283 		err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1284 	if (hw_addr != NULL)
1285 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1286 	if (err != 0) {
1287 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1288 		return (err);
1289 	}
1290 done:
1291 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1292 	if (newnce != NULL)
1293 		*newnce = nce;
1294 	else
1295 		nce_refrele(nce);
1296 	return (0);
1297 }
1298 
1299 /*
1300  * Return the link layer address, and any flags of a ncec.
1301  */
1302 int
ndp_query(ill_t * ill,struct lif_nd_req * lnr)1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1304 {
1305 	ncec_t		*ncec;
1306 	in6_addr_t	*addr;
1307 	sin6_t		*sin6;
1308 
1309 	ASSERT(ill != NULL && ill->ill_isv6);
1310 	sin6 = (sin6_t *)&lnr->lnr_addr;
1311 	addr =  &sin6->sin6_addr;
1312 
1313 	/*
1314 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1315 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1316 	 * addresses for the data addresses on an IPMP interface even though
1317 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1318 	 */
1319 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1320 	if (ncec == NULL)
1321 		return (ESRCH);
1322 	/* If no link layer address is available yet, return ESRCH */
1323 	if (!NCE_ISREACHABLE(ncec)) {
1324 		ncec_refrele(ncec);
1325 		return (ESRCH);
1326 	}
1327 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1328 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1329 	    lnr->lnr_hdw_len);
1330 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1331 		lnr->lnr_flags = NDF_ISROUTER_ON;
1332 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1333 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1334 	if (ncec->ncec_flags & NCE_F_STATIC)
1335 		lnr->lnr_flags |= NDF_STATIC;
1336 	ncec_refrele(ncec);
1337 	return (0);
1338 }
1339 
1340 /*
1341  * Finish setting up the Enable/Disable multicast for the driver.
1342  */
1343 mblk_t *
ndp_mcastreq(ill_t * ill,const in6_addr_t * v6group,uint32_t hw_addr_len,uint32_t hw_addr_offset,mblk_t * mp)1344 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1345     uint32_t hw_addr_offset, mblk_t *mp)
1346 {
1347 	uchar_t		*hw_addr;
1348 	ipaddr_t	v4group;
1349 	uchar_t		*addr;
1350 
1351 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1352 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1353 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1354 
1355 		ASSERT(CLASSD(v4group));
1356 		ASSERT(!(ill->ill_isv6));
1357 
1358 		addr = (uchar_t *)&v4group;
1359 	} else {
1360 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1361 		ASSERT(ill->ill_isv6);
1362 
1363 		addr = (uchar_t *)v6group;
1364 	}
1365 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1366 	if (hw_addr == NULL) {
1367 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1368 		freemsg(mp);
1369 		return (NULL);
1370 	}
1371 
1372 	ip_mcast_mapping(ill, addr, hw_addr);
1373 	return (mp);
1374 }
1375 
1376 void
ip_ndp_resolve(ncec_t * ncec)1377 ip_ndp_resolve(ncec_t *ncec)
1378 {
1379 	in_addr_t	sender4 = INADDR_ANY;
1380 	in6_addr_t	sender6 = ipv6_all_zeros;
1381 	ill_t		*src_ill;
1382 	uint32_t	ms;
1383 
1384 	src_ill = nce_resolve_src(ncec, &sender6);
1385 	if (src_ill == NULL) {
1386 		/* Make sure we try again later */
1387 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1388 		nce_restart_timer(ncec, (clock_t)ms);
1389 		return;
1390 	}
1391 	if (ncec->ncec_ipversion == IPV4_VERSION)
1392 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1393 	mutex_enter(&ncec->ncec_lock);
1394 	if (ncec->ncec_ipversion == IPV6_VERSION)
1395 		ms = ndp_solicit(ncec, sender6, src_ill);
1396 	else
1397 		ms = arp_request(ncec, sender4, src_ill);
1398 	mutex_exit(&ncec->ncec_lock);
1399 	if (ms == 0) {
1400 		if (ncec->ncec_state != ND_REACHABLE) {
1401 			if (ncec->ncec_ipversion == IPV6_VERSION)
1402 				ndp_resolv_failed(ncec);
1403 			else
1404 				arp_resolv_failed(ncec);
1405 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1406 			nce_make_unreachable(ncec);
1407 			ncec_delete(ncec);
1408 		}
1409 	} else {
1410 		nce_restart_timer(ncec, (clock_t)ms);
1411 	}
1412 done:
1413 	ill_refrele(src_ill);
1414 }
1415 
1416 /*
1417  * Send an IPv6 neighbor solicitation.
1418  * Returns number of milliseconds after which we should either rexmit or abort.
1419  * Return of zero means we should abort.
1420  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1421  * The optional source address is used as a hint to ndp_solicit for
1422  * which source to use in the packet.
1423  *
1424  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1425  * the packet.
1426  */
1427 uint32_t
ndp_solicit(ncec_t * ncec,in6_addr_t src,ill_t * ill)1428 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1429 {
1430 	in6_addr_t	dst;
1431 	boolean_t	dropped = B_FALSE;
1432 
1433 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1434 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1435 
1436 	if (ncec->ncec_rcnt == 0)
1437 		return (0);
1438 
1439 	dst = ncec->ncec_addr;
1440 	ncec->ncec_rcnt--;
1441 	mutex_exit(&ncec->ncec_lock);
1442 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1443 	    ill->ill_phys_addr_length, &src, &dst, 0);
1444 	mutex_enter(&ncec->ncec_lock);
1445 	if (dropped)
1446 		ncec->ncec_rcnt++;
1447 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1448 }
1449 
1450 /*
1451  * Attempt to recover an address on an interface that's been marked as a
1452  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1453  * no easy way to just probe the address and have the right thing happen if
1454  * it's no longer in use.  Instead, we just bring it up normally and allow the
1455  * regular interface start-up logic to probe for a remaining duplicate and take
1456  * us back down if necessary.
1457  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1458  * ip_ndp_excl.
1459  */
1460 /* ARGSUSED */
1461 void
ip_addr_recover(ipsq_t * ipsq,queue_t * rq,mblk_t * mp,void * dummy_arg)1462 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1463 {
1464 	ill_t	*ill = rq->q_ptr;
1465 	ipif_t	*ipif;
1466 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1467 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1468 	boolean_t addr_equal;
1469 
1470 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1471 		/*
1472 		 * We do not support recovery of proxy ARP'd interfaces,
1473 		 * because the system lacks a complete proxy ARP mechanism.
1474 		 */
1475 		if (ill->ill_isv6) {
1476 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1477 			    addr6);
1478 		} else {
1479 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1480 		}
1481 
1482 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1483 			continue;
1484 
1485 		/*
1486 		 * If we have already recovered or if the interface is going
1487 		 * away, then ignore.
1488 		 */
1489 		mutex_enter(&ill->ill_lock);
1490 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1491 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1492 			mutex_exit(&ill->ill_lock);
1493 			continue;
1494 		}
1495 
1496 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1497 		ill->ill_ipif_dup_count--;
1498 		mutex_exit(&ill->ill_lock);
1499 		ipif->ipif_was_dup = B_TRUE;
1500 
1501 		if (ill->ill_isv6) {
1502 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1503 			(void) ipif_up_done_v6(ipif);
1504 		} else {
1505 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1506 			    EINPROGRESS);
1507 			(void) ipif_up_done(ipif);
1508 		}
1509 	}
1510 	freeb(mp);
1511 }
1512 
1513 /*
1514  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1515  * As long as someone else holds the address, the interface will stay down.
1516  * When that conflict goes away, the interface is brought back up.  This is
1517  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1518  * server will recover from a failure.
1519  *
1520  * For DHCP and temporary addresses, recovery is not done in the kernel.
1521  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1522  *
1523  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1524  */
1525 void
ipif_dup_recovery(void * arg)1526 ipif_dup_recovery(void *arg)
1527 {
1528 	ipif_t *ipif = arg;
1529 
1530 	ipif->ipif_recovery_id = 0;
1531 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1532 		return;
1533 
1534 	/*
1535 	 * No lock, because this is just an optimization.
1536 	 */
1537 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1538 		return;
1539 
1540 	/* If the link is down, we'll retry this later */
1541 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1542 		return;
1543 
1544 	ipif_do_recovery(ipif);
1545 }
1546 
1547 /*
1548  * Perform interface recovery by forcing the duplicate interfaces up and
1549  * allowing the system to determine which ones should stay up.
1550  *
1551  * Called both by recovery timer expiry and link-up notification.
1552  */
1553 void
ipif_do_recovery(ipif_t * ipif)1554 ipif_do_recovery(ipif_t *ipif)
1555 {
1556 	ill_t *ill = ipif->ipif_ill;
1557 	mblk_t *mp;
1558 	ip_stack_t *ipst = ill->ill_ipst;
1559 	size_t mp_size;
1560 
1561 	if (ipif->ipif_isv6)
1562 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1563 	else
1564 		mp_size = sizeof (ipif->ipif_lcl_addr);
1565 	mp = allocb(mp_size, BPRI_MED);
1566 	if (mp == NULL) {
1567 		mutex_enter(&ill->ill_lock);
1568 		if (ipst->ips_ip_dup_recovery > 0 &&
1569 		    ipif->ipif_recovery_id == 0 &&
1570 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1571 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1572 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1573 		}
1574 		mutex_exit(&ill->ill_lock);
1575 	} else {
1576 		/*
1577 		 * A recovery timer may still be running if we got here from
1578 		 * ill_restart_dad(); cancel that timer.
1579 		 */
1580 		if (ipif->ipif_recovery_id != 0)
1581 			(void) untimeout(ipif->ipif_recovery_id);
1582 		ipif->ipif_recovery_id = 0;
1583 
1584 		if (ipif->ipif_isv6) {
1585 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1586 			    sizeof (ipif->ipif_v6lcl_addr));
1587 		} else  {
1588 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1589 			    sizeof (ipif->ipif_lcl_addr));
1590 		}
1591 		ill_refhold(ill);
1592 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1593 		    B_FALSE);
1594 	}
1595 }
1596 
1597 /*
1598  * Find the MAC and IP addresses in an NA/NS message.
1599  */
1600 static void
ip_ndp_find_addresses(mblk_t * mp,ip_recv_attr_t * ira,ill_t * ill,in6_addr_t * targp,uchar_t ** haddr,uint_t * haddrlenp)1601 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1602     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1603 {
1604 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1605 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1606 	uchar_t *addr;
1607 	int alen;
1608 
1609 	/* icmp_inbound_v6 ensures this */
1610 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1611 
1612 	addr = ira->ira_l2src;
1613 	alen = ill->ill_phys_addr_length;
1614 	if (alen > 0) {
1615 		*haddr = addr;
1616 		*haddrlenp = alen;
1617 	} else {
1618 		*haddr = NULL;
1619 		*haddrlenp = 0;
1620 	}
1621 
1622 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1623 	*targp = ns->nd_ns_target;
1624 }
1625 
1626 /*
1627  * This is for exclusive changes due to NDP duplicate address detection
1628  * failure.
1629  */
1630 /* ARGSUSED */
1631 static void
ip_ndp_excl(ipsq_t * ipsq,queue_t * rq,mblk_t * mp,void * dummy_arg)1632 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1633 {
1634 	ill_t	*ill = rq->q_ptr;
1635 	ipif_t	*ipif;
1636 	uchar_t	*haddr;
1637 	uint_t	haddrlen;
1638 	ip_stack_t *ipst = ill->ill_ipst;
1639 	in6_addr_t targ;
1640 	ip_recv_attr_t iras;
1641 	mblk_t	*attrmp;
1642 
1643 	attrmp = mp;
1644 	mp = mp->b_cont;
1645 	attrmp->b_cont = NULL;
1646 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1647 		/* The ill or ip_stack_t disappeared on us */
1648 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1649 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1650 		freemsg(mp);
1651 		ira_cleanup(&iras, B_TRUE);
1652 		return;
1653 	}
1654 
1655 	ASSERT(ill == iras.ira_rill);
1656 
1657 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1658 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1659 		/*
1660 		 * Ignore conflicts generated by misbehaving switches that
1661 		 * just reflect our own messages back to us.  For IPMP, we may
1662 		 * see reflections across any ill in the illgrp.
1663 		 *
1664 		 * RFC2462 and revisions tried to detect both the case
1665 		 * when a statically configured IPv6 address is a duplicate,
1666 		 * and the case when the L2 address itself is a duplicate. The
1667 		 * later is important because, with stateles address autoconf,
1668 		 * if the L2 address is a duplicate, the resulting IPv6
1669 		 * address(es) would also be duplicates. We rely on DAD of the
1670 		 * IPv6 address itself to detect the latter case.
1671 		 */
1672 		/* For an under ill_grp can change under lock */
1673 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1674 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1675 		    IS_UNDER_IPMP(ill) &&
1676 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1677 		    haddrlen) != NULL) {
1678 			rw_exit(&ipst->ips_ill_g_lock);
1679 			goto ignore_conflict;
1680 		}
1681 		rw_exit(&ipst->ips_ill_g_lock);
1682 	}
1683 
1684 	/*
1685 	 * Look up the appropriate ipif.
1686 	 */
1687 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1688 	if (ipif == NULL)
1689 		goto ignore_conflict;
1690 
1691 	/* Reload the ill to match the ipif */
1692 	ill = ipif->ipif_ill;
1693 
1694 	/* If it's already duplicate or ineligible, then don't do anything. */
1695 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1696 		ipif_refrele(ipif);
1697 		goto ignore_conflict;
1698 	}
1699 
1700 	/*
1701 	 * If this is a failure during duplicate recovery, then don't
1702 	 * complain.  It may take a long time to recover.
1703 	 */
1704 	if (!ipif->ipif_was_dup) {
1705 		char ibuf[LIFNAMSIZ];
1706 		char hbuf[MAC_STR_LEN];
1707 		char sbuf[INET6_ADDRSTRLEN];
1708 
1709 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1710 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1711 		    " disabled", ibuf,
1712 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1713 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1714 	}
1715 	mutex_enter(&ill->ill_lock);
1716 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1717 	ipif->ipif_flags |= IPIF_DUPLICATE;
1718 	ill->ill_ipif_dup_count++;
1719 	mutex_exit(&ill->ill_lock);
1720 	(void) ipif_down(ipif, NULL, NULL);
1721 	(void) ipif_down_tail(ipif);
1722 	mutex_enter(&ill->ill_lock);
1723 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1724 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1725 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1726 	    ipst->ips_ip_dup_recovery > 0) {
1727 		ASSERT(ipif->ipif_recovery_id == 0);
1728 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1729 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1730 	}
1731 	mutex_exit(&ill->ill_lock);
1732 	ipif_refrele(ipif);
1733 
1734 ignore_conflict:
1735 	freemsg(mp);
1736 	ira_cleanup(&iras, B_TRUE);
1737 }
1738 
1739 /*
1740  * Handle failure by tearing down the ipifs with the specified address.  Note
1741  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1742  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1743  * we start a timer on the ipif.
1744  * Caller has to free mp;
1745  */
1746 static void
ndp_failure(mblk_t * mp,ip_recv_attr_t * ira)1747 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1748 {
1749 	const uchar_t	*haddr;
1750 	ill_t		*ill = ira->ira_rill;
1751 
1752 	/*
1753 	 * Ignore conflicts generated by misbehaving switches that just
1754 	 * reflect our own messages back to us.
1755 	 */
1756 
1757 	/* icmp_inbound_v6 ensures this */
1758 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1759 	haddr = ira->ira_l2src;
1760 	if (haddr != NULL &&
1761 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1762 		return;
1763 	}
1764 
1765 	if ((mp = copymsg(mp)) != NULL) {
1766 		mblk_t	*attrmp;
1767 
1768 		attrmp = ip_recv_attr_to_mblk(ira);
1769 		if (attrmp == NULL) {
1770 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1771 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1772 			freemsg(mp);
1773 		} else {
1774 			ASSERT(attrmp->b_cont == NULL);
1775 			attrmp->b_cont = mp;
1776 			mp = attrmp;
1777 			ill_refhold(ill);
1778 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1779 			    B_FALSE);
1780 		}
1781 	}
1782 }
1783 
1784 /*
1785  * Handle a discovered conflict: some other system is advertising that it owns
1786  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1787  * interface.
1788  *
1789  * Handles both IPv4 and IPv6
1790  */
1791 boolean_t
ip_nce_conflict(mblk_t * mp,ip_recv_attr_t * ira,ncec_t * ncec)1792 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1793 {
1794 	ipif_t		*ipif;
1795 	clock_t		now;
1796 	uint_t		maxdefense;
1797 	uint_t		defs;
1798 	ill_t		*ill = ira->ira_ill;
1799 	ip_stack_t	*ipst = ill->ill_ipst;
1800 	uint32_t	elapsed;
1801 	boolean_t	isv6 = ill->ill_isv6;
1802 	ipaddr_t	ncec_addr;
1803 
1804 	if (isv6) {
1805 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1806 		    ipst);
1807 	} else {
1808 		if (arp_no_defense) {
1809 			/*
1810 			 * Yes, there is a conflict, but no, we do not
1811 			 * defend ourself.
1812 			 */
1813 			return (B_TRUE);
1814 		}
1815 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1816 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1817 		    ipst);
1818 	}
1819 	if (ipif == NULL)
1820 		return (B_FALSE);
1821 
1822 	/*
1823 	 * First, figure out if this address is disposable.
1824 	 */
1825 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1826 		maxdefense = ipst->ips_ip_max_temp_defend;
1827 	else
1828 		maxdefense = ipst->ips_ip_max_defend;
1829 
1830 	/*
1831 	 * Now figure out how many times we've defended ourselves.  Ignore
1832 	 * defenses that happened long in the past.
1833 	 */
1834 	now = ddi_get_lbolt();
1835 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1836 	mutex_enter(&ncec->ncec_lock);
1837 	if ((defs = ncec->ncec_defense_count) > 0 &&
1838 	    elapsed > ipst->ips_ip_defend_interval) {
1839 		/*
1840 		 * ip_defend_interval has elapsed.
1841 		 * reset the defense count.
1842 		 */
1843 		ncec->ncec_defense_count = defs = 0;
1844 	}
1845 	ncec->ncec_defense_count++;
1846 	ncec->ncec_last_time_defended = now;
1847 	mutex_exit(&ncec->ncec_lock);
1848 	ipif_refrele(ipif);
1849 
1850 	/*
1851 	 * If we've defended ourselves too many times already, then give up and
1852 	 * tear down the interface(s) using this address.
1853 	 * Otherwise, caller has to defend by sending out an announce.
1854 	 */
1855 	if (defs >= maxdefense) {
1856 		if (isv6)
1857 			ndp_failure(mp, ira);
1858 		else
1859 			arp_failure(mp, ira);
1860 	} else {
1861 		return (B_TRUE); /* caller must defend this address */
1862 	}
1863 	return (B_FALSE);
1864 }
1865 
1866 /*
1867  * Handle reception of Neighbor Solicitation messages.
1868  */
1869 static void
ndp_input_solicit(mblk_t * mp,ip_recv_attr_t * ira)1870 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1871 {
1872 	ill_t		*ill = ira->ira_ill, *under_ill;
1873 	nd_neighbor_solicit_t *ns;
1874 	uint32_t	hlen = ill->ill_phys_addr_length;
1875 	uchar_t		*haddr = NULL;
1876 	icmp6_t		*icmp_nd;
1877 	ip6_t		*ip6h;
1878 	ncec_t		*our_ncec = NULL;
1879 	in6_addr_t	target;
1880 	in6_addr_t	src;
1881 	int		len;
1882 	int		flag = 0;
1883 	nd_opt_hdr_t	*opt = NULL;
1884 	boolean_t	bad_solicit = B_FALSE;
1885 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1886 	boolean_t	need_ill_refrele = B_FALSE;
1887 
1888 	ip6h = (ip6_t *)mp->b_rptr;
1889 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1890 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1891 	src = ip6h->ip6_src;
1892 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1893 	target = ns->nd_ns_target;
1894 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1895 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1896 		if (ip_debug > 2) {
1897 			/* ip1dbg */
1898 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1899 			    AF_INET6, &target);
1900 		}
1901 		bad_solicit = B_TRUE;
1902 		goto done;
1903 	}
1904 	if (len > sizeof (nd_neighbor_solicit_t)) {
1905 		/* Options present */
1906 		opt = (nd_opt_hdr_t *)&ns[1];
1907 		len -= sizeof (nd_neighbor_solicit_t);
1908 		if (!ndp_verify_optlen(opt, len)) {
1909 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1910 			bad_solicit = B_TRUE;
1911 			goto done;
1912 		}
1913 	}
1914 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1915 		/* Check to see if this is a valid DAD solicitation */
1916 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1917 			if (ip_debug > 2) {
1918 				/* ip1dbg */
1919 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1920 				    "Destination is not solicited node "
1921 				    "multicast %s\n", AF_INET6,
1922 				    &ip6h->ip6_dst);
1923 			}
1924 			bad_solicit = B_TRUE;
1925 			goto done;
1926 		}
1927 	}
1928 
1929 	/*
1930 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1931 	 * received this packet if it's multicast) is not the ill tied to
1932 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1933 	 * to ensure we find the associated NCE.
1934 	 */
1935 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1936 	/*
1937 	 * If this is a valid Solicitation for an address we are publishing,
1938 	 * then a PUBLISH entry should exist in the cache
1939 	 */
1940 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1941 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1942 		    "ifname=%s ", ill->ill_name));
1943 		if (ip_debug > 2) {
1944 			/* ip1dbg */
1945 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1946 		}
1947 		if (our_ncec == NULL)
1948 			bad_solicit = B_TRUE;
1949 		goto done;
1950 	}
1951 
1952 	/* At this point we should have a verified NS per spec */
1953 	if (opt != NULL) {
1954 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1955 		if (opt != NULL) {
1956 			haddr = (uchar_t *)&opt[1];
1957 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1958 			    hlen == 0) {
1959 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1960 				bad_solicit = B_TRUE;
1961 				goto done;
1962 			}
1963 		}
1964 	}
1965 
1966 	/* If sending directly to peer, set the unicast flag */
1967 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1968 		flag |= NDP_UNICAST;
1969 
1970 	/*
1971 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1972 	 * or respond to outstanding queries, don't if
1973 	 * the source is unspecified address.
1974 	 */
1975 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1976 		int	err;
1977 		nce_t	*nnce;
1978 
1979 		ASSERT(ill->ill_isv6);
1980 		/*
1981 		 * Regular solicitations *must* include the Source Link-Layer
1982 		 * Address option.  Ignore messages that do not.
1983 		 */
1984 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1985 			ip1dbg(("ndp_input_solicit: source link-layer address "
1986 			    "option missing with a specified source.\n"));
1987 			bad_solicit = B_TRUE;
1988 			goto done;
1989 		}
1990 
1991 		/*
1992 		 * This is a regular solicitation.  If we're still in the
1993 		 * process of verifying the address, then don't respond at all
1994 		 * and don't keep track of the sender.
1995 		 */
1996 		if (our_ncec->ncec_state == ND_PROBE)
1997 			goto done;
1998 
1999 		/*
2000 		 * If the solicitation doesn't have sender hardware address
2001 		 * (legal for unicast solicitation), then process without
2002 		 * installing the return NCE.  Either we already know it, or
2003 		 * we'll be forced to look it up when (and if) we reply to the
2004 		 * packet.
2005 		 */
2006 		if (haddr == NULL)
2007 			goto no_source;
2008 
2009 		under_ill = ill;
2010 		if (IS_UNDER_IPMP(under_ill)) {
2011 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
2012 			if (ill == NULL)
2013 				ill = under_ill;
2014 			else
2015 				need_ill_refrele = B_TRUE;
2016 		}
2017 		err = nce_lookup_then_add_v6(ill,
2018 		    haddr, hlen,
2019 		    &src,	/* Soliciting nodes address */
2020 		    0,
2021 		    ND_STALE,
2022 		    &nnce);
2023 
2024 		if (need_ill_refrele) {
2025 			ill_refrele(ill);
2026 			ill = under_ill;
2027 			need_ill_refrele =  B_FALSE;
2028 		}
2029 		switch (err) {
2030 		case 0:
2031 			/* done with this entry */
2032 			nce_refrele(nnce);
2033 			break;
2034 		case EEXIST:
2035 			/*
2036 			 * B_FALSE indicates this is not an an advertisement.
2037 			 */
2038 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
2039 			nce_refrele(nnce);
2040 			break;
2041 		default:
2042 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2043 			    err));
2044 			goto done;
2045 		}
2046 no_source:
2047 		flag |= NDP_SOLICITED;
2048 	} else {
2049 		/*
2050 		 * No source link layer address option should be present in a
2051 		 * valid DAD request.
2052 		 */
2053 		if (haddr != NULL) {
2054 			ip1dbg(("ndp_input_solicit: source link-layer address "
2055 			    "option present with an unspecified source.\n"));
2056 			bad_solicit = B_TRUE;
2057 			goto done;
2058 		}
2059 		if (our_ncec->ncec_state == ND_PROBE) {
2060 			/*
2061 			 * Internally looped-back probes will have
2062 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
2063 			 * transmissions.
2064 			 */
2065 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
2066 				/*
2067 				 * If someone else is probing our address, then
2068 				 * we've crossed wires.  Declare failure.
2069 				 */
2070 				ndp_failure(mp, ira);
2071 			}
2072 			goto done;
2073 		}
2074 		/*
2075 		 * This is a DAD probe.  Multicast the advertisement to the
2076 		 * all-nodes address.
2077 		 */
2078 		src = ipv6_all_hosts_mcast;
2079 	}
2080 	flag |= nce_advert_flags(our_ncec);
2081 	(void) ndp_xmit(ill,
2082 	    ND_NEIGHBOR_ADVERT,
2083 	    our_ncec->ncec_lladdr,
2084 	    our_ncec->ncec_lladdr_length,
2085 	    &target,	/* Source and target of the advertisement pkt */
2086 	    &src,	/* IP Destination (source of original pkt) */
2087 	    flag);
2088 done:
2089 	if (bad_solicit)
2090 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2091 	if (our_ncec != NULL)
2092 		ncec_refrele(our_ncec);
2093 }
2094 
2095 /*
2096  * Handle reception of Neighbor Solicitation messages
2097  */
2098 void
ndp_input_advert(mblk_t * mp,ip_recv_attr_t * ira)2099 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
2100 {
2101 	ill_t		*ill = ira->ira_ill;
2102 	nd_neighbor_advert_t *na;
2103 	uint32_t	hlen = ill->ill_phys_addr_length;
2104 	uchar_t		*haddr = NULL;
2105 	icmp6_t		*icmp_nd;
2106 	ip6_t		*ip6h;
2107 	ncec_t		*dst_ncec = NULL;
2108 	in6_addr_t	target;
2109 	nd_opt_hdr_t	*opt = NULL;
2110 	int		len;
2111 	ip_stack_t	*ipst = ill->ill_ipst;
2112 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2113 
2114 	ip6h = (ip6_t *)mp->b_rptr;
2115 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2116 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2117 	na = (nd_neighbor_advert_t *)icmp_nd;
2118 
2119 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2120 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2121 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2122 		    "solicited flag is not zero\n"));
2123 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2124 		return;
2125 	}
2126 	target = na->nd_na_target;
2127 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
2128 	    IN6_IS_ADDR_LOOPBACK(&target)) {
2129 		if (ip_debug > 2) {
2130 			/* ip1dbg */
2131 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
2132 			    AF_INET6, &target);
2133 		}
2134 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2135 		return;
2136 	}
2137 	if (len > sizeof (nd_neighbor_advert_t)) {
2138 		opt = (nd_opt_hdr_t *)&na[1];
2139 		if (!ndp_verify_optlen(opt,
2140 		    len - sizeof (nd_neighbor_advert_t))) {
2141 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2142 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2143 			return;
2144 		}
2145 		/* At this point we have a verified NA per spec */
2146 		len -= sizeof (nd_neighbor_advert_t);
2147 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2148 		if (opt != NULL) {
2149 			haddr = (uchar_t *)&opt[1];
2150 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2151 			    hlen == 0) {
2152 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2153 				BUMP_MIB(mib,
2154 				    ipv6IfIcmpInBadNeighborAdvertisements);
2155 				return;
2156 			}
2157 		}
2158 	}
2159 
2160 	/*
2161 	 * NOTE: we match across the illgrp since we need to do DAD for all of
2162 	 * our local addresses, and those are spread across all the active
2163 	 * ills in the group.
2164 	 */
2165 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
2166 		return;
2167 
2168 	if (NCE_PUBLISH(dst_ncec)) {
2169 		/*
2170 		 * Someone just advertised an addresses that we publish. First,
2171 		 * check it it was us -- if so, we can safely ignore it.
2172 		 * We don't get the haddr from the ira_l2src because, in the
2173 		 * case that the packet originated from us, on an IPMP group,
2174 		 * the ira_l2src may would be the link-layer address of the
2175 		 * cast_ill used to send the packet, which may not be the same
2176 		 * as the dst_ncec->ncec_lladdr of the address.
2177 		 */
2178 		if (haddr != NULL) {
2179 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
2180 				goto out;
2181 
2182 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
2183 				goto out;   /* from us -- no conflict */
2184 
2185 			/*
2186 			 * If we're in an IPMP group, check if this is an echo
2187 			 * from another ill in the group.  Use the double-
2188 			 * checked locking pattern to avoid grabbing
2189 			 * ill_g_lock in the non-IPMP case.
2190 			 */
2191 			if (IS_UNDER_IPMP(ill)) {
2192 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2193 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2194 				    ill->ill_grp, haddr, hlen) != NULL) {
2195 					rw_exit(&ipst->ips_ill_g_lock);
2196 					goto out;
2197 				}
2198 				rw_exit(&ipst->ips_ill_g_lock);
2199 			}
2200 		}
2201 
2202 		/*
2203 		 * This appears to be a real conflict.  If we're trying to
2204 		 * configure this NCE (ND_PROBE), then shut it down.
2205 		 * Otherwise, handle the discovered conflict.
2206 		 */
2207 		if (dst_ncec->ncec_state == ND_PROBE) {
2208 			ndp_failure(mp, ira);
2209 		} else {
2210 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
2211 				char hbuf[MAC_STR_LEN];
2212 				char sbuf[INET6_ADDRSTRLEN];
2213 
2214 				cmn_err(CE_WARN,
2215 				    "node '%s' is using %s on %s",
2216 				    inet_ntop(AF_INET6, &target, sbuf,
2217 				    sizeof (sbuf)),
2218 				    haddr == NULL ? "<none>" :
2219 				    mac_colon_addr(haddr, hlen, hbuf,
2220 				    sizeof (hbuf)), ill->ill_name);
2221 				/*
2222 				 * RFC 4862, Section 5.4.4 does not mandate
2223 				 * any specific behavior when an NA matches
2224 				 * a non-tentative address assigned to the
2225 				 * receiver. We make the choice of defending
2226 				 * our address, based on the assumption that
2227 				 * the sender has not detected the Duplicate.
2228 				 *
2229 				 * ncec_last_time_defended has been adjusted
2230 				 * in ip_nce_conflict()
2231 				 */
2232 				(void) ndp_announce(dst_ncec);
2233 			}
2234 		}
2235 	} else {
2236 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2237 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2238 
2239 		/* B_TRUE indicates this an advertisement */
2240 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2241 	}
2242 out:
2243 	ncec_refrele(dst_ncec);
2244 }
2245 
2246 /*
2247  * Process NDP neighbor solicitation/advertisement messages.
2248  * The checksum has already checked o.k before reaching here.
2249  * Information about the datalink header is contained in ira_l2src, but
2250  * that should be ignored for loopback packets.
2251  */
2252 void
ndp_input(mblk_t * mp,ip_recv_attr_t * ira)2253 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2254 {
2255 	ill_t		*ill = ira->ira_rill;
2256 	icmp6_t		*icmp_nd;
2257 	ip6_t		*ip6h;
2258 	int		len;
2259 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2260 	ill_t		*orig_ill = NULL;
2261 
2262 	/*
2263 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2264 	 * and make it be the IPMP upper so avoid being confused by a packet
2265 	 * addressed to a unicast address on a different ill.
2266 	 */
2267 	if (IS_UNDER_IPMP(ill)) {
2268 		orig_ill = ill;
2269 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2270 		if (ill == NULL) {
2271 			ill = orig_ill;
2272 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2273 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2274 			    mp, ill);
2275 			freemsg(mp);
2276 			return;
2277 		}
2278 		ASSERT(ill != orig_ill);
2279 		orig_ill = ira->ira_ill;
2280 		ira->ira_ill = ill;
2281 		mib = ill->ill_icmp6_mib;
2282 	}
2283 	if (!pullupmsg(mp, -1)) {
2284 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2285 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2286 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2287 		goto done;
2288 	}
2289 	ip6h = (ip6_t *)mp->b_rptr;
2290 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2291 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2292 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2293 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2294 		goto done;
2295 	}
2296 	/*
2297 	 * NDP does not accept any extension headers between the
2298 	 * IP header and the ICMP header since e.g. a routing
2299 	 * header could be dangerous.
2300 	 * This assumes that any AH or ESP headers are removed
2301 	 * by ip prior to passing the packet to ndp_input.
2302 	 */
2303 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2304 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2305 		    ip6h->ip6_nxt));
2306 		ip_drop_input("Wrong next header", mp, ill);
2307 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2308 		goto done;
2309 	}
2310 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2311 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2312 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2313 	if (icmp_nd->icmp6_code != 0) {
2314 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2315 		ip_drop_input("code non-zero", mp, ill);
2316 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2317 		goto done;
2318 	}
2319 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2320 	/*
2321 	 * Make sure packet length is large enough for either
2322 	 * a NS or a NA icmp packet.
2323 	 */
2324 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2325 		ip1dbg(("ndp_input: packet too short\n"));
2326 		ip_drop_input("packet too short", mp, ill);
2327 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2328 		goto done;
2329 	}
2330 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2331 		ndp_input_solicit(mp, ira);
2332 	} else {
2333 		ndp_input_advert(mp, ira);
2334 	}
2335 done:
2336 	freemsg(mp);
2337 	if (orig_ill != NULL) {
2338 		ill_refrele(ill);
2339 		ira->ira_ill = orig_ill;
2340 	}
2341 }
2342 
2343 /*
2344  * ndp_xmit is called to form and transmit a ND solicitation or
2345  * advertisement ICMP packet.
2346  *
2347  * If the source address is unspecified and this isn't a probe (used for
2348  * duplicate address detection), an appropriate source address and link layer
2349  * address will be chosen here.  The link layer address option is included if
2350  * the source is specified (i.e., all non-probe packets), and omitted (per the
2351  * specification) otherwise.
2352  *
2353  * It returns B_FALSE only if it does a successful put() to the
2354  * corresponding ill's ill_wq otherwise returns B_TRUE.
2355  */
2356 static boolean_t
ndp_xmit(ill_t * ill,uint32_t operation,uint8_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * sender,const in6_addr_t * target,int flag)2357 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2358     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2359 {
2360 	uint32_t	len;
2361 	icmp6_t		*icmp6;
2362 	mblk_t		*mp;
2363 	ip6_t		*ip6h;
2364 	nd_opt_hdr_t	*opt;
2365 	uint_t		plen;
2366 	zoneid_t	zoneid = GLOBAL_ZONEID;
2367 	ill_t		*hwaddr_ill = ill;
2368 	ip_xmit_attr_t	ixas;
2369 	ip_stack_t	*ipst = ill->ill_ipst;
2370 	boolean_t	need_refrele = B_FALSE;
2371 	boolean_t	probe = B_FALSE;
2372 
2373 	if (IS_UNDER_IPMP(ill)) {
2374 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2375 		/*
2376 		 * We send non-probe packets on the upper IPMP interface.
2377 		 * ip_output_simple() will use cast_ill for sending any
2378 		 * multicast packets. Note that we can't follow the same
2379 		 * logic for probe packets because all interfaces in the ipmp
2380 		 * group may have failed, so that we really want to only try
2381 		 * to send the ND packet on the ill corresponding to the src
2382 		 * address.
2383 		 */
2384 		if (!probe) {
2385 			ill = ipmp_ill_hold_ipmp_ill(ill);
2386 			if (ill != NULL)
2387 				need_refrele = B_TRUE;
2388 			else
2389 				ill = hwaddr_ill;
2390 		}
2391 	}
2392 
2393 	/*
2394 	 * If we have a unspecified source(sender) address, select a
2395 	 * proper source address for the solicitation here itself so
2396 	 * that we can initialize the h/w address correctly.
2397 	 *
2398 	 * If the sender is specified then we use this address in order
2399 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2400 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2401 	 * by IP (we cannot guarantee that the global zone has an interface
2402 	 * route to the destination).
2403 	 *
2404 	 * Note that the NA never comes here with the unspecified source
2405 	 * address.
2406 	 */
2407 
2408 	/*
2409 	 * Probes will have unspec src at this point.
2410 	 */
2411 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2412 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2413 		/*
2414 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2415 		 * ALL_ZONES if it cannot find a matching ipif for the address
2416 		 * we are trying to use. In this case we err on the side of
2417 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2418 		 */
2419 		if (zoneid == ALL_ZONES)
2420 			zoneid = GLOBAL_ZONEID;
2421 	}
2422 
2423 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2424 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2425 	mp = allocb(len,  BPRI_LO);
2426 	if (mp == NULL) {
2427 		if (need_refrele)
2428 			ill_refrele(ill);
2429 		return (B_TRUE);
2430 	}
2431 
2432 	bzero((char *)mp->b_rptr, len);
2433 	mp->b_wptr = mp->b_rptr + len;
2434 
2435 	bzero(&ixas, sizeof (ixas));
2436 	ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2437 
2438 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2439 	ixas.ixa_ipst = ipst;
2440 	ixas.ixa_cred = kcred;
2441 	ixas.ixa_cpid = NOPID;
2442 	ixas.ixa_tsl = NULL;
2443 	ixas.ixa_zoneid = zoneid;
2444 
2445 	ip6h = (ip6_t *)mp->b_rptr;
2446 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2447 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2448 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2449 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2450 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2451 	ip6h->ip6_dst = *target;
2452 	icmp6 = (icmp6_t *)&ip6h[1];
2453 
2454 	if (hw_addr_len != 0) {
2455 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2456 		    sizeof (nd_neighbor_advert_t));
2457 	} else {
2458 		opt = NULL;
2459 	}
2460 	if (operation == ND_NEIGHBOR_SOLICIT) {
2461 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2462 
2463 		if (opt != NULL && !(flag & NDP_PROBE)) {
2464 			/*
2465 			 * Note that we don't send out SLLA for ND probes
2466 			 * per RFC 4862, even though we do send out the src
2467 			 * haddr for IPv4 DAD probes, even though both IPv4
2468 			 * and IPv6 go out with the unspecified/INADDR_ANY
2469 			 * src IP addr.
2470 			 */
2471 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2472 		}
2473 		ip6h->ip6_src = *sender;
2474 		ns->nd_ns_target = *target;
2475 		if (!(flag & NDP_UNICAST)) {
2476 			/* Form multicast address of the target */
2477 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2478 			ip6h->ip6_dst.s6_addr32[3] |=
2479 			    ns->nd_ns_target.s6_addr32[3];
2480 		}
2481 	} else {
2482 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2483 
2484 		ASSERT(!(flag & NDP_PROBE));
2485 		if (opt != NULL)
2486 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2487 		ip6h->ip6_src = *sender;
2488 		na->nd_na_target = *sender;
2489 		if (flag & NDP_ISROUTER)
2490 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2491 		if (flag & NDP_SOLICITED)
2492 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2493 		if (flag & NDP_ORIDE)
2494 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2495 	}
2496 
2497 	if (!(flag & NDP_PROBE)) {
2498 		if (hw_addr != NULL && opt != NULL) {
2499 			/* Fill in link layer address and option len */
2500 			opt->nd_opt_len = (uint8_t)plen;
2501 			bcopy(hw_addr, &opt[1], hw_addr_len);
2502 		}
2503 	}
2504 	if (opt != NULL && opt->nd_opt_type == 0) {
2505 		/* If there's no link layer address option, then strip it. */
2506 		len -= plen * 8;
2507 		mp->b_wptr = mp->b_rptr + len;
2508 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2509 	}
2510 
2511 	icmp6->icmp6_type = (uint8_t)operation;
2512 	icmp6->icmp6_code = 0;
2513 	/*
2514 	 * Prepare for checksum by putting icmp length in the icmp
2515 	 * checksum field. The checksum is calculated in ip_output.c.
2516 	 */
2517 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2518 
2519 	(void) ip_output_simple(mp, &ixas);
2520 	ixa_cleanup(&ixas);
2521 	if (need_refrele)
2522 		ill_refrele(ill);
2523 	return (B_FALSE);
2524 }
2525 
2526 /*
2527  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2528  * The datapath uses this as an indication that there
2529  * is a problem (as opposed to a NCE that was just
2530  * reclaimed due to lack of memory.
2531  * Note that static ARP entries never become unreachable.
2532  */
2533 void
nce_make_unreachable(ncec_t * ncec)2534 nce_make_unreachable(ncec_t *ncec)
2535 {
2536 	mutex_enter(&ncec->ncec_lock);
2537 	ncec->ncec_state = ND_UNREACHABLE;
2538 	mutex_exit(&ncec->ncec_lock);
2539 }
2540 
2541 /*
2542  * NCE retransmit timer. Common to IPv4 and IPv6.
2543  * This timer goes off when:
2544  * a. It is time to retransmit a resolution for resolver.
2545  * b. It is time to send reachability probes.
2546  */
2547 void
nce_timer(void * arg)2548 nce_timer(void *arg)
2549 {
2550 	ncec_t		*ncec = arg;
2551 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2552 	char		addrbuf[INET6_ADDRSTRLEN];
2553 	boolean_t	dropped = B_FALSE;
2554 	ip_stack_t	*ipst = ncec->ncec_ipst;
2555 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2556 	in_addr_t	sender4 = INADDR_ANY;
2557 	in6_addr_t	sender6 = ipv6_all_zeros;
2558 
2559 	/*
2560 	 * The timer has to be cancelled by ncec_delete before doing the final
2561 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2562 	 * until it clears the timeout_id. Before clearing the timeout_id
2563 	 * bump up the refcnt so that we can continue to use the ncec
2564 	 */
2565 	ASSERT(ncec != NULL);
2566 	mutex_enter(&ncec->ncec_lock);
2567 	ncec_refhold_locked(ncec);
2568 	ncec->ncec_timeout_id = 0;
2569 	mutex_exit(&ncec->ncec_lock);
2570 
2571 	src_ill = nce_resolve_src(ncec, &sender6);
2572 	/* if we could not find a sender address, return */
2573 	if (src_ill == NULL) {
2574 		if (!isv6) {
2575 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2576 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2577 			    &sender4, addrbuf, sizeof (addrbuf))));
2578 		} else {
2579 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2580 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2581 		}
2582 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2583 		ncec_refrele(ncec);
2584 		return;
2585 	}
2586 	if (!isv6)
2587 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2588 
2589 	mutex_enter(&ncec->ncec_lock);
2590 	/*
2591 	 * Check the reachability state.
2592 	 */
2593 	switch (ncec->ncec_state) {
2594 	case ND_DELAY:
2595 		ASSERT(ncec->ncec_lladdr != NULL);
2596 		ncec->ncec_state = ND_PROBE;
2597 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2598 		if (isv6) {
2599 			mutex_exit(&ncec->ncec_lock);
2600 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2601 			    src_ill->ill_phys_addr,
2602 			    src_ill->ill_phys_addr_length,
2603 			    &sender6, &ncec->ncec_addr,
2604 			    NDP_UNICAST);
2605 		} else {
2606 			dropped = (arp_request(ncec, sender4, src_ill) == 0);
2607 			mutex_exit(&ncec->ncec_lock);
2608 		}
2609 		if (!dropped) {
2610 			mutex_enter(&ncec->ncec_lock);
2611 			ncec->ncec_pcnt--;
2612 			mutex_exit(&ncec->ncec_lock);
2613 		}
2614 		if (ip_debug > 3) {
2615 			/* ip2dbg */
2616 			pr_addr_dbg("nce_timer: state for %s changed "
2617 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2618 		}
2619 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2620 		break;
2621 	case ND_PROBE:
2622 		/* must be retransmit timer */
2623 		ASSERT(ncec->ncec_pcnt >= -1);
2624 		if (ncec->ncec_pcnt > 0) {
2625 			/*
2626 			 * As per RFC2461, the ncec gets deleted after
2627 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2628 			 * Note that the first unicast solicitation is sent
2629 			 * during the DELAY state.
2630 			 */
2631 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2632 			    ncec->ncec_pcnt,
2633 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2634 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2635 			if (NCE_PUBLISH(ncec)) {
2636 				mutex_exit(&ncec->ncec_lock);
2637 				/*
2638 				 * send out a probe; note that src_ill
2639 				 * is ignored by nce_dad() for all
2640 				 * DAD message types other than IPv6
2641 				 * unicast probes
2642 				 */
2643 				nce_dad(ncec, src_ill, B_TRUE);
2644 			} else {
2645 				ASSERT(src_ill != NULL);
2646 				if (isv6) {
2647 					mutex_exit(&ncec->ncec_lock);
2648 					dropped = ndp_xmit(src_ill,
2649 					    ND_NEIGHBOR_SOLICIT,
2650 					    src_ill->ill_phys_addr,
2651 					    src_ill->ill_phys_addr_length,
2652 					    &sender6, &ncec->ncec_addr,
2653 					    NDP_UNICAST);
2654 				} else {
2655 					/*
2656 					 * since the nce is REACHABLE,
2657 					 * the ARP request will be sent out
2658 					 * as a link-layer unicast.
2659 					 */
2660 					dropped = (arp_request(ncec, sender4,
2661 					    src_ill) == 0);
2662 					mutex_exit(&ncec->ncec_lock);
2663 				}
2664 				if (!dropped) {
2665 					mutex_enter(&ncec->ncec_lock);
2666 					ncec->ncec_pcnt--;
2667 					mutex_exit(&ncec->ncec_lock);
2668 				}
2669 				nce_restart_timer(ncec,
2670 				    ill->ill_reachable_retrans_time);
2671 			}
2672 		} else if (ncec->ncec_pcnt < 0) {
2673 			/* No hope, delete the ncec */
2674 			/* Tell datapath it went bad */
2675 			ncec->ncec_state = ND_UNREACHABLE;
2676 			mutex_exit(&ncec->ncec_lock);
2677 			if (ip_debug > 2) {
2678 				/* ip1dbg */
2679 				pr_addr_dbg("nce_timer: Delete NCE for"
2680 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2681 				    &ncec->ncec_addr);
2682 			}
2683 			/* if static ARP can't delete. */
2684 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2685 				ncec_delete(ncec);
2686 
2687 		} else if (!NCE_PUBLISH(ncec)) {
2688 			/*
2689 			 * Probe count is 0 for a dynamic entry (one that we
2690 			 * ourselves are not publishing). We should never get
2691 			 * here if NONUD was requested, hence the ASSERT below.
2692 			 */
2693 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2694 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2695 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2696 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2697 			ncec->ncec_pcnt--;
2698 			mutex_exit(&ncec->ncec_lock);
2699 			/* Wait one interval before killing */
2700 			nce_restart_timer(ncec,
2701 			    ill->ill_reachable_retrans_time);
2702 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2703 			ipif_t *ipif;
2704 			ipaddr_t ncec_addr;
2705 
2706 			/*
2707 			 * We're done probing, and we can now declare this
2708 			 * address to be usable.  Let IP know that it's ok to
2709 			 * use.
2710 			 */
2711 			ncec->ncec_state = ND_REACHABLE;
2712 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2713 			mutex_exit(&ncec->ncec_lock);
2714 			if (isv6) {
2715 				ipif = ipif_lookup_addr_exact_v6(
2716 				    &ncec->ncec_addr, ill, ipst);
2717 			} else {
2718 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2719 				    ncec_addr);
2720 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2721 				    ipst);
2722 			}
2723 			if (ipif != NULL) {
2724 				if (ipif->ipif_was_dup) {
2725 					char ibuf[LIFNAMSIZ];
2726 					char sbuf[INET6_ADDRSTRLEN];
2727 
2728 					ipif->ipif_was_dup = B_FALSE;
2729 					(void) inet_ntop(AF_INET6,
2730 					    &ipif->ipif_v6lcl_addr,
2731 					    sbuf, sizeof (sbuf));
2732 					ipif_get_name(ipif, ibuf,
2733 					    sizeof (ibuf));
2734 					cmn_err(CE_NOTE, "recovered address "
2735 					    "%s on %s", sbuf, ibuf);
2736 				}
2737 				if ((ipif->ipif_flags & IPIF_UP) &&
2738 				    !ipif->ipif_addr_ready)
2739 					ipif_up_notify(ipif);
2740 				ipif->ipif_addr_ready = 1;
2741 				ipif_refrele(ipif);
2742 			}
2743 			if (!isv6 && arp_no_defense)
2744 				break;
2745 			/* Begin defending our new address */
2746 			if (ncec->ncec_unsolicit_count > 0) {
2747 				ncec->ncec_unsolicit_count--;
2748 				if (isv6) {
2749 					dropped = ndp_announce(ncec);
2750 				} else {
2751 					dropped = arp_announce(ncec);
2752 				}
2753 
2754 				if (dropped)
2755 					ncec->ncec_unsolicit_count++;
2756 				else
2757 					ncec->ncec_last_time_defended =
2758 					    ddi_get_lbolt();
2759 			}
2760 			if (ncec->ncec_unsolicit_count > 0) {
2761 				nce_restart_timer(ncec,
2762 				    ANNOUNCE_INTERVAL(isv6));
2763 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2764 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2765 			}
2766 		} else {
2767 			/*
2768 			 * This is an address we're probing to be our own, but
2769 			 * the ill is down.  Wait until it comes back before
2770 			 * doing anything, but switch to reachable state so
2771 			 * that the restart will work.
2772 			 */
2773 			ncec->ncec_state = ND_REACHABLE;
2774 			mutex_exit(&ncec->ncec_lock);
2775 		}
2776 		break;
2777 	case ND_INCOMPLETE: {
2778 		mblk_t	*mp, *nextmp;
2779 		mblk_t	**prevmpp;
2780 
2781 		/*
2782 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2783 		 * for any IPMP probe packets, and toss them.  IPMP probe
2784 		 * packets will always be at the head of ncec_qd_mp, so that
2785 		 * we can stop at the first queued ND packet that is
2786 		 * not a probe packet.
2787 		 */
2788 		prevmpp = &ncec->ncec_qd_mp;
2789 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2790 			nextmp = mp->b_next;
2791 
2792 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2793 				inet_freemsg(mp);
2794 				ncec->ncec_nprobes--;
2795 				*prevmpp = nextmp;
2796 			} else {
2797 				prevmpp = &mp->b_next;
2798 			}
2799 		}
2800 
2801 		/*
2802 		 * Must be resolver's retransmit timer.
2803 		 */
2804 		mutex_exit(&ncec->ncec_lock);
2805 		ip_ndp_resolve(ncec);
2806 		break;
2807 	}
2808 	case ND_REACHABLE:
2809 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2810 		    ncec->ncec_unsolicit_count != 0) ||
2811 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2812 			if (ncec->ncec_unsolicit_count > 0) {
2813 				ncec->ncec_unsolicit_count--;
2814 				mutex_exit(&ncec->ncec_lock);
2815 				/*
2816 				 * When we get to zero announcements left,
2817 				 * switch to address defense
2818 				 */
2819 			} else {
2820 				boolean_t rate_limit;
2821 
2822 				mutex_exit(&ncec->ncec_lock);
2823 				rate_limit = ill_defend_rate_limit(ill, ncec);
2824 				if (rate_limit) {
2825 					nce_restart_timer(ncec,
2826 					    DEFENSE_INTERVAL(isv6));
2827 					break;
2828 				}
2829 			}
2830 			if (isv6) {
2831 				dropped = ndp_announce(ncec);
2832 			} else {
2833 				dropped = arp_announce(ncec);
2834 			}
2835 			mutex_enter(&ncec->ncec_lock);
2836 			if (dropped) {
2837 				ncec->ncec_unsolicit_count++;
2838 			} else {
2839 				ncec->ncec_last_time_defended =
2840 				    ddi_get_lbolt();
2841 			}
2842 			mutex_exit(&ncec->ncec_lock);
2843 			if (ncec->ncec_unsolicit_count != 0) {
2844 				nce_restart_timer(ncec,
2845 				    ANNOUNCE_INTERVAL(isv6));
2846 			} else {
2847 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2848 			}
2849 		} else {
2850 			mutex_exit(&ncec->ncec_lock);
2851 		}
2852 		break;
2853 	default:
2854 		mutex_exit(&ncec->ncec_lock);
2855 		break;
2856 	}
2857 done:
2858 	ncec_refrele(ncec);
2859 	ill_refrele(src_ill);
2860 }
2861 
2862 /*
2863  * Set a link layer address from the ll_addr passed in.
2864  * Copy SAP from ill.
2865  */
2866 static void
nce_set_ll(ncec_t * ncec,uchar_t * ll_addr)2867 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2868 {
2869 	ill_t	*ill = ncec->ncec_ill;
2870 
2871 	ASSERT(ll_addr != NULL);
2872 	if (ill->ill_phys_addr_length > 0) {
2873 		/*
2874 		 * The bcopy() below used to be called for the physical address
2875 		 * length rather than the link layer address length. For
2876 		 * ethernet and many other media, the phys_addr and lla are
2877 		 * identical.
2878 		 *
2879 		 * The phys_addr and lla may not be the same for devices that
2880 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2881 		 * no known instances of these.
2882 		 *
2883 		 * For PPP or other interfaces with a zero length
2884 		 * physical address, don't do anything here.
2885 		 * The bcopy() with a zero phys_addr length was previously
2886 		 * a no-op for interfaces with a zero-length physical address.
2887 		 * Using the lla for them would change the way they operate.
2888 		 * Doing nothing in such cases preserves expected behavior.
2889 		 */
2890 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2891 	}
2892 }
2893 
2894 boolean_t
nce_cmp_ll_addr(const ncec_t * ncec,const uchar_t * ll_addr,uint32_t ll_addr_len)2895 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2896     uint32_t ll_addr_len)
2897 {
2898 	ASSERT(ncec->ncec_lladdr != NULL);
2899 	if (ll_addr == NULL)
2900 		return (B_FALSE);
2901 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2902 		return (B_TRUE);
2903 	return (B_FALSE);
2904 }
2905 
2906 /*
2907  * Updates the link layer address or the reachability state of
2908  * a cache entry.  Reset probe counter if needed.
2909  */
2910 void
nce_update(ncec_t * ncec,uint16_t new_state,uchar_t * new_ll_addr)2911 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2912 {
2913 	ill_t	*ill = ncec->ncec_ill;
2914 	boolean_t need_stop_timer = B_FALSE;
2915 	boolean_t need_fastpath_update = B_FALSE;
2916 	nce_t	*nce = NULL;
2917 	timeout_id_t tid;
2918 
2919 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2920 	/*
2921 	 * If this interface does not do NUD, there is no point
2922 	 * in allowing an update to the cache entry.  Although
2923 	 * we will respond to NS.
2924 	 * The only time we accept an update for a resolver when
2925 	 * NUD is turned off is when it has just been created.
2926 	 * Non-Resolvers will always be created as REACHABLE.
2927 	 */
2928 	if (new_state != ND_UNCHANGED) {
2929 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2930 		    (ncec->ncec_state != ND_INCOMPLETE))
2931 			return;
2932 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2933 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2934 		need_stop_timer = B_TRUE;
2935 		if (new_state == ND_REACHABLE)
2936 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2937 		else {
2938 			/* We force NUD in this case */
2939 			ncec->ncec_last = 0;
2940 		}
2941 		ncec->ncec_state = new_state;
2942 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2943 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2944 		    new_state == ND_INCOMPLETE);
2945 	}
2946 
2947 	tid = 0;
2948 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2949 		tid = ncec->ncec_timeout_id;
2950 		ncec->ncec_timeout_id = 0;
2951 	}
2952 	/*
2953 	 * Re-trigger fastpath probe and
2954 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2955 	 * whatever packets that happens to be transmitting at the time.
2956 	 */
2957 	if (new_ll_addr != NULL) {
2958 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2959 		    ill->ill_phys_addr_length);
2960 		need_fastpath_update = B_TRUE;
2961 	}
2962 	mutex_exit(&ncec->ncec_lock);
2963 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2964 		if (tid != 0)
2965 			(void) untimeout(tid);
2966 	}
2967 	if (need_fastpath_update) {
2968 		/*
2969 		 * Delete any existing existing dlur_mp and fp_mp information.
2970 		 * For IPMP interfaces, all underlying ill's must be checked
2971 		 * and purged.
2972 		 */
2973 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2974 		/*
2975 		 * add the new dlur_mp and fp_mp
2976 		 */
2977 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2978 		if (nce != NULL)
2979 			nce_refrele(nce);
2980 	}
2981 	mutex_enter(&ncec->ncec_lock);
2982 }
2983 
2984 static void
nce_queue_mp_common(ncec_t * ncec,mblk_t * mp,boolean_t head_insert)2985 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2986 {
2987 	uint_t	count = 0;
2988 	mblk_t  **mpp, *tmp;
2989 
2990 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2991 
2992 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2993 		if (++count > ncec->ncec_ill->ill_max_buf) {
2994 			tmp = ncec->ncec_qd_mp->b_next;
2995 			ncec->ncec_qd_mp->b_next = NULL;
2996 			/*
2997 			 * if we never create data addrs on the under_ill
2998 			 * does this matter?
2999 			 */
3000 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
3001 			    ipIfStatsOutDiscards);
3002 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
3003 			    ncec->ncec_ill);
3004 			freemsg(ncec->ncec_qd_mp);
3005 			ncec->ncec_qd_mp = tmp;
3006 		}
3007 	}
3008 
3009 	if (head_insert) {
3010 		ncec->ncec_nprobes++;
3011 		mp->b_next = ncec->ncec_qd_mp;
3012 		ncec->ncec_qd_mp = mp;
3013 	} else {
3014 		*mpp = mp;
3015 	}
3016 }
3017 
3018 /*
3019  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
3020  * queued at the head or tail of the queue based on the input argument
3021  * 'head_insert'. The caller should specify this argument as B_TRUE if this
3022  * packet is an IPMP probe packet, in which case the following happens:
3023  *
3024  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
3025  *	(non-ipmp_probe) load-speading case where the source address of the ND
3026  *	packet is not tied to ncec_ill. If the ill bound to the source address
3027  *	cannot receive, the response to the ND packet will not be received.
3028  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
3029  *	packet, those probes will also fail to be sent, and thus in.mpathd will
3030  *	 erroneously conclude that ncec_ill has also failed.
3031  *
3032  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
3033  *	the first attempt.  This ensures that ND problems do not manifest as
3034  *	probe RTT spikes.
3035  *
3036  * We achieve this by inserting ipmp_probe() packets at the head of the
3037  * nce_queue.
3038  *
3039  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
3040  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
3041  */
3042 void
nce_queue_mp(ncec_t * ncec,mblk_t * mp,boolean_t head_insert)3043 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
3044 {
3045 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3046 	nce_queue_mp_common(ncec, mp, head_insert);
3047 }
3048 
3049 /*
3050  * Called when address resolution failed due to a timeout.
3051  * Send an ICMP unreachable in response to all queued packets.
3052  */
3053 void
ndp_resolv_failed(ncec_t * ncec)3054 ndp_resolv_failed(ncec_t *ncec)
3055 {
3056 	mblk_t	*mp, *nxt_mp;
3057 	char	buf[INET6_ADDRSTRLEN];
3058 	ill_t *ill = ncec->ncec_ill;
3059 	ip_recv_attr_t	iras;
3060 
3061 	bzero(&iras, sizeof (iras));
3062 	iras.ira_flags = 0;
3063 	/*
3064 	 * we are setting the ira_rill to the ipmp_ill (instead of
3065 	 * the actual ill on which the packet was received), but this
3066 	 * is ok because we don't actually need the real ira_rill.
3067 	 * to send the icmp unreachable to the sender.
3068 	 */
3069 	iras.ira_ill = iras.ira_rill = ill;
3070 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3071 	iras.ira_rifindex = iras.ira_ruifindex;
3072 
3073 	ip1dbg(("ndp_resolv_failed: dst %s\n",
3074 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
3075 	mutex_enter(&ncec->ncec_lock);
3076 	mp = ncec->ncec_qd_mp;
3077 	ncec->ncec_qd_mp = NULL;
3078 	ncec->ncec_nprobes = 0;
3079 	mutex_exit(&ncec->ncec_lock);
3080 	while (mp != NULL) {
3081 		nxt_mp = mp->b_next;
3082 		mp->b_next = NULL;
3083 
3084 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3085 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3086 		    mp, ill);
3087 		icmp_unreachable_v6(mp,
3088 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
3089 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3090 		mp = nxt_mp;
3091 	}
3092 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3093 }
3094 
3095 /*
3096  * Handle the completion of NDP and ARP resolution.
3097  */
3098 void
nce_resolv_ok(ncec_t * ncec)3099 nce_resolv_ok(ncec_t *ncec)
3100 {
3101 	mblk_t *mp;
3102 	uint_t pkt_len;
3103 	iaflags_t ixaflags = IXAF_NO_TRACE;
3104 	nce_t *nce;
3105 	ill_t	*ill = ncec->ncec_ill;
3106 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
3107 	ip_stack_t *ipst = ill->ill_ipst;
3108 
3109 	if (IS_IPMP(ncec->ncec_ill)) {
3110 		nce_resolv_ipmp_ok(ncec);
3111 		return;
3112 	}
3113 	/* non IPMP case */
3114 
3115 	mutex_enter(&ncec->ncec_lock);
3116 	ASSERT(ncec->ncec_nprobes == 0);
3117 	mp = ncec->ncec_qd_mp;
3118 	ncec->ncec_qd_mp = NULL;
3119 	mutex_exit(&ncec->ncec_lock);
3120 
3121 	while (mp != NULL) {
3122 		mblk_t *nxt_mp;
3123 
3124 		if (ill->ill_isv6) {
3125 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
3126 
3127 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
3128 		} else {
3129 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
3130 
3131 			ixaflags |= IXAF_IS_IPV4;
3132 			pkt_len = ntohs(ipha->ipha_length);
3133 		}
3134 		nxt_mp = mp->b_next;
3135 		mp->b_next = NULL;
3136 		/*
3137 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
3138 		 * longer available, but it's ok to drop this flag because TCP
3139 		 * has its own flow-control in effect, so TCP packets
3140 		 * are not likely to get here when flow-control is in effect.
3141 		 */
3142 		mutex_enter(&ill->ill_lock);
3143 		nce = nce_lookup(ill, &ncec->ncec_addr);
3144 		mutex_exit(&ill->ill_lock);
3145 
3146 		if (nce == NULL) {
3147 			if (isv6) {
3148 				BUMP_MIB(&ipst->ips_ip6_mib,
3149 				    ipIfStatsOutDiscards);
3150 			} else {
3151 				BUMP_MIB(&ipst->ips_ip_mib,
3152 				    ipIfStatsOutDiscards);
3153 			}
3154 			ip_drop_output("ipIfStatsOutDiscards - no nce",
3155 			    mp, NULL);
3156 			freemsg(mp);
3157 		} else {
3158 			/*
3159 			 * We don't know the zoneid, but
3160 			 * ip_xmit does not care since IXAF_NO_TRACE
3161 			 * is set. (We traced the packet the first
3162 			 * time through ip_xmit.)
3163 			 */
3164 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
3165 			    ALL_ZONES, 0, NULL);
3166 			nce_refrele(nce);
3167 		}
3168 		mp = nxt_mp;
3169 	}
3170 
3171 	ncec_cb_dispatch(ncec); /* complete callbacks */
3172 }
3173 
3174 /*
3175  * Called by SIOCSNDP* ioctl to add/change an ncec entry
3176  * and the corresponding attributes.
3177  * Disallow states other than ND_REACHABLE or ND_STALE.
3178  */
3179 int
ndp_sioc_update(ill_t * ill,lif_nd_req_t * lnr)3180 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3181 {
3182 	sin6_t		*sin6;
3183 	in6_addr_t	*addr;
3184 	ncec_t		*ncec;
3185 	nce_t		*nce;
3186 	int		err = 0;
3187 	uint16_t	new_flags = 0;
3188 	uint16_t	old_flags = 0;
3189 	int		inflags = lnr->lnr_flags;
3190 	ip_stack_t	*ipst = ill->ill_ipst;
3191 	boolean_t	do_postprocess = B_FALSE;
3192 
3193 	ASSERT(ill->ill_isv6);
3194 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3195 	    (lnr->lnr_state_create != ND_STALE))
3196 		return (EINVAL);
3197 
3198 	sin6 = (sin6_t *)&lnr->lnr_addr;
3199 	addr = &sin6->sin6_addr;
3200 
3201 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3202 	ASSERT(!IS_UNDER_IPMP(ill));
3203 	nce = nce_lookup_addr(ill, addr);
3204 	if (nce != NULL)
3205 		new_flags = nce->nce_common->ncec_flags;
3206 
3207 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3208 	case NDF_ISROUTER_ON:
3209 		new_flags |= NCE_F_ISROUTER;
3210 		break;
3211 	case NDF_ISROUTER_OFF:
3212 		new_flags &= ~NCE_F_ISROUTER;
3213 		break;
3214 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3215 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3216 		if (nce != NULL)
3217 			nce_refrele(nce);
3218 		return (EINVAL);
3219 	}
3220 	if (inflags & NDF_STATIC)
3221 		new_flags |= NCE_F_STATIC;
3222 
3223 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3224 	case NDF_ANYCAST_ON:
3225 		new_flags |= NCE_F_ANYCAST;
3226 		break;
3227 	case NDF_ANYCAST_OFF:
3228 		new_flags &= ~NCE_F_ANYCAST;
3229 		break;
3230 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3231 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3232 		if (nce != NULL)
3233 			nce_refrele(nce);
3234 		return (EINVAL);
3235 	}
3236 
3237 	if (nce == NULL) {
3238 		err = nce_add_v6(ill,
3239 		    (uchar_t *)lnr->lnr_hdw_addr,
3240 		    ill->ill_phys_addr_length,
3241 		    addr,
3242 		    new_flags,
3243 		    lnr->lnr_state_create,
3244 		    &nce);
3245 		if (err != 0) {
3246 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3247 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3248 			return (err);
3249 		} else {
3250 			do_postprocess = B_TRUE;
3251 		}
3252 	}
3253 	ncec = nce->nce_common;
3254 	old_flags = ncec->ncec_flags;
3255 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3256 		ncec_router_to_host(ncec);
3257 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3258 		if (do_postprocess)
3259 			err = nce_add_v6_postprocess(nce);
3260 		nce_refrele(nce);
3261 		return (0);
3262 	}
3263 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3264 
3265 	if (do_postprocess)
3266 		err = nce_add_v6_postprocess(nce);
3267 	/*
3268 	 * err cannot be anything other than 0 because we don't support
3269 	 * proxy arp of static addresses.
3270 	 */
3271 	ASSERT(err == 0);
3272 
3273 	mutex_enter(&ncec->ncec_lock);
3274 	ncec->ncec_flags = new_flags;
3275 	mutex_exit(&ncec->ncec_lock);
3276 	/*
3277 	 * Note that we ignore the state at this point, which
3278 	 * should be either STALE or REACHABLE.  Instead we let
3279 	 * the link layer address passed in to determine the state
3280 	 * much like incoming packets.
3281 	 */
3282 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3283 	nce_refrele(nce);
3284 	return (0);
3285 }
3286 
3287 /*
3288  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3289  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3290  * be held to ensure that they are in the same group.
3291  */
3292 static nce_t *
nce_fastpath_create(ill_t * ill,ncec_t * ncec)3293 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3294 {
3295 
3296 	nce_t *nce;
3297 
3298 	nce = nce_ill_lookup_then_add(ill, ncec);
3299 
3300 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3301 		return (nce);
3302 
3303 	/*
3304 	 * hold the ncec_lock to synchronize with nce_update() so that,
3305 	 * at the end of this function, the contents of nce_dlur_mp are
3306 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3307 	 * packet may have been sent out with a mangled address, which would
3308 	 * only be a transient condition.
3309 	 */
3310 	mutex_enter(&ncec->ncec_lock);
3311 	if (ncec->ncec_lladdr != NULL) {
3312 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3313 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3314 	} else {
3315 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3316 		    ill->ill_sap_length);
3317 	}
3318 	mutex_exit(&ncec->ncec_lock);
3319 	return (nce);
3320 }
3321 
3322 /*
3323  * we make nce_fp_mp to have an M_DATA prepend.
3324  * The caller ensures there is hold on ncec for this function.
3325  * Note that since ill_fastpath_probe() copies the mblk there is
3326  * no need to hold the nce or ncec beyond this function.
3327  *
3328  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3329  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3330  * and will be returned back by this function, so that no extra nce_refrele
3331  * is required for the caller. The calls from nce_add_common() use this
3332  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3333  * nce_refrele of the returned nce (when it is non-null).
3334  */
3335 static nce_t *
nce_fastpath(ncec_t * ncec,boolean_t trigger_fp_req,nce_t * ncec_nce)3336 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3337 {
3338 	nce_t *nce;
3339 	ill_t *ill = ncec->ncec_ill;
3340 
3341 	ASSERT(ill != NULL);
3342 
3343 	if (IS_IPMP(ill) && trigger_fp_req) {
3344 		trigger_fp_req = B_FALSE;
3345 		ipmp_ncec_refresh_nce(ncec);
3346 	}
3347 
3348 	/*
3349 	 * If the caller already has the nce corresponding to the ill, use
3350 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3351 	 * nce_add_common() fall in the former category, and have just done
3352 	 * the nce lookup/add that can be reused.
3353 	 */
3354 	if (ncec_nce == NULL)
3355 		nce = nce_fastpath_create(ill, ncec);
3356 	else
3357 		nce = ncec_nce;
3358 
3359 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3360 		return (nce);
3361 
3362 	if (trigger_fp_req)
3363 		nce_fastpath_trigger(nce);
3364 	return (nce);
3365 }
3366 
3367 /*
3368  * Trigger fastpath on nce. No locks may be held.
3369  */
3370 static void
nce_fastpath_trigger(nce_t * nce)3371 nce_fastpath_trigger(nce_t *nce)
3372 {
3373 	int res;
3374 	ill_t *ill = nce->nce_ill;
3375 	ncec_t *ncec = nce->nce_common;
3376 
3377 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3378 	/*
3379 	 * EAGAIN is an indication of a transient error
3380 	 * i.e. allocation failure etc. leave the ncec in the list it
3381 	 * will be updated when another probe happens for another ire
3382 	 * if not it will be taken out of the list when the ire is
3383 	 * deleted.
3384 	 */
3385 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3386 		nce_fastpath_list_delete(ill, ncec, NULL);
3387 }
3388 
3389 /*
3390  * Add ncec to the nce fastpath list on ill.
3391  */
3392 static nce_t *
nce_ill_lookup_then_add_locked(ill_t * ill,ncec_t * ncec,list_t * graveyard)3393 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3394 {
3395 	nce_t *nce = NULL;
3396 
3397 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3398 	/*
3399 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3400 	 * down, before adding the NCE.
3401 	 */
3402 	if (ill->ill_state_flags & ILL_CONDEMNED)
3403 		return (NULL);
3404 	mutex_enter(&ncec->ncec_lock);
3405 	/*
3406 	 * if ncec has not been deleted and
3407 	 * is not already in the list add it.
3408 	 */
3409 	if (!NCE_ISCONDEMNED(ncec)) {
3410 		nce = nce_lookup(ill, &ncec->ncec_addr);
3411 		if (nce != NULL)
3412 			goto done;
3413 		nce = nce_add(ill, ncec, graveyard);
3414 	}
3415 done:
3416 	mutex_exit(&ncec->ncec_lock);
3417 	return (nce);
3418 }
3419 
3420 static nce_t *
nce_ill_lookup_then_add(ill_t * ill,ncec_t * ncec)3421 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3422 {
3423 	nce_t *nce;
3424 	list_t graveyard;
3425 
3426 	list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3427 	mutex_enter(&ill->ill_lock);
3428 	nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3429 	mutex_exit(&ill->ill_lock);
3430 	nce_graveyard_free(&graveyard);
3431 	return (nce);
3432 }
3433 
3434 
3435 /*
3436  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3437  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3438  * entry after all locks have been dropped.
3439  */
3440 void
nce_fastpath_list_delete(ill_t * ill,ncec_t * ncec,list_t * dead)3441 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3442 {
3443 	nce_t *nce;
3444 
3445 	ASSERT(ill != NULL);
3446 
3447 	/* delete any nces referencing the ncec from underlying ills */
3448 	if (IS_IPMP(ill))
3449 		ipmp_ncec_delete_nce(ncec);
3450 
3451 	/* now the ill itself */
3452 	mutex_enter(&ill->ill_lock);
3453 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3454 	    nce = list_next(&ill->ill_nce, nce)) {
3455 		if (nce->nce_common == ncec) {
3456 			nce_refhold(nce);
3457 			nce_delete(nce);
3458 			break;
3459 		}
3460 	}
3461 	mutex_exit(&ill->ill_lock);
3462 	if (nce != NULL) {
3463 		if (dead == NULL)
3464 			nce_refrele(nce);
3465 		else
3466 			list_insert_tail(dead, nce);
3467 	}
3468 }
3469 
3470 /*
3471  * when the fastpath response does not fit in the datab
3472  * associated with the existing nce_fp_mp, we delete and
3473  * add the nce to retrigger fastpath based on the information
3474  * in the ncec_t.
3475  */
3476 static nce_t *
nce_delete_then_add(nce_t * nce)3477 nce_delete_then_add(nce_t *nce)
3478 {
3479 	ill_t		*ill = nce->nce_ill;
3480 	nce_t		*newnce = NULL;
3481 	list_t		graveyard;
3482 
3483 	list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3484 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3485 	    (void *)nce, ill->ill_name));
3486 	mutex_enter(&ill->ill_lock);
3487 	mutex_enter(&nce->nce_common->ncec_lock);
3488 	nce_delete(nce);
3489 	/*
3490 	 * Make sure that ncec is not condemned before adding. We hold the
3491 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3492 	 * ipmp_ncec_delete_nce()
3493 	 */
3494 	if (!NCE_ISCONDEMNED(nce->nce_common))
3495 		newnce = nce_add(ill, nce->nce_common, &graveyard);
3496 	mutex_exit(&nce->nce_common->ncec_lock);
3497 	mutex_exit(&ill->ill_lock);
3498 	nce_graveyard_free(&graveyard);
3499 	nce_refrele(nce);
3500 	return (newnce); /* could be null if nomem */
3501 }
3502 
3503 typedef struct nce_fp_match_s {
3504 	nce_t	*nce_fp_match_res;
3505 	mblk_t	*nce_fp_match_ack_mp;
3506 } nce_fp_match_t;
3507 
3508 /* ARGSUSED */
3509 static int
nce_fastpath_match_dlur(ill_t * ill,nce_t * nce,void * arg)3510 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3511 {
3512 	nce_fp_match_t	*nce_fp_marg = arg;
3513 	ncec_t		*ncec = nce->nce_common;
3514 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3515 	uchar_t	*mp_rptr, *ud_mp_rptr;
3516 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3517 	ptrdiff_t	cmplen;
3518 
3519 	/*
3520 	 * mp is the mp associated with the fastpath ack.
3521 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3522 	 * under consideration. If the contents match, then the
3523 	 * fastpath ack is used to update the nce.
3524 	 */
3525 	if (ud_mp == NULL)
3526 		return (0);
3527 	mp_rptr = mp->b_rptr;
3528 	cmplen = mp->b_wptr - mp_rptr;
3529 	ASSERT(cmplen >= 0);
3530 
3531 	ud_mp_rptr = ud_mp->b_rptr;
3532 	/*
3533 	 * The ncec is locked here to prevent any other threads from accessing
3534 	 * and changing nce_dlur_mp when the address becomes resolved to an
3535 	 * lla while we're in the middle of looking at and comparing the
3536 	 * hardware address (lla). It is also locked to prevent multiple
3537 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3538 	 * time.
3539 	 */
3540 	mutex_enter(&ncec->ncec_lock);
3541 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3542 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3543 		nce_fp_marg->nce_fp_match_res = nce;
3544 		mutex_exit(&ncec->ncec_lock);
3545 		nce_refhold(nce);
3546 		return (1);
3547 	}
3548 	mutex_exit(&ncec->ncec_lock);
3549 	return (0);
3550 }
3551 
3552 /*
3553  * Update all NCE's that are not in fastpath mode and
3554  * have an nce_fp_mp that matches mp. mp->b_cont contains
3555  * the fastpath header.
3556  *
3557  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3558  */
3559 void
nce_fastpath_update(ill_t * ill,mblk_t * mp)3560 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3561 {
3562 	nce_fp_match_t nce_fp_marg;
3563 	nce_t *nce;
3564 	mblk_t *nce_fp_mp, *fp_mp;
3565 
3566 	nce_fp_marg.nce_fp_match_res = NULL;
3567 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3568 
3569 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3570 
3571 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3572 		return;
3573 
3574 	mutex_enter(&nce->nce_lock);
3575 	nce_fp_mp = nce->nce_fp_mp;
3576 
3577 	if (nce_fp_mp != NULL) {
3578 		fp_mp = mp->b_cont;
3579 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3580 		    nce_fp_mp->b_datap->db_lim) {
3581 			mutex_exit(&nce->nce_lock);
3582 			nce = nce_delete_then_add(nce);
3583 			if (nce == NULL) {
3584 				return;
3585 			}
3586 			mutex_enter(&nce->nce_lock);
3587 			nce_fp_mp = nce->nce_fp_mp;
3588 		}
3589 	}
3590 
3591 	/* Matched - install mp as the fastpath mp */
3592 	if (nce_fp_mp == NULL) {
3593 		fp_mp = dupb(mp->b_cont);
3594 		nce->nce_fp_mp = fp_mp;
3595 	} else {
3596 		fp_mp = mp->b_cont;
3597 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3598 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3599 		    + MBLKL(fp_mp);
3600 	}
3601 	mutex_exit(&nce->nce_lock);
3602 	nce_refrele(nce);
3603 }
3604 
3605 /*
3606  * Return a pointer to a given option in the packet.
3607  * Assumes that option part of the packet have already been validated.
3608  */
3609 nd_opt_hdr_t *
ndp_get_option(nd_opt_hdr_t * opt,int optlen,int opt_type)3610 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3611 {
3612 	while (optlen > 0) {
3613 		if (opt->nd_opt_type == opt_type)
3614 			return (opt);
3615 		optlen -= 8 * opt->nd_opt_len;
3616 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3617 	}
3618 	return (NULL);
3619 }
3620 
3621 /*
3622  * Verify all option lengths present are > 0, also check to see
3623  * if the option lengths and packet length are consistent.
3624  */
3625 boolean_t
ndp_verify_optlen(nd_opt_hdr_t * opt,int optlen)3626 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3627 {
3628 	ASSERT(opt != NULL);
3629 	while (optlen > 0) {
3630 		if (opt->nd_opt_len == 0)
3631 			return (B_FALSE);
3632 		optlen -= 8 * opt->nd_opt_len;
3633 		if (optlen < 0)
3634 			return (B_FALSE);
3635 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3636 	}
3637 	return (B_TRUE);
3638 }
3639 
3640 /*
3641  * ncec_walk function.
3642  * Free a fraction of the NCE cache entries.
3643  *
3644  * A possible optimization here would be to use ncec_last where possible, and
3645  * delete the least-frequently used entry, which would require more complex
3646  * computation as we walk through the ncec's (e.g., track ncec entries by
3647  * order of ncec_last and/or maintain state)
3648  */
3649 static void
ncec_cache_reclaim(ncec_t * ncec,void * arg)3650 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3651 {
3652 	ip_stack_t	*ipst = ncec->ncec_ipst;
3653 	uint_t		fraction = *(uint_t *)arg;
3654 	uint_t		rand;
3655 
3656 	if ((ncec->ncec_flags &
3657 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3658 		return;
3659 	}
3660 
3661 	rand = (uint_t)ddi_get_lbolt() +
3662 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3663 	if ((rand/fraction)*fraction == rand) {
3664 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3665 		ncec_delete(ncec);
3666 	}
3667 }
3668 
3669 /*
3670  * kmem_cache callback to free up memory.
3671  *
3672  * For now we just delete a fixed fraction.
3673  */
3674 static void
ip_nce_reclaim_stack(ip_stack_t * ipst)3675 ip_nce_reclaim_stack(ip_stack_t *ipst)
3676 {
3677 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3678 
3679 	IP_STAT(ipst, ip_nce_reclaim_calls);
3680 
3681 	ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3682 
3683 	/*
3684 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3685 	 * Get them to update any stale references to drop any refholds they
3686 	 * have.
3687 	 */
3688 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3689 }
3690 
3691 /*
3692  * Called by the memory allocator subsystem directly, when the system
3693  * is running low on memory.
3694  */
3695 /* ARGSUSED */
3696 void
ip_nce_reclaim(void * args)3697 ip_nce_reclaim(void *args)
3698 {
3699 	netstack_handle_t nh;
3700 	netstack_t *ns;
3701 	ip_stack_t *ipst;
3702 
3703 	netstack_next_init(&nh);
3704 	while ((ns = netstack_next(&nh)) != NULL) {
3705 		/*
3706 		 * netstack_next() can return a netstack_t with a NULL
3707 		 * netstack_ip at boot time.
3708 		 */
3709 		if ((ipst = ns->netstack_ip) == NULL) {
3710 			netstack_rele(ns);
3711 			continue;
3712 		}
3713 		ip_nce_reclaim_stack(ipst);
3714 		netstack_rele(ns);
3715 	}
3716 	netstack_next_fini(&nh);
3717 }
3718 
3719 #ifdef DEBUG
3720 void
ncec_trace_ref(ncec_t * ncec)3721 ncec_trace_ref(ncec_t *ncec)
3722 {
3723 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3724 
3725 	if (ncec->ncec_trace_disable)
3726 		return;
3727 
3728 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3729 		ncec->ncec_trace_disable = B_TRUE;
3730 		ncec_trace_cleanup(ncec);
3731 	}
3732 }
3733 
3734 void
ncec_untrace_ref(ncec_t * ncec)3735 ncec_untrace_ref(ncec_t *ncec)
3736 {
3737 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3738 
3739 	if (!ncec->ncec_trace_disable)
3740 		th_trace_unref(ncec);
3741 }
3742 
3743 static void
ncec_trace_cleanup(const ncec_t * ncec)3744 ncec_trace_cleanup(const ncec_t *ncec)
3745 {
3746 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3747 }
3748 #endif
3749 
3750 /*
3751  * Called when address resolution fails due to a timeout.
3752  * Send an ICMP unreachable in response to all queued packets.
3753  */
3754 void
arp_resolv_failed(ncec_t * ncec)3755 arp_resolv_failed(ncec_t *ncec)
3756 {
3757 	mblk_t	*mp, *nxt_mp;
3758 	char	buf[INET6_ADDRSTRLEN];
3759 	struct in_addr ipv4addr;
3760 	ill_t *ill = ncec->ncec_ill;
3761 	ip_stack_t *ipst = ncec->ncec_ipst;
3762 	ip_recv_attr_t	iras;
3763 
3764 	bzero(&iras, sizeof (iras));
3765 	iras.ira_flags = IRAF_IS_IPV4;
3766 	/*
3767 	 * we are setting the ira_rill to the ipmp_ill (instead of
3768 	 * the actual ill on which the packet was received), but this
3769 	 * is ok because we don't actually need the real ira_rill.
3770 	 * to send the icmp unreachable to the sender.
3771 	 */
3772 	iras.ira_ill = iras.ira_rill = ill;
3773 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3774 	iras.ira_rifindex = iras.ira_ruifindex;
3775 
3776 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3777 	ip3dbg(("arp_resolv_failed: dst %s\n",
3778 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3779 	mutex_enter(&ncec->ncec_lock);
3780 	mp = ncec->ncec_qd_mp;
3781 	ncec->ncec_qd_mp = NULL;
3782 	ncec->ncec_nprobes = 0;
3783 	mutex_exit(&ncec->ncec_lock);
3784 	while (mp != NULL) {
3785 		nxt_mp = mp->b_next;
3786 		mp->b_next = NULL;
3787 
3788 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3789 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3790 		    mp, ill);
3791 		if (ipst->ips_ip_arp_icmp_error) {
3792 			ip3dbg(("arp_resolv_failed: "
3793 			    "Calling icmp_unreachable\n"));
3794 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3795 		} else {
3796 			freemsg(mp);
3797 		}
3798 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3799 		mp = nxt_mp;
3800 	}
3801 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3802 }
3803 
3804 /*
3805  * if ill is an under_ill, translate it to the ipmp_ill and add the
3806  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3807  * one on the underlying in_ill) will be created for the
3808  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3809  */
3810 int
nce_lookup_then_add_v4(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)3811 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3812     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3813 {
3814 	int	err;
3815 	in6_addr_t addr6;
3816 	ip_stack_t *ipst = ill->ill_ipst;
3817 	nce_t	*nce, *upper_nce = NULL;
3818 	ill_t	*in_ill = ill, *under = NULL;
3819 	boolean_t need_ill_refrele = B_FALSE;
3820 
3821 	if (flags & NCE_F_MCAST) {
3822 		/*
3823 		 * hw_addr will be figured out in nce_set_multicast_v4;
3824 		 * caller needs to pass in the cast_ill for ipmp
3825 		 */
3826 		ASSERT(hw_addr == NULL);
3827 		ASSERT(!IS_IPMP(ill));
3828 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3829 		return (err);
3830 	}
3831 
3832 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3833 		ill = ipmp_ill_hold_ipmp_ill(ill);
3834 		if (ill == NULL)
3835 			return (ENXIO);
3836 		need_ill_refrele = B_TRUE;
3837 	}
3838 	if ((flags & NCE_F_BCAST) != 0) {
3839 		/*
3840 		 * IPv4 broadcast ncec: compute the hwaddr.
3841 		 */
3842 		if (IS_IPMP(ill)) {
3843 			under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3844 			if (under == NULL)  {
3845 				if (need_ill_refrele)
3846 					ill_refrele(ill);
3847 				return (ENETDOWN);
3848 			}
3849 			hw_addr = under->ill_bcast_mp->b_rptr +
3850 			    NCE_LL_ADDR_OFFSET(under);
3851 			hw_addr_len = under->ill_phys_addr_length;
3852 		} else {
3853 			hw_addr = ill->ill_bcast_mp->b_rptr +
3854 			    NCE_LL_ADDR_OFFSET(ill),
3855 			    hw_addr_len = ill->ill_phys_addr_length;
3856 		}
3857 	}
3858 
3859 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3860 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3861 	nce = nce_lookup_addr(ill, &addr6);
3862 	if (nce == NULL) {
3863 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3864 		    state, &nce);
3865 	} else {
3866 		err = EEXIST;
3867 	}
3868 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3869 	if (err == 0)
3870 		err = nce_add_v4_postprocess(nce);
3871 
3872 	if (in_ill != ill && nce != NULL) {
3873 		nce_t *under_nce = NULL;
3874 
3875 		/*
3876 		 * in_ill was the under_ill. Try to create the under_nce.
3877 		 * Hold the ill_g_lock to prevent changes to group membership
3878 		 * until we are done.
3879 		 */
3880 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3881 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3882 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3883 			    ill_t *, ill);
3884 			rw_exit(&ipst->ips_ill_g_lock);
3885 			err = ENXIO;
3886 			nce_refrele(nce);
3887 			nce = NULL;
3888 			goto bail;
3889 		}
3890 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3891 		if (under_nce == NULL) {
3892 			rw_exit(&ipst->ips_ill_g_lock);
3893 			err = EINVAL;
3894 			nce_refrele(nce);
3895 			nce = NULL;
3896 			goto bail;
3897 		}
3898 		rw_exit(&ipst->ips_ill_g_lock);
3899 		upper_nce = nce;
3900 		nce = under_nce; /* will be returned to caller */
3901 		if (NCE_ISREACHABLE(nce->nce_common))
3902 			nce_fastpath_trigger(under_nce);
3903 	}
3904 	if (nce != NULL) {
3905 		if (newnce != NULL)
3906 			*newnce = nce;
3907 		else
3908 			nce_refrele(nce);
3909 	}
3910 bail:
3911 	if (under != NULL)
3912 		ill_refrele(under);
3913 	if (upper_nce != NULL)
3914 		nce_refrele(upper_nce);
3915 	if (need_ill_refrele)
3916 		ill_refrele(ill);
3917 
3918 	return (err);
3919 }
3920 
3921 /*
3922  * NDP Cache Entry creation routine for IPv4.
3923  * This routine must always be called with ndp4->ndp_g_lock held.
3924  * Prior to return, ncec_refcnt is incremented.
3925  *
3926  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3927  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3928  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3929  * entries will be created, both pointing at the same ncec_t. The nce_t
3930  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3931  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3932  * Local addresses are always created on the ill passed to nce_add_v4.
3933  */
3934 int
nce_add_v4(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)3935 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3936     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3937 {
3938 	int		err;
3939 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3940 	struct in6_addr	addr6;
3941 	nce_t		*nce;
3942 
3943 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3944 	ASSERT(!ill->ill_isv6);
3945 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3946 
3947 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3948 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3949 	    &nce);
3950 	ASSERT(newnce != NULL);
3951 	*newnce = nce;
3952 	return (err);
3953 }
3954 
3955 /*
3956  * Post-processing routine to be executed after nce_add_v4(). This function
3957  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3958  * and must be called without any locks held.
3959  *
3960  * Always returns 0, but we return an int to keep this symmetric with the
3961  * IPv6 counter-part.
3962  */
3963 int
nce_add_v4_postprocess(nce_t * nce)3964 nce_add_v4_postprocess(nce_t *nce)
3965 {
3966 	ncec_t		*ncec = nce->nce_common;
3967 	uint16_t	flags = ncec->ncec_flags;
3968 	boolean_t	ndp_need_dad = B_FALSE;
3969 	boolean_t	dropped;
3970 	clock_t		delay;
3971 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3972 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3973 	boolean_t	trigger_fastpath = B_TRUE;
3974 
3975 	/*
3976 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3977 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3978 	 * We call nce_fastpath from nce_update if the link layer address of
3979 	 * the peer changes from nce_update
3980 	 */
3981 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3982 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3983 		trigger_fastpath = B_FALSE;
3984 
3985 	if (trigger_fastpath)
3986 		nce_fastpath_trigger(nce);
3987 
3988 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3989 		/*
3990 		 * Either the caller (by passing in ND_PROBE)
3991 		 * or nce_add_common() (by the internally computed state
3992 		 * based on ncec_addr and ill_net_type) has determined
3993 		 * that this unicast entry needs DAD. Trigger DAD.
3994 		 */
3995 		ndp_need_dad = B_TRUE;
3996 	} else if (flags & NCE_F_UNSOL_ADV) {
3997 		/*
3998 		 * We account for the transmit below by assigning one
3999 		 * less than the ndd variable. Subsequent decrements
4000 		 * are done in nce_timer.
4001 		 */
4002 		mutex_enter(&ncec->ncec_lock);
4003 		ncec->ncec_unsolicit_count =
4004 		    ipst->ips_ip_arp_publish_count - 1;
4005 		mutex_exit(&ncec->ncec_lock);
4006 		dropped = arp_announce(ncec);
4007 		mutex_enter(&ncec->ncec_lock);
4008 		if (dropped)
4009 			ncec->ncec_unsolicit_count++;
4010 		else
4011 			ncec->ncec_last_time_defended = ddi_get_lbolt();
4012 		if (ncec->ncec_unsolicit_count != 0) {
4013 			nce_start_timer(ncec,
4014 			    ipst->ips_ip_arp_publish_interval);
4015 		}
4016 		mutex_exit(&ncec->ncec_lock);
4017 	}
4018 
4019 	/*
4020 	 * If ncec_xmit_interval is 0, user has configured us to send the first
4021 	 * probe right away.  Do so, and set up for the subsequent probes.
4022 	 */
4023 	if (ndp_need_dad) {
4024 		mutex_enter(&ncec->ncec_lock);
4025 		if (ncec->ncec_pcnt == 0) {
4026 			/*
4027 			 * DAD probes and announce can be
4028 			 * administratively disabled by setting the
4029 			 * probe_count to zero. Restart the timer in
4030 			 * this case to mark the ipif as ready.
4031 			 */
4032 			ncec->ncec_unsolicit_count = 0;
4033 			mutex_exit(&ncec->ncec_lock);
4034 			nce_restart_timer(ncec, 0);
4035 		} else {
4036 			mutex_exit(&ncec->ncec_lock);
4037 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
4038 			    ipst->ips_arp_probe_delay :
4039 			    ipst->ips_arp_fastprobe_delay);
4040 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
4041 		}
4042 	}
4043 	return (0);
4044 }
4045 
4046 /*
4047  * ncec_walk routine to update all entries that have a given destination or
4048  * gateway address and cached link layer (MAC) address.  This is used when ARP
4049  * informs us that a network-to-link-layer mapping may have changed.
4050  */
4051 void
nce_update_hw_changed(ncec_t * ncec,void * arg)4052 nce_update_hw_changed(ncec_t *ncec, void *arg)
4053 {
4054 	nce_hw_map_t *hwm = arg;
4055 	ipaddr_t ncec_addr;
4056 
4057 	if (ncec->ncec_state != ND_REACHABLE)
4058 		return;
4059 
4060 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
4061 	if (ncec_addr != hwm->hwm_addr)
4062 		return;
4063 
4064 	mutex_enter(&ncec->ncec_lock);
4065 	if (hwm->hwm_flags != 0)
4066 		ncec->ncec_flags = hwm->hwm_flags;
4067 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
4068 	mutex_exit(&ncec->ncec_lock);
4069 }
4070 
4071 void
ncec_refhold(ncec_t * ncec)4072 ncec_refhold(ncec_t *ncec)
4073 {
4074 	mutex_enter(&(ncec)->ncec_lock);
4075 	(ncec)->ncec_refcnt++;
4076 	ASSERT((ncec)->ncec_refcnt != 0);
4077 #ifdef DEBUG
4078 	ncec_trace_ref(ncec);
4079 #endif
4080 	mutex_exit(&(ncec)->ncec_lock);
4081 }
4082 
4083 void
ncec_refhold_notr(ncec_t * ncec)4084 ncec_refhold_notr(ncec_t *ncec)
4085 {
4086 	mutex_enter(&(ncec)->ncec_lock);
4087 	(ncec)->ncec_refcnt++;
4088 	ASSERT((ncec)->ncec_refcnt != 0);
4089 	mutex_exit(&(ncec)->ncec_lock);
4090 }
4091 
4092 static void
ncec_refhold_locked(ncec_t * ncec)4093 ncec_refhold_locked(ncec_t *ncec)
4094 {
4095 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
4096 	(ncec)->ncec_refcnt++;
4097 #ifdef DEBUG
4098 	ncec_trace_ref(ncec);
4099 #endif
4100 }
4101 
4102 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
4103 void
ncec_refrele(ncec_t * ncec)4104 ncec_refrele(ncec_t *ncec)
4105 {
4106 	mutex_enter(&(ncec)->ncec_lock);
4107 #ifdef DEBUG
4108 	ncec_untrace_ref(ncec);
4109 #endif
4110 	ASSERT((ncec)->ncec_refcnt != 0);
4111 	if (--(ncec)->ncec_refcnt == 0) {
4112 		ncec_inactive(ncec);
4113 	} else {
4114 		mutex_exit(&(ncec)->ncec_lock);
4115 	}
4116 }
4117 
4118 void
ncec_refrele_notr(ncec_t * ncec)4119 ncec_refrele_notr(ncec_t *ncec)
4120 {
4121 	mutex_enter(&(ncec)->ncec_lock);
4122 	ASSERT((ncec)->ncec_refcnt != 0);
4123 	if (--(ncec)->ncec_refcnt == 0) {
4124 		ncec_inactive(ncec);
4125 	} else {
4126 		mutex_exit(&(ncec)->ncec_lock);
4127 	}
4128 }
4129 
4130 /*
4131  * Common to IPv4 and IPv6.
4132  */
4133 void
nce_restart_timer(ncec_t * ncec,uint_t ms)4134 nce_restart_timer(ncec_t *ncec, uint_t ms)
4135 {
4136 	timeout_id_t tid;
4137 
4138 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
4139 
4140 	/* First cancel any running timer */
4141 	mutex_enter(&ncec->ncec_lock);
4142 	tid = ncec->ncec_timeout_id;
4143 	ncec->ncec_timeout_id = 0;
4144 	if (tid != 0) {
4145 		mutex_exit(&ncec->ncec_lock);
4146 		(void) untimeout(tid);
4147 		mutex_enter(&ncec->ncec_lock);
4148 	}
4149 
4150 	/* Restart timer */
4151 	nce_start_timer(ncec, ms);
4152 	mutex_exit(&ncec->ncec_lock);
4153 }
4154 
4155 static void
nce_start_timer(ncec_t * ncec,uint_t ms)4156 nce_start_timer(ncec_t *ncec, uint_t ms)
4157 {
4158 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4159 	/*
4160 	 * Don't start the timer if the ncec has been deleted, or if the timer
4161 	 * is already running
4162 	 */
4163 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
4164 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
4165 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
4166 	}
4167 }
4168 
4169 int
nce_set_multicast_v4(ill_t * ill,const in_addr_t * dst,uint16_t flags,nce_t ** newnce)4170 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
4171     uint16_t flags, nce_t **newnce)
4172 {
4173 	uchar_t		*hw_addr;
4174 	int		err = 0;
4175 	ip_stack_t	*ipst = ill->ill_ipst;
4176 	in6_addr_t	dst6;
4177 	nce_t		*nce;
4178 
4179 	ASSERT(!ill->ill_isv6);
4180 
4181 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
4182 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
4183 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
4184 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4185 		goto done;
4186 	}
4187 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
4188 		/*
4189 		 * For IRE_IF_RESOLVER a hardware mapping can be
4190 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
4191 		 * in the ill is copied in nce_add_v4().
4192 		 */
4193 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
4194 		if (hw_addr == NULL) {
4195 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4196 			return (ENOMEM);
4197 		}
4198 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
4199 	} else {
4200 		/*
4201 		 * IRE_IF_NORESOLVER type simply copies the resolution
4202 		 * cookie passed in.  So no hw_addr is needed.
4203 		 */
4204 		hw_addr = NULL;
4205 	}
4206 	ASSERT(flags & NCE_F_MCAST);
4207 	ASSERT(flags & NCE_F_NONUD);
4208 	/* nce_state will be computed by nce_add_common() */
4209 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
4210 	    ND_UNCHANGED, &nce);
4211 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4212 	if (err == 0)
4213 		err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
4214 	if (hw_addr != NULL)
4215 		kmem_free(hw_addr, ill->ill_phys_addr_length);
4216 	if (err != 0) {
4217 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
4218 		return (err);
4219 	}
4220 done:
4221 	if (newnce != NULL)
4222 		*newnce = nce;
4223 	else
4224 		nce_refrele(nce);
4225 	return (0);
4226 }
4227 
4228 /*
4229  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
4230  * don't want to have to walk the list for every single one, so we gather up
4231  * batches at a time.
4232  */
4233 #define	NCE_RESCHED_LIST_LEN	8
4234 
4235 typedef struct {
4236 	ill_t	*ncert_ill;
4237 	uint_t	ncert_num;
4238 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
4239 } nce_resched_t;
4240 
4241 /*
4242  * Pick the longest waiting NCEs for defense.
4243  */
4244 /* ARGSUSED */
4245 static int
ncec_reschedule(ill_t * ill,nce_t * nce,void * arg)4246 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4247 {
4248 	nce_resched_t *ncert = arg;
4249 	ncec_t **ncecs;
4250 	ncec_t **ncec_max;
4251 	ncec_t *ncec_temp;
4252 	ncec_t *ncec = nce->nce_common;
4253 
4254 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4255 	/*
4256 	 * Only reachable entries that are ready for announcement are eligible.
4257 	 */
4258 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4259 		return (0);
4260 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4261 		ncec_refhold(ncec);
4262 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
4263 	} else {
4264 		ncecs = ncert->ncert_nces;
4265 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4266 		ncec_refhold(ncec);
4267 		for (; ncecs < ncec_max; ncecs++) {
4268 			ASSERT(ncec != NULL);
4269 			if ((*ncecs)->ncec_last_time_defended >
4270 			    ncec->ncec_last_time_defended) {
4271 				ncec_temp = *ncecs;
4272 				*ncecs = ncec;
4273 				ncec = ncec_temp;
4274 			}
4275 		}
4276 		ncec_refrele(ncec);
4277 	}
4278 	return (0);
4279 }
4280 
4281 /*
4282  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4283  * doesn't happen very often (if at all), and thus it needn't be highly
4284  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4285  * outer loop is bounded by a constant rather than by the length of the list.)
4286  */
4287 static void
nce_ill_reschedule(ill_t * ill,nce_resched_t * ncert)4288 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4289 {
4290 	ncec_t		*ncec;
4291 	ip_stack_t	*ipst = ill->ill_ipst;
4292 	uint_t		i, defend_rate;
4293 
4294 	i = ill->ill_defend_count;
4295 	ill->ill_defend_count = 0;
4296 	if (ill->ill_isv6)
4297 		defend_rate = ipst->ips_ndp_defend_rate;
4298 	else
4299 		defend_rate = ipst->ips_arp_defend_rate;
4300 	/* If none could be sitting around, then don't reschedule */
4301 	if (i < defend_rate) {
4302 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4303 		return;
4304 	}
4305 	ncert->ncert_ill = ill;
4306 	while (ill->ill_defend_count < defend_rate) {
4307 		nce_walk_common(ill, ncec_reschedule, ncert);
4308 		for (i = 0; i < ncert->ncert_num; i++) {
4309 
4310 			ncec = ncert->ncert_nces[i];
4311 			mutex_enter(&ncec->ncec_lock);
4312 			ncec->ncec_flags |= NCE_F_DELAYED;
4313 			mutex_exit(&ncec->ncec_lock);
4314 			/*
4315 			 * we plan to schedule this ncec, so incr the
4316 			 * defend_count in anticipation.
4317 			 */
4318 			if (++ill->ill_defend_count >= defend_rate)
4319 				break;
4320 		}
4321 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4322 			break;
4323 	}
4324 }
4325 
4326 /*
4327  * Check if the current rate-limiting parameters permit the sending
4328  * of another address defense announcement for both IPv4 and IPv6.
4329  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4330  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4331  * determines how many address defense announcements are permitted
4332  * in any `defense_perio' interval.
4333  */
4334 static boolean_t
ill_defend_rate_limit(ill_t * ill,ncec_t * ncec)4335 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4336 {
4337 	clock_t		now = ddi_get_lbolt();
4338 	ip_stack_t	*ipst = ill->ill_ipst;
4339 	clock_t		start = ill->ill_defend_start;
4340 	uint32_t	elapsed, defend_period, defend_rate;
4341 	nce_resched_t	ncert;
4342 	boolean_t	ret;
4343 	int		i;
4344 
4345 	if (ill->ill_isv6) {
4346 		defend_period = ipst->ips_ndp_defend_period;
4347 		defend_rate = ipst->ips_ndp_defend_rate;
4348 	} else {
4349 		defend_period = ipst->ips_arp_defend_period;
4350 		defend_rate = ipst->ips_arp_defend_rate;
4351 	}
4352 	if (defend_rate == 0)
4353 		return (B_TRUE);
4354 	bzero(&ncert, sizeof (ncert));
4355 	mutex_enter(&ill->ill_lock);
4356 	if (start > 0) {
4357 		elapsed = now - start;
4358 		if (elapsed > SEC_TO_TICK(defend_period)) {
4359 			ill->ill_defend_start = now;
4360 			/*
4361 			 * nce_ill_reschedule will attempt to
4362 			 * prevent starvation by reschduling the
4363 			 * oldest entries, which are marked with
4364 			 * the NCE_F_DELAYED flag.
4365 			 */
4366 			nce_ill_reschedule(ill, &ncert);
4367 		}
4368 	} else {
4369 		ill->ill_defend_start = now;
4370 	}
4371 	ASSERT(ill->ill_defend_count <= defend_rate);
4372 	mutex_enter(&ncec->ncec_lock);
4373 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4374 		/*
4375 		 * This ncec was rescheduled as one of the really old
4376 		 * entries needing on-going defense. The
4377 		 * ill_defend_count was already incremented in
4378 		 * nce_ill_reschedule. Go ahead and send the announce.
4379 		 */
4380 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4381 		mutex_exit(&ncec->ncec_lock);
4382 		ret = B_FALSE;
4383 		goto done;
4384 	}
4385 	mutex_exit(&ncec->ncec_lock);
4386 	if (ill->ill_defend_count < defend_rate)
4387 		ill->ill_defend_count++;
4388 	if (ill->ill_defend_count == defend_rate) {
4389 		/*
4390 		 * we are no longer allowed to send unbidden defense
4391 		 * messages. Wait for rescheduling.
4392 		 */
4393 		ret = B_TRUE;
4394 	} else {
4395 		ret = B_FALSE;
4396 	}
4397 done:
4398 	mutex_exit(&ill->ill_lock);
4399 	/*
4400 	 * After all the locks have been dropped we can restart nce timer,
4401 	 * and refrele the delayed ncecs
4402 	 */
4403 	for (i = 0; i < ncert.ncert_num; i++) {
4404 		clock_t	xmit_interval;
4405 		ncec_t	*tmp;
4406 
4407 		tmp = ncert.ncert_nces[i];
4408 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4409 		    B_FALSE);
4410 		nce_restart_timer(tmp, xmit_interval);
4411 		ncec_refrele(tmp);
4412 	}
4413 	return (ret);
4414 }
4415 
4416 boolean_t
ndp_announce(ncec_t * ncec)4417 ndp_announce(ncec_t *ncec)
4418 {
4419 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4420 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4421 	    nce_advert_flags(ncec)));
4422 }
4423 
4424 ill_t *
nce_resolve_src(ncec_t * ncec,in6_addr_t * src)4425 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4426 {
4427 	mblk_t		*mp;
4428 	in6_addr_t	src6;
4429 	ipaddr_t	src4;
4430 	ill_t		*ill = ncec->ncec_ill;
4431 	ill_t		*src_ill = NULL;
4432 	ipif_t		*ipif = NULL;
4433 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4434 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4435 
4436 	ASSERT(src != NULL);
4437 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4438 	src4 = 0;
4439 	src6 = *src;
4440 	if (is_myaddr) {
4441 		src6 = ncec->ncec_addr;
4442 		if (!isv6)
4443 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4444 	} else {
4445 		/*
4446 		 * try to find one from the outgoing packet.
4447 		 */
4448 		mutex_enter(&ncec->ncec_lock);
4449 		mp = ncec->ncec_qd_mp;
4450 		if (mp != NULL) {
4451 			if (isv6) {
4452 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4453 
4454 				src6 = ip6h->ip6_src;
4455 			} else {
4456 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4457 
4458 				src4 = ipha->ipha_src;
4459 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4460 			}
4461 		}
4462 		mutex_exit(&ncec->ncec_lock);
4463 	}
4464 
4465 	/*
4466 	 * For outgoing packets, if the src of outgoing packet is one
4467 	 * of the assigned interface addresses use it, otherwise we
4468 	 * will pick the source address below.
4469 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4470 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4471 	 * (non-IPMP) ncec_ill for these message types. The only case
4472 	 * of unicast DAD messages are for IPv6 ND probes, for which
4473 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4474 	 */
4475 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4476 		if (isv6) {
4477 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4478 			    ill->ill_ipst);
4479 		} else {
4480 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4481 			    ill->ill_ipst);
4482 		}
4483 
4484 		/*
4485 		 * If no relevant ipif can be found, then it's not one of our
4486 		 * addresses.  Reset to :: and try to find a src for the NS or
4487 		 * ARP request using ipif_select_source_v[4,6]  below.
4488 		 * If an ipif can be found, but it's not yet done with
4489 		 * DAD verification, and we are not being invoked for
4490 		 * DAD (i.e., !is_myaddr), then just postpone this
4491 		 * transmission until later.
4492 		 */
4493 		if (ipif == NULL) {
4494 			src6 = ipv6_all_zeros;
4495 			src4 = INADDR_ANY;
4496 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4497 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4498 			    ncec_t *, ncec, ipif_t *, ipif);
4499 			ipif_refrele(ipif);
4500 			return (NULL);
4501 		}
4502 	}
4503 
4504 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4505 		/*
4506 		 * Pick a source address for this solicitation, but
4507 		 * restrict the selection to addresses assigned to the
4508 		 * output interface.  We do this because the destination will
4509 		 * create a neighbor cache entry for the source address of
4510 		 * this packet, so the source address had better be a valid
4511 		 * neighbor.
4512 		 */
4513 		if (isv6) {
4514 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4515 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4516 			    B_FALSE, NULL);
4517 		} else {
4518 			ipaddr_t nce_addr;
4519 
4520 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4521 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4522 			    B_FALSE, NULL);
4523 		}
4524 		if (ipif == NULL && IS_IPMP(ill)) {
4525 			ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4526 
4527 			if (send_ill != NULL) {
4528 				if (isv6) {
4529 					ipif = ipif_select_source_v6(send_ill,
4530 					    &ncec->ncec_addr, B_TRUE,
4531 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4532 					    B_FALSE, NULL);
4533 				} else {
4534 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4535 					    src4);
4536 					ipif = ipif_select_source_v4(send_ill,
4537 					    src4, ALL_ZONES, B_TRUE, NULL);
4538 				}
4539 				ill_refrele(send_ill);
4540 			}
4541 		}
4542 
4543 		if (ipif == NULL) {
4544 			char buf[INET6_ADDRSTRLEN];
4545 
4546 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4547 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4548 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4549 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4550 			return (NULL);
4551 		}
4552 		src6 = ipif->ipif_v6lcl_addr;
4553 	}
4554 	*src = src6;
4555 	if (ipif != NULL) {
4556 		src_ill = ipif->ipif_ill;
4557 		if (IS_IPMP(src_ill))
4558 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4559 		else
4560 			ill_refhold(src_ill);
4561 		ipif_refrele(ipif);
4562 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4563 		    ill_t *, src_ill);
4564 	}
4565 	return (src_ill);
4566 }
4567 
4568 void
ip_nce_lookup_and_update(ipaddr_t * addr,ipif_t * ipif,ip_stack_t * ipst,uchar_t * hwaddr,int hwaddr_len,int flags)4569 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4570     uchar_t *hwaddr, int hwaddr_len, int flags)
4571 {
4572 	ill_t	*ill;
4573 	ncec_t	*ncec;
4574 	nce_t	*nce;
4575 	uint16_t new_state;
4576 
4577 	ill = (ipif ? ipif->ipif_ill : NULL);
4578 	if (ill != NULL) {
4579 		/*
4580 		 * only one ncec is possible
4581 		 */
4582 		nce = nce_lookup_v4(ill, addr);
4583 		if (nce != NULL) {
4584 			ncec = nce->nce_common;
4585 			mutex_enter(&ncec->ncec_lock);
4586 			if (NCE_ISREACHABLE(ncec))
4587 				new_state = ND_UNCHANGED;
4588 			else
4589 				new_state = ND_STALE;
4590 			ncec->ncec_flags = flags;
4591 			nce_update(ncec, new_state, hwaddr);
4592 			mutex_exit(&ncec->ncec_lock);
4593 			nce_refrele(nce);
4594 			return;
4595 		}
4596 	} else {
4597 		/*
4598 		 * ill is wildcard; clean up all ncec's and ire's
4599 		 * that match on addr.
4600 		 */
4601 		nce_hw_map_t hwm;
4602 
4603 		hwm.hwm_addr = *addr;
4604 		hwm.hwm_hwlen = hwaddr_len;
4605 		hwm.hwm_hwaddr = hwaddr;
4606 		hwm.hwm_flags = flags;
4607 
4608 		ncec_walk_common(ipst->ips_ndp4, NULL,
4609 		    nce_update_hw_changed, &hwm, B_TRUE);
4610 	}
4611 }
4612 
4613 /*
4614  * Common function to add ncec entries.
4615  * we always add the ncec with ncec_ill == ill, and always create
4616  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4617  * ncec is !reachable.
4618  *
4619  * When the caller passes in an nce_state of ND_UNCHANGED,
4620  * nce_add_common() will determine the state of the created nce based
4621  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4622  * be created with state set to the passed in nce_state.
4623  */
4624 static int
nce_add_common(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t nce_state,nce_t ** retnce)4625 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4626     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4627 {
4628 	static	ncec_t		nce_nil;
4629 	uchar_t			*template = NULL;
4630 	int			err;
4631 	ncec_t			*ncec;
4632 	ncec_t			**ncep;
4633 	ip_stack_t		*ipst = ill->ill_ipst;
4634 	uint16_t		state;
4635 	boolean_t		fastprobe = B_FALSE;
4636 	struct ndp_g_s		*ndp;
4637 	nce_t			*nce = NULL;
4638 	list_t			graveyard;
4639 	mblk_t			*dlur_mp = NULL;
4640 
4641 	if (ill->ill_isv6)
4642 		ndp = ill->ill_ipst->ips_ndp6;
4643 	else
4644 		ndp = ill->ill_ipst->ips_ndp4;
4645 
4646 	*retnce = NULL;
4647 	state = 0;
4648 
4649 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4650 
4651 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4652 		ip0dbg(("nce_add_common: no addr\n"));
4653 		return (EINVAL);
4654 	}
4655 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4656 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4657 		return (EINVAL);
4658 	}
4659 
4660 	if (ill->ill_isv6) {
4661 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4662 	} else {
4663 		ipaddr_t v4addr;
4664 
4665 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4666 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4667 	}
4668 
4669 	/*
4670 	 * The caller has ensured that there is no nce on ill, but there could
4671 	 * still be an nce_common_t for the address, so that we find exisiting
4672 	 * ncec_t strucutures first, and atomically add a new nce_t if
4673 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4674 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4675 	 * compare for matches across the illgrp because this function is
4676 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4677 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4678 	 * appropriate.
4679 	 */
4680 	ncec = *ncep;
4681 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4682 		if (ncec->ncec_ill == ill) {
4683 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4684 				/*
4685 				 * We should never find *retnce to be
4686 				 * MYADDR, since the caller may then
4687 				 * incorrectly restart a DAD timer that's
4688 				 * already running.  However, if we are in
4689 				 * forwarding mode, and the interface is
4690 				 * moving in/out of groups, the data
4691 				 * path ire lookup (e.g., ire_revalidate_nce)
4692 				 * may  have determined that some destination
4693 				 * is offlink while the control path is adding
4694 				 * that address as a local address.
4695 				 * Recover from  this case by failing the
4696 				 * lookup
4697 				 */
4698 				if (NCE_MYADDR(ncec))
4699 					return (ENXIO);
4700 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4701 				if (*retnce != NULL)
4702 					break;
4703 			}
4704 		}
4705 	}
4706 	if (*retnce != NULL) /* caller must trigger fastpath on nce */
4707 		return (0);
4708 
4709 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4710 	if (ncec == NULL)
4711 		return (ENOMEM);
4712 	*ncec = nce_nil;
4713 	ncec->ncec_ill = ill;
4714 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4715 	ncec->ncec_flags = flags;
4716 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4717 
4718 	if (!ill->ill_isv6) {
4719 		ipaddr_t addr4;
4720 
4721 		/*
4722 		 * DAD probe interval and probe count are set based on
4723 		 * fast/slow probe settings. If the underlying link doesn't
4724 		 * have reliably up/down notifications or if we're working
4725 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4726 		 * don't use the fast timers.  Otherwise, use them.
4727 		 */
4728 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4729 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4730 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4731 			fastprobe = B_TRUE;
4732 		} else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4733 		    !IS_IPV4_LL_SPACE(&addr4)) {
4734 			ill_t *hwaddr_ill;
4735 
4736 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4737 			    hw_addr_len);
4738 			if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4739 				fastprobe = B_TRUE;
4740 		}
4741 		if (fastprobe) {
4742 			ncec->ncec_xmit_interval =
4743 			    ipst->ips_arp_fastprobe_interval;
4744 			ncec->ncec_pcnt =
4745 			    ipst->ips_arp_fastprobe_count;
4746 			ncec->ncec_flags |= NCE_F_FAST;
4747 		} else {
4748 			ncec->ncec_xmit_interval =
4749 			    ipst->ips_arp_probe_interval;
4750 			ncec->ncec_pcnt =
4751 			    ipst->ips_arp_probe_count;
4752 		}
4753 		if (NCE_PUBLISH(ncec)) {
4754 			ncec->ncec_unsolicit_count =
4755 			    ipst->ips_ip_arp_publish_count;
4756 		}
4757 	} else {
4758 		/*
4759 		 * probe interval is constant: ILL_PROBE_INTERVAL
4760 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4761 		 */
4762 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4763 		if (NCE_PUBLISH(ncec)) {
4764 			ncec->ncec_unsolicit_count =
4765 			    ipst->ips_ip_ndp_unsolicit_count;
4766 		}
4767 	}
4768 	ncec->ncec_rcnt = ill->ill_xmit_count;
4769 	ncec->ncec_addr = *addr;
4770 	ncec->ncec_qd_mp = NULL;
4771 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4772 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4773 	ncec->ncec_trace_disable = B_FALSE;
4774 
4775 	/*
4776 	 * ncec_lladdr holds link layer address
4777 	 */
4778 	if (hw_addr_len > 0) {
4779 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4780 		if (template == NULL) {
4781 			err = ENOMEM;
4782 			goto err_ret;
4783 		}
4784 		ncec->ncec_lladdr = template;
4785 		ncec->ncec_lladdr_length = hw_addr_len;
4786 		bzero(ncec->ncec_lladdr, hw_addr_len);
4787 	}
4788 	if ((flags & NCE_F_BCAST) != 0) {
4789 		state = ND_REACHABLE;
4790 		ASSERT(hw_addr_len > 0);
4791 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4792 		state = ND_INITIAL;
4793 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4794 		/*
4795 		 * NORESOLVER entries are always created in the REACHABLE
4796 		 * state.
4797 		 */
4798 		state = ND_REACHABLE;
4799 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4800 		    ill->ill_mactype != DL_IPV4 &&
4801 		    ill->ill_mactype != DL_6TO4) {
4802 			/*
4803 			 * We create a nce_res_mp with the IP nexthop address
4804 			 * as the destination address if the physical length
4805 			 * is exactly 4 bytes for point-to-multipoint links
4806 			 * that do their own resolution from IP to link-layer
4807 			 * address (e.g. IP over X.25).
4808 			 */
4809 			bcopy((uchar_t *)addr,
4810 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4811 		}
4812 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4813 		    ill->ill_mactype != DL_IPV6) {
4814 			/*
4815 			 * We create a nce_res_mp with the IP nexthop address
4816 			 * as the destination address if the physical legnth
4817 			 * is exactly 16 bytes for point-to-multipoint links
4818 			 * that do their own resolution from IP to link-layer
4819 			 * address.
4820 			 */
4821 			bcopy((uchar_t *)addr,
4822 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4823 		}
4824 		/*
4825 		 * Since NUD is not part of the base IPv4 protocol definition,
4826 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4827 		 * age, and are marked NCE_F_NONUD.
4828 		 */
4829 		if (!ill->ill_isv6)
4830 			ncec->ncec_flags |= NCE_F_NONUD;
4831 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4832 		state = ND_REACHABLE;
4833 	}
4834 
4835 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4836 		/*
4837 		 * We are adding an ncec with a deterministic hw_addr,
4838 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4839 		 *
4840 		 * if we are adding a unicast ncec for the local address
4841 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4842 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4843 		 * addresses are added in PROBE to trigger DAD.
4844 		 */
4845 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4846 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4847 			state = ND_REACHABLE;
4848 		else if (!NCE_PUBLISH(ncec))
4849 			state = ND_STALE;
4850 		else
4851 			state = ND_PROBE;
4852 		if (hw_addr != NULL)
4853 			nce_set_ll(ncec, hw_addr);
4854 	}
4855 	/* caller overrides internally computed state */
4856 	if (nce_state != ND_UNCHANGED)
4857 		state = nce_state;
4858 
4859 	if (state == ND_PROBE)
4860 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4861 
4862 	ncec->ncec_state = state;
4863 
4864 	if (state == ND_REACHABLE) {
4865 		ncec->ncec_last = ncec->ncec_init_time =
4866 		    TICK_TO_MSEC(ddi_get_lbolt64());
4867 	} else {
4868 		ncec->ncec_last = 0;
4869 		if (state == ND_INITIAL)
4870 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4871 	}
4872 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4873 	    offsetof(ncec_cb_t, ncec_cb_node));
4874 	/*
4875 	 * have all the memory allocations out of the way before taking locks
4876 	 * and adding the nce.
4877 	 */
4878 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4879 	if (nce == NULL) {
4880 		err = ENOMEM;
4881 		goto err_ret;
4882 	}
4883 	if (ncec->ncec_lladdr != NULL ||
4884 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4885 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4886 		    ill->ill_phys_addr_length, ill->ill_sap,
4887 		    ill->ill_sap_length);
4888 		if (dlur_mp == NULL) {
4889 			err = ENOMEM;
4890 			goto err_ret;
4891 		}
4892 	}
4893 
4894 	/*
4895 	 * Atomically ensure that the ill is not CONDEMNED, before
4896 	 * adding the NCE.
4897 	 */
4898 	mutex_enter(&ill->ill_lock);
4899 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4900 		mutex_exit(&ill->ill_lock);
4901 		err = EINVAL;
4902 		goto err_ret;
4903 	}
4904 	if (!NCE_MYADDR(ncec) &&
4905 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4906 		mutex_exit(&ill->ill_lock);
4907 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4908 		err = EINVAL;
4909 		goto err_ret;
4910 	}
4911 	/*
4912 	 * Acquire the ncec_lock even before adding the ncec to the list
4913 	 * so that it cannot get deleted after the ncec is added, but
4914 	 * before we add the nce.
4915 	 */
4916 	mutex_enter(&ncec->ncec_lock);
4917 	if ((ncec->ncec_next = *ncep) != NULL)
4918 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4919 	*ncep = ncec;
4920 	ncec->ncec_ptpn = ncep;
4921 
4922 	/* Bump up the number of ncec's referencing this ill */
4923 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4924 	    (char *), "ncec", (void *), ncec);
4925 	ill->ill_ncec_cnt++;
4926 	/*
4927 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4928 	 * condemned, and we can safely add the nce.
4929 	 */
4930 	list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
4931 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4932 	mutex_exit(&ncec->ncec_lock);
4933 	mutex_exit(&ill->ill_lock);
4934 	nce_graveyard_free(&graveyard);
4935 
4936 	/* caller must trigger fastpath on *retnce */
4937 	return (0);
4938 
4939 err_ret:
4940 	if (ncec != NULL)
4941 		kmem_cache_free(ncec_cache, ncec);
4942 	if (nce != NULL)
4943 		kmem_cache_free(nce_cache, nce);
4944 	freemsg(dlur_mp);
4945 	if (template != NULL)
4946 		kmem_free(template, ill->ill_phys_addr_length);
4947 	return (err);
4948 }
4949 
4950 /*
4951  * take a ref on the nce
4952  */
4953 void
nce_refhold(nce_t * nce)4954 nce_refhold(nce_t *nce)
4955 {
4956 	mutex_enter(&nce->nce_lock);
4957 	nce->nce_refcnt++;
4958 	ASSERT((nce)->nce_refcnt != 0);
4959 	mutex_exit(&nce->nce_lock);
4960 }
4961 
4962 /*
4963  * release a ref on the nce; In general, this
4964  * cannot be called with locks held because nce_inactive
4965  * may result in nce_inactive which will take the ill_lock,
4966  * do ipif_ill_refrele_tail etc. Thus the one exception
4967  * where this can be called with locks held is when the caller
4968  * is certain that the nce_refcnt is sufficient to prevent
4969  * the invocation of nce_inactive.
4970  */
4971 void
nce_refrele(nce_t * nce)4972 nce_refrele(nce_t *nce)
4973 {
4974 	ASSERT((nce)->nce_refcnt != 0);
4975 	mutex_enter(&nce->nce_lock);
4976 	if (--nce->nce_refcnt == 0)
4977 		nce_inactive(nce); /* destroys the mutex */
4978 	else
4979 		mutex_exit(&nce->nce_lock);
4980 }
4981 
4982 /*
4983  * free the nce after all refs have gone away.
4984  */
4985 static void
nce_inactive(nce_t * nce)4986 nce_inactive(nce_t *nce)
4987 {
4988 	ill_t *ill = nce->nce_ill;
4989 
4990 	ASSERT(nce->nce_refcnt == 0);
4991 
4992 	ncec_refrele_notr(nce->nce_common);
4993 	nce->nce_common = NULL;
4994 	freemsg(nce->nce_fp_mp);
4995 	freemsg(nce->nce_dlur_mp);
4996 
4997 	mutex_enter(&ill->ill_lock);
4998 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4999 	    (char *), "nce", (void *), nce);
5000 	ill->ill_nce_cnt--;
5001 	nce->nce_ill = NULL;
5002 	/*
5003 	 * If the number of ncec's associated with this ill have dropped
5004 	 * to zero, check whether we need to restart any operation that
5005 	 * is waiting for this to happen.
5006 	 */
5007 	if (ILL_DOWN_OK(ill)) {
5008 		/* ipif_ill_refrele_tail drops the ill_lock */
5009 		ipif_ill_refrele_tail(ill);
5010 	} else {
5011 		mutex_exit(&ill->ill_lock);
5012 	}
5013 
5014 	mutex_destroy(&nce->nce_lock);
5015 	kmem_cache_free(nce_cache, nce);
5016 }
5017 
5018 /*
5019  * Add an nce to the ill_nce list.
5020  *
5021  * Adding multicast NCEs is subject to a per-ill limit. This function returns
5022  * NULL if that's the case, and it may reap a number of multicast nces.
5023  * Callers (and upstack) must be able to cope with NULL returns.
5024  */
5025 static nce_t *
nce_add_impl(ill_t * ill,ncec_t * ncec,nce_t * nce,mblk_t * dlur_mp,list_t * graveyard)5026 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
5027     list_t *graveyard)
5028 {
5029 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5030 
5031 	if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
5032 		if (nce_too_many_mcast(ill, graveyard)) {
5033 			kmem_cache_free(nce_cache, nce);
5034 			return (NULL);
5035 		}
5036 		ill->ill_mcast_nces++;
5037 	}
5038 
5039 	bzero(nce, sizeof (*nce));
5040 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
5041 	nce->nce_common = ncec;
5042 	nce->nce_addr = ncec->ncec_addr;
5043 	nce->nce_ill = ill;
5044 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
5045 	    (char *), "nce", (void *), nce);
5046 	ill->ill_nce_cnt++;
5047 
5048 	nce->nce_refcnt = 1; /* for the thread */
5049 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
5050 	nce->nce_dlur_mp = dlur_mp;
5051 
5052 	/* add nce to the ill's fastpath list.  */
5053 	nce->nce_refcnt++; /* for the list */
5054 	list_insert_head(&ill->ill_nce, nce);
5055 	return (nce);
5056 }
5057 
5058 static nce_t *
nce_add(ill_t * ill,ncec_t * ncec,list_t * graveyard)5059 nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
5060 {
5061 	nce_t	*nce;
5062 	mblk_t	*dlur_mp = NULL;
5063 
5064 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5065 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
5066 
5067 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
5068 	if (nce == NULL)
5069 		return (NULL);
5070 	if (ncec->ncec_lladdr != NULL ||
5071 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
5072 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
5073 		    ill->ill_phys_addr_length, ill->ill_sap,
5074 		    ill->ill_sap_length);
5075 		if (dlur_mp == NULL) {
5076 			kmem_cache_free(nce_cache, nce);
5077 			return (NULL);
5078 		}
5079 	}
5080 	/*
5081 	 * If nce_add_impl() returns NULL due to on multicast limiting, caller
5082 	 * will (correctly) assume ENOMEM.
5083 	 */
5084 	return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
5085 }
5086 
5087 /*
5088  * remove the nce from the ill_faspath list
5089  */
5090 void
nce_delete(nce_t * nce)5091 nce_delete(nce_t *nce)
5092 {
5093 	ill_t	*ill = nce->nce_ill;
5094 
5095 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5096 
5097 	mutex_enter(&nce->nce_lock);
5098 	if (nce->nce_is_condemned) {
5099 		/*
5100 		 * some other thread has removed this nce from the ill_nce list
5101 		 */
5102 		mutex_exit(&nce->nce_lock);
5103 		return;
5104 	}
5105 	nce->nce_is_condemned = B_TRUE;
5106 	mutex_exit(&nce->nce_lock);
5107 
5108 	/* Update the count of multicast NCEs. */
5109 	if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
5110 		ill->ill_mcast_nces--;
5111 
5112 	list_remove(&ill->ill_nce, nce);
5113 	/*
5114 	 * even though we are holding the ill_lock, it is ok to
5115 	 * call nce_refrele here because we know that we should have
5116 	 * at least 2 refs on the nce: one for the thread, and one
5117 	 * for the list. The refrele below will release the one for
5118 	 * the list.
5119 	 */
5120 	nce_refrele(nce);
5121 }
5122 
5123 nce_t *
nce_lookup(ill_t * ill,const in6_addr_t * addr)5124 nce_lookup(ill_t *ill, const in6_addr_t *addr)
5125 {
5126 	nce_t *nce = NULL;
5127 
5128 	ASSERT(ill != NULL);
5129 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5130 
5131 	for (nce = list_head(&ill->ill_nce); nce != NULL;
5132 	    nce = list_next(&ill->ill_nce, nce)) {
5133 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
5134 			break;
5135 	}
5136 
5137 	/*
5138 	 * if we found the nce on the ill_nce list while holding
5139 	 * the ill_lock, then it cannot be condemned yet.
5140 	 */
5141 	if (nce != NULL) {
5142 		ASSERT(!nce->nce_is_condemned);
5143 		nce_refhold(nce);
5144 	}
5145 	return (nce);
5146 }
5147 
5148 /*
5149  * Walk the ill_nce list on ill. The callback function func() cannot perform
5150  * any destructive actions.
5151  */
5152 static void
nce_walk_common(ill_t * ill,pfi_t func,void * arg)5153 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
5154 {
5155 	nce_t *nce = NULL, *nce_next;
5156 
5157 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5158 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
5159 		nce_next = list_next(&ill->ill_nce, nce);
5160 		if (func(ill, nce, arg) != 0)
5161 			break;
5162 		nce = nce_next;
5163 	}
5164 }
5165 
5166 void
nce_walk(ill_t * ill,pfi_t func,void * arg)5167 nce_walk(ill_t *ill, pfi_t func, void *arg)
5168 {
5169 	mutex_enter(&ill->ill_lock);
5170 	nce_walk_common(ill, func, arg);
5171 	mutex_exit(&ill->ill_lock);
5172 }
5173 
5174 void
nce_flush(ill_t * ill,boolean_t flushall)5175 nce_flush(ill_t *ill, boolean_t flushall)
5176 {
5177 	nce_t *nce, *nce_next;
5178 	list_t dead;
5179 
5180 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
5181 	mutex_enter(&ill->ill_lock);
5182 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
5183 		nce_next = list_next(&ill->ill_nce, nce);
5184 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
5185 			nce = nce_next;
5186 			continue;
5187 		}
5188 		/*
5189 		 * nce_delete requires that the caller should either not
5190 		 * be holding locks, or should hold a ref to ensure that
5191 		 * we wont hit ncec_inactive. So take a ref and clean up
5192 		 * after the list is flushed.
5193 		 */
5194 		nce_refhold(nce);
5195 		nce_delete(nce);
5196 		list_insert_tail(&dead, nce);
5197 		nce = nce_next;
5198 	}
5199 	mutex_exit(&ill->ill_lock);
5200 	while ((nce = list_head(&dead)) != NULL) {
5201 		list_remove(&dead, nce);
5202 		nce_refrele(nce);
5203 	}
5204 	ASSERT(list_is_empty(&dead));
5205 	list_destroy(&dead);
5206 }
5207 
5208 /* Return an interval that is anywhere in the [1 .. intv] range */
5209 static clock_t
nce_fuzz_interval(clock_t intv,boolean_t initial_time)5210 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
5211 {
5212 	clock_t rnd, frac;
5213 
5214 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
5215 	/* Note that clock_t is signed; must chop off bits */
5216 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
5217 	if (initial_time) {
5218 		if (intv <= 0)
5219 			intv = 1;
5220 		else
5221 			intv = (rnd % intv) + 1;
5222 	} else {
5223 		/* Compute 'frac' as 20% of the configured interval */
5224 		if ((frac = intv / 5) <= 1)
5225 			frac = 2;
5226 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
5227 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
5228 			intv = 1;
5229 	}
5230 	return (intv);
5231 }
5232 
5233 void
nce_resolv_ipmp_ok(ncec_t * ncec)5234 nce_resolv_ipmp_ok(ncec_t *ncec)
5235 {
5236 	mblk_t *mp;
5237 	uint_t pkt_len;
5238 	iaflags_t ixaflags = IXAF_NO_TRACE;
5239 	nce_t *under_nce;
5240 	ill_t	*ill = ncec->ncec_ill;
5241 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
5242 	ipif_t *src_ipif = NULL;
5243 	ip_stack_t *ipst = ill->ill_ipst;
5244 	ill_t *send_ill;
5245 	uint_t nprobes;
5246 
5247 	ASSERT(IS_IPMP(ill));
5248 
5249 	mutex_enter(&ncec->ncec_lock);
5250 	nprobes = ncec->ncec_nprobes;
5251 	mp = ncec->ncec_qd_mp;
5252 	ncec->ncec_qd_mp = NULL;
5253 	ncec->ncec_nprobes = 0;
5254 	mutex_exit(&ncec->ncec_lock);
5255 
5256 	while (mp != NULL) {
5257 		mblk_t *nxt_mp;
5258 
5259 		nxt_mp = mp->b_next;
5260 		mp->b_next = NULL;
5261 		if (isv6) {
5262 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
5263 
5264 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
5265 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5266 			    ill, ALL_ZONES, ipst);
5267 		} else {
5268 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
5269 
5270 			ixaflags |= IXAF_IS_IPV4;
5271 			pkt_len = ntohs(ipha->ipha_length);
5272 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5273 			    ill, ALL_ZONES, ipst);
5274 		}
5275 
5276 		/*
5277 		 * find a new nce based on an under_ill. The first IPMP probe
5278 		 * packet gets queued, so we could still find a src_ipif that
5279 		 * matches an IPMP test address.
5280 		 */
5281 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5282 			/*
5283 			 * if src_ipif is null, this could be either a
5284 			 * forwarded packet or a probe whose src got deleted.
5285 			 * We identify the former case by looking for the
5286 			 * ncec_nprobes: the first ncec_nprobes packets are
5287 			 * probes;
5288 			 */
5289 			if (src_ipif == NULL && nprobes > 0)
5290 				goto drop_pkt;
5291 
5292 			/*
5293 			 * For forwarded packets, we use the ipmp rotor
5294 			 * to find send_ill.
5295 			 */
5296 			send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5297 			    B_TRUE);
5298 		} else {
5299 			send_ill = src_ipif->ipif_ill;
5300 			ill_refhold(send_ill);
5301 		}
5302 
5303 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5304 		    (ncec_t *), ncec, (ipif_t *),
5305 		    src_ipif, (ill_t *), send_ill);
5306 
5307 		if (send_ill == NULL) {
5308 			if (src_ipif != NULL)
5309 				ipif_refrele(src_ipif);
5310 			goto drop_pkt;
5311 		}
5312 		/* create an under_nce on send_ill */
5313 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5314 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5315 			under_nce = nce_fastpath_create(send_ill, ncec);
5316 		else
5317 			under_nce = NULL;
5318 		rw_exit(&ipst->ips_ill_g_lock);
5319 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5320 			nce_fastpath_trigger(under_nce);
5321 
5322 		ill_refrele(send_ill);
5323 		if (src_ipif != NULL)
5324 			ipif_refrele(src_ipif);
5325 
5326 		if (under_nce != NULL) {
5327 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5328 			    ALL_ZONES, 0, NULL);
5329 			nce_refrele(under_nce);
5330 			if (nprobes > 0)
5331 				nprobes--;
5332 			mp = nxt_mp;
5333 			continue;
5334 		}
5335 drop_pkt:
5336 		if (isv6) {
5337 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5338 		} else {
5339 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5340 		}
5341 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5342 		freemsg(mp);
5343 		if (nprobes > 0)
5344 			nprobes--;
5345 		mp = nxt_mp;
5346 	}
5347 	ncec_cb_dispatch(ncec); /* complete callbacks */
5348 }
5349